
    sgM                        d Z ddlZddlZddlmZmZmZ ddlZddl	m
Z
 ddlmZmZ ddlmZmZmZmZ ddlmZmZmZmZmZmZmZ dd	lmZmZmZmZ dd
l m!Z!  e       rddl"Z"ddl#m$Z$m%Z%m&Z&  e       rddl'Z' ejP                  e)      Z*dZ+d Z,	 	 	 	 	 	 	 	 	 d de-de.de-de-de.de.de.de.dee/   dee-   de$jH                  fdZ0	 d!dejb                  de-deee-e2f      fdZ3 G d de      Z4y)"z%Image processor class for Pix2Struct.    N)DictOptionalUnion)hf_hub_download   )BaseImageProcessorBatchFeature)convert_to_rgb	normalizeto_channel_dimension_formatto_pil_image)ChannelDimension
ImageInputget_image_sizeinfer_channel_dimension_formatmake_list_of_imagesto_numpy_arrayvalid_images)
TensorTypeis_torch_availableis_vision_availablelogging)requires_backends)Image	ImageDraw	ImageFontzybelkada/fontsc                    t        t        dg       | j                  d      } t        j                  j
                  j                  | ||f||f      }|j                  | j                  d      | j                  d      ||d      }|j                  ddddd      j                  | j                  d      |z  | j                  d      |z  | j                  d      |z  |z        }|j                  d      S )	a  
    Utiliy function to extract patches from a given image tensor. Returns a tensor of shape (1, `patch_height`,
    `patch_width`, `num_channels`x `patch_height` x `patch_width`)

    Args:
        image_tensor (torch.Tensor):
            The image tensor to extract patches from.
        patch_height (int):
            The height of the patches to extract.
        patch_width (int):
            The width of the patches to extract.
    torchr   )stride         r   )
r   torch_extract_patches	unsqueezer   nn
functionalunfoldreshapesizepermute)image_tensorpatch_heightpatch_widthpatchess       m/var/www/html/venv/lib/python3.12/site-packages/transformers/models/pix2struct/image_processing_pix2struct.pyr$   r$   4   s     +gY7))!,Lhh!!((k7R\hju[v(wGool//2L4E4Ea4H,XceghGooaAq!,44!,!+!|+k9G
 Q    text	text_size
text_colorbackground_colorleft_paddingright_paddingtop_paddingbottom_padding
font_bytes	font_pathreturnc
                 T   t        t        d       t        j                  d      }
|
j	                  |       }dj                  |      }||	t        j                  |      }n|	|	}nt        t        d      }t        j                  |d|      }t        j                  t        j                  d	d
|            }|j!                  d||      \  }}}}||z   |z   }||z   |z   }t        j                  d	||f|      }t        j                  |      }|j#                  ||f|||       |S )a  
    Render text. This script is entirely adapted from the original script that can be found here:
    https://github.com/google-research/pix2struct/blob/main/pix2struct/preprocessing/preprocessing_utils.py

    Args:
        text (`str`, *optional*, defaults to ):
            Text to render.
        text_size (`int`, *optional*, defaults to 36):
            Size of the text.
        text_color (`str`, *optional*, defaults to `"black"`):
            Color of the text.
        background_color (`str`, *optional*, defaults to `"white"`):
            Color of the background.
        left_padding (`int`, *optional*, defaults to 5):
            Padding on the left.
        right_padding (`int`, *optional*, defaults to 5):
            Padding on the right.
        top_padding (`int`, *optional*, defaults to 5):
            Padding on the top.
        bottom_padding (`int`, *optional*, defaults to 5):
            Padding on the bottom.
        font_bytes (`bytes`, *optional*):
            Bytes of the font to use. If `None`, the default font will be used.
        font_path (`str`, *optional*):
            Path to the font to use. If `None`, the default font will be used.
    visionP   )width)r2   
z	Arial.TTFzUTF-8)encodingr*   RGB)r    r    r   r   )xyr2   fillfont)r   render_texttextwrapTextWrapperwrapjoinioBytesIOr   DEFAULT_FONT_PATHr   truetyper   Drawr   newtextbboxr2   )r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   wrapperlineswrapped_textrG   	temp_draw_
text_widthtext_heightimage_widthimage_heightimagedraws                         r0   rH   rH   O   s$   L k8, "",GLLdL#E99U#L)"3zz*%		0+>dW9ED uyy8HIJI$-$6$6v|T$R!Aq*k |+m;K,~=LIIek<8:JKE>>% DII,,<jW[I\Lr1   r]   headerinput_data_formatc                 v   t        t        d       t        | |      } t        |fi |}t	        |j
                  | j
                        }t        | j                  || j
                  z  z        }t        |j                  ||j
                  z  z        }t        j                  d|||z   fd      }|j                  |j                  ||f      d       |j                  | j                  ||f      d|f       t        |      }t        |      t        j                  k(  rt!        |t        j                        }|S )a  
    Renders the input text as a header on the input image.

    Args:
        image (`np.ndarray`):
            The image to render the header on.
        header (`str`):
            The header text.
        data_format (`Union[ChannelDimension, str]`, *optional*):
            The data format of the image. Can be either "ChannelDimension.channels_first" or
            "ChannelDimension.channels_last".

    Returns:
        `np.ndarray`: The image with the header rendered.
    r>   )r`   rC   whiterD   r   )r   render_headerr   rH   maxr@   intheightr   rR   pasteresizer   r   r   LASTr   )	r]   r_   r`   kwargsheader_image	new_width
new_heightnew_header_height	new_images	            r0   rc   rc      s   $ mX. 2CDEv00LL&&4IU\\Y%<=>JL//9|?Q?Q3QRS		%)Z:K-K!LgVIOOL''4E(FGPOOELL)Z!89A?P;QR y)I%i04D4I4II/	;K;P;PQ	r1   c                       e Zd ZdZdgZ	 	 	 	 	 ddededeeef   deded	df fd
Z		 dde
j                  dededeeeef      d	e
j                  f
dZ	 	 dde
j                  deeeef      deeeef      d	e
j                  fdZddddddej$                  dfdedee   dedee   dee   deeeef      deeeef      dedeeeef      d	efdZ xZS )Pix2StructImageProcessoraf  
    Constructs a Pix2Struct image processor.

    Args:
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method. According to Pix2Struct paper and code, the image is normalized with its own mean and standard
            deviation.
        patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
            The patch size to use for the image. According to Pix2Struct paper and code, the patch size is 16x16.
        max_patches (`int`, *optional*, defaults to 2048):
            The maximum number of patches to extract from the image as per the [Pix2Struct
            paper](https://arxiv.org/pdf/2210.03347.pdf).
        is_vqa (`bool`, *optional*, defaults to `False`):
            Whether or not the image processor is for the VQA task. If `True` and `header_text` is passed in, text is
            rendered onto the input images.
    flattened_patchesNdo_convert_rgbdo_normalize
patch_sizemax_patchesis_vqar<   c                 x    t        |   di | ||nddd| _        || _        || _        || _        || _        y )N   )rf   r@    )super__init__ru   rt   rs   rv   rw   )selfrs   rt   ru   rv   rw   rj   	__class__s          r0   r|   z!Pix2StructImageProcessor.__init__   sH     	"6"(2(>*r\^D_(,&r1   r]   r`   c           	         t        | j                  d       t        |t        j                  |      }t        j                  |      }|d   |d   }}t        |t        j                        \  }}	t        j                  |||z  z  ||	z  z        }
t        t        t        j                  |
|z  |z        |      d      }t        t        t        j                  |
|	z  |z        |      d      }t        ||z  d      }t        ||z  d      }t
        j                  j                  j                  |j!                  d      ||fddd	      j#                  d      }t%        |||      }|j&                  }|d   }|d
   }|d   }|j)                  ||z  |g      }t        j*                  |      j)                  |dg      j-                  d|      j)                  ||z  dg      }t        j*                  |      j)                  d|g      j-                  |d      j)                  ||z  dg      }|dz  }|dz  }|j/                  t
        j0                        }|j/                  t
        j0                        }t        j2                  |||gd      }t
        j                  j                  j5                  |ddd|||z  z
  g      j7                         }t9        |      }|S )a  
        Extract flattened patches from an image.

        Args:
            image (`np.ndarray`):
                Image to extract flattened patches from.
            max_patches (`int`):
                Maximum number of patches to extract.
            patch_size (`dict`):
                Dictionary containing the patch height and width.

        Returns:
            result (`np.ndarray`):
                A sequence of `max_patches` flattened patches.
        r   rf   r@   r    r   bilinearFT)r*   modealign_corners	antialiasr#   r   r!   )r   extract_flattened_patchesr   r   FIRSTr   
from_numpyr   mathsqrtrd   minfloorr&   r'   interpolater%   squeezer$   shaper)   arangerepeattofloat32catpadfloatr   )r}   r]   rv   ru   r`   rj   r-   r.   r\   r[   scalenum_feasible_rowsnum_feasible_colsresized_heightresized_widthr/   patches_shaperowscolumnsdepthrow_idscol_idsresults                          r0   r   z2Pix2StructImageProcessor.extract_flattened_patches   s   . 	$88'B ,E3C3I3IK\]  '$.x$8*W:Mk$25:J:P:P$Q!k 		+)DEWbIbcdDJJu|/Cl/R$SU` acdeDJJu{/B[/P$QS^ _abc.=qA-;Q?##//OOA -0 0 
 '!* 	 (|[IQ"a  //4'>5"9: ,,t$,,dAY7>>q'JRRTX[bTbdeSfg,,w'//G=DDT1MUUW[^eWeghVij 	11 **U]]+**U]]+ GWg6; $$((!Q;$QX.;Y1Z[aac'r1   data_formatc           	      n   |j                   t        j                  k(  r|j                  t        j                        }t        j
                  |      }t        j                  |      }t        |dt        j                  t        j                  |j                              z        }t        |f||||d|S )a  
        Normalize an image. image = (image - image_mean) / image_std.

        The image std is to mimic the tensorflow implementation of the `per_image_standardization`:
        https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization

        Args:
            image (`np.ndarray`):
                Image to normalize.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        g      ?)meanstdr   r`   )dtypenpuint8astyper   r   r   rd   r   r   prodr   r   )r}   r]   r   r`   rj   r   r   adjusted_stddevs           r0   r   z"Pix2StructImageProcessor.normalize5  s    , ;;"(("LL,E wwu~ffUmc32775;;3G)H#HI
#/
 
 	
r1   imagesheader_textreturn_tensorsc
           
          ||n| j                   }||n| j                  }||n| j                  }||n| j                  }| j                  }|
j                  dd      t        d      t        |      }t        |      st        d      |r|D cg c]  }t        |       }}|D cg c]  }t        |       }}|	t        |d         }	|r}|t        d      |
j                  dd      }|
j                  dd      }t        |t              r|gt        |      z  }t!        |      D cg c]  \  }}t#        |||   ||	       }}}|r |D cg c]  }| j%                  ||	
       }}|D cg c]  }| j'                  ||||	       }}|D cg c]4  }|j)                  d      dk7  j+                  t,        j.                        6 }}t1        ||d|      }|S c c}w c c}w c c}}w c c}w c c}w c c}w )a  
        Preprocess an image or batch of images. The processor first computes the maximum possible number of
        aspect-ratio preserving patches of size `patch_size` that can be extracted from the image. It then pads the
        image with zeros to make the image respect the constraint of `max_patches`. Before extracting the patches the
        images are standardized following the tensorflow implementation of `per_image_standardization`
        (https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization).


        Args:
            images (`ImageInput`):
                Image to preprocess. Expects a single or batch of images.
            header_text (`Union[List[str], str]`, *optional*):
                Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                Whether to convert the image to RGB.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            max_patches (`int`, *optional*, defaults to `self.max_patches`):
                Maximum number of patches to extract.
            patch_size (`dict`, *optional*, defaults to `self.patch_size`):
                Dictionary containing the patch height and width.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                    - Unset: Return a list of `np.ndarray`.
                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format for the output image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - Unset: Use the channel dimension format of the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        Nr   z8data_format is not an accepted input as the outputs are zkInvalid image type. Must be of type PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or jax.ndarray.r   z.A header text must be provided for VQA models.r:   r;   )r:   r;   )r]   r`   )r]   rv   ru   r`   r!   )axis)rr   attention_mask)datatensor_type)rt   rs   ru   rv   rw   get
ValueErrorr   r   r
   r   r   pop
isinstancestrlen	enumeraterc   r   r   sumr   r   r   r	   )r}   r   r   rs   rt   rv   ru   r   r   r`   rj   rw   r]   r:   r;   iattention_masksencoded_outputss                     r0   
preprocessz#Pix2StructImageProcessor.preprocess\  s?   j (4'?|TEVEV+9+E4K^K^#-#9Zt
%0%<k$BRBR::mT*6WXX$V,F#:  9?@nU+@F@ 6<<E.'<<$ >vay I" !QRRL$7J

;5I+s+*mc&k9 !*& 1Au e[^
V_`F 
 djk[`dnn5DUnVkFk  	
  **_p + 
 
 V\\EEII2I.!3;;BJJG\\&'-Q_m
 S A = l
 ]s$   G!)G&-G+G12G69G;)TTNi   FN)NN)__name__
__module____qualname____doc__model_input_namesboolr   r   re   r|   r   ndarraydictr   r   r   r   r   r   r   r   r   __classcell__)r~   s   @r0   rq   rq      s   ( --  $!%)  cN	
   
* EIOzzO O 	O
 $E#/?*?$@AO 
Oh ?CDH	%
zz%
 eC)9$9:;%
 $E#/?*?$@A	%
 
%
T &*#'+%)/3;?(8(>(>DHqq c]q 	q
 tnq c]q T#s(^,q !sJ!78q &q $E#/?*?$@Aq 
qr1   rq   )	$   blackrb      r   r   r   NNr   )5r   rM   r   typingr   r   r   numpyr   huggingface_hubr   image_processing_utilsr   r	   image_transformsr
   r   r   r   image_utilsr   r   r   r   r   r   r   utilsr   r   r   r   utils.import_utilsr   rI   PILr   r   r   r   
get_loggerr   loggerrO   r$   r   re   bytesrH   r   ChildProcessErrorrc   rq   rz   r1   r0   <module>r      sd   , 	  ( (  + F d d   R Q 3 //			H	%$  : #"&#@
@@ @ 	@
 @ @ @ @ @ }@ [[@J bf'::'"'7?cK\F\@]7^'TP1 Pr1   