
    sg\                     <   d Z ddlmZmZmZmZmZ ddlmZ ddl	m
Z
 ddlmZmZmZmZmZmZ ddlmZmZ ddlmZmZ dd	lmZ  e       rdd
lZ e       rdd
lZdZ G d ded      Z G d ded      Z G d ded      Z ddZ!d Z"d Z#d Z$d Z% G d de      Z&y
)z
Processor class for IDEFICS.
    )CallableDictListOptionalUnion)urlparse   )BatchFeature)ImagesKwargsProcessingKwargsProcessorMixin
TextKwargsUnpack!_validate_images_text_input_order)PreTokenizedInput	TextInput)is_tf_availableis_torch_available)deprecate_kwargN<image>c                   x    e Zd ZU ee   ed<   eeeef      ed<   ee	e
ee
   f      ed<   ee	e
ee
   f      ed<   y)IdeficsImagesKwargs	transform
image_size
image_mean	image_stdN)__name__
__module____qualname__r   r   __annotations__r   strintr   floatr        a/var/www/html/venv/lib/python3.12/site-packages/transformers/models/idefics/processing_idefics.pyr   r   -   sR    !!c3h((ud5k1233eT%[0122r%   r   F)totalc                   .    e Zd ZU ee   ed<   ee   ed<   y)IdeficsTextKwargsadd_eos_tokenadd_end_of_utterance_tokenN)r   r   r   r   boolr    r$   r%   r&   r)   r)   4   s    D>! (.r%   r)   c                   :    e Zd ZU eed<   eed<   ddddi ddidZy	)
IdeficsProcessorKwargstext_kwargsimages_kwargsFlongest)add_special_tokenspaddingr*   return_tensorspt)r/   r0   common_kwargsN)r   r   r   r)   r    r   	_defaultsr$   r%   r&   r.   r.   9   s2    ""&& #( "

 *D1Ir%   r.   c                    |dk7  r-|dk(  r	d| | |k\  <   n|dk(  rt        j                  | |k\  d|       } |dk(  r@| dk(  }d| |<   t        j                  j                  j                  | |      }d||d d f<   |S |dk(  rt        j                  | d      }t        j                  |d|       } t        j
                  | |      }t        j                  |d      }t        j                  |t        j                  |      |      }S )Nr5   tfr   num_classes)depth)	r:   wheretorchnn
functionalone_hotequalexpand_dims
zeros_like)incremental_maskr4   r<   	negatives	attn_masknegatives_expandeds         r&   $incremental_to_binary_attention_maskrJ   H   s   bT!@B-<=t#!xx(8K(GM]^ $*	&'#HH''//0@k/Z	"#	)Q,  
4	HH-r2	88Iq2BCJJ/{C	^^Ir:HH/y1I9U	r%   c                 H    |dk(  rt        | |      S |dk(  rt        | |      S y )Nr5   r:   ),image_attention_mask_for_packed_input_ids_pt,image_attention_mask_for_packed_input_ids_tf)	input_ids	tokenizerr4   s      r&   )image_attention_mask_for_packed_input_idsrP   b   s1    ;IyQQ	4	;IyQQ 
 r%   c                    t        j                  | d      }t        j                  | d      }|j                  t              }|j                  }t        | j                  d            D ]K  }d}d}t        | |         D ]4  \  }	}
|
|k(  r|dz  }|||   |	<   d}n|||   |	<   |rd||   |	<   |
|k(  s3d}6 M t        | j                  d            D ]  }d}d}t        | |   j                  d      dz
  dd      D ]9  }	| |   |	   }
|
|k(  r|dz  }|||   |	<   d}n|||   |	<   |
|k(  rd}|s2d||   |	<   ; ||   dk7  }||   |xx   |z  cc<   ||   |xx   dz  cc<    ||fS )Nr9   )
fill_valuer   F   T)r?   	full_likeconvert_tokens_to_idsIMAGE_TOKENeos_token_idrangesize	enumerate)rN   rO   image_attention_masknext_image_attention_maskimage_token_ideod_token_id	batch_idxcountseen_eodidxtoken_idnon_negative_indicess               r&   rL   rL   i   s    ??9D %	b I44[AN))L9>>!,-  	&y';< 	 MC>)
7<$Y/4 7<$Y/479$Y/4<'	  " 9>>!,- I	9-221592rB 	?C +C0H>)
<A))4S9 <A))4S9<'<>))4S9	?  9CrI!),-ABeKB!),-ABbHB)I,  !:::r%   c                    |j                  t              }|j                  }t        j                  |       d   }t        j
                  t        j                  |       d      }t        j
                  t        j                  |       d      }t        |      D ]  }d}d}	t        j                  |       d   }
t        |
dz
  dd      D ]  }| ||f   j                         }||k(  r<|dz  }||gg}|g}t        j                  |||      }t        j                  |||      }n*||k(  r%|	s#d}	d}||gg}|g}t        j                  |||      }|	s||k7  s||gg}dg}t        j                  |||      }  ||fS )Nr   r9   FrS   T)	rU   rV   rW   r:   shapefillrX   numpytensor_scatter_nd_update)rN   rO   r]   r^   
batch_sizer[   r\   r_   r`   ra   
seq_lengthrb   rc   indicesupdatess                  r&   rM   rM      s   44[AN))L)$Q'J77288I#6; "(;R @:& u	XXi(+
aR0 	uC C0668H>)
%s+, '')'B'BCWY`bi'j$,.,G,GHacjls,t)\)(%s+, ',.,G,GHacjls,t)H4%s+,$,.,G,GHacjls,t)#	uu.  !:::r%   c                 d    d| v ryt        |       }t        |j                  |j                  g      S )zChecks if the passed string contains a valid url and nothing else. e.g. if space is included it's immediately
    invalidated the url F)r   allschemenetloc)stringresults     r&   is_urlru      s0     f}fFv}}-..r%   c                        e Zd ZdZddgZddgZdZdZd fd	Z e	d	d
dd      	 	 	 	 dde
eeee   ee   eee      eee      f   dee   defd       Zd Zd Zed        Z xZS )IdeficsProcessora  
    Constructs a IDEFICS processor which wraps a LLama tokenizer and IDEFICS image processor into a single processor.

    [`IdeficsProcessor`] offers all the functionalities of [`IdeficsImageProcessor`] and [`LlamaTokenizerFast`]. See
    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.

    Args:
        image_processor (`IdeficsImageProcessor`):
            An instance of [`IdeficsImageProcessor`]. The image processor is a required input.
        tokenizer (`LlamaTokenizerFast`):
            An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
        image_size (`int`, *optional*, defaults to 224): Image size (assuming a square image)
    image_processorrO   r   r+   IdeficsImageProcessorLlamaTokenizerFastc                    |t        d      |t        d      t        | 	  ||       | j                  | _        t        |d      r|j                  n|j                  t              | _        | j                  j                  | j                  j                  | j                  j                  f| _        d| j                  j                  j                  dg       v rd| _        y d| _        y )Nz)You need to specify an `image_processor`.z"You need to specify a `tokenizer`.image_token<end_of_utterance>additional_special_tokensTF)
ValueErrorsuper__init__rx   current_processorhasattrr]   rU   rV   image_num_channelsr   default_image_dimsrO   special_tokens_mapget1tokenizer_was_trained_with_end_of_utterance_token)selfrx   rO   r   r+   kwargs	__class__s         r&   r   zIdeficsProcessor.__init__   s    "HIIABB)4!%!5!5 y-0 $$00= 	   33  ++  ++#
 $t~~'H'H'L'LMhjl'mm  	>  	>r%   promptsz5.0.0textT)old_nameversionnew_nameraise_if_both_namesr   returnc                 L  ./ ||t        d      t        ||      \  }}||}n|t        |t        t        f      s|g}t        |t
              r|g}t        |t        t        f      r"t        |      t        |      k7  rt        d      t        d |D              st        d      t        |d   t        t        f      r|D cg c]  }|g }}t        t        ||            } | j                  t        fd| j                  j                  i|}|d   j                  d	d
      }	|d   j                  dd      }
|
| j                  }
t        d D              s|g}d.d/d}./fd}g }g }|D ]&  }| j                  j                    }g }d
}d
}t#        |      D ]  \  }}|dkD  r|sdnd
}t        |t
              rg|j%                  d      }t'        |      r:| j(                  j+                  |      }| ||      z  }|j-                  |       d}w|
r|r||z  }||z  }d
}| ||      z  }|j-                  |       d} |	r|| j                  j.                  z  } | j(                  |fi |d   }|j-                  |       |j-                  |       ) |d   j                  dd      } | j                  |fi |d   }|d   }|d   }t1        d |D              }t1        d|      }t3        d |D              dkD  }g }g }g }t        |||      D ]2  \  }} }!|}"|"j5                  | j6                        }#t9        |#|      }$|!d|$ }%t        |%      dkD  r|dk(  r=t;        j<                  |g|%j?                         dd  }&|%|&d|%j?                  d       n|dk(  rtA        jB                  |%      dd }'tA        jD                  |g|'gd      }(tA        j<                  |(|%jF                        }&tA        jB                  |%      d   })tA        jH                  tA        jJ                  |)      d      }*|%}+tA        jL                  |&|*|+      }&nN|dk(  r!t;        j<                  |g| jN                   }&n(|dk(  r#tA        j<                  |g| jN                        }&|j-                  &       |dk(  rJ|j-                  t;        jP                  |"             |j-                  t;        jP                  |              |dk(  s|j-                  tA        jR                  |"t@        jT                               |j-                  |        5 |dk(  r@t;        jV                  |      }t;        jV                  |      }t;        jV                  |      }nD|dk(  r?tA        jV                  |      }tA        jV                  |      }tA        jV                  |      }|r)tY        || j                  |      \  },}-t[        |,||      },n|dk(  rBt;        j<                  |jB                  d   |jB                  d   dt:        j\                        },nG|dk(  rBtA        j<                  |jB                  d   |jB                  d   dft@        j\                        },t_        |||,d !      S c c}w )"a  This method takes batched or non-batched prompts made of text and images and converts them into prompts that
        the model was trained on and prepares the image pixel values for the model to process.

        Args:
            images (`Union[PIL.Image, str, List[PIL.Image], List[str]]`):
                either a single image or a batched list of images - can be passed in when text contains only text prompts,
                in order to use the image-text-to-text behavior.
            text (`Union[List[TextInput], [List[List[TextInput]]]]`):
                either a single prompt or a batched list of prompts - see the detailed description immediately after
                the end of the arguments doc section.
            return_tensors (`str` or `TensorType`, *optional*, defaults to `TensorType.PYTORCH`):
                The type of tensors to return. Can be one of:
                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.

        Returns:
            a dict with entries: `input_ids`, `attention_mask`, `pixel_values`, `image_attention_mask` which can be
            directly passed to `model.generate`

        Detailed explanation:

        Each entry in `text` is either a text to be passed as is or an image that will be processed.

        An image can be either an image object (`PIL.Image`) or a url from which the image can be retrieved.

        When the processor encounters an image it'll inject `<fake_token_around_image><image><fake_token_around_image>`
        entry into the prompt.

        Example:

        ```python
        checkpoint = "HuggingFaceM4/idefics-9b"
        processor = AutoProcessor.from_pretrained(checkpoint)
        url = "https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg"
        img = processor.image_processor.fetch_images([url])[0]

        prompts = [
            "User:",
            img,
            "Describe this image.
Assistant: An image of two kittens in grass.
",
            "User:",
            "https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg",
            "Describe this image.
Assistant:",
        ]

        inputs = processor(text=prompts, return_tensors="pt")
        generated_ids = model.generate(**inputs, max_length=100)
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```

        In this example the `prompts` will be converted into:

        ```
        <s>User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
        Assistant: An image of two kittens in grass.
        User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
        Assistant:'
        ```

        and the two images will be massaged using [`IdeficsImageProcessor.__call__`] method and placed inside the
        `pixel_values` dict entry of the return value.

        This example also examplifies that images can be passed as objects or as text urls. It can be seen that the
        first image is passed as object and the second one as a url.

        To do training do:

        ```python
        image_transform = transforms.Compose(
            [
                transforms.RandomResizedCrop(
                    (w, h), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC
                ),
                transforms.ToTensor(),
                transforms.Normalize(mean=self.image_mean, std=self.image_std),
            ]
        )
        inputs = processor(text=prompts, transform=image_transform, return_tensors="pt")
        ```

        In order to help debug prompt generation enable `debug=True` which will show you what's happening.

        Nz9You need to specify either `text` or `images` and `text`.a  When providing both images and text arguments, the number of text prompts should be the same as the number of images.If you want to have several images per prompt, images should be nested as such: images=[[img1, img2], [img3, img4], ...] for text=[prompt1, prompt2, ...].c              3   <   K   | ]  }t        |t                y wN)
isinstancer!   .0is     r&   	<genexpr>z,IdeficsProcessor.__call__.<locals>.<genexpr>h  s     8az!S)8s   zQWhen using the image-text-to-text behavior, the prompts should only contain text.r   tokenizer_init_kwargsr/   r*   Fr+   c              3   H   K   | ]  }t        |t        t        f        y wr   )r   listtupler   s     r&   r   z,IdeficsProcessor.__call__.<locals>.<genexpr>|  s     AA:a$/As    "z<fake_token_around_image>r   r}   c                 "    | rz   S z   z   S r   r$   )last_was_image
fake_tokenr|   s    r&   image_tokensz/IdeficsProcessor.__call__.<locals>.image_tokens  s!    "Z//!K/*<<r%   Tro   r0   r4   r5   rN   attention_maskc              3   2   K   | ]  }t        |        y wr   lenr   xs     r&   r   z,IdeficsProcessor.__call__.<locals>.<genexpr>  s     8SV8   rS   c              3   2   K   | ]  }t        |        y wr   r   r   s     r&   r   z,IdeficsProcessor.__call__.<locals>.<genexpr>  s      <AQ <r   r:   )axis)dtype)r9   rS   r;   )rN   r   pixel_valuesr[   )data)0r   r   r   r   r   r!   r   rp   zip_merge_kwargsr.   rO   init_kwargspopr   any	bos_tokenrZ   stripru   rx   fetch_imagesappend	eos_tokenmaxsumr`   r]   minr?   zerosrY   r:   rf   concatr   reshaperX   ri   r   tensorconvert_to_tensorint32stackrP   rJ   r,   r
   )0r   imagesr   audiovideosr   r   r   output_kwargsr*   r+   end_of_utterance_tokenr   all_prompts
all_imagessample	full_textimage_objectsr   last_was_textitemimager4   text_encoding	all_textsall_attention_masksmax_num_imagesat_least_one_imageoutput_input_idsoutput_imagesoutput_attention_maskstext_singler   extracted_imagespadded_input_idsimage_countlocal_max_num_imagescurrent_imagespadded_image_tensorimage_shapepadded_shape
num_imagesrl   rm   r[   _r   r|   s0                                                 @@r&   __call__zIdeficsProcessor.__call__   s   D >dlXYY8F>G ftUm4 $$v$u.3t9F3K q 
 8488 !tuu&)dE]3%)***3vt,-G***"
"&.."<"<
 
 &m488%P%2=%A%E%EFbdh%i" &-)-)_)_&AAAiG0
!5	= 
 %	-F>>334I M"N!M$V, *4q50>DEMdC(::c?Dd| $ 4 4 A A$ G!\.%AA	%,,U3)- 6-%)??I!T)	). n!==I!((.%)N+*. T^^555	0D00a-P_B`aMy)m,K%	-P '}599:JDQ&{SmM6RS!+.	+,<= 8Z88Q/  < <<q@!#=@L_ak=l %	>9K)9**001D1DEK#&{N#C -.C/CDN>"Q&!T)*/++n*a~GZGZG\]^]_G`*a'DR'(@.*=*=a*@A#t+ #%((>":12">K#%99~.>-LST#UL*,((<~G[G[*\'!#.!9!!<J jj*)=wGG,G*,*E*EFY[bdk*l'!T)*/++n*_tG^G^*_'#t+*,((N3]TE\E\3]*^'  !45% ''5E(FG&--ell>.JK4' ''(<(<=MUWU]U](^_&--n=K%	>N T!${{+;<!KK6M%*[[1G%H"t#!xx(89HH]3M%'XX.D%E"&O $...'# ! $H$n.$ 
 %',{{$**1-/?/E/Ea/H!SXS]S]($  4'')xx%++A.0@0F0Fq0I1MUWU\U\($ -"8 -(<	
 	
m +s   
Z!c                 :     | j                   j                  |i |S )z
        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        )rO   batch_decoder   argsr   s      r&   r   zIdeficsProcessor.batch_decode  s     
 +t~~**D;F;;r%   c                 :     | j                   j                  |i |S )z
        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )rO   decoder   s      r&   r   zIdeficsProcessor.decode  s     
 %t~~$$d5f55r%   c                     | j                   j                  }| j                  j                  }t        t        j                  ||z               S r   )rO   model_input_namesrx   r   dictfromkeys)r   tokenizer_input_namesimage_processor_input_namess      r&   r   z"IdeficsProcessor.model_input_names  s?     $ @ @&*&:&:&L&L#DMM"7:U"UVWWr%   )N   N)NNNN)r   r   r   __doc__
attributesvalid_kwargsimage_processor_classtokenizer_classr   r   r   r   r   r   r   r.   r
   r   r   r   propertyr   __classcell__)r   s   @r&   rw   rw      s     $[1J ">?L3*O
4 i6_cd  X
 O"#i!'()+
X
 /0X
 
X
 eX
t<6 X Xr%   rw   )r9   )'r   typingr   r   r   r   r   urllib.parser   feature_extraction_utilsr
   processing_utilsr   r   r   r   r   r   tokenization_utils_baser   r   utilsr   r   utils.deprecationr   r?   
tensorflowr:   rV   r   r)   r.   rJ   rP   rL   rM   ru   rw   r$   r%   r&   <module>r      s    9 8 ! 4  D 8 0 3,e 3/
% /
-U 4R,;^;B/[X~ [Xr%   