
    sg?              
       F   d Z ddlmZmZmZ ddlZddlmZ ddl	m
Z
 ddlmZmZmZmZ ddlmZmZ d	d
lmZ  G d ded      Z G d ded      Zdee   dedeee      fdZdeeee         deee      dededej2                  f
dZdedededefdZ G d de      Zy) zProcessor class for Mllama.    )ListOptionalUnionN   )BatchFeature)
ImageInput)ImagesKwargsProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput   )make_list_of_imagesc                       e Zd ZU ee   ed<   y)MllamaImagesKwargsmax_image_tilesN)__name__
__module____qualname__r   int__annotations__     _/var/www/html/venv/lib/python3.12/site-packages/transformers/models/mllama/processing_mllama.pyr   r   "   s    c]"r   r   F)totalc                   $    e Zd ZU eed<   dddiiZy)MllamaProcessorKwargsimages_kwargsimage_kwargsr      N)r   r   r   r   r   	_defaultsr   r   r   r   r   &   s    %% 	q
Ir   r   	input_idsimage_token_idreturnc                    t        |       D cg c]  \  }}||k(  s| }}}t        |      dk(  rg S t        |      dk(  r|d   dggS t        |dd |dd       D cg c]	  \  }}||g }}}|j                  |d   t        |       g       |d   d   }|ddd   D ]  }	|	d   |	d   dz
  k(  r||	d<   |	d   } |S c c}}w c c}}w )a  
    Generate a cross-attention token mask for image tokens in the input sequence.

    This function identifies the positions of image tokens in the input sequence and creates
    a mask that defines which subsequent tokens each image token should attend to.

    Args:
        input_ids (List[int]): A list of token ids representing the input sequence.
        image_token_id (int): The id of the token used to represent images in the sequence.

    Returns:
        List[List[int]]: A list of [start, end] pairs, where each pair represents the range
        of tokens an image token should attend to.

    Notes:
        - If no image tokens are present, an empty list is returned.
        - For a single image token, it attends to all subsequent tokens until the end of the sequence.
        - For multiple image tokens, each attends to tokens up to the next image token or the end of the sequence.
        - Consecutive image tokens are treated as a group and attend to all subsequent tokens together.
    r   r   N)	enumeratelenzipappend)
r#   r$   itokenimage_token_locationsloc1loc2vision_maskslast_mask_endvision_masks
             r   get_cross_attention_token_maskr4   0   s   , 09/C_81euP^G^Q__
 !Q&	  !Q&&q)2.//367LSb7QShijikSl3mnZT4T4LnLn .r2C	NCD
 !$Q'M#DbD) 'q>[^a//*KN#A'
 / ` os   B>B>Ccross_attention_token_mask	num_tilesmax_num_tileslengthc           	         t        |       }t        | D cg c]  }t        |       c}      }t        j                  ||||ft        j                        }t        t        | |            D ]\  \  }\  }	}
t        t        |	|
            D ]<  \  }\  }}t        |      dk(  s|\  }}t        ||      }|dk(  r|}d|||||d|f<   > ^ |S c c}w )a  
    Convert the cross attention mask indices to a cross attention mask 4D array.

    This function takes a sparse representation of cross attention masks and converts it to a dense 4D numpy array.
    The sparse representation is a nested list structure that defines attention ranges for each image in each batch item.

    Args:
        cross_attention_token_mask (List[List[List[int]]]): A nested list structure where:
            - The outer list represents the batch dimension.
            - The middle list represents different images within each batch item.
            - The inner list contains pairs of integers [start, end] representing token ranges for each image.
        num_tiles (List[List[int]]): A nested list structure specifying the number of tiles for each image in each batch item.
        max_num_tiles (int): The maximum possible number of tiles.
        length (int): The total sequence length of the input.

    Returns:
        np.ndarray: A 4D numpy array of shape (batch_size, length, max_num_images, max_num_tiles)
            The array contains `1` where attention is allowed and `0` where it is not.

    Note:
        - Special handling is done for cases where the end token is -1, which is interpreted as attending to the end of the sequence.
    )shapedtype   r'   r   N)r)   maxnpzerosint64r(   r*   min)r5   r6   r7   r8   
batch_sizemasksmax_num_imagescross_attention_mask
sample_idxsample_maskssample_num_tilesmask_idx	locationsmask_num_tilesstartends                   r   ,convert_sparse_cross_attention_mask_to_denserN   `   s    : /0J2LM#e*MNN886>=Ahh
 9B#F`bkBl8m [4
4\#35>s<Qa?b5c 	[1H1y.9~"&
s#v&"9 CYZ$ZsHo~o%UV	[[   Ns   Cprompt	bos_tokenimage_tokenc                     || v r| S d}| j                  |      r%| t        |      d } |dz  }| j                  |      r%||z   | |  S )a\  
    Builds a string from the input prompt by adding `bos_token` if not already present.

    Args:
        prompt (`str`):
            The input prompt string.
        bos_token (`str`):
            The beginning of sentence token to be added.
        image_token (`str`):
            The image token used to identify the start of an image sequence.

    Returns:
        str: The modified prompt string with the `bos_token` added if necessary.

    Examples:
        >>> build_string_from_input("Hello world", "<begin_of_text>", "<|image|>")
        '<begin_of_text>Hello world'

        >>> build_string_from_input("<|image|>Hello world", "<begin_of_text>", "<|image|>")
        '<|image|><begin_of_text>Hello world'

        >>> build_string_from_input("<begin_of_text>Hello world", "<begin_of_text>", "<|image|>")
        '<begin_of_text>Hello world'
    r   Nr   )
startswithr)   )rO   rP   rQ   num_image_tokens_on_starts       r   build_string_from_inputrU      sn    4 F !


K
(K(*+!Q&! 

K
( 556yk&JJr   c                        e Zd ZdZddgZdZdZ fdZ	 	 	 	 ddee	   dee
eeee   ee   f      d	ee   d
efdZd Zd Zd Zed        Z xZS )MllamaProcessoraw  
    Constructs a Mllama processor which wraps [`MllamaImageProcessor`] and
    [`PretrainedTokenizerFast`] into a single processor that inherits both the image processor and
    tokenizer functionalities. See the [`~MllamaProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more
    information.
    The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
        ```python
        from transformers import MllamaProcessor
        from PIL import Image

        processor = MllamaProcessor.from_pretrained("meta-llama/Llama-3.2-11B-Vision")

        processor(
            images=your_pil_image,
            text=["<|image|>If I had to write a haiku for this one"],
            images_kwargs = {"size": {"height": 448, "width": 448}},
            text_kwargs = {"padding": "right"},
            common_kwargs = {"return_tensors": "pt"},
        )
        ```

    Args:
        image_processor ([`MllamaImageProcessor`]):
            The image processor is a required input.
        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
            The tokenizer is a required input.

    image_processor	tokenizerMllamaImageProcessorPreTrainedTokenizerFastc                 d   t        |d      s(d| _        |j                  | j                        | _        n"|j                  | _        |j                  | _        d| _        |j                  | j                        | _        |j                  | _        |j                  | _        t        | %  ||       y )NrQ   z	<|image|>z<|python_tag|>)
hasattrrQ   convert_tokens_to_idsr$   python_tokenpython_token_idrP   chat_templatesuper__init__)selfrX   rY   	__class__s      r   rc   zMllamaProcessor.__init__   s    y-0*D"+"A"A$BRBR"SD(44D"+":":D,(>>t?P?PQ",,&44)4r   imagestextkwargsr%   c           
      *   ||t        d       | j                  t        fd| j                  j                  i|}|d   }|d   }|d   }	i }
|t        |t              r|g}n3t        |t        t        f      rt        d |D              st        d      |D cg c]  }|j                  | j                         }}|D cg c]#  }t        || j                  | j                        % }}|j                  d	d      } | j                  |fi |}|
j                  |       d
g}|#t!        |      }|D cg c]  }t#        |       }}|wt%        d D              rt        d |D              st        d      t'        |      t'        |      k7  r1|t        d      t        dt'        |       dt'        |       d      |5 | j(                  |fi |}|j                  d      }|
j                  |       |c|ad   D cg c]  }t+        || j,                         }}t/        || j(                  j0                  t3        d |d   D                    }||
d<   |	j                  dd      }t5        |
|      }|S c c}w c c}w c c}w c c}w )a&	  
        Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text`
        arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
        the text. To prepare the image(s), this method forwards the `images` arguments to
        MllamaImageProcessor's [`~MllamaImageProcessor.__call__`] if `images` is not `None`. Please refer
        to the docstring of the above two methods for more information.

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. Both channels-first and channels-last formats are supported.
            text (`str`, `List[str]`, `List[List[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:
                    - `'tf'`: Return TensorFlow `tf.constant` objects.
                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
                    - `'np'`: Return NumPy `np.ndarray` objects.
                    - `'jax'`: Return JAX `jnp.ndarray` objects.
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
            TODO: add aspect_ratio_ids and aspect_ratio_mask and cross_attention_mask
        Nz'You must specify either text or images.tokenizer_init_kwargstext_kwargsr   common_kwargsc              3   <   K   | ]  }t        |t                y wN)
isinstancestr).0ts     r   	<genexpr>z+MllamaProcessor.__call__.<locals>.<genexpr>  s     =_UVjC>P=_s   zAInvalid input text. Please provide a string, or a list of stringspadding_sider   c              3   &   K   | ]	  }|d k(    ywr   Nr   rq   	batch_imgs     r   rs   z+MllamaProcessor.__call__.<locals>.<genexpr>+  s     Di9>D   c              3   &   K   | ]	  }|d k(    ywrv   r   rw   s     r   rs   z+MllamaProcessor.__call__.<locals>.<genexpr>+  s      Q#,	QQry   zaIf a batch of text is provided, there should be either no images or at least one image per samplez@No image were provided, but there are image tokens in the promptzThe number of image token (z:) should be the same as in the number of provided images ()r6   r#   c              3   2   K   | ]  }t        |        y wrn   )r)   )rq   r#   s     r   rs   z+MllamaProcessor.__call__.<locals>.<genexpr>G  s     Qi3y>Qs   )r6   r7   r8   rE   return_tensors)datatensor_type)
ValueError_merge_kwargsr   rY   init_kwargsro   rp   listtupleallcountrQ   rU   rP   popupdater   r)   anysumrX   r4   r$   rN   r   r=   r   )rd   rf   rg   audiovideosrh   output_kwargsrk   r   rl   r~   rr   n_images_in_text	text_item_encodingn_images_in_imagessampleimage_featuresr6   	token_idsr5   rE   r}   batch_features                            r   __call__zMllamaProcessor.__call__   s    N <FNFGG***!
"&.."<"<
 
 $M2%o6%o6$$v e}5#=_Z^=_:_ !deeCGHa(8(8 9HHjno]f+It~~tGWGWXoDo5A%t~~d:k:HKK!S(0F<B!C&#f+!C!CD3CDDS Q0@Q N !w  %&#.>*??>$%ghh$5c:J6K5L  MG  HK  L^  H_  G`  `a  b  1T11&JMJN&**;7IKK' $"2`hit`u*S\.y$:M:MN*& * $P*#"22BBQ8K;PQQ	$  ,@D'(&**+;TB$$NK_  Io "D0*s   "J(J7JJc                 :     | j                   j                  |i |S )z
        This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        rY   batch_decoderd   argsrh   s      r   r   zMllamaProcessor.batch_decodeP  s     
 +t~~**D;F;;r   c                 :     | j                   j                  |i |S )z
        This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )rY   decoder   s      r   r   zMllamaProcessor.decodeW  s     
 %t~~$$d5f55r   c                 >    | j                   j                  |dd      S )a  
        Post-process the output of the model to decode the text.

        Args:
            generated_outputs (`torch.Tensor` or `np.ndarray`):
                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                or `(sequence_length,)`.

        Returns:
            `List[str]`: The decoded text.
        TF)skip_special_tokensclean_up_tokenization_spacesr   )rd   generated_outputss     r   post_process_image_text_to_textz/MllamaProcessor.post_process_image_text_to_text^  s(     ~~**4V[ + 
 	
r   c                 ~    | j                   j                  }| j                  j                  }t        ||z   dgz         S )NrE   )rY   model_input_namesrX   r   )rd   tokenizer_input_namesimage_processor_input_namess      r   r   z!MllamaProcessor.model_input_namesn  s?     $ @ @&*&:&:&L&L#),GGKaJbbccr   )NNNN)r   r   r   __doc__
attributesimage_processor_classtokenizer_classrc   r   r   r   r   r   r   r   r   r   r   r   r   r   propertyr   __classcell__)re   s   @r   rW   rW      s    : $[1J2/O5  (,hli$i uY(94	?DQbLccdei ./i 
iV<6
  d dr   rW   )r   typingr   r   r   numpyr>   feature_extraction_utilsr   image_utilsr   processing_utilsr	   r
   r   r   tokenization_utils_baser   r   image_processing_mllamar   r   r   r   r4   ndarrayrN   rp   rU   rW   r   r   r   <module>r      s     " ( (  4 % V V 9#U #,E -d3i - -QUVZ[^V_Q` -`-  $T$s)_ 5- DI-  -  	- 
 ZZ- `"KC "KC "Kc "Kc "KJ}dn }dr   