
    sgP              
          d dl Z d dlmZmZmZmZ ddlmZmZ ddl	m
Z
mZmZmZmZ ddlmZmZ  e       rd dlmZ dd	lmZmZ  e       rdd
lmZ ddlmZ  ej6                  e      ZdZ G d de j>                        Z  G d d      Z!de"deee#ee#   ded   f      fdZ$ e
 ed             G d de             Z%y)    N)DictListOptionalUnion   )ProcessingKwargsUnpack)add_end_docstringsis_torch_availableis_vision_availableloggingrequires_backends   )Pipelinebuild_pipeline_init_args)Image)load_imagesvalid_images)*MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)
KeyDatasetz<image>c                       e Zd ZdZdZdZy)
ReturnTyper   r   r   N)__name__
__module____qualname__TENSORSNEW_TEXT	FULL_TEXT     \/var/www/html/venv/lib/python3.12/site-packages/transformers/pipelines/image_text_to_text.pyr   r   -   s    GHIr    r   c            	       :    e Zd ZdZdedeeee   ded   f   fdZy)Chata   This class is intended to just be used internally in this pipeline and not exposed to users. We convert chats
    to this format because the rest of the pipeline code tends to assume that lists of messages are
    actually a batch of samples rather than messages in the same conversation.messagesimagesImage.Imagec                 l    |D ]  }d|v rd|v rt        d       t        ||      }|| _        || _        y )NrolecontentzQWhen passing chat dicts as input, each dict must have a 'role' and 'content' key.)
ValueErrorretrieve_images_in_messagesr$   r%   )selfr$   r%   messages       r!   __init__zChat.__init__8   sJ     	vGg%)w*> !tuu	v -Xv> r    N)	r   r   r   __doc__r   r   strr   r.   r   r    r!   r#   r#   3   s4    R uS$s)]TXYfTg5g/h r    r#   r$   r%   r&   c                 <   |g }d}g }| D ]  }|d   D ]  }t        |t              s|j                  d      dk(  rTdD ]  }||v s|j                  ||           H |t	        |      k  r|j                  ||          |dz  }qt        d      |j                  d      dk(  st        |j                  d      t              r2d	|d   v r+|j                  |d   d	          d|d<   |d   d	   |d<   |d= t        d
        |t	        |      k7  rt        d      |S )zS
    Retrieve and combine images from the chat and the images passed as input.
    r   r)   typeimage)r3   urlpathbase64r   zlThe number of images in the chat messages should be the same as the number of images passed to the pipeline.	image_urlr4   zhWrong format for 'image_url' content type. The content should have an 'image_url' dict with a 'url' key.)
isinstancedictgetappendlenr*   )r$   r%   
idx_imagesretrieved_imagesr-   r)   keys          r!   r+   r+   B   sh    ~J y) 	G'4(;;v&'1A '>,33GCLA!
 &F3,33F:4FG&!OJ", !O#  [[(K7!'++k":DAewWbOcFc(//0DU0KL*1+2;+?+F(#K0( G 1	< S[ z
 	
 r    T)has_processorc                        e Zd ZdZdZdZdZdZ fdZ	 	 	 	 	 	 	 dde	e
   fdZ	 	 ddeeeee   eee      d	ed	   eed	      f      d
eeeee   ee   f      f fdZddZddZej*                  dfdZ xZS )ImageTextToTextPipelineac	  
    Image-text-to-text pipeline using an `AutoModelForImageTextToText`. This pipeline generates text given an image and text.
    When the underlying model is a conversational model, it can also accept one or more chats,
    in which case the pipeline will operate in chat mode and will continue the chat(s) by adding its response(s).
    Each chat takes the form of a list of dicts, where each dict contains "role" and "content" keys.

    Example:

    ```python
    >>> from transformers import pipeline

    >>> pipe = pipeline(task="image-text-to-text", model="Salesforce/blip-image-captioning-base")
    >>> pipe("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", text="A photo of")
    [{'generated_text': 'a photo of two birds'}]
    ```

    ```python
    >>> from transformers import pipeline

    >>> pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
    >>> messages = [
    >>>     {
    >>>         "role": "user",
    >>>         "content": [
    >>>             {
    >>>                 "type": "image",
    >>>                 "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
    >>>             },
    >>>             {"type": "text", "text": "Describe this image."},
    >>>         ],
    >>>     },
    >>>     {
    >>>         "role": "assistant",
    >>>         "content": [
    >>>             {"type": "text", "text": "There is a dog and"},
    >>>         ],
    >>>     },
    >>> ]
    >>> pipe(text=messages, max_new_tokens=20, return_full_text=False)
    [{'input_text': [{'role': 'user',
        'content': [{'type': 'image',
        'url': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'},
        {'type': 'text', 'text': 'Describe this image.'}]},
    {'role': 'assistant',
        'content': [{'type': 'text', 'text': 'There is a dog and'}]}],
    'generated_text': ' a person in the image. The dog is sitting on the sand, and the person is sitting on'}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This image-text to text pipeline can currently be loaded from pipeline() using the following task identifier:
    "image-text-to-text".

    See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-text-to-text).
    TFc                 f    t        |   |i | t        | d       | j                  t               y )Nvision)superr.   r   check_model_typer   )r,   argskwargs	__class__s      r!   r.   z ImageTextToTextPipeline.__init__   s.    $)&)$)HIr    NrH   c                 @   i }	i }
i }||
d<   |||
d<   |||
d<   |||	d<   |#d|	vri |	d<   d|	d   v rt        d      ||	d   d<   |1|/|t        d      |rt        j                  nt        j                  }||t        j                  }|||d<   |||d<   |
|	|fS )	Nprocessing_kwargstimeoutcontinue_final_messagegenerate_kwargsmax_new_tokenszp'max_new_tokens' is defined twice, once in 'generate_kwargs' and once as a direct parameter, please use only onez>`return_full_text` is mutually exclusive with `return_tensors`return_type)r*   r   r   r   r   )r,   rO   rN   rL   return_full_textreturn_tensorsrP   rM   rH   forward_kwargspreprocess_paramspostprocess_paramss               r!   _sanitize_parametersz,ImageTextToTextPipeline._sanitize_parameters   s    17-.+2i(!-:P67&0?N,-% 64601>2C#DD +  CQN,-.>?'K,?) !abb2B*..
H[H[K%+*=$,,K"0;}-!-;Q78 .2DDDr    r%   r&   textc                    ||t        d      ||t        |      s	 t        |   |fi |S t	        |t
        t        t        f      rt	        |d   t
        t        t        f      rvt	        |d   t              rt        |   t        ||      fi |S |dgt        |      z  }t        ||      D cg c]  \  }}t        ||       }}}t        |   |fi |S t        | j                  dd      t        j                  d       |t        |   |fi |S |t        d      t        |   ||dfi |S c c}}w )a
  
        Generate a text given text and the image(s) passed as inputs.

        Args:
            images (`str`, `List[str]`, `PIL.Image or `List[PIL.Image]`):
                The pipeline handles three types of images:

                - A string containing a HTTP(s) link pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL directly

                The pipeline accepts either a single image or a batch of images.
            text (str, List[str], `List[Dict[str, Union[str, PIL.Image]]]`):
                The text to be used for generation. If a list of strings is passed, the length of the list should be the
                same as the number of images. Text can also follow the chat format: a list of dictionaries where each
                dictionary represents a message in a conversation. Each dictionary should have two keys: 'role' and
                'content'. 'role' should be one of 'user', 'system' or 'assistant'. 'content' should be a list of dictionary
                containing the text of the message and the type of the message. The type of the message can be either
                'text' or 'image'. If the type is 'image', no text is needed.
            return_tensors (`bool`, *optional*, defaults to `False`):
                Returns the tensors of predictions (as token indices) in the outputs. If set to
                `True`, the decoded text is not returned.
            return_text (`bool`, *optional*):
                Returns the decoded texts in the outputs.
            return_full_text (`bool`, *optional*, defaults to `True`):
                If set to `False` only added text is returned, otherwise the full text is returned. Cannot be
                specified at the same time as `return_text`.
            continue_final_message( `bool`, *optional*): This indicates that you want the model to continue the
                last message in the input chat rather than starting a new one, allowing you to "prefill" its response.
                By default this is `True` when the final message in the input chat has the `assistant` role and
                `False` otherwise, but you can manually override that behaviour by setting this flag.

        Return:
            A list or a list of list of `dict`: Each result comes as a dictionary with the following key (cannot return a combination
            of both `generated_text` and `generated_token_ids`):

            - **generated_text** (`str`, present when `return_text=True`) -- The generated text.
            - **generated_token_ids** (`torch.Tensor`, present when `return_tensors=True`) -- The token
                ids of the generated text.
            - **input_text** (`str`) -- The input text.
        Nz0You must at least provide either text or images.r   chat_templatea
  The input data was not formatted as a chat with dicts containing 'role' and 'content' keys, even though this model supports chat. Consider using the chat format for better results. For more information, see https://huggingface.co/docs/transformers/en/chat_templatingz(You must provide text for this pipeline.)r%   rW   )r*   r   rE   __call__r8   listtupler   r9   r#   r<   zipgetattr	processorloggerwarning_once)r,   r%   rW   rH   chatr3   chatsrI   s          r!   rZ   z ImageTextToTextPipeline.__call__   s_   b >dlOPP$,|F7K 7#F5f55dT5*56:d1gPTV[]aOb;c$q'4(w'T6(:EfEE>"Vc$i/F>A$>OP{tUdE*PPw'888 4>>?D9E[ >7#D3F33<GHHw64 @KFKK! Qs   7D>c                 >   t        |t        t        t        f      rd }|}|}nt        |t              rZ||j
                  d   d   dk(  }| j                  j                  |j
                  | || j                        }|}|j                  }n|d   }|d   }|d   }t        |      }t        |t        t        f      r t        |      dkD  r|j                  dd	        | j                  d||| j                  d
d|j                  | j                        }||d<   |S )Nr(   	assistant)add_generation_promptrM   rR   rW   r%   r   paddingTF)r%   rW   rR   legacy)dtyper   )r8   r[   r\   r0   r#   r$   r_   apply_chat_template	frameworkr%   r   r<   
setdefaulttotorch_dtype)	r,   inputsrL   rM   rK   r%   rW   inputs_textmodel_inputss	            r!   
preprocessz"ImageTextToTextPipeline.preprocess>  s.   ftUC01FD K&$' *1-3__R-@-HK-W*~~99OO.D*D+A#'>>	 :  %f~$Vn) (F dT5M*s4y1}((D9%t~~ 
T^^E
Uf

"4##"
$ 	  +Vr    c                     |i n|}|j                  d      }d|v r|d   n|d   } | j                  j                  di ||}|||dS )NrW   	input_idsdecoder_input_ids)generated_sequenceprompt_textru   r   )popmodelgenerate)r,   rr   rN   rx   ru   rw   s         r!   _forwardz ImageTextToTextPipeline._forwardd  si     / 7"_"&&v.)4)DL%,WjJk 	 1TZZ00S<S?S&8clmmr    c                    |d   }t        |t        t        f      r|gn|}|d   }|d   }|t        j                  k(  r+t        t        |            D cg c]  }||   ||   d c}S | j                  j                  |      }| j                  j                  |      }	|t        j                  t        j                  hv rig }
t        ||	      D ]V  \  }}|j                  |      }d|cxk  rdk  r$n n!|
j                  ||t        |      z   d         F|
j                  |       X |
}|t        j                  k(  rg }t        ||      D ]  \  }}t        |t              r||z   }nt        |t              r||j                  d   d   d	k(  }|rt        |j                  d   d
   d   j!                               }|dxx   |z  cc<   t#        |j                        d d |j                  d   d   |j                  d   d
   d d |gz   dgz   }nt#        |j                        d	|dgz   }|j                  |        |}t        ||      D cg c]&  \  }}t        |t              r|j                  n||d( }}}|S c c}w c c}}w )Nrx   rw   ru   )
input_textgenerated_token_idsr   r   re   r(   rf   r)   rW   )r(   r)   )r~   generated_text)r8   r0   r#   r   r   ranger<   r_   post_process_image_text_to_textr   r   r]   findr;   r$   r9   itemsr[   )r,   model_outputsrP   rM   input_textsrw   ru   igenerated_textsdecoded_inputsnew_generated_textstext_generateddecoded_inputindex_input_text
full_textsrx   r   new_textr~   recordss                       r!   postprocessz#ImageTextToTextPipeline.postprocessn  s   #M2'1+T{'K{mQ\*+?@!+.	*,,, s;/0  +1~FXYZF[\  ..HHI[\GG	R :..
0D0DEE #%14_n1U ?-#1#6#6}#E (-A-'..~>NQTUbQc>c>e/fg'..~>? 2O*...J/2;/P 2+^k3/%0>%ANT2-5 2=1E1Eb1I&1QU`1`.-#'(<(<R(@(KB(O(U(U(W#X (N:()-k.B.B)CCR)H(3(<(<R(@(H+6+?+?+CI+NsPR+SW_V`+`L * *.k.B.B)C%0^LG * !!.1/20 )O /2+.O

 +
N 6@
D5Qj11Wa"0
 
 yh
s   J+J)NNNNNNN)NN)NNNN)N)r   r   r   r/   _load_processor_load_image_processor_load_feature_extractor_load_tokenizerr.   r	   r   rV   r   r   r0   r   r9   rZ   rs   r|   r   r   r   __classcell__)rI   s   @r!   rB   rB   r   s    7r O!#OJ #/E )*/Ej <@TL#tCy$tCy/=$}BUW[\`an\oWppq
TL
 uS$s)T$Z789TLl$Ln 6@5I5Ibf Br    rB   )&enumtypingr   r   r   r   processing_utilsr   r	   utilsr
   r   r   r   r   baser   r   PILr   image_utilsr   r   models.auto.modeling_autor   pt_utilsr   
get_loggerr   r`   IMAGE_TOKENEnumr   r#   r9   r0   r+   rB   r   r    r!   <module>r      s      . . 7  5 7 V$			H	%  --$U3S	=$}J]+]%^_-` ,4@A}h } B}r    