
    sgd+                         d Z ddlZddlmZmZmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZmZmZmZmZmZ dd	lmZmZ d
dlmZ  ej0                  e      Z G d de      Zy)zq
Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former.
    N)ListOptionalUnion   )BatchFeature)
VideoInput)ProcessorMixin)
AddedTokenBatchEncodingPaddingStrategyPreTokenizedInput	TextInputTruncationStrategy)
TensorTypelogging   )AutoTokenizerc            $       :    e Zd ZdZg dZdgZdZdZdZd fd	Z		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dde
deeeee   ee   f   d	ed
eeeef   deeeef   dee   dedee   dee   dededededededeeeef      def"dZd Zd Zed        Z fdZe fd       Z xZS ) InstructBlipVideoProcessora  
    Constructs an InstructBLIPVideo processor which wraps a InstructBLIP image processor and a LLaMa/T5 tokenizer into a single
    processor.

    [`InstructBlipVideoProcessor`] offers all the functionalities of [`InstructBlipVideoImageProcessor`] and [`AutoTokenizer`]. See the
    docstring of [`~InstructBlipVideoProcessor.__call__`] and [`~InstructBlipVideoProcessor.decode`] for more information.

    Args:
        image_processor (`InstructBlipVideoImageProcessor`):
            An instance of [`InstructBlipVideoImageProcessor`]. The image processor is a required input.
        tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
        qformer_tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
        num_query_tokens (`int`, *optional*):
            Number of tokens used by the Qformer as queries, should be same as in model's config.
    )image_processor	tokenizerqformer_tokenizernum_query_tokensInstructBlipVideoImageProcessorr   c                     t        |d      s2t        ddd      | _        |j                  | j                  gd       n|j                  | _        || _        t
        |   |||       y )Nvideo_tokenz<video>FT)
normalizedspecial)special_tokens)hasattrr
   r   
add_tokensr   super__init__)selfr   r   r   r   kwargs	__class__s         u/var/www/html/venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/processing_instructblipvideo.pyr#   z#InstructBlipVideoProcessor.__init__A   sb    y-0))tTD  $"2"2!3D I(44D 0)5FG    imagestextadd_special_tokenspadding
truncation
max_lengthstridepad_to_multiple_ofreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_token_type_idsreturn_lengthverbosereturn_tensorsreturnc                 t   ||t        d      t               }|ot        |t              r|g}n.t        |t              st        |d   t              st        d       | j
                  d||||||||	|
|||||dd|}| j                  ~||i }| j                  j                  | j                  z  dz  }| j                  |gt        |      z  dd      }|D ]-  }t        ||   ||         D cg c]
  \  }}||z    c}}||<   / n|}|t        j                  d	       t        ||
      }|j                  |        | j                  d||||||||	|
||||||d|}|j!                  d      |d<   |j!                  d      |d<   |$| j#                  ||      }|j                  |       |S c c}}w )a%  
        This method uses [`InstructBlipVideoImageProcessor.__call__`] method to prepare image(s) or video(s) for the model, and
        [`BertTokenizerFast.__call__`] to prepare text for the model.

        Please refer to the docstring of the above two methods for more information.
        Nz3You have to specify at least one of images or text.r   zAInvalid input text. Please provide a string, or a list of strings)r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8      F)r+   r8   aK  Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.)tensor_type	input_idsqformer_input_idsattention_maskqformer_attention_mask)r8    )
ValueErrorr   
isinstancestrlistr   r   r   contentlenziploggerwarning_oncer   updater   popr   )r$   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r%   encoding_text_encodingtext_encodingvideo_tokensvideo_token_encodingkimg_encodingtxt_encodingqformer_text_encodingimage_encodings                               r'   __call__z#InstructBlipVideoProcessor.__call__J   sE   4 >dlRSS>$$vd+JtAw4L !dee+T^^ #5%%#5&;*C+E'=&;+#  !N* $$0V5G "$$,,t/D/DDqH  (,~~!NSY.5Y] (6 ($ ( A ;>>RST>UWefgWh:i(6L, %|3(M!$ !/%''B *-^TMOOM*$:D$:$: %#5%%#5&;*C+E'=&;+-%  !%!$ -B,E,Ek,RH()1F1J1JK[1\H-.!11&1XNOON+S(s   2F4c                 :     | j                   j                  |i |S )z
        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        )r   batch_decoder$   argsr%   s      r'   rY   z'InstructBlipVideoProcessor.batch_decode   s     
 +t~~**D;F;;r(   c                 :     | j                   j                  |i |S )z
        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )r   decoderZ   s      r'   r]   z!InstructBlipVideoProcessor.decode   s     
 %t~~$$d5f55r(   c                     | j                   j                  }| j                  j                  }t        t        j                  ||z               S N)r   model_input_namesr   rE   dictfromkeys)r$   tokenizer_input_namesimage_processor_input_namess      r'   r`   z,InstructBlipVideoProcessor.model_input_names   sA     !% @ @&*&:&:&L&L#DMM"7:U"UVWWr(   c                    t         j                  j                  |      rt        d| d      t        j                  |d       t         j                  j                  |d      }| j                  j                  |       d| j                  v }|r| j                  j                  d       t        |   |fi |}|r| xj                  dgz  c_        |S )NzProvided path (z#) should be a directory, not a fileT)exist_okr   )ospathisfilerB   makedirsjoinr   save_pretrained
attributesremover"   )r$   save_directoryr%   qformer_tokenizer_pathqformer_presentoutputsr&   s         r'   rl   z*InstructBlipVideoProcessor.save_pretrained   s    77>>.)~.>>abcc
NT2!#n>Q!R../EF .@OO""#67').CFCOO 344Or(   c                     t        |   |fi |}t        |t              r|d   }t	        j                  |d      }||_        |S )Nr   r   )	subfolder)r"   from_pretrainedrC   tupler   r   )clspretrained_model_name_or_pathr%   	processorr   r&   s        r'   ru   z*InstructBlipVideoProcessor.from_pretrained   sP    G+,ITVT	 i'!!I)99:Wcvw&7	#r(   r_   )NNTFNNr   NNFFFFFTN) __name__
__module____qualname____doc__rm   valid_kwargsimage_processor_classtokenizer_classqformer_tokenizer_classr#   r   r   r   r   r   boolrD   r   r   r   intr   r   rW   rY   r]   propertyr`   rl   classmethodru   __classcell__)r&   s   @r'   r   r   (   s   $ GJ&'L=%O-H "^b#'5:;?$(,004*/+0',&+#;?#ll I0$y/4HYCZZ[l !	l
 tS/12l $%778l SMl l %SMl  (~l $(l %)l !%l  $l l  !l" !sJ!78#l& 
'l^<6 X X&  r(   r   )r}   rg   typingr   r   r   image_processing_utilsr   image_utilsr   processing_utilsr	   tokenization_utils_baser
   r   r   r   r   r   utilsr   r   autor   
get_loggerrz   rI   r   rA   r(   r'   <module>r      sR    
 ( ( 2 % .  )   
		H	%D Dr(   