
    sg                      d Z ddlmZ ddlZddlmZ ddlmZmZm	Z	m
Z
 ddlZddlmZmZ ddlmZmZmZmZmZmZmZ dd	lmZmZ dd
lmZmZmZmZm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z(  ejR                  e*      Z+dZ,d9dZ-d:dZ.e G d de             Z/e G d de             Z0e G d de             Z1e G d de             Z2 G d dejf                  jh                        Z5 G d dejf                  jh                        Z6 G d dejf                  jh                        Z7 G d d ejf                  jh                        Z8 G d! d"ejf                  jh                        Z9 G d# d$e      Z:d%Z;d&Z<d'Z=e G d( d)ejf                  jh                               Z> G d* d+e:      Z? G d, d-ejf                  jh                        Z@ G d. d/e:      ZA ed0e;       G d1 d2e:             ZB ed3e;       G d4 d5e:             ZC ed6e;       G d7 d8e:             ZDy);zTensorFlow BLIP model.    )annotationsN)	dataclass)AnyOptionalTupleUnion   )TFBaseModelOutputTFBaseModelOutputWithPooling)TFPreTrainedModelget_initializerget_tf_activationkeraskeras_serializable
shape_listunpack_inputs)check_embeddings_within_boundsstable_softmax)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )
BlipConfigBlipTextConfigBlipVisionConfig)BLIP_TEXT_INPUTS_DOCSTRINGTFBlipTextLMHeadModelTFBlipTextModelzSalesforce/blip-vqa-basec           	         t         j                  j                  t        j                  j                  t        j                  t        |       d         | d            S )Nr   T)y_truey_predfrom_logits)tfmathreduce_meanr   metricssparse_categorical_crossentropyranger   )logitss    \/var/www/html/venv/lib/python3.12/site-packages/transformers/models/blip/modeling_tf_blip.pycontrastive_lossr-   5   sJ    775588Jv.q126t 	6 	
     c                d    t        |       }t        t        j                  |             }||z   dz  S )Ng       @)r-   r%   	transpose)
similaritycaption_loss
image_losss      r,   	blip_lossr4   >   s/    #J/L!",,z":;J:%,,r.   c                  v    e Zd ZU dZdZded<   dZded<   dZded<   dZded	<   dZ	d
ed<   dZ
d
ed<   ed        Zy))TFBlipForConditionalGenerationModelOutputa  
    Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
    last hidden states. This class also adds the loss term from the text decoder.

    Args:
        loss (`tf.Tensor`, *optional*, returned when `labels` is provided, `tf.Tensor` of shape `(1,)`):
            Languge modeling loss from the text decoder.
        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
            Prediction scores of the language modeling head of the text decoder model.
        image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)`, *optional*):
            The image embeddings obtained after applying the Vision Transformer model to the input image.
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
            the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.`
    NTuple[tf.Tensor] | Nonelossr+   tf.Tensor | Noneimage_embeds	tf.Tensorlast_hidden_stateTuple[tf.Tensor, ...] | Nonehidden_states
attentionsc                N    t        j                  dt               | j                  S )Nz`decoder_logits` attribute is deprecated and will be removed in version 5 of Transformers. Please use the `logits` attribute to retrieve the final output instead.)warningswarnFutureWarningr+   selfs    r,   decoder_logitsz8TFBlipForConditionalGenerationModelOutput.decoder_logitsg   s#    W	

 {{r.   )__name__
__module____qualname____doc__r8   __annotations__r+   r:   r<   r>   r?   propertyrF    r.   r,   r6   r6   D   s]    4 %)D
!(&*F#*%)L")#'y'26M/6/3J,3 r.   r6   c                  X    e Zd ZU dZdZded<   dZded<   dZded<   dZded	<   dZ	ded
<   y)TFBlipTextVisionModelOutputa  
    Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
    last hidden states. This class also adds the loss term from the text decoder.

    Args:
        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Languge modeling loss from the text decoder.
        image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The image embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
            the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nr9   r8   r:   r;   r<   r=   r>   r?   )
rG   rH   rI   rJ   r8   rK   r:   r<   r>   r?   rM   r.   r,   rO   rO   q   s@    0 "D
!%)L")#'y'26M/6/3J,3r.   rO   c                      e Zd ZU dZdZded<   dZded<   dZded<   dZded<   dZ	d	ed
<   dZ
ded<   dZd	ed<   dZded<   y)"TFBlipImageTextMatchingModelOutputa9  
    Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
    last hidden states. This class also adds the loss term from the text decoder as well as the image-text similarity
    scores.

    Args:
        itm_score (`tf.Tensor`):
            The image-text similarity scores.
        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Languge modeling loss from the text decoder.
        image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The image embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
            the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        vision_pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`, *optional*):
            Last layer hidden-state of the vision of the vision-only branch of the model.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        question_embeds (`tf.Tensor`):
            The question embeddings obtained by the text projection layer.
    Nr9   	itm_scorer8   r:   r;   r<   r=   r>   vision_pooler_outputr?   r7   question_embeds)rG   rH   rI   rJ   rR   rK   r8   r:   r<   r>   rS   r?   rT   rM   r.   r,   rQ   rQ      sb    > #'I&!D
!%)L")#'y'26M/6-1*1/3J,3/3O,3r.   rQ   c                  |    e Zd ZU dZdZded<   dZded<   dZded<   dZded<   dZ	ded	<   dZ
d
ed<   dZd
ed<   ddZy)TFBlipOutputa  
    Args:
        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image:(`tf.Tensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text:(`tf.Tensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds(`tf.Tensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`BlipTextModel`].
        image_embeds(`tf.Tensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of [`BlipVisionModel`].
        text_model_output(`BaseModelOutputWithPooling`):
            The output of the [`BlipTextModel`].
        vision_model_output(`BaseModelOutputWithPooling`):
            The output of the [`BlipVisionModel`].
    Nr9   r8   r;   logits_per_imagelogits_per_texttext_embedsr:   r   text_model_outputvision_model_outputc                H     t         fd j                         D              S )Nc              3  d   K   | ]'  }|d vr|   nt        |      j                          ) yw))rZ   r[   N)getattrto_tuple).0krE   s     r,   	<genexpr>z(TFBlipOutput.to_tuple.<locals>.<genexpr>   s=      
  LLDGRYZ^`aRbRkRkRmm
s   -0)tuplekeysrD   s   `r,   r_   zTFBlipOutput.to_tuple   s#     
YY[
 
 	
r.   )returnz
Tuple[Any])rG   rH   rI   rJ   r8   rK   rW   rX   rY   r:   rZ   r[   r_   rM   r.   r,   rV   rV      sZ    ( "D
!"&i&!%OY%!K!"L)"6:3:8<5<
r.   rV   c                  0     e Zd Zd fdZddZddZ xZS )TFBlipVisionEmbeddingsc                   t        |   di | || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  j                  | j                  | j                  | j                  t        | j                  j                        dd      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        y )Nchannels_lastpatch_embedding)filterskernel_sizestrideskernel_initializerdata_formatname   r   rM   )super__init__confighidden_size	embed_dim
image_size
patch_sizer   layersConv2Dr   initializer_rangerj   num_patchesnum_positionsrE   rt   kwargs	__class__s      r,   rs   zTFBlipVisionEmbeddings.__init__   s    "6"++ ++ ++$||22NNOO.t{{/L/LM'"  3  
 !OOt>1D!--1r.   c                "   | j                  dd| j                  ft        | j                  j                        dd      | _        | j                  d| j                  | j                  ft        | j                  j                        dd      | _        | j                  ry d| _        t        | dd       Pt        j                  | j                  j                        5  | j                  j                  g d       d d d        y y # 1 sw Y   y xY w)Nr   Tclass_embeddingshapeinitializer	trainablerp   position_embeddingrj   )NNNr	   )
add_weightrv   r   rt   r{   r   r}   r   builtr^   r%   
name_scoperj   rp   buildrE   input_shapes     r,   r   zTFBlipVisionEmbeddings.build   s    #a('(E(EF"	  /  
 #'//d(($..9'(E(EF%	 #2 #
 ::
4*D1=t33889 B$$**+@AB B >B Bs   DDc                   t        j                  |      d   }t        j                  |d      }| j                  |      }t        j                  ||| j
                  df      }t        j                  | j                  |d| j                  f      }t        j                  ||gd      }|| j                  d d d t        j                  |      d   d d f   z   }|S )Nr   )r   rq   r	   r   permr   axis)r%   r   r0   rj   reshaper|   broadcast_tor   rv   concatr   )rE   pixel_values
batch_sizepatch_embedsclass_embeds
embeddingss         r,   callzTFBlipVisionEmbeddings.call  s     XXl+A.
||L|D++L9zz,T=M=Mr0RSt';';j!T^^=\]YYl;!D
$"9"9!=Vrxx
?STU?V=VXY:Y"ZZ
r.   rt   r   N)r   r;   re   r;   rG   rH   rI   rs   r   r   __classcell__r   s   @r,   rg   rg      s    2&B,r.   rg   c                  L     e Zd Zd fdZdd fdZ	 	 	 d	 	 	 	 	 	 	 ddZ xZS )	TFBlipTextEmbeddingsc                T    t        |   di | |j                  | _        || _        y )NrM   )rr   rs   ru   rv   rt   r~   s      r,   rs   zTFBlipTextEmbeddings.__init__  s'    "6"++r.   c                   t        j                  d      5  | j                  | j                  j                  | j
                  ft        | j                  j                  | j                  j                  z        dd      | _	        d d d        t        j                  d      5  | j                  | j                  j                  | j
                  ft        | j                  j                  | j                  j                  z        dd      | _        d d d        t        | 5  |       y # 1 sw Y   xY w# 1 sw Y   %xY w)Ntoken_embeddingTweightr   r   r   )r%   r   r   rt   
vocab_sizerv   r   initializer_factorr{   r   max_position_embeddingsr   rr   r   )rE   r   r   s     r,   r   zTFBlipTextEmbeddings.build#  s    ]],- 	//{{--t~~>+DKK,J,JT[[MjMj,jk	 * DK	 ]]/0 	&*oo{{::DNNK+DKK,J,JT[[MjMj,jk!	 '6 'D#	 	k"!	 		 	s   A/D*#A/D6*D36D?c                   ||t        d      |At        || j                  j                         t	        j
                  | j                  |      }t        |      dd }|/t	        j                  t	        j                  d|d         d      }t	        j
                  | j                  |      }t	        j                  ||d   ddf	      }||z   }|S )
z
        Applies embedding based on inputs tensor.

        Returns:
            final_embeddings (`tf.Tensor`): output embedding tensor.
        Nz5You have to specify either input_ids or inputs_embeds)paramsindicesr   r   )startlimitr   r   )input	multiples)
ValueErrorr   rt   r   r%   gatherr   r   expand_dimsr*   r   tile)rE   	input_idsposition_idsinputs_embedsr   position_embedsfinal_embeddingss          r,   r   zTFBlipTextEmbeddings.call6  s     !6TUU *9dkk6L6LMIIT[[)LM /4>>"((+b/*RYZ[L))4+B+BLY''KPQNTUWXCYZ(?:r.   )rt   r   r   )r   ztf.TensorShape)NNN)r   r;   r   r;   r   r;   re   r;   r   r   s   @r,   r   r     sC    #*  $"&#'	      !	 
 
 r.   r   c                  L     e Zd ZdZ fdZ	 	 	 d	 	 	 	 	 	 	 	 	 ddZddZ xZS )TFBlipAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                   t        |   di | || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        t        j                  j                  |j                  d      | _        t        j                  j                  d| j                  z  t!        |j"                        d	      | _        t        j                  j                  | j                  t!        |j"                        d
	      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      dropoutrp   r	   qkvrn   rp   
projectionrM   )rr   rs   rt   ru   rv   num_attention_heads	num_headshead_dimr   scaler   ry   Dropoutattention_dropoutr   Denser   r{   r   r   r~   s      r,   rs   zTFBlipAttention.__init__X  s/   "6"++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
||++F,D,D9+U<<%%?6C[C[3\ch & 
  ,,,,NNv?W?W/X_k - 
r.   c                `   t        |      \  }}}| j                  |      }t        j                  |||d| j                  | j
                  f      }t        j                  |d      }|d   |d   |d   }}
}	|	t        j                  |
d      z  }|| j                  z  }t        |d	      }| j                  ||
      }|||z  }t        j                  ||z  d      }t        |      dd | j                  gz   }t        j                  ||      }| j                  |      }|r||f}|S |df}|S )z#Input shape: Batch x Time x Channelr	   )rq   r   r	   r      r   r   r   rq   )r   r   r	   rq   r   r   )trainingN)r   rq   r   r	   )r   r   r%   r   r   r   r0   r   r   r   rv   r   )rE   r>   	head_maskoutput_attentionsr   bsztgt_lenrv   	mixed_qkvquery_states
key_statesvalue_statesattention_scoresattention_probscontext_layernew_context_layer_shapeoutputoutputss                     r,   r   zTFBlipAttention.calln  sA    #-]";WiHH]+	JJy3DNNDMM*Z[	LLA	1:1y|YWX\,j (",,z<*PP+djj8 ))9C ,,,J  -	9O_|%C,W",]";CR"@DNNCS"S

=2IJ//@6?+ HNtnr.   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   xY w# 1 sw Y   ~xY w# 1 sw Y   y xY w)NTr   r   r   )
r   r^   r%   r   r   rp   r   r   rv   r   r   s     r,   r   zTFBlipAttention.build  s   ::
4D)5t||001 )""4()4%1txx}}- =dDNN;<=4t,8t334 D%%tT4>>&BCD D 9) )= =D Ds$   D?%)E)E?EEE )NFN)
r>   r;   r   r9   r   Optional[bool]r   r   re   z;Tuple[tf.Tensor, tf.Tensor | None, Tuple[tf.Tensor] | None]r   )rG   rH   rI   rJ   rs   r   r   r   r   s   @r,   r   r   U  sS    G
2 '+,1#'* * $* *	*
 !* 
E*XDr.   r   c                  0     e Zd Zd fdZddZddZ xZS )	TFBlipMLPc                   t        |   di | t        |j                        | _        |j
                  dz  d|j                  z  dz  z  }d|j
                  z  dz  }t        j                  j                  |j                  t        |      d      | _        t        j                  j                  |j
                  t        |      d      | _        || _        y )Nr   rq   fc1)unitsrn   rp   fc2rM   )rr   rs   r   
hidden_actactivation_fnru   num_hidden_layersr   ry   r   intermediate_sizer   r   r   rt   )rE   rt   r   in_proj_stdfc_stdr   s        r,   rs   zTFBlipMLP.__init__  s    "6".v/@/@A))4/Q9Q9Q5QVZ4Z[f(((T1<<%%**v?V]b & 
 <<%%$$9U\a & 
 r.   c                p    | j                  |      }| j                  |      }| j                  |      }|S )N)inputs)r   r   r   )rE   r>   s     r,   r   zTFBlipMLP.call  s8    6**=96r.   c                "   | j                   ry d| _         t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       et        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        y y # 1 sw Y   |xY w# 1 sw Y   y xY w)NTr   r   )r   r^   r%   r   r   rp   r   rt   ru   r   r   r   s     r,   r   zTFBlipMLP.build  s    ::
4%1txx}}- FdDKK,C,CDEF4%1txx}}- LdDKK,I,IJKL L 2F FL Ls   3C9<3D9DDrt   r   )r>   r;   re   r;   r   rG   rH   rI   rs   r   r   r   r   s   @r,   r   r     s     	Lr.   r   c                  H     e Zd Zd fdZ	 	 d	 	 	 	 	 	 	 	 	 ddZddZ xZS )TFBlipEncoderLayerc                N   t        |   di | |j                  | _        t	        |d      | _        t        j                  j                  |j                  d      | _
        t        |d      | _        t        j                  j                  |j                  d      | _        y )N	self_attnr   layer_norm1epsilonrp   mlplayer_norm2rM   )rr   rs   ru   rv   r   r   r   ry   LayerNormalizationlayer_norm_epsr   r   r   r   r~   s      r,   rs   zTFBlipEncoderLayer.__init__  s    "6"++(kB <<::6CXCX_l:mV%0 <<::6CXCX_l:mr.   c                    |}| j                  |      }| j                  ||||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )a9  
        Args:
            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`tf.Tensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r>   r   r   r   )r   r   r   r   )rE   r>   attention_maskr   r   residualattn_weightsr   s           r,   r   zTFBlipEncoderLayer.call  s    " !((7&*nn'$/	 '5 '
#| &0 ((7/%0 "&Gr.   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   4xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)NTr   r   r   r   )r   r^   r%   r   r   rp   r   r   rv   r   r   r   s     r,   r   zTFBlipEncoderLayer.build  sp   ::
4d+7t~~223 +$$T*+4-9t//445 E  &&dDNN'CDE4%1txx}}- %t$%4-9t//445 E  &&dDNN'CDE E :+ +E E% %E Es0   F%)F&F2&)F>F#&F/2F;>Gr   )FN)
r>   r;   r   r;   r   r   r   r   re   zTuple[tf.Tensor]r   r   r   s   @r,   r   r     sM    n -2#'& & "& *	&
 !& 
&PEr.   r   c                      e Zd ZdZeZdZdgZy)TFBlipPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    blipr   N)rG   rH   rI   rJ   r   config_classbase_model_prefix_keys_to_ignore_on_load_missingrM   r.   r,   r  r    s    
 L'6&7#r.   r  a:  
    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`BlipConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
a=  
    Args:
        pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`BlipImageProcessor`]. See [`BlipImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoProcessor`]. See [`BlipProcessor.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`BlipImageProcessor`]. See [`BlipImageProcessor.__call__`] for details.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                  b     e Zd ZeZ	 d fdZe	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 dd       ZddZ xZ	S )TFBlipEncoderc                    t        |   di | || _        t        |j                        D cg c]  }t        |d|        c}| _        y c c}w )Nz	layers_._r   rM   )rr   rs   rt   r*   r   r   ry   )rE   rt   r   ir   s       r,   rs   zTFBlipEncoder.__init__i  sI    "6"QVW]WoWoQpqA)&1#Gqqs   Ac                   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	t	        | j
                        D ]*  \  }
}|r||	fz   } ||	|||      }|d   }	|s"||d   fz   }, |r||	fz   }|st        d |	||fD              S t        |	||      S )a  
        Args:
            inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embedded representation of the inputs. Should be float, not int tokens.
            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NrM   )r   r   r   r   c              3  &   K   | ]	  }||  y wr   rM   )r`   vs     r,   rb   z%TFBlipEncoder.call.<locals>.<genexpr>  s     eqWXWde   )r<   r>   r?   )rt   r   output_hidden_statesuse_return_dict	enumeratery   rc   r
   )rE   r   r   r   r  return_dictr   encoder_statesall_attentionsr>   idxencoder_layerlayer_outputss                r,   r   zTFBlipEncoder.calln  s   < 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8 	FC#!/=2B!B)"3!	M *!,M !/=3C2E!E	F  +}.>>Ne]NN$Seee +>Vd
 	
r.   c                    | j                   ry d| _         t        | dd       K| j                  D ];  }t        j                  |j
                        5  |j                  d        d d d        = y y # 1 sw Y   IxY w)NTry   )r   r^   ry   r%   r   rp   r   )rE   r   layers      r,   r   zTFBlipEncoder.build  sp    ::
44(4 &]]5::. &KK%& && 5& &s   A..A7	r   NNNNN)r   r9   r   r   r  r   r  r   r   r   re   zUnion[Tuple, TFBaseModelOutput]r   )
rG   rH   rI   r   r  rs   r   r   r   r   r   s   @r,   r  r  ]  sz    Lr
  ,0,0/3&*#'=
 )=
 *	=

 -=
 $=
 !=
 
)=
 =
~&r.   r  c                       e Zd ZdZeZd fdZd	dZe e	e
       eee      	 	 	 	 	 d
	 	 	 	 	 	 	 	 	 	 	 dd                     Zd ZddZ xZS )TFBlipVisionModelr   c                   t        |   |g|i | || _        t        |d      | _        t        |d      | _        t        j                  j                  |j                  d      | _        |j                  | _        y )Nr   r   encoderpost_layernormr   )rr   rs   rt   rg   r   r  r  r   ry   r   r   r  ru   rv   rE   rt   argsr   r   s       r,   rs   zTFBlipVisionModel.__init__  sm    1$1&10lK$V)<#ll==fF[F[br=s++r.   c                $   | j                   j                  rt        j                  |j                        nd }| j                   j
                  rt        j                  |j                        nd }t        |j                  |j                  ||      S )Nr<   pooler_outputr>   r?   )
rt   r  r%   convert_to_tensorr>   r   r?   r   r<   r#  )rE   r   hsattnss       r,   serving_outputz TFBlipVisionModel.serving_output  sq    ;?;;;[;[R!!&"6"67ae;?;;;X;X$$V%6%67^b+$66 ..	
 	
r.   output_typer  c                   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  |      }| j                  |||||      }|d   }| j                  |      }|dddddf   }	| j                  t        j                  |	d            }	t        j                  |	d      }	|s
||	f|dd z   S t        ||	|j                  |j                        S )z
        Returns:

        Nz You have to specify pixel_values)r   r   r  r  r   r   r   r"  )rt   r   r  r  r   r   r  r  r%   r   squeezer   r>   r?   )
rE   r   r   r  r  r   r>   encoder_outputsr<   pooled_outputs
             r,   r   zTFBlipVisionModel.call  s-    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@5,,'/!5# ' 
 ,A. //0AB)!Q'2++BNN=!,LM

=!4%}58KKK+/')77&11	
 	
r.   c                    | j                   S r   )r   rD   s    r,   get_input_embeddingsz&TFBlipVisionModel.get_input_embeddings  s    r.   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   xY w# 1 sw Y   ~xY w# 1 sw Y   y xY w)NTr   r  r  )
r   r^   r%   r   r   rp   r   r  r  rv   r   s     r,   r   zTFBlipVisionModel.build  s   ::
4t,8t334 ,%%d+,4D)5t||001 )""4()4)40<t22778 H##))4t~~*FGH H =, ,) )H Hs$   D2%D>?)E
2D;>E
Er   )r   r   re   r   r  )r   r9   r   r   r  r   r  r   r   r   re   z*Union[Tuple, TFBaseModelOutputWithPooling]r   )rG   rH   rI   main_input_namer   r  rs   r'  r   r   BLIP_VISION_INPUTS_DOCSTRINGr   r   r   r/  r   r   r   s   @r,   r  r    s    $O#L,	
 *+GH+GVfg *.,0/3&*#'/
&/
 */
 -	/

 $/
 !/
 
4/
 h I /
bHr.   r  c                  x     e Zd ZeZd fdZddZe	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd       Z xZ	S )TFBlipMainLayerc                   t        |   |i | t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }|j                  | _	        |j                  | _        |j                  | _        t        |d      | _        t        |d      | _        t"        j$                  j'                  | j                  dt)        |j*                        d	      | _        t"        j$                  j'                  | j                  dt)        |j*                        d
	      | _        || _        y )NzKconfig.text_config is expected to be of type BlipTextConfig but is of type .zOconfig.vision_config is expected to be of type BlipVisionConfig but is of type 
text_modelr   vision_modelFvisual_projection)use_biasrn   rp   text_projection)rr   rs   
isinstancetext_configr   	TypeErrortypevision_configr   projection_dimru   text_embed_dimvision_embed_dimr    r7  r  r8  r   ry   r   r   r{   r9  r;  rt   )rE   rt   r   r   r=  r@  r   s         r,   rs   zTFBlipMainLayer.__init__  sh   $)&)&,,n=++,-Q0 
 &..0@A--./q2 
 ((,,$33)55 - 9 9)+LI-m.Q!&!3!3.v/G/GH$	 "4 "
  %||11.v/G/GH"	  2  
 r.   c                *   | j                  dg t        j                  j                  | j                  j
                        d      | _        | j                  ry d| _        t        | dd       Mt        j                  | j                  j                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j                        5  | j                  j                  d        d d d        t        | dd       Zt        j                  | j                  j                        5  | j                  j                  d d | j                   g       d d d        t        | dd       [t        j                  | j"                  j                        5  | j"                  j                  d d | j$                  g       d d d        y y # 1 sw Y   4xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)Nlogit_scaleT)rp   r   r   r   r7  r8  r9  r;  )r   r   initializersConstantrt   logit_scale_init_valuerE  r   r^   r%   r   r7  rp   r   r8  r9  rC  r;  rB  r   s     r,   r   zTFBlipMainLayer.buildA  s   ??**33DKK4V4VW	 + 
 ::
4t,8t334 ,%%d+,4.:t00556 .!!''-.4,d3?t55::; R&&,,dD$:O:O-PQR4*D1=t33889 N$$**D$8K8K+LMN N >, ,. .R RN Ns0   G$0G1
)G=1)H	$G.1G:=H	Hc
           	        ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |||||	      }
| j                  |||||||	      }|
d   }| j                  |      }|d   }| j                  |      }|t        j                  |ddd      z  }|t        j                  |ddd      z  }t        j                  | j                        }t        j                  ||d      |z  }t        j                  |      }d }|r!t        |      }t        j                  |d	      }|s||||||
f}||f|z   S |S t!        |||||||

      S )Nr   r   r  r  r   )r   r   r   r   r  r  r   r   rq   r   T)ordr   keepdimstranspose_br   )r8   rW   rX   rY   r:   rZ   r[   )rt   r   r  r  r8  r7  r9  r;  r%   normexprE  matmulr0   r4   r   rV   )rE   r   r   r   r   return_lossr   r  r  r   vision_outputstext_outputsr:   rY   rE  rX   rW   r8   r   s                      r,   r   zTFBlipMainLayer.callY  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%/!5# + 
 )%/!5# ' 
 &a(--l;"1o**;7 $bgglUY&ZZ!BGGKQRRV$WW ffT--.))K4PS^^<<8_-D::dD)D&lT`bpqF)-)9TGf$EvE-+#%* .
 	
r.   r   r   	NNNNNNNNNr   r9   r   r9   r   r9   r   r9   rS  r   r   r   r  r   r  r   r   r   re   zUnion[Tuple, TFBlipOutput])
rG   rH   rI   r   r  rs   r   r   r   r   r   s   @r,   r4  r4    s    L&PN0  '+)-+/)-&*,0/3&*#'E
#E
 'E
 )	E

 'E
 $E
 *E
 -E
 $E
 !E
 
$E
 E
r.   r4  c                  "    e Zd ZeZdgZdZd
 fdZddZe	 e
e       eee      	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd                     Z e
e      	 	 	 	 d	 	 	 	 	 	 	 	 	 dd       Z e
e      	 	 d	 	 	 	 	 dd       Zdd	Z xZS )TFBlipModel)text_decoder.cls.predictions.decoder.biasr   c                P    t        |   |g|i | t        |d      | _        y )Nr  r   )rr   rs   r4  r  )rE   rt   r   r   r   s       r,   rs   zTFBlipModel.__init__  s(    3&3F3#F8	r.   c                p    t        |j                  |j                  |j                  |j                        S )N)rW   rX   rY   r:   )rV   rW   rX   rY   r:   )rE   r   s     r,   r'  zTFBlipModel.serving_output  s3    #44"22**,,	
 	
r.   r(  c
                :    | j                  |||||||||		      }
|
S )a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, TFBlipModel

        >>> model = TFBlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="tf", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = tf.nn.softmax(logits_per_image, axis=1)  # we can take the softmax to get the label probabilities
        ```)	r   r   r   r   rS  r   r  r  r   )r  )rE   r   r   r   r   rS  r   r  r  r   r   s              r,   r   zTFBlipModel.call  s:    N ))%)%#/!5#  

 r.   c                    ||n| j                   j                  }| j                  j                  ||||      }|d   }| j                  j	                  |      }|S )a  
        Returns:
            text_features (`tf.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by applying
            the projection layer to the pooled output of [`TFBlipTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoProcessor, TFBlipModel

        >>> model = TFBlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")
        >>> text_features = model.get_text_features(**inputs)
        ```)r   r   r   r  r   )rt   r  r  r7  r;  )rE   r   r   r   r  rU  r-  text_featuress           r,   get_text_featureszTFBlipModel.get_text_features  sd    0 &1%<k$++B]B]yy++)%#	 , 
 %Q		11-@r.   c                    ||n| j                   j                  }| j                  j                  ||      }|d   }| j                  j	                  |      }|S )aT  
        Returns:
            image_features (`tf.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by applying
            the projection layer to the pooled output of [`TFBlipVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, TFBlipModel

        >>> model = TFBlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="tf")

        >>> image_features = model.get_image_features(**inputs)
        ```)r   r  r   )rt   r  r  r8  r9  )rE   r   r  rT  r-  image_featuress         r,   get_image_featureszTFBlipModel.get_image_features  sX    8 &1%<k$++B]B]//\Wb/c&q)44]Cr.   c                    | j                   ry d| _         t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY w)NTr  )r   r^   r%   r   r  rp   r   r   s     r,   r   zTFBlipModel.build3  se    ::
4&2tyy~~. &		%& & 3& &s   A11A:r   )r   rV   re   rV   rV  rW  )NNNN)
r   r9   r   r9   r   r9   r  r   re   r;   NN)r   r9   r  r   re   r;   r   )rG   rH   rI   r   r  r  r1  rs   r'  r   r   BLIP_INPUTS_DOCSTRINGr   rV   r   r   r`  r2  rc  r   r   r   s   @r,   rY  rY    sr   L'S&T#!O9

 *+@A<jQ '+)-+/)-&*,0/3&*#'/#/ '/ )	/
 '/ $/ */ -/ $/ !/ 
$/ R B /b ++EF '++/)-&*### )# '	#
 $# 
# G#J ++GH *.&*"&" $" 
	" I"H&r.   rY  a  
    BLIP Model for image captioning. The model consists of a vision encoder and a text decoder. One can optionally pass
    `input_ids` to the model, which serve as a text prompt, to make the text decoder continue the prompt. Otherwise,
    the decoder starts generating text from the [BOS] (beginning-of-sequence) token. will start generating the caption
    from the text input. If no text input is provided, the decoder will start with the [BOS] token only.
    c                       e Zd ZeZdgZdZd	 fdZd
dZe	 e
e       eee      	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd                     Z	 	 d	 	 	 	 	 	 	 ddZddZ xZS )TFBlipForConditionalGenerationrZ  r   c                   t        |   |g|i | t        |j                  d      | _        t        |j                  d      | _        |j                  j                  | _	        |j                  j                  | _        y )Nr8  r   text_decoder)rr   rs   r  r@  r8  r   r=  rj  bos_token_iddecoder_input_idspad_token_iddecoder_pad_token_idr  s       r,   rs   z'TFBlipForConditionalGeneration.__init__J  sl    1$1&1-f.B.BX1&2D2D>Z!'!3!3!@!@$*$6$6$C$C!r.   c                B    | j                   j                  j                  S r   r8  r   rj   rD   s    r,   r/  z3TFBlipForConditionalGeneration.get_input_embeddingsT        ++;;;r.   r(  c	                   ||n| j                   j                  }| j                  |||||      }	|	d   }
| j                  |||
|d|      }|s'|d   |d   |
|	d   f|	dd z   }t	        d |D              S ||d   }|d   }nd}|d   }|/|j
                  j                  dk(  rt        j                  |d	      }t        |||
|	j                  |	j                  |	j                  
      S )a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, TFBlipForConditionalGeneration

        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
        >>> model = TFBlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "A picture of"

        >>> inputs = processor(images=image, text=text, return_tensors="tf")

        >>> outputs = model(**inputs)
        ```NrJ  r   F)r   r   encoder_hidden_stateslabelsr  r   r   rq   c              3  &   K   | ]	  }||  y wr   rM   r`   r   s     r,   rb   z6TFBlipForConditionalGeneration.call.<locals>.<genexpr>       LF9KLr  rO  )r8   r+   r:   r<   r>   r?   )rt   r  r8  rj  rc   r   rankr%   r   r6   r<   r>   r?   )rE   r   r   r   r   r  rt  r  r   rT  r:   r   r8   r+   s                 r,   r   z#TFBlipForConditionalGeneration.callW  s.   H &1%<k$++B]B]**%/!5# + 
 &a(##)". $ 
 qz71:|^A=NOR`abacRddGLgLLL1:DQZFDQZF

1 4::dD)D8%,>>(66%00
 	
r.   c           
        |j                   d   }| j                  |      }|d   }t        j                  t	        |      dd t        j
                        }t        |t              r&t        j                  |t        j
                        }nj|ht        j                  | j                  | j                  j                  j                  ggt        j
                        }t        j                  ||df      }t        j                  t        j                  |dft        j
                        | j                  j                  j                  z  |ddddf   gd      }||ddddf   nd} | j                   j"                  d	|ddddf   | j                  j                  j$                  | j                  j                  j&                  |||d|}	|	S )
ay  
        Overrides *generate* function to be able to use the model as a conditional generator

        Parameters:
            pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, image_height, image_width)`:
                Input image to be processed
            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                The sequence used as a prompt for the generation.
            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:


        Examples:
        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, TFBlipForConditionalGeneration

        >>> model = TFBlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="tf")

        >>> outputs = model.generate(**inputs)
        >>> print(processor.decode(outputs[0], skip_special_tokens=True))
        two cats sleeping on a couch
        ```
        r   r   Nr   dtyper   r   )r   eos_token_idrm  r   rs  encoder_attention_maskrM   )r   r8  r%   onesr   int32r<  listr$  rl  rt   r=  r}  r   r   rk  rj  generatesep_token_idrm  )
rE   r   r   r   generate_kwargsr   rT  r:   image_attention_maskr   s
             r,   r  z'TFBlipForConditionalGeneration.generate  s   N "''*
***E%a(!wwz,'?'DBHHUi&,,YbhhGI,,(($++*A*A*N*NOPXZX`X`I 	J?;I IIWWj!_BHH58O8O8\8\\^ghiklkmhm^novw
	 4B3M3B3/SW,$##,, 
3B3'00==00==)".#7
 
 r.   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   exY w# 1 sw Y   y xY w)NTr8  rj  )r   r^   r%   r   r8  rp   r   rj  r   s     r,   r   z$TFBlipForConditionalGeneration.build  s    ::
4.:t00556 .!!''-.4.:t00556 .!!''-. . ;. .. .s   C%CCC r   re   zkeras.layers.Layer)NNNNNNN)r   r;   r   r9   r   r9   r   r   r  r   rt  r9   r  r   r   r   re   z7Union[Tuple, TFBlipForConditionalGenerationModelOutput]re  )r   r;   r   r9   r   r9   re   r;   r   )rG   rH   rI   r   r  r  r1  rs   r/  r   r   r2  r   r6   r   r  r   r   r   s   @r,   rh  rh  <  s    L'S&T#$OD< *+GH+Tcmn '++/,0/3#'&*#'J
J
 $J
 )	J

 *J
 -J
 !J
 $J
 !J
 
AJ
 o I J
^ '++/	GG $G )	G 
GR	.r.   rh  aS  
    BLIP Model for visual question answering. The model consists of a vision encoder, a text encoder as well as a text
    decoder. The vision encoder will encode the input image, the text encoder will encode the input question together
    with the encoding of the image, and the text decoder will output the answer to the question.
    c                       e Zd ZeZdgZd	 fdZd
dZd Ze	 e
e       eee      	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd                     Z	 d	 	 	 	 	 	 	 ddZddZ xZS )TFBlipForQuestionAnsweringrZ  c                B   t        |   |g|i | t        |j                  d      | _        t        |j                  dd      | _        t        |j                  d      | _	        |j                  j                  | _        |j                  j                  | _        y )Nr8  r   text_encoderFrp   add_pooling_layerrj  )rr   rs   r  r@  r8  r    r=  r  r   rj  rm  rn  rk  decoder_start_token_idr  s       r,   rs   z#TFBlipForQuestionAnswering.__init__  s    1$1&1-f.B.BX+F,>,>^glm1&2D2D>Z$*$6$6$C$C!&,&8&8&E&E#r.   c                B    | j                   j                  j                  S r   rp  rD   s    r,   r/  z/TFBlipForQuestionAnswering.get_input_embeddings  rq  r.   c           
     F   | j                   }| j                  }||t        d      t        j                  t        |      d   df|      }t        j                  ||j                        }t        j                  ||d d d df   gd      }t        j                  |dk(  t        j                  t        j                  t        |      |      |j                        |      }t        j                  j                  |t        j                  d|j                               |S )Nz8decoder_start_token_id and pad_token_id must be defined!r   r   r   ir{  )r  rn  r   r%   fillr   castr|  r   where	debuggingassert_greater_equalconstant)rE   r   r  rm  start_tokensshifted_input_idss         r,   _shift_rightz'TFBlipForQuestionAnswering._shift_right  s    !%!<!<00!)\-AWXXww
9 5a 8!<>TUww|Y__=II|Yq#2#v5F&GL HH%GGBGGJ'89<HJ[JaJab
 	))*;R[[RcRiRi=jk  r.   r(  c           	        ||t        d      |	|	n| j                  j                  }	| j                  ||||	|
      }|d   }t	        j
                  t        |      dd t        j                        }| j                  |||||	|
      }|	s|d   n|j                  }|||}| j                  ||||||	|
      }|:|	rt	        j                  |j                        nt	        j                  |d         }nd}|	s |||d   f|d	d z   }t        d
 |D              S t        |||j                  |j                  |j                         S )a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, TFBlipForQuestionAnswering

        >>> model = TFBlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> # training
        >>> text = "How many cats are in the picture?"
        >>> label = "2"
        >>> inputs = processor(images=image, text=text, return_tensors="tf")
        >>> labels = processor(text=label, return_tensors="tf").input_ids

        >>> inputs["labels"] = labels
        >>> outputs = model(**inputs)
        >>> loss = outputs.loss

        >>> # inference
        >>> text = "How many cats are in the picture?"
        >>> inputs = processor(images=image, text=text, return_tensors="tf")
        >>> outputs = model.generate(**inputs)
        >>> print(processor.decode(outputs[0], skip_special_tokens=True))
        2
        ```Na  Either `decoder_input_ids` or `labels` should be passed when calling `TFBlipForQuestionAnswering`. if you are training the model make sure that `labels` is passed, if you are using the model for inference make sure that `decoder_input_ids` is passed or call `generate`rJ  r   r   r{  r   r   rs  r~  r  r   )r   r   rs  r~  rt  r  r   rq   c              3  &   K   | ]	  }||  y wr   rM   rv  s     r,   rb   z2TFBlipForQuestionAnswering.call.<locals>.<genexpr>  rw  r  )r8   r:   r<   r>   r?   )r   rt   r  r8  r%   r  r   int64r  r<   rj  r'   r8   rc   rO   r>   r?   )rE   r   r   rl  decoder_attention_maskr   r   r  rt  r  r   rT  r:   r  rT   answer_outputdecoder_lossr   s                     r,   r   zTFBlipForQuestionAnswering.call.  s   b >/7u  &1%<k$++B]B]**%/!5# + 
 &a(!wwz,'?'DBHHU++)".#7# , 
 5@/!,_EfEf"3"; &))'1"1#1# * 
 AL2>>-*<*<=RTR`R`anopaqRrLL#\>!3DEWXWYHZZGLgLLL*%,>>(66%00
 	
r.   c           	        | j                  |      }|d   }t        j                  t        |      dd t        j                        }t        |t              rt        j                  |      }| j                  ||||d      }|d   }	t        j                  t        |	      dd t        j                        }
t        j                  t        j                  |	      d   dft        j                  | j                  |j                        	      } | j                  j                  d|| j                   j"                  j$                  | j                   j"                  j&                  |	|
d
|}|S )aC  
        Overrides *generate* function to be able to use the model as a conditional generator

        Parameters:
            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, image_height, image_width)`:
                Input image to be processed
            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`. `1` for
                tokens that are NOT MASKED, `0` for MASKED tokens.
            generate_kwargs (dict, *optional*):
                Additional arguments passed to the `generate` function of the decoder


        Examples:
        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, TFBlipForQuestionAnswering

        >>> model = TFBlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "How many cats are in the picture?"

        >>> inputs = processor(images=image, text=text, return_tensors="tf")

        >>> outputs = model.generate(**inputs)
        >>> print(processor.decode(outputs[0], skip_special_tokens=True))
        2
        ```
        rz  r   Nr   r{  F)r   r   rs  r~  r  r   )value)r   r}  rm  rs  r~  rM   )r8  r%   r  r   r  r<  r  Tensorr  r  r   r  r  r|  rj  r  rt   r=  r  rm  )rE   r   r   r   r  rT  r:   r  question_outputsrT   question_attention_maskbos_idsr   s                r,   r  z#TFBlipForQuestionAnswering.generate  sM   T ***E%a(!wwz,'?'DBHHUi&		),I,,)".#7 - 
 +1-"$''*_*Ecr*JRTRZRZ"[''XXo&q)1-RWWT=X=XZcZiZi5j
 -$##,, 
00==00=="1#:
 
 r.   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   xY w# 1 sw Y   qxY w# 1 sw Y   y xY w)NTr8  r  rj  )	r   r^   r%   r   r8  rp   r   r  rj  r   s     r,   r   z TFBlipForQuestionAnswering.build  s   ::
4.:t00556 .!!''-.4.:t00556 .!!''-.4.:t00556 .!!''-. . ;. .. .. .s$   D%%D1?D=%D.1D:=Er   r  rV  )r   r;   r   r9   rl  r9   r  r9   r   r9   r   r   r  r   rt  r9   r  r   r   r   re   z)Union[Tuple, TFBlipTextVisionModelOutput]r   )r   r;   r   r;   r   r9   re   r;   )rG   rH   rI   r   r  r  rs   r/  r  r   r   r2  r   rO   r   r   r  r   r   r   s   @r,   r  r    s*    L'S&T#
F<!. *+GH+FUef *..237+/,0/3#'&*#'j
j
 'j
 ,	j

 !1j
 )j
 *j
 -j
 !j
 $j
 !j
 
3j
 g I j
` ,0	LL  L )	L 
L\.r.   r  a   
    BLIP Model with a vision and text projector, and a classification head on top. The model is used in the context of
    image-text retrieval. Given an image and a text, the model returns the probability of the text being relevant to
    the image.
    c                       e Zd ZeZd fdZddZe ee	       e
ee      	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d	d                     Zd
dZ xZS )TFBlipForImageTextRetrievalc                   t        |   |g|i | t        |j                  d      | _        t        |j                  dd      | _        t        j                  j                  |j                  t        |j                        d      | _        t        j                  j                  |j                  t        |j                        d      | _        t        j                  j                  d	t        |j                        d
      | _        t#        |d      s|j                  j$                  n|j&                  | _        t#        |d      s|j                  j(                  n|j*                  | _        || _        y )Nr8  r   r  Fr  vision_projr   	text_projrq   itm_headrn  r  )rr   rs   r  r@  r8  r    r=  r  r   ry   r   image_text_hidden_sizer   r{   r  r  r  hasattrrm  rn  rk  r  rt   r  s       r,   rs   z$TFBlipForImageTextRetrieval.__init__  sM   1$1&1-f.B.BX+F,>,>^glm !<<--)).v/G/GH . 
 ++)).v/G/GH , 
 **/&2J2J"KR\ + 
 6#9: ++,, 	! 6#;< ++.. 	#
 r.   c                B    | j                   j                  j                  S r   rp  rD   s    r,   r/  z0TFBlipForImageTextRetrieval.get_input_embeddings+  rq  r.   r(  c	           	        ||n| j                   j                  }| j                  |||||      }	|	d   }
t        j                  t        |
      dd t        j                        }| j                  |||
|||      }|s|d   n|j                  }| j                  |dddddf         }| j                  ||||      }|s|d   n|j                  }t        j                  j                  | j                  |
dddddf         dd	      \  }}t        j                  j                  | j                  |dddddf         dd	      \  }}t        j                  ||d
      }|r|}|}n|}|}|s#||	d   f|	dd z   |fz   }t        d |D              S t!        ||	j                  |	j"                  |	j$                  |      S )a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, TFBlipForImageTextRetrieval

        >>> model = TFBlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco")
        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "an image of a cat"

        >>> inputs = processor(images=image, text=text, return_tensors="tf")
        >>> outputs = model(**inputs)
        ```
        NrJ  r   r   r{  r  )r   r   r  r   rq   )rK  r   TrM  c              3  &   K   | ]	  }||  y wr   rM   rv  s     r,   rb   z3TFBlipForImageTextRetrieval.call.<locals>.<genexpr>  rw  r  )rR   r<   r>   r?   rT   )rt   r  r8  r%   r  r   r  r  r<   r  linalg	normalizer  r  rR  rc   rQ   r>   r?   )rE   r   r   use_itm_headr   r   r  r  r   rT  r:   
image_attsitm_question_embeds
itm_outputno_itm_question_embeds
image_feat_	text_featno_itm_outputr   rT   r   s                         r,   r   z TFBlipForImageTextRetrieval.call.  s   F &1%<k$++B]B]**%/!5# + 
 &a(WWZ5cr:"((K
 #//)".#-# 0 
 =H1!4M`MrMr]]#6q!Qw#?@
!%!2!2)#	 "3 "
 .9"1%>T>f>f 	 		++D,<,<\!QPQ'=R,SYZac+d
Ayy**4>>:PQRTUWXQX:Y+Z`ahj*k	1		*iTJF1O"F4O~a01N124FF/I[[GLgLLL1,>>(66%00+
 	
r.   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       nt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  j                  g       d d d        t        | dd       nt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  j                  g       d d d        t        | dd       ot        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  j                  g       d d d        y y # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)NTr8  r  r  r  r  )r   r^   r%   r   r8  rp   r   r  r  rt   r@  ru   r  r=  r  r   s     r,   r   z!TFBlipForImageTextRetrieval.build  s   ::
4.:t00556 .!!''-.4.:t00556 .!!''-.4-9t//445 \  &&dDKK4M4M4Y4Y'Z[\4d+7t~~223 X$$dD$++2I2I2U2U%VWX4T*6t}}112 W##T41H1H1T1T$UVW W 7. .. .\ \X XW Ws<   H<%I	?=I:=I#5=I/<I	II #I,/I8r   r  )NTNNNNN)r   r;   r   r9   r  r   r   r9   r   r   r  r   r  r   r   r   re   z0Union[Tuple, TFBlipImageTextMatchingModelOutput]r   )rG   rH   rI   r   r  rs   r/  r   r   r2  r   rQ   r   r   r   r   r   s   @r,   r  r    s     L$L< *+GH+M\lm *.'++/,0/3&*#']
]
 ']
 %	]

 )]
 *]
 -]
 $]
 !]
 
:]
 n I ]
~Wr.   r  )r+   r;   re   r;   )r1   r;   re   r;   )ErJ   
__future__r   rA   dataclassesr   typingr   r   r   r   
tensorflowr%   modeling_tf_outputsr
   r   modeling_tf_utilsr   r   r   r   r   r   r   tf_utilsr   r   utilsr   r   r   r   r   configuration_blipr   r   r   modeling_tf_blip_textr   r   r    
get_loggerrG   logger_CHECKPOINT_FOR_DOCr-   r4   r6   rO   rQ   rV   ry   Layerrg   r   r   r   r   r  BLIP_START_DOCSTRINGr2  rf  r  r  r4  rY  rh  r  r  rM   r.   r,   <module>r     sw    "  ! . .  R   G  M L e e 
		H	%0 - ) ) )X 4+ 4 4@ '4 '4 '4T !
; !
 !
H5U\\// 5r7 5<<-- 7 tQDell(( QDh L""  LF@E++ @EF8- 8   " J W&ELL&& W& W&t[H- [H|I
ell(( I
XW&' W&t  s.%: s.s.l 
 t.!6 t.t.n 
 `W"7 `W`Wr.   