
    sg                    |   d Z ddlmZ ddlZddlmZ ddlmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZmZ dd	lmZmZmZmZmZmZ dd
lmZmZmZ ddlm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z(  e#jR                  e*      Z+dZ,dZ-d7d8dZ.d9dZ/d:dZ0e G d de              Z1 G d dejd                  jf                        Z4 G d dejd                  jf                        Z5 G d dejd                  jf                        Z6 G d dejd                  jf                        Z7 G d dejd                  jf                        Z8 G d d ejd                  jf                        Z9 G d! d"ejd                  jf                        Z:e G d# d$ejd                  jf                               Z; G d% d&ejd                  jf                        Z<e G d' d(ejd                  jf                               Z=e G d) d*ejd                  jf                               Z> G d+ d,e      Z?d-Z@d.ZAd/ZBd0ZC G d1 d2e?      ZD G d3 d4e?      ZE e!e@       G d5 d6e?             ZFy);zTF 2.0 CLIP model.    )annotationsN)	dataclass)AnyOptionalTupleUnion   )get_tf_activation)TFBaseModelOutputTFBaseModelOutputWithPooling)TFModelInputTypeTFPreTrainedModelget_initializerkeraskeras_serializableunpack_inputs)check_embeddings_within_bounds
shape_liststable_softmax)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )
CLIPConfigCLIPTextConfigCLIPVisionConfigzopenai/clip-vit-base-patch32g    חc                    t        |       d   }||n|}t        j                  d      }t        j                  | |j                        } t        j
                  | ddddddf   dd|df      }||z
  t        z  S )z_
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    r   Ng      ?dtype)r   tfconstantcastr!   tileLARGE_NEGATIVE)masktgt_lensrc_lenone_cstexpanded_masks        \/var/www/html/venv/lib/python3.12/site-packages/transformers/models/clip/modeling_tf_clip.py_expand_maskr-   :   sx     q!G ,g'Gkk#G774w}}-DGGDD$!12Q7A4FGMm#~55    c           	         t         j                  j                  t        j                  j                  t        j                  t        |       d         | d            S )Nr   T)y_truey_predfrom_logits)r"   mathreduce_meanr   metricssparse_categorical_crossentropyranger   )logitss    r,   contrastive_lossr9   I   sJ    775588Jv.q126t 	6 	
 r.   c                d    t        |       }t        t        j                  |             }||z   dz  S )Ng       @)r9   r"   	transpose)
similaritycaption_loss
image_losss      r,   	clip_lossr?   Q   s/    #J/L!",,z":;J:%,,r.   c                  |    e Zd ZU dZdZded<   dZded<   dZded<   dZded<   dZ	ded	<   dZ
d
ed<   dZd
ed<   ddZy)TFCLIPOutputa	  
    Args:
        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image:(`tf.Tensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text:(`tf.Tensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds(`tf.Tensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`TFCLIPTextModel`].
        image_embeds(`tf.Tensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of
            [`TFCLIPVisionModel`].
        text_model_output([`~modeling_tf_utils.TFBaseModelOutputWithPooling`]):
            The output of the [`TFCLIPTextModel`].
        vision_model_output([`~modeling_tf_utils.TFBaseModelOutputWithPooling`]):
            The output of the [`TFCLIPVisionModel`].
    Nztf.Tensor | Noneloss	tf.Tensorlogits_per_imagelogits_per_texttext_embedsimage_embedsr   text_model_outputvision_model_outputc                H     t         fd j                         D              S )Nc              3  d   K   | ]'  }|d vr|   nt        |      j                          ) yw))rH   rI   N)getattrto_tuple).0kselfs     r,   	<genexpr>z(TFCLIPOutput.to_tuple.<locals>.<genexpr>w   s=      
  LLDGRYZ^`aRbRkRkRmm
s   -0)tuplekeysrP   s   `r,   rM   zTFCLIPOutput.to_tuplev   s#     
YY[
 
 	
r.   )returnz
Tuple[Any])__name__
__module____qualname____doc__rB   __annotations__rD   rE   rF   rG   rH   rI   rM    r.   r,   rA   rA   W   sZ    * "D
!"&i&!%OY%!K!"L)"6:3:8<5<
r.   rA   c                  2     e Zd Zd fdZdddZddZ xZS )TFCLIPVisionEmbeddingsc                   t        |   di | |j                  | _        |j                  | _        |j
                  | _        | j                  | j
                  z  dz  | _        | j                  dz   | _        || _        t        j                  j                  | j                  | j
                  | j
                  dddt        | j                  j                  | j                  j                  z        d      | _        y )	N   r   validchannels_lastFpatch_embedding)filterskernel_sizestridespaddingdata_formatuse_biaskernel_initializernamer[   )super__init__hidden_size	embed_dim
image_size
patch_sizenum_patchesnum_positionsconfigr   layersConv2Dr   initializer_rangeinitializer_factorrb   rP   rs   kwargs	__class__s      r,   rl   zTFCLIPVisionEmbeddings.__init__~   s    "6"++ ++ ++ OOt>1D!--1$||22NNOO'.t{{/L/Lt{{OmOm/mn"  3 	 
r.   c                   | j                   j                  }| j                  | j                  ft	        | j                  dz  |z        dd      | _        t        j                  d      5  | j                  | j                  | j                  ft	        | j                   j                  |z        dd      | _
        d d d        | j                  ry d| _        t        | dd       ft        j                  | j                  j                        5  | j                  j                  d d d | j                   j                   g       d d d        y y # 1 sw Y   xY w# 1 sw Y   y xY w)N      Tclass_embeddingshapeinitializer	trainablerj   position_embedding
embeddingsrb   )rs   rw   
add_weightrn   r   r}   r"   
name_scoperr   rv   r   builtrL   rb   rj   buildnum_channels)rP   input_shapefactors      r,   r   zTFCLIPVisionEmbeddings.build   s<   //#>>#'(<v(EF"	  /  
 ]]/0 	&*oo))4>>:+DKK,I,IF,RS!	 '6 'D#	 ::
4*D1=t33889 Y$$**D$dkk>V>V+WXY Y >	 	Y Ys   *AE4EEE c                j   t        |      \  }}}}t        j                  |d      }| j                  |      }t        j                  ||| j
                  df      }t        j                  | j                  |d| j                  f      }t        j                  ||fd      }|| j                  z   }|S )z0`pixel_values` is expected to be of NCHW format.)r   r_   r	   r   permtensorr   r   )r   axis)r   r"   r;   rb   reshaperq   broadcast_tor}   rn   concatr   )	rP   pixel_values
batch_sizer   heightwidthpatch_embedsclass_embedsr   s	            r,   callzTFCLIPVisionEmbeddings.call   s     3=\2J/
L&%
 ||L|D++L9 zzj$JZJZ\^=_` t';';JPQSWSaSaCbcYYl;!D
$"9"99
r.   rs   r   Nr   ztf.TensorShape)r   rC   rU   rC   rV   rW   rX   rl   r   r   __classcell__rz   s   @r,   r]   r]   }   s    
.Y2r.   r]   c                  L     e Zd Zd fdZdd fdZ	 	 	 d	 	 	 	 	 	 	 ddZ xZS )	TFCLIPTextEmbeddingsc                T    t        |   di | |j                  | _        || _        y )Nr[   )rk   rl   rm   rn   rs   rx   s      r,   rl   zTFCLIPTextEmbeddings.__init__   s'    "6"++r.   c                   t        j                  d      5  | j                  | j                  j                  | j
                  ft        | j                  j                  | j                  j                  z        dd      | _	        d d d        t        j                  d      5  | j                  | j                  j                  | j
                  ft        | j                  j                  | j                  j                  z        dd      | _        d d d        t        | 5  |       y # 1 sw Y   xY w# 1 sw Y   %xY w)Ntoken_embeddingTweightr~   r   r   )r"   r   r   rs   
vocab_sizern   r   rw   rv   r   max_position_embeddingsr   rk   r   )rP   r   rz   s     r,   r   zTFCLIPTextEmbeddings.build   s    ]],- 	//{{--t~~>+DKK,J,JT[[MjMj,jk	 * DK	 ]]/0 	&*oo{{::DNNK+DKK,J,JT[[MjMj,jk!	 '6 'D#	 	k"!	 		 	s   A/D*#A/D6*D36D?c                   ||t        d      |At        || j                  j                         t	        j
                  | j                  |      }t        |      dd }|/t	        j                  t	        j                  d|d         d      }t	        j
                  | j                  |      }t	        j                  ||d   ddf	      }||z   }|S )
z
        Applies embedding based on inputs tensor.

        Returns:
            final_embeddings (`tf.Tensor`): output embedding tensor.
        Nz5You have to specify either input_ids or inputs_embedsparamsindicesr   r   )startlimitr   r   )input	multiples)
ValueErrorr   rs   r   r"   gatherr   r   expand_dimsr7   r   r%   )rP   	input_idsposition_idsinputs_embedsr   position_embedsfinal_embeddingss          r,   r   zTFCLIPTextEmbeddings.call   s     !6TUU *9dkk6L6LMIIT[[)LM /4>>"((+b/*RYZ[L))4+B+BLY''KPQNTUWXCYZ(?:r.   rs   r   r   r   )NNN)r   rC   r   rC   r   rC   rU   rC   r   r   s   @r,   r   r      sC    #*  $"&#'	      !	 
 
 r.   r   c                  V     e Zd ZdZd fdZddZ	 d	 	 	 	 	 	 	 	 	 	 	 d	dZd
dZ xZS )TFCLIPAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                8   t        |   di | |j                  | _        |j                  | _        | j                  | j                  z  | _        | j
                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      |j                  }| j                  dz  d|j                  z  dz  z  |z  }| j                  dz  |z  }t        j                  | j
                        | _        t        j                  j                  | j                  t        |      d      | _        t        j                  j                  | j                  t        |      d      | _        t        j                  j                  | j                  t        |      d	      | _        t        j                  j'                  |j(                  
      | _        t        j                  j                  | j                  t        |      d      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r|   r_   q_projunitsri   rj   k_projv_proj)rateout_projr[   )rk   rl   rm   rn   num_attention_headsattention_head_sizer   rw   num_hidden_layersr3   sqrtsqrt_att_head_sizer   rt   Denser   r   r   r   Dropoutattention_dropoutdropoutr   )rP   rs   ry   r   in_proj_stdout_proj_stdrz   s         r,   rl   zTFCLIPAttention.__init__  s   "6"++#)#=#= #'>>T5M5M#M ##d&>&>>$..PMdnnM] ^,,-R1 
 **~~t+V5M5M1MRV0VWZ``,6"&))D,D,D"Ell((.._[5QX` ) 
 ll((.._[5QX` ) 
 ll((.._[5QX` ) 
 ||++1I1I+J**.._\5RYc + 
r.   c                    t        j                  ||d| j                  | j                  f      }t        j                  |g d      S )Nr   r   r   r_   r   r	   r   )r"   r   r   r   r;   )rP   r   r   s      r,   transpose_for_scoresz$TFCLIPAttention.transpose_for_scores'  s;    6*b$BZBZ\`\t\t1uv ||F66r.   c                .   t        |      d   }| j                  |      }| j                  |      }| j                  |      }	| j	                  ||      }
| j	                  ||      }| j	                  |	|      }t        j                  |
|d      }t        j                  | j                  |j                        }t        j                  ||      }|t        j                  ||      }|t        j                  ||      }t        |d      }| j                  ||      }t        j                  ||      }t        j                  |g d	
      }t        j                  ||d| j                   f      }| j#                  ||      }|r||f}|S |f}|S )z#Input shape: Batch x Time x Channelr   inputsTtranspose_br    r   )r8   r   )r   trainingr   r   r   )r   )r   r   r   r   r   r"   matmulr$   r   r!   divideaddr   r   r;   r   rn   r   )rP   hidden_statesattention_maskcausal_attention_maskoutput_attentionsr   r   mixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerattention_scoresdk_attention_probsattention_probsattention_outputoutputss                      r,   r   zTFCLIPAttention.call.  s     .q1
 KK}K=++]+; KK}K=//0A:N--ozJ	//0A:N 99[)NWWT,,4D4J4JK99%5r: !,!vv&68MN%!vv&6G *1AK ,,.>,R99_kB<<(8|L ::-=jRTVZVdVdEef==)9H=M ;L#%56 ScQdr.   c                   | j                   ry d| _         t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   AxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)NTr   r   r   r   )r   rL   r"   r   r   rj   r   rn   r   r   r   rP   r   s     r,   r   zTFCLIPAttention.buildc  s   ::
44(4t{{//0 @!!4t~~">?@44(4t{{//0 @!!4t~~">?@44(4t{{//0 @!!4t~~">?@4T*6t}}112 B##T4$@AB B 7@ @@ @@ @B Bs0   )F32)G )G )G3F= G	GG!rs   r   )r   rC   r   intrU   rC   Fr   rC   r   rC   r   rC   r   boolr   r   rU   zTuple[tf.Tensor]r   )	rV   rW   rX   rY   rl   r   r   r   r   r   s   @r,   r   r     s\    G 
F7 3 3 "3  )	3
  3 3 
3jBr.   r   c                  0     e Zd Zd fdZddZddZ xZS )	TFCLIPMLPc                   t        |   di | t        |j                        | _        |j
                  }|j                  dz  d|j                  z  dz  z  |z  }d|j                  z  dz  |z  }t        j                  j                  |j                  t        |      d      | _        t        j                  j                  |j                  t        |      d      | _        || _        y )Nr|   r_   fc1r   fc2r[   )rk   rl   r
   
hidden_actactivation_fnrw   rm   r   r   rt   r   intermediate_sizer   r   r   rs   )rP   rs   ry   r   r   fc_stdrz   s         r,   rl   zTFCLIPMLP.__init__v  s    "6".v/@/@A**))4/Q9Q9Q5QVZ4Z[^ddf(((T1F:<<%%**v?V]b & 
 <<%%$$9U\a & 
 r.   c                p    | j                  |      }| j                  |      }| j                  |      }|S )Nr   )r   r   r   )rP   r   s     r,   r   zTFCLIPMLP.call  s8    6**=96r.   c                "   | j                   ry d| _         t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       et        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        y y # 1 sw Y   |xY w# 1 sw Y   y xY w)NTr   r   )r   rL   r"   r   r   rj   r   rs   rm   r   r   r   s     r,   r   zTFCLIPMLP.build  s    ::
4%1txx}}- FdDKK,C,CDEF4%1txx}}- LdDKK,I,IJKL L 2F FL Ls   3C9<3D9DDr   )r   rC   rU   rC   r   rV   rW   rX   rl   r   r   r   r   s   @r,   r   r   u  s    "	Lr.   r   c                  J     e Zd Zd fdZ	 d	 	 	 	 	 	 	 	 	 	 	 ddZddZ xZS )TFCLIPEncoderLayerc                N   t        |   di | |j                  | _        t	        |d      | _        t        j                  j                  |j                  d      | _
        t        |d      | _        t        j                  j                  |j                  d      | _        y )N	self_attnrj   layer_norm1epsilonrj   mlplayer_norm2r[   )rk   rl   rm   rn   r   r  r   rt   LayerNormalizationlayer_norm_epsr  r   r  r	  rx   s      r,   rl   zTFCLIPEncoderLayer.__init__  s    "6"++(kB <<::6CXCX_l:mV%0 <<::6CXCX_l:mr.   c                    |}| j                  |      }| j                  |||||      }|d   }||z   }|}| j                  |      }| j                  |      }||z   }|f|dd z   }|S )a  
        Args:
            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`tf.Tensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            causal_attention_mask (`tf.Tensor`): causal attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`):
                Whether or not to return the attentions tensors of all attention layers. See `outputs` under returned
                tensors for more detail.
        r   r   r   r   r   r   r   )r   r   N)r  r  r	  r  )	rP   r   r   r   r   r   residualattention_outputsr   s	            r,   r   zTFCLIPEncoderLayer.call  s    & !(((> NN')"7/ + 
 *!, =0 (((>}= =0 "%6qr%::r.   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   4xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)NTr  r  r  r	  )r   rL   r"   r   r  rj   r   r  rn   r  r	  r   s     r,   r   zTFCLIPEncoderLayer.build  sp   ::
4d+7t~~223 +$$T*+4-9t//445 E  &&dDNN'CDE4%1txx}}- %t$%4-9t//445 E  &&dDNN'CDE E :+ +E E% %E Es0   F%)F&F2&)F>F#&F/2F;>Gr   r   r   r   r   r   s   @r,   r  r    sT    n ' ' "'  )	'
  ' ' 
'REr.   r  c                  V     e Zd ZdZd fdZ	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZddZ xZS )	TFCLIPEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`TFCLIPEncoderLayer`].

    Args:
        config: CLIPConfig
    c                    t        |   di | t        |j                        D cg c]  }t	        |d|        c}| _        y c c}w )Nz	layers_._r  r[   )rk   rl   r7   r   r  rt   )rP   rs   ry   irz   s       r,   rl   zTFCLIPEncoder.__init__  sB    "6"QVW]WoWoQpqA)&1#Gqqs   Ac                    |rdnd }|rdnd }	t        | j                        D ]+  \  }
}|r||fz   } ||||||      }|d   }|s#|	|d   fz   }	- |r||fz   }|st        d |||	fD              S t        |||	      S )Nr[   r  r   r   c              3  &   K   | ]	  }||  y wr   r[   )rN   vs     r,   rQ   z%TFCLIPEncoder.call.<locals>.<genexpr>  s     hqZ[Zghs   )last_hidden_stater   
attentions)	enumeratert   rR   r   )rP   r   r   r   r   output_hidden_statesreturn_dictr   all_hidden_statesall_attentionsr  layer_modulelayer_outputss                r,   r   zTFCLIPEncoder.call  s     #7BD0d(5 	FOA|#$58H$H!(+-&;"3!M *!,M !/=3C2E!E	F"   1]4D Dh]4E~$Vhhh +;LYg
 	
r.   c                    | j                   ry d| _         t        | dd       K| j                  D ];  }t        j                  |j
                        5  |j                  d        d d d        = y y # 1 sw Y   IxY w)NTrt   )r   rL   rt   r"   r   rj   r   )rP   r   layers      r,   r   zTFCLIPEncoder.build  sp    ::
44(4 &]]5::. &KK%& && 5& &s   A..A7	r   r   )r   rC   r   rC   r   rC   r   r   r  r   r  r   r   r   rU   z*Union[TFBaseModelOutput, Tuple[tf.Tensor]]r   )rV   rW   rX   rY   rl   r   r   r   r   s   @r,   r  r    sl    r &
 &
 "&
  )	&

  &
 #&
 &
 &
 
4&
P&r.   r  c                  p     e Zd Zd fdZ	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZej                  fdZddZ xZ	S )	TFCLIPTextTransformerc                   t        |   di | t        |d      | _        t	        |d      | _        t        j                  j                  |j                  d      | _
        |j                  | _        |j                  | _        y )Nr   r  encoderfinal_layer_normr  r[   )rk   rl   r   r   r  r&  r   rt   r
  r  r'  eos_token_idrm   rn   rx   s      r,   rl   zTFCLIPTextTransformer.__init__  so    "6".vLI$V)< % ? ?H]H]dv ? w #//++r.   c                   t        |      }| j                  ||      }	|\  }
}| j                  |
||	j                        }t	        |      }| j                  |	||||||      }|d   }| j                  |      }| j                  dk(  rtt        j                  |t        j                  t        j                  |d   t        j                        t        j                  j                  |d      fd	
            }nt        j                  |t        j                  t        j                  |d   t        j                        t        j                  j                  t        j                  || j                  k(  t        j                         d      fd	
            }|s
||f|d	d  z   S t#        |||j$                  |j&                        S )N)r   r   r    r   r   r   r   r  r  r   r   r   r_   r   r   r   )valuesr   r   r  pooler_outputr   r  )r   r   _build_causal_attention_maskr!   r-   r&  r'  r(  r"   	gather_ndstackr7   int64r3   argmaxr$   int8r   r   r  )rP   r   r   r   r   r  r  r   r   embedding_outputr   
seq_lengthr   encoder_outputssequence_outputpooled_outputs                   r,   r   zTFCLIPTextTransformer.call*  s    !+??Y\?Z!,
J !% A A*j`p`v`v A w &n5,,*)"7/!5# ' 
 *!,///G! LL&HH[^288DbggnnU^egnFhipqM LL&Qrxx@rwwyD<M<M/MUWU\U\']dfg 	M #]3oab6III+-')77&11	
 	
r.   c                j   t        j                  t        j                  |fd      |      }t        j                  t        j                  ||fd      |      }t         j                  j	                  |dd      }t         j                  j                  ||      }t        j                  ||d||f      S )Ng        g     r   r   )diagonalr   )r   r   )r"   r$   filllinalg	band_partset_diagr   )rP   r   r5  r!   diagto_masks         r,   r.  z2TFCLIPTextTransformer._build_causal_attention_maskq  s    
 wwrww
}c2E: ''"'':z":HEuM ))%%gq"5))$$Wt$<WZJPZ4[\\r.   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   xY w# 1 sw Y   ~xY w# 1 sw Y   y xY w)NTr   r&  r'  )
r   rL   r"   r   r   rj   r   r&  r'  rn   r   s     r,   r   zTFCLIPTextTransformer.build  s   ::
4t,8t334 ,%%d+,4D)5t||001 )""4()4+T2>t4499: J%%++T4,HIJ J ?, ,) )J Js$   D2%D>?)E
2D;>E
Er   r   )r   r   r   rC   r   rC   r   r   r  r   r  r   r   r   rU   5Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]r   )
rV   rW   rX   rl   r   r"   float32r.  r   r   r   s   @r,   r$  r$    s    	,& E
#E
 "E
  	E

  E
 #E
 E
 E
 
?E
N JL ]$Jr.   r$  c                  |     e Zd ZeZd fdZddZddZe	 	 	 	 	 	 	 d		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d
d       Z	ddZ
 xZS )TFCLIPTextMainLayerc                V    t        |   di | || _        t        |d      | _        y )N
text_modelr  r[   )rk   rl   rs   r$  rG  rx   s      r,   rl   zTFCLIPTextMainLayer.__init__  s(    "6"/\Jr.   c                .    | j                   j                  S r   )rG  r   rT   s    r,   get_input_embeddingsz(TFCLIPTextMainLayer.get_input_embeddings  s    )))r.   c                    || j                   j                  _        t        |      d   | j                   j                  _        y )Nr   )rG  r   r   r   r   )rP   values     r,   set_input_embeddingsz(TFCLIPTextMainLayer.set_input_embeddings  s0    ,1"")0:50A!0D""-r.   c           	         |t        d      t        |      }|t        j                  |d      }| j	                  |||||||      }	|	S )NzYou have to specify input_idsr   dimsrK  r   r   r   r   r  r  r   )r   r   r"   r;  rG  )
rP   r   r   r   r   r  r  r   r   text_model_outputss
             r,   r   zTFCLIPTextMainLayer.call  sg     <== +!WW+Q?N!__)%/!5# - 
 "!r.   c                    | j                   ry d| _         t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY w)NTrG  )r   rL   r"   r   rG  rj   r   r   s     r,   r   zTFCLIPTextMainLayer.build  si    ::
4t,8t334 ,%%d+, , 9, ,   A11A:r   rU   zkeras.layers.Layer)rK  ztf.VariableNNNNNNF)r   TFModelInputType | Noner   np.ndarray | tf.Tensor | Noner   rW  r   Optional[bool]r  rX  r  rX  r   r   rU   rB  r   )rV   rW   rX   r   config_classrl   rI  rL  r   r   r   r   r   s   @r,   rE  rE    s    !LK
*E  .28<6:,0/3&*"*" 6" 4	"
 *" -" $" " 
?" "<,r.   rE  c                  J     e Zd Zd fdZ	 d	 	 	 	 	 	 	 	 	 	 	 ddZddZ xZS )TFCLIPVisionTransformerc                N   t        |   di | t        |d      | _        t        j
                  j                  |j                  d      | _        t        |d      | _
        t        j
                  j                  |j                  d      | _        |j                  | _        y )Nr   r  pre_layrnormr  r&  post_layernormr[   )rk   rl   r]   r   r   rt   r
  r  pre_layernormr  r&  r^  rm   rn   rx   s      r,   rl   z TFCLIPVisionTransformer.__init__  s    "6"0lK"\\<<VEZEZao<p$V)<#ll==fF[F[br=s++r.   c           	        | j                  |      }| j                  |      }| j                  |d d ||||      }|d   }|d d dd d f   }	| j                  |	      }	|s
||	f|dd  z   S t	        ||	|j
                  |j                        S )N)r   r   r*  r   r   r,  )r   r_  r&  r^  r   r   r  )
rP   r   r   r  r  r   r4  r6  r7  r8  s
             r,   r   zTFCLIPVisionTransformer.call  s      ???E--5E-F,,*"&/!5# ' 
 *!,'1a0++=+A#]3oab6III+-')77&11	
 	
r.   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d | j                  g       d d d        y y # 1 sw Y   3xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)NTr   r_  r&  r^  )r   rL   r"   r   r   rj   r   r_  rn   r&  r^  r   s     r,   r   zTFCLIPVisionTransformer.build  ss   ::
4t,8t334 ,%%d+,4$/;t11667 G""(($dnn)EFG4D)5t||001 )""4()4)40<t22778 B##))4*@AB B =, ,G G) )B Bs0   F%)F%F1&(F=F"%F.1F:=Gr   r   )r   r   r   r   r  r   r  r   r   r   rU   rB  r   r   r   s   @r,   r[  r[    sS    , !
&!
  !
 #	!

 !
 !
 
?!
FBr.   r[  c                  h     e Zd ZeZd fdZddZe	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 dd       Zd	dZ	 xZ
S )
TFCLIPVisionMainLayerc                V    t        |   di | || _        t        |d      | _        y )Nvision_modelr  r[   )rk   rl   rs   r[  re  rx   s      r,   rl   zTFCLIPVisionMainLayer.__init__  s)    "6"3FPr.   c                .    | j                   j                  S r   )re  r   rT   s    r,   rI  z*TFCLIPVisionMainLayer.get_input_embeddings  s      +++r.   c                L    |t        d      | j                  |||||      }|S )N You have to specify pixel_valuesr   r   r  r  r   )r   re  )rP   r   r   r  r  r   vision_model_outputss          r,   r   zTFCLIPVisionMainLayer.call  sC     ?@@#00%/!5#  1  
 $#r.   c                    | j                   ry d| _         t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY w)NTre  )r   rL   r"   r   re  rj   r   r   s     r,   r   zTFCLIPVisionMainLayer.build+  sm    ::
4.:t00556 .!!''-. . ;. .rS  r   rT  NNNNF)r   rV  r   rX  r  rX  r  rX  r   r   rU   rB  r   )rV   rW   rX   r   rY  rl   rI  r   r   r   r   r   s   @r,   rc  rc  	  sw    #LQ
,  15,0/3&*$-$ *$ -	$
 $$ $ 
?$ $*.r.   rc  c                       e Zd ZeZd fdZdddZe	 	 	 	 	 	 	 d		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d
d       Ze	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 dd       Z	e	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd       Z
 xZS )TFCLIPMainLayerc                |   t        |   di | t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      || _	        |j                  }|j                  }|j                  | _
        t        |d      | _        t        |d      | _        t        j                   j#                  | j                  t%        |j&                  dz  | j                  j(                  z        dd	
      | _        t        j                   j#                  | j                  t%        |j&                  dz  | j                  j(                  z        dd
      | _        |j&                  | _        |j&                  | _        y )NzKconfig.text_config is expected to be of type CLIPTextConfig but is of type .zOconfig.vision_config is expected to be of type CLIPVisionConfig but is of type rG  r  re  r|   Fvisual_projection)r   ri   rh   rj   text_projectionr[   )rk   rl   
isinstancetext_configr   	TypeErrortypevision_configr   rs   projection_dimr$  rG  r[  re  r   rt   r   r   rm   rw   rq  rr  text_embed_dimvision_embed_dim)rP   rs   ry   rt  rw  rz   s        r,   rl   zTFCLIPMainLayer.__init__8  s   "6"&,,n=++,-Q0 
 &..0@A--./q2 
 ((,,$33/,O3MW!&!3!3%%.}/H/H$/NQUQ\Q\QoQo/op$	 "4 "
  %||11%%.{/F/F/Lt{{OmOm/mn"	  2  
 *55 - 9 9r.   c                *   | j                  dt        j                  j                  | j                  j
                        dd      | _        | j                  ry d| _        t        | dd       Mt        j                  | j                  j                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j                        5  | j                  j                  d        d d d        t        | dd       Zt        j                  | j                  j                        5  | j                  j                  d d | j                   g       d d d        t        | dd       [t        j                  | j"                  j                        5  | j"                  j                  d d | j$                  g       d d d        y y # 1 sw Y   4xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)	Nr   Tlogit_scaler~   rG  re  rq  rr  )r   r   initializersConstantrs   logit_scale_init_valuer}  r   rL   r"   r   rG  rj   r   re  rq  rz  rr  ry  r   s     r,   r   zTFCLIPMainLayer.builda  s   ??**33DKK4V4VW	 + 
 ::
4t,8t334 ,%%d+,4.:t00556 .!!''-.4,d3?t55::; R&&,,dD$:O:O-PQR4*D1=t33889 N$$**D$8K8K+LMN N >, ,. .R RN Ns0   G$0G1
)G=1)H	$G.1G:=H	Hc           	         |t        d      t        |      }|t        j                  |d      }| j	                  |||||||      }	|	d   }
| j                  |
      }|S )N$You have to specify either input_idsr   rN  rP  r   )r   r   r"   r;  rG  rr  )rP   r   r   r   r   r  r  r   r   text_outputsr8  text_featuress               r,   get_text_featuresz!TFCLIPMainLayer.get_text_featuresy  s     CDD +!WW+Q?N)%/!5# ' 
 %Q,,M,Br.   c                z    |t        d      | j                  |||||      }|d   }| j                  |      }|S )Nrh  ri  r   r   )r   re  rq  )	rP   r   r   r  r  r   vision_outputsr8  image_featuress	            r,   get_image_featuresz"TFCLIPMainLayer.get_image_features  s_     ?@@**%/!5# + 
 'q)//}/Er.   c
           	        |t        d      |t        d      t        |      }
|t        j                  |
d      }| j	                  |||||	      }| j                  |||||||	      }|d   }| j                  |      }|d   }| j                  |      }|t        j                  |dd	d
      z  }|t        j                  |dd	d
      z  }t        j                  j                  | j                        }t        j                  ||d
      |z  }t        j                  |      }d }|r!t        |      }t        j                  |d      }|s||||||f}||f|z   S |S t!        |||||||      S )Nr  rh  r   rN  ri  rP  r   	euclideanr   T)r   ordr   keepdimsr   r|  )rB   rD   rE   rF   rG   rH   rI   )r   r   r"   r;  re  rG  rq  rr  normr3   expr}  r   r;   r?   r   rA   )rP   r   r   r   r   return_lossr   r  r  r   r   r  r  rG   rF   r}  rE   rD   rB   outputs                       r,   r   zTFCLIPMainLayer.call  s    CDD?@@ +!WW+Q?N**%/!5# + 
 )%/!5# ' 
 &a(--\-B"1o**+*> $bgg\{Y[fj&kk!BGG;KVXcg$hh ggkk$"2"23))K4PS^^<<8_-D::dD)D&lT`bpqF'+'7D7V#CVC-+#%* .
 	
r.   r   r   r   rU  r   rV  r   rW  r   rW  r   rX  r  rX  r  rX  r   r   rU   rC   rl  r   rV  r   rX  r  rX  r  rX  r   r   rU   rC   	NNNNNNNNFr   rV  r   rV  r   rW  r   rW  r  rX  r   rX  r  rX  r  rX  r   r   rU   z%Union[TFCLIPOutput, Tuple[tf.Tensor]])rV   rW   rX   r   rY  rl   r   r   r  r  r   r   r   s   @r,   rn  rn  4  s   L':RN0  .28<6:,0/3&** 6 4	
 * - $  
 B  15,0/3&*- * -	
 $  
 0  .2048<6:&*,0/3&*H
*H
 .H
 6	H

 4H
 $H
 *H
 -H
 $H
 H
 
/H
 H
r.   rn  c                  $    e Zd ZdZeZdZdgZdgZy)TFCLIPPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    clipr   N)	rV   rW   rX   rY   r   rY  base_model_prefix_keys_to_ignore_on_load_missing"_keys_to_ignore_on_load_unexpectedr[   r.   r,   r  r     s&    
 L'6&7#*9):&r.   r  av	  

    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Args:
        config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
            config will be used instead.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False``):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
a  
    Args:
        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`CLIPImageProcessor.__call__`] for details. output_attentions (`bool`, *optional*): Whether or not to
            return the attentions tensors of all attention layers. See `attentions` under returned tensors for more
            detail. This argument can be used only in eager mode, in graph mode the value in the config will be used
            instead.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False``):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
al
  
    Args:
        input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`CLIPImageProcessor.__call__`] for details.
        attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
            config will be used instead.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False``):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
c                       e Zd ZeZd fdZe eej                  d             e
ee      	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd                     Zd	dZ xZS )
TFCLIPTextModelc                P    t        |   |g|i | t        |d      | _        y Nr  r  )rk   rl   rE  r  rP   rs   r   ry   rz   s       r,   rl   zTFCLIPTextModel.__init__  s(    3&3F3'V<	r.   batch_size, sequence_lengthoutput_typerY  c           	     6    | j                  |||||||      }|S )aO  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, TFCLIPTextModel

        >>> model = TFCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```rP  r  )	rP   r   r   r   r   r  r  r   r   s	            r,   r   zTFCLIPTextModel.call  s3    > )))%/!5#  
 r.   c                    | j                   ry d| _         t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY wNTr  r   rL   r"   r   r  rj   r   r   s     r,   r   zTFCLIPTextModel.build  e    ::
4&2tyy~~. &		%& & 3& &rS  r   rU  )r   rV  r   rW  r   rW  r   rX  r  rX  r  rX  r   rX  rU   rB  r   )rV   rW   rX   r   rY  rl   r   r   CLIP_TEXT_INPUTS_DOCSTRINGformatr   r   r   r   r   r   s   @r,   r  r    s    !L=
 *+E+L+LMj+kl+GVde .28<6:,0/3&*#(&*& 6& 4	&
 *& -& $& !& 
?& f m &P&r.   r  c                       e Zd ZeZdZd fdZe ee	       e
ee      	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 dd                     Zd	dZ xZS )
TFCLIPVisionModelr   c                P    t        |   |g|i | t        |d      | _        y r  )rk   rl   rc  r  r  s       r,   rl   zTFCLIPVisionModel.__init__  s(    3&3F3)&v>	r.   r  c                2    | j                  |||||      }|S )a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, TFCLIPVisionModel

        >>> model = TFCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="tf")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```ri  r  )rP   r   r   r  r  r   r   s          r,   r   zTFCLIPVisionModel.call  s.    D ))%/!5#  
 r.   c                    | j                   ry d| _         t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY wr  r  r   s     r,   r   zTFCLIPVisionModel.build  r  rS  r   rl  )r   rV  r   rX  r  rX  r  rX  r   rX  rU   rB  r   )rV   rW   rX   r   rY  main_input_namerl   r   r   CLIP_VISION_INPUTS_DOCSTRINGr   r   r   r   r   r   s   @r,   r  r    s    #L$O?
 *+GH+GVfg 15,0/3&*#('-' *' -	'
 $' !' 
?' h I 'R&r.   r  c                      e Zd ZeZd	 fdZe eej                  d            	 	 	 	 	 	 	 d
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd              Z
e ee      	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 dd              Ze eej                  d             eee      	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd                     ZddZddZ xZS )TFCLIPModelc                P    t        |   |g|i | t        |d      | _        y r  )rk   rl   rn  r  r  s       r,   rl   zTFCLIPModel.__init__  s(    3&3F3#F8	r.   r  c                H    | j                   j                  ||||||      }|S )a  
        Returns:
            text_features (`tf.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by applying
            the projection layer to the pooled output of [`TFCLIPTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, TFCLIPModel

        >>> model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")
        >>> text_features = model.get_text_features(**inputs)
        ```)r   r   r   r   r  r  )r  r  )	rP   r   r   r   r   r  r  r   r  s	            r,   r  zTFCLIPModel.get_text_features  s7    : 		33)%/!5# 4 
 r.   c                D    | j                   j                  ||||      }|S )aB  
        Returns:
            image_features (`tf.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by applying
            the projection layer to the pooled output of [`TFCLIPVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, TFCLIPModel

        >>> model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="tf")

        >>> image_features = model.get_image_features(**inputs)
        ```)r   r   r  r  )r  r  )rP   r   r   r  r  r   r  s          r,   r  zTFCLIPModel.get_image_featuresE  s2    B 55%/!5#	 6 
 r.   r  c
           
     8    | j                  ||||||||      }
|
S )a  
        Returns:

        Examples:

        ```python
        >>> import tensorflow as tf
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, TFCLIPModel

        >>> model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="tf", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = tf.nn.softmax(logits_per_image, axis=1)  # we can take the softmax to get the label probabilities
        ```)r   r   r   r   r  r   r  r  r  )rP   r   r   r   r   r  r   r  r  r   r   s              r,   r   zTFCLIPModel.callo  s7    R ))%)%#/!5#  	
 r.   c                    |S r   r[   )rP   r  s     r,   serving_outputzTFCLIPModel.serving_output  s	     r.   c                    | j                   ry d| _         t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY wr  r  r   s     r,   r   zTFCLIPModel.build  r  rS  r   rU  r  rl  r  r  r  )r  rA   rU   rA   r   )rV   rW   rX   r   rY  rl   r   r   r  r  r  r  r  CLIP_INPUTS_DOCSTRINGr   rA   r   r  r   r   r   s   @r,   r  r    s   L9
 *+E+L+LMj+kl .28<6:,0/3&*$*$ 6$ 4	$
 *$ -$ $$ $ 
$ m $L *+GH 15,0/3&*&-& *& -	&
 $& & 
& I &P *+@+G+GHe+fg<jQ .2048<6:&*,0/3&*1*1 .1 6	1
 41 $1 *1 -1 $1 1 
/1 R h 1f&r.   r  r   )r'   rC   r(   zOptional[int])r8   rC   rU   rC   )r<   rC   rU   rC   )GrY   
__future__r   r3   dataclassesr   typingr   r   r   r   numpynp
tensorflowr"   activations_tfr
   modeling_tf_outputsr   r   modeling_tf_utilsr   r   r   r   r   r   tf_utilsr   r   r   utilsr   r   r   r   r   configuration_clipr   r   r   
get_loggerrV   logger_CHECKPOINT_FOR_DOCr&   r-   r9   r?   rA   rt   Layerr]   r   r   r   r  r  r$  rE  r[  rc  rn  r  CLIP_START_DOCSTRINGr  r  r  r  r  r  r[   r.   r,   <module>r     sG    "  ! . .   / R  S R  M L 
		H	%4  
6- "
; "
 "
JGU\\// GT7 5<<-- 7 tqBell(( qBh!L"" !LHBE++ BEJ=&ELL&& =&@qJELL.. qJh 4,%,,,, 4, 4,n<Bell00 <B~ '.ELL.. '. '.T H
ell(( H
 H
V	;- 	;( T# J  (( V9&+ 9&x;&- ;&| *+\&' \& ,\&r.   