
    sgU                     >   d dl mZmZmZmZ d dlZd dlmZ d dl	Z	d dl
mZ d dlmZmZmZ d dlmZmZ d dlmZ d dlmZmZ d dl	mZ dd	lmZmZ dd
lmZmZmZm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z(  e$jR                  e*      Z+dZ,dZ-dZ.dZ/ej`                  jb                   G d de"             Z2ej`                  jb                   G d de"             Z3 G d dejh                        Z5 G d dejh                        Z6 G d dejh                        Z7 G d dejh                        Z8 G d dejh                        Z9 G d  d!ejh                        Z: G d" d#ejh                        Z; G d$ d%ejh                        Z< G d& d'ejh                        Z= G d( d)e      Z> G d* d+e      Z? G d, d-e      Z@ G d. d/ejh                        ZA G d0 d1e>      ZBd2ZC e eBe-eCz           eeBee'3        G d4 d5ejh                        ZD G d6 d7e>      ZEd8ZF e eEe-eFz           eeEe2e'3        G d9 d:ejh                        ZG G d; d<e?      ZHd=ZI e eHe.eIz           eeHee(3        G d> d?ejh                        ZJ e#e,       G d@ dAe@             ZKdBZL e eKe/eLz           eeKe3e&3       y)C    )AnyOptionalTupleUnionN)
FrozenDictfreezeunfreeze)combine_masksmake_causal_mask)dot_product_attention_weights)flatten_dictunflatten_dict)lax   )FlaxBaseModelOutputFlaxBaseModelOutputWithPooling)ACT2FNFlaxPreTrainedModel append_replace_return_docstringsoverwrite_call_docstring)ModelOutputadd_start_docstringslogging   )
CLIPConfigCLIPTextConfigCLIPVisionConfiga  

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
            `jax.numpy.bfloat16` (on TPUs).

            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
            specified all the computation will be performed with the given `dtype`.

            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
            parameters.**

            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
            [`~FlaxPreTrainedModel.to_bf16`].
a~  
    Args:
        input_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
aA  
    Args:
        pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        input_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                       e Zd ZU dZdZej                  ed<   dZej                  ed<   dZ	e
eej                  df      ed<   dZe
eej                  df      ed<   y)FlaxCLIPTextModelOutputaJ  
    Base class for text model's outputs that also contains a pooling of the last hidden states.

    Args:
        text_embeds (`jnp.ndarray` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of
            [`FlaxCLIPTextModel`].
        last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Ntext_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r    jnpndarray__annotations__r!   r"   r   r   r#        ^/var/www/html/venv/lib/python3.12/site-packages/transformers/models/clip/modeling_flax_clip.pyr   r      s`    ,  $K#%)s{{)7;M8E#++s"234;48Js{{C/018r,   r   c                       e Zd ZU dZdZej                  ed<   dZej                  ed<   dZ	ej                  ed<   dZ
ej                  ed<   dZeed<   dZeed<   d	ee   fd
Zy)FlaxCLIPOutputah  
    Args:
        logits_per_image:(`jnp.ndarray` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text:(`jnp.ndarray` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds(`jnp.ndarray` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of
            [`FlaxCLIPTextModel`].
        image_embeds(`jnp.ndarray` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of
            [`FlaxCLIPVisionModel`].
        text_model_output(`FlaxBaseModelOutputWithPooling`):
            The output of the [`FlaxCLIPTextModel`].
        vision_model_output(`FlaxBaseModelOutputWithPooling`):
            The output of the [`FlaxCLIPVisionModel`].
    Nlogits_per_imagelogits_per_textr    image_embedstext_model_outputvision_model_outputreturnc                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r3   r4   N)getattrto_tuple).0kselfs     r-   	<genexpr>z*FlaxCLIPOutput.to_tuple.<locals>.<genexpr>   s=      
  LLDGRYZ^`aRbRkRkRmm
s   -0)tuplekeysr<   s   `r-   r9   zFlaxCLIPOutput.to_tuple   s#     
YY[
 
 	
r,   )r$   r%   r&   r'   r0   r(   r)   r*   r1   r    r2   r3   r   r4   r   r   r9   r+   r,   r-   r/   r/      sj    ( %)ckk(#'OS[['#K# $L#++$8<5<:>7>
%* 
r,   r/   c                   Z    e Zd ZU eed<   ej                  Zej                  ed<   d Zd Z	y)FlaxCLIPVisionEmbeddingsconfigdtypec           
         | j                   j                  }| j                   j                  }| j                   j                  }| j	                  dt
        j                  j                  j                  d      |f      | _	        t        j                  |||f||fdd| j                  t
        j                  j                  j                               | _        ||z  dz  | _        | j                  dz   }t        j                  ||t
        j                  j                  j                         	      | _        t!        j"                  t!        j$                  d
|d      d
      | _        y )Nclass_embedding{Gz?)stddevVALIDF)kernel_sizestridespaddinguse_biasrD   kernel_init   r   embedding_initr   i4rD   axis)rC   hidden_size
image_size
patch_sizeparamjaxnninitializersnormalrF   ConvrD   patch_embeddingnum_patchesEmbedposition_embeddingr(   expand_dimsarangeposition_ids)r<   	embed_dimrW   rX   num_positionss        r-   setupzFlaxCLIPVisionEmbeddings.setup   s   KK++	[[++
[[++
#zz*;SVV=P=P=W=W_c=W=dgpfrs!ww#Z0,**++224 
 '*4:((1,"$((=)TWTZTZTgTgTnTnTp"qOOCJJq-t,T[\]r,   c                 d   | j                  |      }|j                  \  }}}}t        j                  ||||z  |f      }t        j                  | j
                  d      }t        j                  ||ddf      }t        j                  ||gd      }|| j                  | j                        z   }|S )Nr   r   rT   r   )
r_   shaper(   reshaperc   rF   tileconcatenaterb   re   )	r<   pixel_valuespatch_embeds
batch_sizeheightwidthchannelsclass_embeds
embeddingss	            r-   __call__z!FlaxCLIPVisionEmbeddings.__call__   s    ++L9.:.@.@+
FE8{{<*funh1WXt';';&Ixxz1a.@A__lL%AJ
$"9"9$:K:K"LL
r,   N)
r$   r%   r&   r   r*   r(   float32rD   rh   rw   r+   r,   r-   rB   rB      s%    {{E399"^,	r,   rB   c                   Z    e Zd ZU eed<   ej                  Zej                  ed<   d Zd Z	y)FlaxCLIPTextEmbeddingsrC   rD   c                    | j                   j                  }t        j                  | j                   j                  |t
        j                  j                  j                               | _        t        j                  | j                   j                  |t
        j                  j                  j                               | _
        t        j                  t        j                  d| j                   j                  d      d      | _        y )NrP   r   rR   rS   rj   rT   )rC   rV   r[   ra   
vocab_sizerZ   r\   r]   token_embeddingmax_position_embeddingsrb   r(   rc   rd   re   )r<   rf   s     r-   rh   zFlaxCLIPTextEmbeddings.setup  s    KK++	!xx(>(>	Z]Z`Z`ZmZmZtZtZvw"$((KK//366K^K^KeKeKg#
  OOJJq$++==TJQW
r,   c                     | j                  |j                  d            }| j                  |j                  d            }||z   }|S )NrR   )r}   astyperb   )r<   	input_idsre   input_embedsposition_embedsrv   s         r-   rw   zFlaxCLIPTextEmbeddings.__call__  sH    ++I,<,<T,BC11,2E2Ed2KL!O3
r,   N)
r$   r%   r&   r   r*   r(   rx   rD   rh   rw   r+   r,   r-   rz   rz     s$    {{E399"	
r,   rz   c                       e Zd ZU eeef   ed<   ej                  Z	ej                  ed<   d Z
d Zd Z	 	 	 d
dedefd	Zy)FlaxCLIPAttentionrC   rD   c                 0   | j                   j                  | _        | j                   j                  | _        | j                  | j                  z  | _        | j
                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j
                  dz  | _        | j                   j                  | _	        t        j                  | j                  | j                  t        j                  j                  j                  d            | _        t        j                  | j                  | j                  t        j                  j                  j                  d            | _        t        j                  | j                  | j                  t        j                  j                  j                  d            | _        t        j                  | j                  | j                  t        j                  j                  j                  d            | _        t)        | j                   t*              | _        | j,                  r<t/        t1        j2                  d| j                   j4                  fd	            | _        y y )
Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      {Gz?rD   rN   r   rR   rS   )rC   rV   rf   num_attention_heads	num_headshead_dim
ValueErrorscaleattention_dropoutdropoutr[   DenserD   rZ   r\   r]   k_projv_projq_projout_proj
isinstancer   causalr   r(   onesr~   causal_maskr@   s    r-   rh   zFlaxCLIPAttention.setup"  s   0088$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
{{44hht~~TZZSVVM`M`MgMghlMmnhht~~TZZSVVM`M`MgMghlMmnhht~~TZZSVVM`M`MgMghlMmntzzsvvObObOiOijnOop n=;;/!T[[=`=`9aim0noD r,   c                 p    |j                  |j                  d d | j                  | j                  fz         S NrO   )rl   rk   r   r   r<   r"   s     r-   _split_headszFlaxCLIPAttention._split_heads7  s5    $$]%8%8!%<PTP]P]?^%^__r,   c                 Z    |j                  |j                  d d | j                  fz         S r   )rl   rk   rf   r   s     r-   _merge_headszFlaxCLIPAttention._merge_heads:  s,    $$]%8%8!%<?P%PQQr,   Ndeterministicoutput_attentionsc           
      |   | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }d }| j                  r<|j
                  d   |j
                  d   }
}	| j                  d d d d |
|	z
  |
d |
f   }|(|&t        j                  |d      }t        ||d      }n||}n|t        j                  |d      }|t        j                  |dkD  t        j                  |j
                  d      j                  | j                        t        j                  |j
                  t        j                  | j                        j                         j                  | j                              }nd }d }|s | j"                  dkD  r| j%                  d      }t'        ||||| j"                  || j                  d 	      }t        j(                  d
||      }| j+                  |      }| j-                  |      }|r||f}|S |f}|S )Nr   )rT   rR   rS   r   g        r   )biasdropout_rngdropout_rater   rD   	precisionz...hqk,...khd->...qhd)r   r   r   r   r   rk   r   r(   rc   r
   r   selectfullr   rD   finfominr   make_rngr   einsumr   r   )r<   r"   attention_maskr   r   querykeyvaluecausal_attention_maskquery_length
key_lengthattention_biasr   attn_weightsattn_outputoutputss                   r-   rw   zFlaxCLIPAttention.__call__=  s    M*kk-(M*!!%($!!%( $;;',{{1~syy|*L$($4$4Q:;TWa;acndncn5n$o!%*?*K __^(KN*>;PX\]N".2N' __^(KN% ZZ"--s3::4::F--syy/D/H/HIPPQUQ[Q[\N "N!3--	2K4#'**	
 jj!8,N''4mmK01B;- JUr,   )NTF)r$   r%   r&   r   r   r   r*   r(   rx   rD   rh   r   r   boolrw   r+   r,   r-   r   r     s[    ."2233{{E399"p*`R ""'9 	9
  9r,   r   c                   d    e Zd ZU eeef   ed<   ej                  Z	ej                  ed<   d Z
d Zy)FlaxCLIPMLPrC   rD   c                    t         | j                  j                     | _        t	        j
                  | j                  j                  | j                  t        j                  j                  j                  d            | _        t	        j
                  | j                  j                  | j                  t        j                  j                  j                  d            | _        y )Nr   r   )r   rC   
hidden_actactivation_fnr[   r   intermediate_sizerD   rZ   r\   r]   fc1rV   fc2r@   s    r-   rh   zFlaxCLIPMLP.setup}  s    #DKK$:$:;88KK))**++2248

 88DKK334::SVSYSYSfSfSmSmnrSstr,   c                 l    | j                  |      }| j                  |      }| j                  |      }|S N)r   r   r   r   s     r-   rw   zFlaxCLIPMLP.__call__  s4    /**=9/r,   N)r$   r%   r&   r   r   r   r*   r(   rx   rD   rh   rw   r+   r,   r-   r   r   y  s0    ."2233{{E399"ur,   r   c                   t    e Zd ZU eeef   ed<   ej                  Z	ej                  ed<   d Z
	 	 ddedefdZy)	FlaxCLIPEncoderLayerrC   rD   c                    t        | j                  | j                        | _        t	        j
                  | j                  j                  | j                        | _        t        | j                  | j                        | _	        t	        j
                  | j                  j                  | j                        | _
        y NrS   )epsilonrD   )r   rC   rD   	self_attnr[   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r@   s    r-   rh   zFlaxCLIPEncoderLayer.setup  sv    *4;;djjI<<0J0JRVR\R\]t{{$**=<<0J0JRVR\R\]r,   r   r   c                     |}| j                  |      }| j                  ||||      }|d   }||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||dd  z  }|S )N)r"   r   r   r   r   r   )r   r   r   r   )r<   r"   r   r   r   residualattn_outputsr   s           r-   rw   zFlaxCLIPEncoderLayer.__call__  s     !((7~~')'/	 & 
 %Q =0 ((7/ =0 "|AB''Gr,   N)TFr$   r%   r&   r   r   r   r*   r(   rx   rD   rh   r   rw   r+   r,   r-   r   r     sL    ."2233{{E399"^ #"' 	
  r,   r   c            	           e Zd ZU eeef   ed<   ej                  Z	ej                  ed<   d Z
	 	 	 	 	 d
dedededefd	Zy)FlaxCLIPLayerCollectionrC   rD   c           	          t        | j                  j                        D cg c]-  }t        | j                  t	        |      | j
                        / c}| _        y c c}w )N)namerD   )rangerC   num_hidden_layersr   strrD   layers)r<   is     r-   rh   zFlaxCLIPLayerCollection.setup  sG     4;;889
 !3q6L
 
s   2ANr   r   output_hidden_statesreturn_dictc                     |rdnd }|rdnd }| j                   D ]'  }	|r||fz  } |	||||      }
|
d   }|s||
d   fz  }) |r||fz  }|f}|st        d |D              S t        |||      S )Nr+   )r   r   r   r   c              3   &   K   | ]	  }||  y wr   r+   )r:   vs     r-   r=   z3FlaxCLIPLayerCollection.__call__.<locals>.<genexpr>  s     =qq}=s   )r!   r"   r#   )r   r>   r   )r<   r"   r   r   r   r   r   all_attentionsall_hidden_stateslayerlayer_outputsr   s               r-   rw   z FlaxCLIPLayerCollection.__call__  s      1d"6BD[[ 
	6E#!m%55!!~]^oM *!,M =#3"55
	6  -!11 "=G==="+;LYg
 	
r,   NTFFTr   r+   r,   r-   r   r     sh    ."2233{{E399"
 ""'%* "
 	"

  "
 #"
 "
r,   r   c            	           e Zd ZU eeef   ed<   ej                  Z	ej                  ed<   d Z
	 	 	 	 	 d
dedededefd	Zy)FlaxCLIPEncoderrC   rD   c                 P    t        | j                  | j                        | _        y NrS   )r   rC   rD   r   r@   s    r-   rh   zFlaxCLIPEncoder.setup  s    -dkkLr,   Nr   r   r   r   c                 0    | j                  ||||||      S )N)r"   r   r   r   r   r   )r   )r<   inputs_embedsr   r   r   r   r   s          r-   rw   zFlaxCLIPEncoder.__call__  s,     {{')'/!5#  
 	
r,   r   r   r+   r,   r-   r   r     si    ."2233{{E399"M ""'%* 
 	

  
 #
 
r,   r   c            	       v    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 d
de	de	de	de	fdZ
y	)FlaxCLIPTextTransformerrC   rD   c                 F   t        | j                  | j                        | _        t	        | j                  | j                        | _        t        j                  | j                  j                  | j                        | _	        | j                  j                  | _
        y r   )rz   rC   rD   rv   r   encoderr[   r   r   final_layer_normeos_token_idr@   s    r-   rh   zFlaxCLIPTextTransformer.setup  sf    0DJJO&t{{$**E "T[[5O5OW[WaWa b !KK44r,   r   r   r   r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||      }| j                  ||||||      }	|	d   }
| j                  |
      }
| j                  dk(  r8|
t        j                  |
j                  d         |j                  d      f   }nD|
t        j                  |
j                  d         || j                  k(  j                  d      f   }|s
|
|f|	dd  z   S t        |
||	j                  |	j                        S )	N)r   re   )r   r   r   r   r   r   r   rO   rT   r   r!   pooler_outputr"   r#   )rC   r   r   use_return_dictrv   r   r   r   r(   rd   rk   argmaxr   r"   r#   )r<   r   r   re   r   r   r   r   r"   encoder_outputsr!   pooled_outputs               r-   rw   z FlaxCLIPTextTransformer.__call__  sq    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]),W,,')'/!5# ' 
 ,A. 112CD! .cjj9J9P9PQR9S.TV_VfVflnVfVo.opM .

,22156dFWFW9W8_8_eg8_8hhM %}58KKK-/')77&11	
 	
r,   NTFFTr$   r%   r&   r   r*   r(   rx   rD   rh   r   rw   r+   r,   r-   r   r      sZ    {{E399"5 #"'%* 3

 3
  3
 #3
 3
r,   r   c                   p    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 	 dde	de	fdZ
y)	FlaxCLIPVisionTransformerrC   rD   c                    t        | j                  | j                        | _        t	        j
                  | j                  j                  | j                        | _        t        | j                  | j                        | _	        t	        j
                  | j                  j                  | j                        | _
        y r   )rB   rC   rD   rv   r[   r   r   pre_layrnormr   r   post_layernormr@   s    r-   rh   zFlaxCLIPVisionTransformer.setupF  sv    24;;djjQLL1K1KSWS]S]^&t{{$**E ll4;;3M3MUYU_U_`r,   Nr   r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |      }| j                  |      }| j                  |||||      }|d   }|d d dd d f   }	| j                  |	      }	|s
||	f|dd  z   S t        ||	|j                  |j                        S )N)r   r   r   r   r   r   r   r   )rC   r   r   r   rv   r   r   r   r   r"   r#   )
r<   ro   r   r   r   r   r"   r   r!   r   s
             r-   rw   z"FlaxCLIPVisionTransformer.__call__L  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]5))-8,,''/!5# ' 
 ,A.)!Q'2++M:%}58KKK-/')77&11	
 	
r,   )NTNNTr$   r%   r&   r   r*   r(   rx   rD   rh   r   rw   r+   r,   r-   r   r   B  sJ    {{E399"a "! %
 %
 %
r,   r   c                   8    e Zd ZU eZdZej                  ed<   dde	j                  dfdedede	j                  d	ef fd
Zddej                   j"                  dededefdZ	 	 	 	 	 	 	 	 ddedej                   j"                  dedee   dee   dee   fdZ xZS )FlaxCLIPTextPreTrainedModelNmodule_classr   r   r   TrC   seedrD   _do_initc                 Z     | j                   d||d|}t        | 	  ||||||       y )NrC   rD   input_shaper  rD   r  r+   )r  super__init__	r<   rC   r  r  rD   r  kwargsmodule	__class__s	           r-   r  z$FlaxCLIPTextPreTrainedModel.__init__x  s=     #""H&HH[tSXcklr,   rngr  paramsr5   c                 L   t        j                  |d      }t        j                  t        j                  t        j                  |      j
                  d         |      }t        j                  |      }t        j                  j                  |      \  }}||d}	| j                  j                  |	|||      d   }
|dt        t        |
            }
t        t        |            }| j                  D ]
  }|
|   ||<    t               | _        t!        t#        |            S |
S )NrR   rS   r   r  r   r  )r(   zerosbroadcast_tord   
atleast_2drk   	ones_likerZ   randomsplitr  initr   r	   _missing_keyssetr   r   )r<   r  r  r  r   re   r   
params_rngr   rngsrandom_paramsmissing_keys               r-   init_weightsz(FlaxCLIPTextPreTrainedModel.init_weights  s    IIk6	''

3>>)3L3R3RSU3V(WYdey1"%**"2"23"7
K$=((y.,WX`a(-)@AM!(6"23F#11 A&3K&@{#A!$D.011  r,   r   trainr   r   r   c
                 p   ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	|St	        j
                  t	        j                  t	        j                  |      j                  d         |j                        }|t	        j                  |      }i }
|||
d<   | j                  j                  d|xs | j                  it	        j                  |d      t	        j                  |d      t	        j                  |d      | |||	|
	      S )Nr   r   r  rR   rS   r!  )rC   r   r   r   r(   r  rd   r  rk   r  r  applyr  array)r<   r   r   re   r  r   r%  r   r   r   r!  s              r-   rw   z$FlaxCLIPTextPreTrainedModel.__call__  s!    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY++CJJs~~i7P7V7VWY7Z,[]f]l]lmL! ]]95N ")DO{{  v,-IIit,IInD1IIl$/I  ! 

 
	
r,   r   NNNNFNNN)r$   r%   r&   r   config_classr  r[   Moduler*   r(   rx   intrD   r   r  rZ   r  PRNGKeyr   r   r$  dictr   rw   __classcell__r  s   @r-   r  r  t  s    !L"L"))"
 ;;
m
m 	
m
 yy
m 
m!

 2 2 ! !PZ !fp !0 *.,0/3&*'

 '
 ZZ'''
 '
 $D>'
 'tn'
 d^'
r,   r  c                   B    e Zd ZU eZdZdZej                  e	d<   dde
j                  dfdedee   ded	e
j                  d
ef
 fdZddej&                  j(                  dededefdZ	 	 	 	 	 	 ddedej&                  j(                  dedee   dee   dee   fdZ xZS )FlaxCLIPVisionPreTrainedModelro   Nr  r   TrC   r  r  rD   r  c                     |d|j                   |j                   df} | j                  d||d|}t        |   ||||||       y )Nr   r   r
  r  r+   )rW   r  r  r  r  s	           r-   r  z&FlaxCLIPVisionPreTrainedModel.__init__  s]     f//1B1BAFK"""H&HH[tSXcklr,   r  r  r5   c                    t         j                  j                  ||      }t         j                  j                  |      \  }}||d}| j                  j                  ||      d   }|dt        t        |            }t        t        |            }| j                  D ]
  }	||	   ||	<    t               | _        t        t        |            S |S )Nr  r  )rZ   r  r]   r  r  r  r   r	   r  r  r   r   )
r<   r  r  r  ro   r   r   r!  r"  r#  s
             r-   r$  z*FlaxCLIPVisionPreTrainedModel.init_weights  s    zz((k:"%**"2"23"7
K$=((|<XF(-)@AM!(6"23F#11 A&3K&@{#A!$D.011  r,   r   r%  r   r   r   c           	         ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }t	        j
                  |d      }i }|||d<   | j                  j                  d|xs | j                  it	        j                  |t        j                        | ||||      S )Nr   rO   r   r   r   r  rS   r'  )rC   r   r   r   r(   	transposer  r(  r  r)  rx   )	r<   ro   r  r   r%  r   r   r   r!  s	            r-   rw   z&FlaxCLIPVisionPreTrainedModel.__call__  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY}}\<@ ")DO{{  v,-IIl#++6I  ! 
 	
r,   r   )NNFNNN)r$   r%   r&   r   r+  main_input_namer  r[   r,  r*   r(   rx   r   r   r-  rD   r   r  rZ   r  r.  r   r$  r/  rw   r0  r1  s   @r-   r3  r3    s   #L$O"L"))"
 (,;;m m e_m 	m
 yym m!

 2 2 ! !PZ !fp !, *.,0/3&*
 
 ZZ''	

 
 $D>
 'tn
 d^
r,   r3  c                       e Zd ZU eZdZej                  ed<   dde	j                  dfdedee   dede	j                  d	ef
 fd
Zddej$                  j&                  dededefdZ	 	 	 	 	 	 	 	 ddedej$                  j&                  dedee   dee   dee   fdZ	 	 	 	 	 ddedej$                  j&                  fdZ	 ddedej$                  j&                  fdZ xZS )FlaxCLIPPreTrainedModelNr  r   TrC   r  r  rD   r  c                     |0dd|j                   j                  |j                   j                  dff} | j                  d||d|}t        |   ||||||       y )Nr  r   r   r
  r  r+   )vision_configrW   r  r  r  r  s	           r-   r  z FlaxCLIPPreTrainedModel.__init__  so     !Av';';'F'FH\H\HgHgij#klK"""H&HH[tSXcklr,   r  r  r5   c                    t        j                  |d   d      }t        j                  t        j                  t        j                  |      j
                  d         |d         }t        j                  |      }t        j                  j                  ||d         }t        j                  j                  |      \  }}	||	d}
| j                  j                  |
||||      d   }|dt        t        |            }t        t        |            }| j                  D ]
  }||   ||<    t!               | _        t#        t%        |            S |S )Nr   rR   rS   r   r   r  r  )r(   r  r  rd   r  rk   r  rZ   r  r]   r  r  r  r   r	   r  r  r   r   )r<   r  r  r  r   re   r   ro   r   r   r!  r"  r#  s                r-   r$  z$FlaxCLIPPreTrainedModel.init_weights  s%   IIk!nD9	''

3>>)3L3R3RSU3V(WYdefYghy1zz((k!n="%**"2"23"7
K$=((y,Xdefno(-)@AM!(6"23F#11 A&3K&@{#A!$D.011  r,   r   r%  r   r   r   c                    ||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
|St	        j
                  t	        j                  t	        j                  |      j                  d         |j                        }|t	        j                  |      }t	        j                  |d      }i }|||d<   | j                  j                  d|xs | j                  it	        j                  |d      t	        j                  |t        j                        t	        j                  |d      t	        j                  |d      | ||	|
|
      S )Nr   r7  r   r  rR   rS   r'  )rC   r   r   r   r(   r  rd   r  rk   r  r8  r  r(  r  r)  rx   )r<   r   ro   r   re   r  r   r%  r   r   r   r!  s               r-   rw   z FlaxCLIPPreTrainedModel.__call__4  sC    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY++CJJs~~i7P7V7VWY7Z,[]f]l]lmL! ]]95N}}\<@ ")DO{{  v,-IIit,IIl#++6IInD1IIl$/I  ! 
 	
r,   c           	         |St        j                  t        j                  t        j                  |      j                  d         |j                        }|t        j
                  |      }i }|||d<   d }| j                  j                  d|xs | j                  it        j                  |d      t        j                  |d      t        j                  |d      | ||      S )at  
        Args:
            input_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)

        Returns:
            text_features (`jnp.ndarray` of shape `(batch_size, output_dim`): The text embeddings obtained by applying
            the projection layer to the pooled output of [`FlaxCLIPTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, FlaxCLIPModel

        >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np")
        >>> text_features = model.get_text_features(**inputs)
        ```r   r   c                 \    | j                  ||||      }|d   }| j                  |      }|S )N)r   r   re   r   r   )
text_modeltext_projection)r  r   r   re   r   text_outputsr   text_featuress           r-   _get_featuresz@FlaxCLIPPreTrainedModel.get_text_features.<locals>._get_features  sD    !,,#-)+	 - L )OM"22=AM  r,   r  rR   rS   methodr!  )
r(   r  rd   r  rk   r  r  r(  r  r)  )	r<   r   r   re   r  r   r%  r!  rF  s	            r-   get_text_featuresz)FlaxCLIPPreTrainedModel.get_text_featuresa  s    F ++CJJs~~i7P7V7VWY7Z,[]f]l]lmL! ]]95N ")DO		! {{  v,-IIit,IInD1IIl$/I  ! 
 	
r,   c                     t        j                  |d      }i }|||d<   d }| j                  j                  d|xs | j                  it        j
                  |t         j                        | ||      S )a  
        Args:
            pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
                Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained
                using [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.

        Returns:
            image_features (`jnp.ndarray` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`FlaxCLIPVisionModel`]

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, FlaxCLIPModel

        >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="np")

        >>> image_features = model.get_image_features(**inputs)
        ```r7  r   c                 X    | j                  ||      }|d   }| j                  |      }|S )N)ro   r   r   )vision_modelvisual_projection)r  ro   r   vision_outputsr   image_featuress         r-   rF  zAFlaxCLIPPreTrainedModel.get_image_features.<locals>._get_features  s8    #00lZg0hN*1-M#55mDN!!r,   r  rS   rG  )r(   r8  r  r(  r  r)  rx   )r<   ro   r  r   r%  r!  rF  s          r-   get_image_featuresz*FlaxCLIPPreTrainedModel.get_image_features  s{    < }}\<@ ")DO	" {{  v,-IIl#++6I  ! 
 	
r,   r   r*  )NNNNF)NNF)r$   r%   r&   r   r+  r  r[   r,  r*   r(   rx   r   r   r-  rD   r   r  rZ   r  r.  r   r$  r/  rw   rI  rP  r0  r1  s   @r-   r;  r;    sh   L"L"))"
 (,;;mm e_m 	m
 yym m!

 2 2 ! !PZ !fp !6 *.,0/3&*+
 +
 ZZ''+
 +
 $D>+
 'tn+
 d^+
` *.A

 A
 ZZ''A
H `e1
$(1
>Ajj>P>P1
r,   r;  c            	       v    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 d
de	de	de	de	fdZ
y	)FlaxCLIPTextModulerC   rD   c                 P    t        | j                  | j                        | _        y r   )r   rC   rD   rB  r@   s    r-   rh   zFlaxCLIPTextModule.setup  s    1$++TZZPr,   r   r   r   r   c           	      2    | j                  |||||||      S )Nr   r   re   r   r   r   r   )rB  )r<   r   r   re   r   r   r   r   s           r-   rw   zFlaxCLIPTextModule.__call__  s/     )%'/!5#  
 	
r,   Nr   r   r+   r,   r-   rR  rR    s[    {{E399"Q #"'%* 

 
  
 #
 
r,   rR  c                       e Zd ZeZy)FlaxCLIPTextModelN)r$   r%   r&   rR  r  r+   r,   r-   rW  rW    s    %Lr,   rW  a'  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, FlaxCLIPTextModel

    >>> model = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
    >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

    >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np")

    >>> outputs = model(**inputs)
    >>> last_hidden_state = outputs.last_hidden_state
    >>> pooler_output = outputs.pooler_output  # pooled (EOS token) states
    ```
)output_typer+  c            	       v    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 d
de	de	de	de	fdZ
y	)%FlaxCLIPTextModelWithProjectionModulerC   rD   c                     t        | j                  | j                        | _        t	        j
                  | j                  j                  d| j                        | _        y )NrS   F)rM   rD   )r   rC   rD   rB  r[   r   projection_dimrC  r@   s    r-   rh   z+FlaxCLIPTextModelWithProjectionModule.setup  s>    1$++TZZP!xx(B(BUZ^ZdZder,   r   r   r   r   c           	          | j                  |||||||      }|d   }	| j                  |	      }
|s|
|d   f|dd  z   S t        |
|j                  |j                  |j
                        S )NrU  r   r   rO   )r    r!   r"   r#   )rB  rC  r   r!   r"   r#   )r<   r   r   re   r   r   r   r   rD  r   r    s              r-   rw   z.FlaxCLIPTextModelWithProjectionModule.__call__  s     )%'/!5# ' 
 %Q**=9a1L4DDD&#*<<&44#..	
 	
r,   Nr   r   r+   r,   r-   rZ  rZ    s[    {{E399"f #"'%* 

 
  
 #
 
r,   rZ  c                       e Zd ZeZy)FlaxCLIPTextModelWithProjectionN)r$   r%   r&   rZ  r  r+   r,   r-   r_  r_  ;  s    8Lr,   r_  a  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, FlaxCLIPTextModelWithProjection

    >>> model = FlaxCLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
    >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

    >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np")

    >>> outputs = model(**inputs)
    >>> text_embeds = outputs.text_embeds
    ```
c            	       v    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 d
de	de	de	de	fdZ
y	)FlaxCLIPVisionModulerC   rD   c                 P    t        | j                  | j                        | _        y r   )r   rC   rD   rL  r@   s    r-   rh   zFlaxCLIPVisionModule.setup]  s    5dkkTr,   r   r   r   r   c                 .    | j                  |||||      S )Nro   r   r   r   r   )rL  )r<   ro   r   r   r   r   s         r-   rw   zFlaxCLIPVisionModule.__call__`  s+       %'/!5# ! 
 	
r,   Nr   r  r+   r,   r-   ra  ra  Y  s[    {{E399"U #"'%* 
 
  	

 #
 
r,   ra  c                       e Zd ZeZy)FlaxCLIPVisionModelN)r$   r%   r&   ra  r  r+   r,   r-   rf  rf  q  s    'Lr,   rf  a  
    Returns:

    Example:

    ```python
    >>> from PIL import Image
    >>> import requests
    >>> from transformers import AutoProcessor, FlaxCLIPVisionModel

    >>> model = FlaxCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
    >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    >>> image = Image.open(requests.get(url, stream=True).raw)

    >>> inputs = processor(images=image, return_tensors="np")

    >>> outputs = model(**inputs)
    >>> last_hidden_state = outputs.last_hidden_state
    >>> pooler_output = outputs.pooler_output  # pooled CLS states
    ```
c                   r    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 	 	 	 	 dde	fdZ
y)FlaxCLIPModulerC   rD   c                      j                   j                  } j                   j                  } j                   j                   _        |j                   _        |j                   _        t        | j                         _	        t        | j                         _        t        j                   j                   j                  t        j                  j                  j!                  d      d       _        t        j                   j                   j                  t        j                  j                  j!                  d      d       _         j'                  d fdg        _        y )NrS   rG   F)rD   rN   rM   logit_scalec                 \    t        j                  |      j                  j                  z  S r   )r(   r   rC   logit_scale_init_value)_rk   r<   s     r-   <lambda>z&FlaxCLIPModule.setup.<locals>.<lambda>  s    CHHUOdkk>`>`,` r,   )rC   text_configr=  r\  rV   text_embed_dimvision_embed_dimr   rD   rB  r   rL  r[   r   rZ   r\   r]   rM  rC  rY   rj  )r<   ro  r=  s   `  r-   rh   zFlaxCLIPModule.setup  s
   kk--11"kk88)55 - 9 91+TZZP5m4::V!#**++2248	"
  "xx**++2248	 
  ::`bd
r,   Nr   c	           	      P   ||n| j                   j                  }| j                  |||||      }	| j                  |||||||      }
|	d   }| j	                  |      }|
d   }| j                  |      }|t        j                  j                  |dd      z  }|t        j                  j                  |dd      z  }t        j                  | j                        }t        j                  ||j                        |z  }|j                  }|s|||||
|	fS t        |||||
|	      S )Nrd  rU  r   r   T)rU   keepdims)r0   r1   r    r2   r3   r4   )rC   r   rL  rB  rM  rC  r(   linalgnormexprj  matmulTr/   )r<   r   ro   r   re   r   r   r   r   rN  rD  r2   r    rj  r1   r0   s                   r-   rw   zFlaxCLIPModule.__call__  sM    &1%<k$++BYBY**%'/!5# + 
 )%'/!5# ' 
 &a(--l;"1o**;7 $cjjoolVZo&[[!CJJOOKbSWO$XX ggd../**[,..AKO*,,$o{LR^`noo-+#%* .
 	
r,   )NNNNTNNN)r$   r%   r&   r   r*   r(   rx   rD   rh   r   rw   r+   r,   r-   rh  rh    sH    {{E399"
< "!8
 8
r,   rh  c                       e Zd ZeZy)FlaxCLIPModelN)r$   r%   r&   rh  r  r+   r,   r-   rz  rz    s    !Lr,   rz  ai  
    Returns:

    Example:

    ```python
    >>> import jax
    >>> from PIL import Image
    >>> import requests
    >>> from transformers import AutoProcessor, FlaxCLIPModel

    >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    >>> image = Image.open(requests.get(url, stream=True).raw)

    >>> inputs = processor(
    ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="np", padding=True
    ... )

    >>> outputs = model(**inputs)
    >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
    >>> probs = jax.nn.softmax(logits_per_image, axis=1)  # we can take the softmax to get the label probabilities
    ```
)Mtypingr   r   r   r   flax
flax.linenlinenr[   rZ   	jax.numpynumpyr(   flax.core.frozen_dictr   r   r	   r
   r   flax.linen.attentionr   flax.traverse_utilr   r   r   modeling_flax_outputsr   r   modeling_flax_utilsr   r   r   r   utilsr   r   r   configuration_clipr   r   r   
get_loggerr$   loggerCLIP_START_DOCSTRINGCLIP_TEXT_INPUTS_DOCSTRINGCLIP_VISION_INPUTS_DOCSTRINGCLIP_INPUTS_DOCSTRINGstruct	dataclassr   r/   r,  rB   rz   r   r   r   r   r   r   r   r  r3  r;  rR  rW  FLAX_CLIP_TEXT_MODEL_DOCSTRINGrZ  r_  .FLAX_CLIP_TEXT_MODEL_WITH_PROJECTION_DOCSTRINGra  rf   FLAX_CLIP_VISION_MODEL_DOCSTRINGrh  rz  FLAX_CLIP_MODEL_DOCSTRINGr+   r,   r-   <module>r     s    / .   
  > > 6 > ;  X  @ ? L L 
		H	%! F @  ! H 9k 9 9:  
[  
  
F#ryy #LRYY .X		 Xv")) ('299 'T,
bii ,
^
bii 
4?
bii ?
D/
		 /
dL
"5 L
^E
$7 E
PJ
1 J
Z
 
8&3 &" & *,FIg,g h  #AP^
'
BII '
T9&A 92 .$ #%?Bp%p !#1HWe

299 
0(7 ($  0 ,.JMm.m n  %CRb
X
RYY X
v *+"+ " ," 6 (=@Y(Y Z  NYc dr,   