
    sgǩ                       d Z ddlmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlZddlmZ ddlmZmZmZmZmZmZmZ dd	lmZmZ dd
lmZmZmZmZmZ ddl m!Z!  ejD                  e#      Z$dZ%e G d de             Z& G d dejN                  jP                        Z) G d dejN                  jP                        Z* G d dejN                  jP                        Z+ G d dejN                  jP                        Z, G d dejN                  jP                        Z- G d dejN                  jP                        Z. G d dejN                  jP                        Z/ G d dejN                  jP                        Z0 G d  d!ejN                  jP                        Z1 G d" d#ejN                  jP                        Z2 G d$ d%ejN                  jP                        Z3 G d& d'ejN                  jP                        Z4 G d( d)ejN                  jP                        Z5 G d* d+ejN                  jP                        Z6e G d, d-ejN                  jP                               Z7 G d. d/e      Z8d0Z9d1Z: ed2e9       G d3 d4e8             Z; ed5e9       G d6 d7e8e             Z<y)8zTF 2.0 Cvt model.    )annotationsN)	dataclass)OptionalTupleUnion   )&TFImageClassifierOutputWithNoAttention)TFModelInputTypeTFPreTrainedModelTFSequenceClassificationLossget_initializerkeraskeras_serializableunpack_inputs)
shape_liststable_softmax)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )	CvtConfigr   c                  <    e Zd ZU dZdZded<   dZded<   dZded<   y)TFBaseModelOutputWithCLSTokena2  
    Base class for model's outputs.

    Args:
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        cls_token_value (`tf.Tensor` of shape `(batch_size, 1, hidden_size)`):
            Classification token at the output of the last layer of the model.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus
            the initial embedding outputs.
    N	tf.Tensorlast_hidden_statecls_token_valuezTuple[tf.Tensor, ...] | Nonehidden_states)__name__
__module____qualname____doc__r   __annotations__r   r        Z/var/www/html/venv/lib/python3.12/site-packages/transformers/models/cvt/modeling_tf_cvt.pyr   r   4   s)     $(y'!%OY%26M/6r&   r   c                  .     e Zd ZdZd fdZdddZ xZS )TFCvtDropPathzDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
    References:
        (1) github.com:rwightman/pytorch-image-models
    c                2    t        |   di | || _        y )Nr%   )super__init__	drop_prob)selfr-   kwargs	__class__s      r'   r,   zTFCvtDropPath.__init__O   s    "6""r&   c                \   | j                   dk(  s|s|S d| j                   z
  }t        j                  |      d   fdt        t        j                  |            dz
  z  z   }|t        j                  j                  |dd| j                        z   }t        j                  |      }||z  |z  S )N        r   r   )r   )dtype)r-   tfshapelenrandomuniformcompute_dtypefloor)r.   xtraining	keep_probr5   random_tensors         r'   callzTFCvtDropPath.callS   s    >>S H&	!Q!DC,<q,@$AA!BII$5$5eQI[I[$5$\\/I..r&   )r-   floatN)r;   r   )r    r!   r"   r#   r,   r?   __classcell__r0   s   @r'   r)   r)   I   s    
#/ /r&   r)   c                  R     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZdddZddZ xZS )	TFCvtEmbeddingsz-Construct the Convolutional Token Embeddings.c           	         t        	|   di | t        ||||||d      | _        t        j
                  j                  |      | _        y )Nconvolution_embeddings)
patch_sizenum_channels	embed_dimstridepaddingnamer%   )r+   r,   TFCvtConvEmbeddingsrG   r   layersDropoutdropout)
r.   configrH   rI   rJ   rK   rL   dropout_rater/   r0   s
            r'   r,   zTFCvtEmbeddings.__init__`   sO     	"6"&9!%)'
# ||++L9r&   c                N    | j                  |      }| j                  ||      }|S Nr<   )rG   rQ   )r.   pixel_valuesr<   hidden_states       r'   r?   zTFCvtEmbeddings.callw   s*    22<@||L8|Dr&   c                    | j                   ry d| _         t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY w)NTrG   )builtgetattrr4   
name_scoperG   rM   buildr.   input_shapes     r'   r]   zTFCvtEmbeddings.build|   o    ::
4148Dt::??@ 8++11$78 8 E8 8   A11A:)rR   r   rH   intrI   rb   rJ   rb   rK   rb   rL   rb   rS   r@   F)rW   r   r<   boolreturnr   rA   r    r!   r"   r#   r,   r?   r]   rB   rC   s   @r'   rE   rE   ]   sY    7:: : 	:
 : : : :.
8r&   rE   c                  L     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 d fdZddZddZ xZS )rN   zcImage to Convolution Embeddings. This convolutional operation aims to model local spatial contexts.c           
        t        |   d	i | t        j                  j	                  |      | _        t        |t        j                  j                        r|n||f| _
        t        j                  j                  |||ddt        |j                        d      | _        t        j                  j                  dd      | _        || _        || _        y )
NrL   validchannels_last
projection)filterskernel_sizestridesrL   data_formatkernel_initializerrM   h㈵>normalizationepsilonrM   r%   )r+   r,   r   rO   ZeroPadding2DrL   
isinstancecollectionsabcIterablerH   Conv2Dr   initializer_rangerl   LayerNormalizationrs   rI   rJ   )	r.   rR   rH   rI   rJ   rK   rL   r/   r0   s	           r'   r,   zTFCvtConvEmbeddings.__init__   s     	"6"||11'1B(2:{?W?W(X*_iku^v,,--"'.v/G/GH . 
 #\\<<TP_<`("r&   c                &   t        |t              r|d   }| j                  | j                  |            }t	        |      \  }}}}||z  }t        j                  ||||f      }| j                  |      }t        j                  |||||f      }|S )NrW   r5   )rw   dictrl   rL   r   r4   reshapers   )r.   rW   
batch_sizeheightwidthrI   hidden_sizes          r'   r?   zTFCvtConvEmbeddings.call   s    lD)'7Lt||L'AB 3=\2J/
FE<unzz,z;P\6]^)),7 zz,z65R^6_`r&   c                   | j                   ry d| _         t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d d | j                  g       d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   rxY w# 1 sw Y   y xY w)NTrl   rs   )
rZ   r[   r4   r\   rl   rM   r]   rI   rs   rJ   r^   s     r'   r]   zTFCvtConvEmbeddings.build   s    ::
4t,8t334 M%%tT49J9J&KLM4$/;t11667 G""(($dnn)EFG G <M MG Gs   *C&3)C2&C/2C;)rR   r   rH   rb   rI   rb   rJ   rb   rK   rb   rL   rb   )rW   r   re   r   rA   rf   rC   s   @r'   rN   rN      sP    m## # 	#
 # # #6 	Gr&   rN   c                  6     e Zd ZdZd fdZdddZddZ xZS )	 TFCvtSelfAttentionConvProjectionzConvolutional projection layer.c           
     H   t        |   d
i | t        j                  j	                  |      | _        t        j                  j                  ||t        |j                        d|dd|      | _	        t        j                  j                  ddd	      | _        || _        y )Nri   rj   Fconvolution)rm   rn   rq   rL   ro   use_biasrM   groupsrr   g?rs   )ru   momentumrM   r%   )r+   r,   r   rO   rv   rL   r{   r   r|   r   BatchNormalizationrs   rJ   )r.   rR   rJ   rn   rK   rL   r/   r0   s          r'   r,   z)TFCvtSelfAttentionConvProjection.__init__   s    "6"||11'1B <<..#.v/G/GH / 	
 #\\<<TTW^m<n"r&   c                l    | j                  | j                  |            }| j                  ||      }|S rU   )r   rL   rs   r.   rX   r<   s      r'   r?   z%TFCvtSelfAttentionConvProjection.call   s6    ''\(BC)),)Jr&   c                   | j                   ry d| _         t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d d | j                  g       d d d        t        | dd       \t        j                  | j                  j
                        5  | j                  j                  d d d | j                  g       d d d        y y # 1 sw Y   sxY w# 1 sw Y   y xY w)NTr   rs   )	rZ   r[   r4   r\   r   rM   r]   rJ   rs   r^   s     r'   r]   z&TFCvtSelfAttentionConvProjection.build   s    ::
4-9t//445 K  &&dD$..'IJK4$/;t11667 M""(($dDNN)KLM M <K KM Ms   *C'3*C3'C03C<)
rR   r   rJ   rb   rn   rb   rK   rb   rL   rb   rc   rX   r   r<   rd   re   r   rA   rf   rC   s   @r'   r   r      s    )#"
	Mr&   r   c                      e Zd ZdZddZy)"TFCvtSelfAttentionLinearProjectionz7Linear projection layer used to flatten tokens into 1D.c                d    t        |      \  }}}}||z  }t        j                  ||||f      }|S )Nr   )r   r4   r   )r.   rX   r   r   r   rI   r   s          r'   r?   z'TFCvtSelfAttentionLinearProjection.call   s<    2<\2J/
FE<unzz,z;P\6]^r&   NrX   r   re   r   )r    r!   r"   r#   r?   r%   r&   r'   r   r      s
    Ar&   r   c                  P     e Zd ZdZ	 d	 	 	 	 	 	 	 	 	 	 	 d fdZdddZd	dZ xZS )
TFCvtSelfAttentionProjectionz'Convolutional Projection for Attention.c                x    t        |   di | |dk(  rt        |||||d      | _        t	               | _        y )Ndw_bnconvolution_projectionrM   r%   )r+   r,   r   r   r   linear_projection)	r.   rR   rJ   rn   rK   rL   projection_methodr/   r0   s	           r'   r,   z%TFCvtSelfAttentionProjection.__init__   sF     	"6"'*J	;F^+D' "D!Er&   c                N    | j                  ||      }| j                  |      }|S rU   )r   r   r   s      r'   r?   z!TFCvtSelfAttentionProjection.call  s-    22<(2S--l;r&   c                    | j                   ry d| _         t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY w)NTr   )rZ   r[   r4   r\   r   rM   r]   r^   s     r'   r]   z"TFCvtSelfAttentionProjection.build  r`   ra   )r   )rR   r   rJ   rb   rn   rb   rK   rb   rL   rb   r   strrc   r   rA   rf   rC   s   @r'   r   r      s[    1 ")FF F 	F
 F F F"
8r&   r   c                  p     e Zd ZdZ	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZddZd	d
dZddZ xZS )TFCvtSelfAttentionz
    Self-attention layer. A depth-wise separable convolution operation (Convolutional Projection), is applied for
    query, key, and value embeddings.
    c           	        t        |   di | |dz  | _        || _        || _        || _        t        ||||||	dk(  rdn|	d      | _        t        ||||||	d      | _        t        ||||||	d      | _	        t        j                  j                  |t        |j                        |
dd	
      | _        t        j                  j                  |t        |j                        |
dd
      | _        t        j                  j                  |t        |j                        |
dd
      | _        t        j                  j%                  |      | _        y )Ng      avglinearconvolution_projection_query)r   rM   convolution_projection_keyconvolution_projection_valuezerosprojection_queryunitsrq   r   bias_initializerrM   projection_keyprojection_valuer%   )r+   r,   scalewith_cls_tokenrJ   	num_headsr   r   r   r   r   rO   Denser   r|   r   r   r   rP   rQ   )r.   rR   r   rJ   rn   stride_q	stride_kv	padding_q
padding_kvqkv_projection_methodqkv_biasattention_drop_rater   r/   r0   s                 r'   r,   zTFCvtSelfAttention.__init__  sm     	"6"_
,"",H*?5*HhNc/-
) +G3-+
' -I3/-
) !& 2 2.v/G/GH$# !3 !
 $ll00.v/G/GH$! 1 
 !& 2 2.v/G/GH$# !3 !
 ||++,?@r&   c                    t        |      \  }}}| j                  | j                  z  }t        j                  |||| j                  |f      }t        j
                  |d      }|S )Nr   r      r   r   perm)r   rJ   r   r4   r   	transpose)r.   rX   r   r   _head_dims         r'   "rearrange_for_multi_head_attentionz5TFCvtSelfAttention.rearrange_for_multi_head_attention`  s\    %/%="
K>>T^^3zz,z;PTP^P^`h6ij||L|Dr&   c                   | j                   rt        j                  |d||z  gd      \  }}t        |      \  }}}t        j                  |||||f      }| j                  ||      }	| j                  ||      }
| j                  ||      }| j                   rKt        j                  |
fd      }
t        j                  ||	fd      }	t        j                  ||fd      }| j                  | j                  z  }| j                  | j                  |
            }
| j                  | j                  |	            }	| j                  | j                  |            }t        j                  |
|	d      | j                   z  }t#        |d      }| j%                  ||      }t        j                  ||      }t        |      \  }}}}t        j&                  |d	
      }t        j                  |||| j                  |z  f      }|S )Nr   r   rV   axisT)transpose_b)logitsr   r   r   )r   r4   splitr   r   r   r   r   concatrJ   r   r   r   r   r   matmulr   r   rQ   r   )r.   rX   r   r   r<   	cls_tokenr   r   rI   keyqueryvaluer   attention_scoreattention_probscontextr   s                    r'   r?   zTFCvtSelfAttention.callg  s   &(hh|a%=PRS&T#I| 1;<0H-
Kzz,z65R^6_`--lX-N11,1R11,1RIIy%0q9E))Y,15CIIy%0q9E>>T^^3778M8Me8TU55d6I6I#6NO778M8Me8TU))E3DADJJN(bI,,,J))OU3)'21k1,,w\:**Wz;QY@Y&Z[r&   c                L   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   [xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)NTr   r   r   r   r   r   )rZ   r[   r4   r\   r   rM   r]   r   r   r   rJ   r   r   r^   s     r'   r]   zTFCvtSelfAttention.build  s1   ::
47>Jt@@EEF >1177=>45t<Ht>>CCD <//55d;<47>Jt@@EEF >1177=>4+T2>t4499: J%%++T4,HIJ4)40<t22778 H##))4t~~*FGH4+T2>t4499: J%%++T4,HIJ J ?> >< <> >J JH HJ JsH   I%I'?I4)J )J')JI$'I14I>JJJ#T)rR   r   r   rb   rJ   rb   rn   rb   r   rb   r   rb   r   rb   r   rb   r   r   r   rd   r   r@   r   rd   r   rc   
rX   r   r   rb   r   rb   r<   rd   re   r   rA   )	r    r!   r"   r#   r,   r   r?   r]   rB   rC   s   @r'   r   r     s    $  $GAGA GA 	GA
 GA GA GA GA GA  #GA GA #GA GAR DJr&   r   c                  6     e Zd ZdZd fdZdddZddZ xZS )	TFCvtSelfOutputzOutput of the Attention layer .c                    t        |   di | t        j                  j	                  |t        |j                        d      | _        t        j                  j                  |      | _	        || _
        y Ndense)r   rq   rM   r%   )r+   r,   r   rO   r   r   r|   r   rP   rQ   rJ   )r.   rR   rJ   	drop_rater/   r0   s        r'   r,   zTFCvtSelfOutput.__init__  s`    "6"\\''@X@X0Y`g ( 

 ||++I6"r&   c                P    | j                  |      }| j                  ||      }|S N)inputs)r   r<   r   rQ   r   s      r'   r?   zTFCvtSelfOutput.call  s*    zzz6||<(|Kr&   c                   | j                   ry d| _         t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   y xY wNTr   rZ   r[   r4   r\   r   rM   r]   rJ   r^   s     r'   r]   zTFCvtSelfOutput.build  r    ::
4$'3tzz/ ?

  $dnn!=>? ? 4? ?   )A>>B)rR   r   rJ   rb   r   r@   rc   r   rA   rf   rC   s   @r'   r   r     s    )#
?r&   r   c                  r     e Zd ZdZ	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZd Zdd	dZd
dZ xZS )TFCvtAttentionzDAttention layer. First chunk of the convolutional transformer block.c                    t        |   di | t        |||||||||	|
||d      | _        t	        |||d      | _        y )N	attentionr   outputr%   )r+   r,   r   r   r   dense_output)r.   rR   r   rJ   rn   r   r   r   r   r   r   r   r   r   r/   r0   s                  r'   r,   zTFCvtAttention.__init__  s]    " 	"6"+!
 ,FIyxXr&   c                    t         rA   )NotImplementedError)r.   headss     r'   prune_headszTFCvtAttention.prune_heads  s    !!r&   c                V    | j                  ||||      }| j                  ||      }|S rU   )r   r   )r.   rX   r   r   r<   self_outputattention_outputs          r'   r?   zTFCvtAttention.call  s4    nn\658nT,,[8,Lr&   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   exY w# 1 sw Y   y xY w)NTr   r   )rZ   r[   r4   r\   r   rM   r]   r   r^   s     r'   r]   zTFCvtAttention.build  s    ::
4d+7t~~223 +$$T*+4.:t00556 .!!''-. . ;+ +. .s   C%CCC r   )rR   r   r   rb   rJ   rb   rn   rb   r   rb   r   rb   r   rb   r   rb   r   r   r   rd   r   r@   r   r@   r   rd   rc   )rX   r   r   rb   r   rb   r<   rd   rA   )	r    r!   r"   r#   r,   r   r?   r]   rB   rC   s   @r'   r   r     s    N   $!Y!Y !Y 	!Y
 !Y !Y !Y !Y !Y  #!Y !Y #!Y !Y !YF" 
	.r&   r   c                  4     e Zd ZdZd fdZddZddZ xZS )TFCvtIntermediatezNIntermediate dense layer. Second chunk of the convolutional transformer block.c                    t        |   di | t        j                  j	                  t        ||z        t        |j                        dd      | _        || _	        y )Ngelur   )r   rq   
activationrM   r%   )
r+   r,   r   rO   r   rb   r   r|   r   rJ   )r.   rR   rJ   	mlp_ratior/   r0   s        r'   r,   zTFCvtIntermediate.__init__  sX    "6"\\''i)+,.v/G/GH	 ( 

 #r&   c                (    | j                  |      }|S rA   )r   )r.   rX   s     r'   r?   zTFCvtIntermediate.call  s    zz,/r&   c                   | j                   ry d| _         t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   y xY wr   r   r^   s     r'   r]   zTFCvtIntermediate.build  r   r   )rR   r   rJ   rb   r   rb   r   rA   rf   rC   s   @r'   r   r     s    X#?r&   r   c                  6     e Zd ZdZd fdZdddZddZ xZS )	TFCvtOutputzu
    Output of the Convolutional Transformer Block (last chunk). It consists of a MLP and a residual connection.
    c                    t        |   di | t        j                  j	                  |t        |j                        d      | _        t        j                  j                  |      | _	        || _
        || _        y r   )r+   r,   r   rO   r   r   r|   r   rP   rQ   rJ   r   )r.   rR   rJ   r   r   r/   r0   s         r'   r,   zTFCvtOutput.__init__  sg    "6"\\''@X@X0Y`g ( 

 ||++I6""r&   c                Z    | j                  |      }| j                  ||      }||z   }|S r   r   )r.   rX   input_tensorr<   s       r'   r?   zTFCvtOutput.call  s4    zzz6||<(|K#l2r&   c           	     @   | j                   ry d| _         t        | dd       qt        j                  | j                  j
                        5  | j                  j                  d d t        | j                  | j                  z        g       d d d        y y # 1 sw Y   y xY wr   )
rZ   r[   r4   r\   r   rM   r]   rb   rJ   r   r^   s     r'   r]   zTFCvtOutput.build"  s    ::
4$'3tzz/ U

  $c$..4>>2Q.R!STU U 4U Us   ?BB)rR   r   rJ   rb   r   rb   r   rb   rc   )rX   r   r  r   r<   rd   re   r   rA   rf   rC   s   @r'   r   r     s    #Ur&   r   c                  t     e Zd ZdZ	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZdddZd	dZ xZS )

TFCvtLayera&  
    Convolutional Transformer Block composed by attention layers, normalization and multi-layer perceptrons (mlps). It
    consists of 3 chunks : an attention layer, an intermediate dense layer and an output layer. This corresponds to the
    `Block` class in the original implementation.
    c                   t        |   di | t        |||||||||	|
|||d      | _        t	        |||d      | _        t        ||||d      | _        |dkD  rt        |d      n t        j                  j                  dd      | _        t        j                  j                  dd	
      | _        t        j                  j                  dd
      | _        || _        y )Nr   r   intermediater   r2   	drop_pathr   rr   layernorm_beforert   layernorm_afterr%   )r+   r,   r   r   r   r  r   r   r)   r   rO   
Activationr  r}   r  r	  rJ   )r.   rR   r   rJ   rn   r   r   r   r   r   r   r   r   r   drop_path_rater   r/   r0   s                    r'   r,   zTFCvtLayer.__init__2  s    & 	"6"'!
  .fiQ_`'	9iV^_ # .{;(((D 	 !& ? ?Se ? f$||>>tRc>d"r&   c                   | j                  | j                  |      |||      }| j                  ||      }||z   }| j                  |      }| j	                  |      }| j                  ||      }| j                  ||      }|S rU   )r   r  r  r	  r  r   )r.   rX   r   r   r<   r   layer_outputs          r'   r?   zTFCvtLayer.callc  s    >>$*?*?*MvW\go>p>>*:X>N (,6 ++L9((6 ((|D~~lX~Fr&   c                2   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   NxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)NTr   r  r   r  r  r	  )rZ   r[   r4   r\   r   rM   r]   r  r   r  r  rJ   r	  r^   s     r'   r]   zTFCvtLayer.buildt  s   ::
4d+7t~~223 +$$T*+4.:t00556 .!!''-.4.:t00556 .!!''-.4d+7t~~223 +$$T*+4+T2>t4499: J%%++T4,HIJ4*D1=t33889 I$$**D$+GHI I >+ +. .. .+ +J JI IsH   I%I?I'I43)J)JII$'I14I>J
Jr   )rR   r   r   rb   rJ   rb   rn   rb   r   rb   r   rb   r   rb   r   rb   r   r   r   rd   r   r@   r   r@   r   r@   r  r@   r   rd   rc   r   rA   rf   rC   s   @r'   r  r  +  s    ,  $!/#/# /# 	/#
 /# /# /# /# /#  #/# /# #/# /# /# /#  !/#b"Ir&   r  c                  6     e Zd ZdZd fdZdddZddZ xZS )	
TFCvtStageaK  
    Cvt stage (encoder block). Each stage has 2 parts :
    - (1) A Convolutional Token Embedding layer
    - (2) A Convolutional Transformer Block (layer).
    The classification token is added only in the last stage.

    Args:
        config ([`CvtConfig`]): Model configuration class.
        stage (`int`): Stage number.
    c                   t        |   di | || _        || _        | j                  j                  | j                     rQ| j                  dd| j                  j                  d   ft        | j                  j                        dd      | _        t        | j                  |j                  | j                     | j                  dk(  r|j                  n|j                  | j                  dz
     |j                  | j                     |j                  | j                     |j                  | j                     |j                  | j                     d      | _        t!        j"                  d	|j$                  | j                     |j&                  |         }|D cg c]   }|j)                         j+                         " }}t-        |j&                  | j                           D cg c]X  }t/        |f|j0                  | j                     |j                  | j                     |j2                  | j                     |j4                  | j                     |j6                  | j                     |j8                  | j                     |j:                  | j                     |j<                  | j                     |j>                  | j                     |j@                  | j                     |j                  | j                     |jB                  | j                     || j                     |j                  | j                     d
| d[ c}| _"        y c c}w c c}w )Nr   r   Tzcvt.encoder.stages.2.cls_token)r5   initializer	trainablerM   r   	embedding)rH   rI   rK   rJ   rL   rS   rM   r2   zlayers.)r   rJ   rn   r   r   r   r   r   r   r   r   r   r  r   rM   r%   )#r+   r,   rR   stager   
add_weightrJ   r   r|   rE   patch_sizesrI   patch_stridepatch_paddingr   r  r4   linspacer  depthnumpyitemranger  r   
kernel_qkvr   r   r   r   r   r   r   r   rO   )r.   rR   r  r/   drop_path_ratesr;   jr0   s          r'   r,   zTFCvtStage.__init__  s   "6"
;;  ,!__!T[[22267+DKK,I,IJ5	 - DN )KK))$**504

a,,VEUEUVZV`V`cdVdEe&&tzz2&&tzz2((4))$**5	
 ++c6+@+@+Lfll[`Nab5DE1779>>+EE( 6<<

34'
& %  **4::6 **4::6"--djj94 **4::6 **4::6!,,TZZ8&,&B&B4::&N4$*$>$>tzz$J **4::6 **4::6.tzz:%//

;qc]!
 F
s   %M'EMc                D   d }| j                  ||      }t        |      \  }}}}||z  }t        j                  ||||f      }| j                  j
                  | j                     r;t        j                  | j
                  |d      }t        j                  ||fd      }| j                  D ]  }	 |	||||      }
|
} | j                  j
                  | j                     rt        j                  |d||z  gd      \  }}t        j                  |||||f      }||fS )Nr   r   )repeatsr   r   r   rV   )r  r   r4   r   rR   r   r  repeatr   rO   r   )r.   rX   r<   r   r   r   r   rI   r   layerlayer_outputss              r'   r?   zTFCvtStage.call  s   	~~lH= 3=\2J/
FE<unzz,z;P\6]^;;  ,		$..*1MI99i%>QGL[[ 	)E!,QM(L	) ;;  ,&(hh|a%=PRS&T#I| zz,z65R^6_`Y&&r&   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       K| j                  D ];  }t        j                  |j
                        5  |j                  d        d d d        = y y # 1 sw Y   bxY w# 1 sw Y   UxY w)NTr  rO   )rZ   r[   r4   r\   r  rM   r]   rO   r.   r_   r%  s      r'   r]   zTFCvtStage.build  s    ::
4d+7t~~223 +$$T*+44(4 &]]5::. &KK%& && 5+ +& &s   C*CCC	)rR   r   r  rb   rc   )rX   r   r<   rd   rA   rf   rC   s   @r'   r  r    s    	-
^'0
&r&   r  c                  R     e Zd ZdZeZd fdZ	 	 	 d	 	 	 	 	 	 	 	 	 ddZddZ xZ	S )	TFCvtEncoderz
    Convolutional Vision Transformer encoder. CVT has 3 stages of encoder blocks with their respective number of layers
    (depth) being 1, 2 and 10.

    Args:
        config ([`CvtConfig`]): Model configuration class.
    c           	         t        |   di | || _        t        t	        |j
                              D cg c]  }t        ||d|        c}| _        y c c}w )Nzstages.r   r%   )r+   r,   rR   r  r6   r  r  stages)r.   rR   r/   	stage_idxr0   s       r'   r,   zTFCvtEncoder.__init__  sX    "6"W\]`agamam]nWo
JSJvy/DE
 
s   Ac           	        |rdnd }|}t        j                  |d      }d }t        | j                        D ]  \  }}	 |	||      \  }}|s||fz   } t        j                  |d      }|r.t	        |D 
cg c]  }
t        j                  |
d       c}
      }|st	        d |||fD              S t        |||      S c c}
w )Nr%   )r   r   r   r   r   rV   )r   r   r   r   c              3  &   K   | ]	  }||  y wrA   r%   ).0vs     r'   	<genexpr>z$TFCvtEncoder.call.<locals>.<genexpr>  s     bqTUTabs   r   r   r   )r4   r   	enumerater,  tupler   )r.   rW   output_hidden_statesreturn_dictr<   all_hidden_statesrX   r   r   stage_modulehss              r'   r?   zTFCvtEncoder.call  s     #7BD# ||L|D	!*4;;!7 	HA&2<(&S#L)#$5$G!	H ||L|D %Uf&grr||B\'J&g hb\9>O$Pbbb,*%+
 	
 'hs   7Cc                    | j                   ry d| _         t        | dd       K| j                  D ];  }t        j                  |j
                        5  |j                  d        d d d        = y y # 1 sw Y   IxY w)NTr,  )rZ   r[   r,  r4   r\   rM   r]   r(  s      r'   r]   zTFCvtEncoder.build  sp    ::
44(4 &]]5::. &KK%& && 5& &s   A..A7	rR   r   )FTF)
rW   r
   r6  Optional[bool]r7  r=  r<   r=  re   6Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]rA   )
r    r!   r"   r#   r   config_classr,   r?   r]   rB   rC   s   @r'   r*  r*    s[     L
 05&*#(
&
 -
 $	

 !
 
@
B&r&   r*  c                  ^     e Zd ZdZeZd fdZe	 	 	 	 d	 	 	 	 	 	 	 	 	 dd       ZddZ	 xZ
S )	TFCvtMainLayerzConstruct the Cvt model.c                V    t        |   di | || _        t        |d      | _        y )Nencoderr   r%   )r+   r,   rR   r*  rC  )r.   rR   r/   r0   s      r'   r,   zTFCvtMainLayer.__init__/  s(    "6"#F;r&   c                    |t        d      | j                  ||||      }|d   }|s	|f|dd  z   S t        ||j                  |j                        S )N You have to specify pixel_valuesr6  r7  r<   r   r   r3  )
ValueErrorrC  r   r   r   )r.   rW   r6  r7  r<   encoder_outputssequence_outputs          r'   r?   zTFCvtMainLayer.call4  s{     ?@@,,!5#	 ' 
 *!,#%(;;;,-+;;)77
 	
r&   c                    | j                   ry d| _         t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY w)NTrC  )rZ   r[   r4   r\   rC  rM   r]   r^   s     r'   r]   zTFCvtMainLayer.buildQ  si    ::
4D)5t||001 )""4() ) 6) )ra   r<  NNNF)
rW   zTFModelInputType | Noner6  r=  r7  r=  r<   r=  re   r>  rA   )r    r!   r"   r#   r   r?  r,   r   r?   r]   rB   rC   s   @r'   rA  rA  )  sh    "L<
  15/3&*#(
-
 -
 $	

 !
 
@
 
8)r&   rA  c                      e Zd ZdZeZdZdZy)TFCvtPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    cvtrW   N)r    r!   r"   r#   r   r?  base_model_prefixmain_input_namer%   r&   r'   rM  rM  Z  s    
 L$Or&   rM  a  

    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TF 2.0 models accepts two formats as inputs:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional arguments.

    This second option is useful when using [`keras.Model.fit`] method which currently requires having all the
    tensors in the first argument of the model call function: `model(inputs)`.

    </Tip>

    Args:
        config ([`CvtConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
al  
    Args:
        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`CvtImageProcessor.__call__`]
            for details.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False``):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
z]The bare Cvt Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd Zd fdZe ee       eee	      	 	 	 	 d	 	 	 	 	 	 	 	 	 dd                     Z
ddZ xZS )	
TFCvtModelc                P    t        |   |g|i | t        |d      | _        y )NrN  r   )r+   r,   rA  rN  r.   rR   r   r/   r0   s       r'   r,   zTFCvtModel.__init__  s(    3&3F3!&u5r&   output_typer?  c                    |t        d      | j                  ||||      }|s|d   f|dd z   S t        |j                  |j                  |j
                        S )a  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, TFCvtModel
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13")
        >>> model = TFCvtModel.from_pretrained("microsoft/cvt-13")

        >>> inputs = image_processor(images=image, return_tensors="tf")
        >>> outputs = model(**inputs)
        >>> last_hidden_states = outputs.last_hidden_state
        ```NrE  )rW   r6  r7  r<   r   r   r3  )rG  rN  r   r   r   r   )r.   rW   r6  r7  r<   outputss         r'   r?   zTFCvtModel.call  sy    > ?@@((%!5#	  
 AJ=712;..,%77#33!//
 	
r&   c                    | j                   ry d| _         t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY w)NTrN  )rZ   r[   r4   r\   rN  rM   r]   r^   s     r'   r]   zTFCvtModel.build  se    ::
4%1txx}}- %t$% % 2% %ra   r<  rK  )
rW   tf.Tensor | Noner6  r=  r7  r=  r<   r=  re   r>  rA   )r    r!   r"   r,   r   r   TFCVT_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCr?   r]   rB   rC   s   @r'   rR  rR    s    
6
 *+AB+HWfg *./3&*#(-
&-
 --
 $	-

 !-
 
@-
 h C -
^%r&   rR  z
    Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                       e Zd Zd fdZe ee       eee	      	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 dd                     Z
ddZ xZS )	TFCvtForImageClassificationc                X   t        |   |g|i | |j                  | _        t        |d      | _        t
        j                  j                  dd      | _        t
        j                  j                  |j                  t        |j                        ddd	      | _        || _        y )
NrN  r   rr   	layernormrt   Tr   
classifierr   )r+   r,   
num_labelsrA  rN  r   rO   r}   r`  r   r   r|   ra  rR   rT  s       r'   r,   z$TFCvtForImageClassification.__init__  s    3&3F3 ++!&u588K8X  ,,,,##.v/G/GH$ - 
 r&   rU  c                   | j                  ||||      }|d   }|d   }| j                  j                  d   r| j                  |      }nUt	        |      \  }	}
}}t        j                  ||	|
||z  f      }t        j                  |d      }| j                  |      }t        j                  |d      }| j                  |      }|d	n| j                  ||
      }|s|f|dd	 z   }||f|z   S |S t        |||j                        S )a+  
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, TFCvtForImageClassification
        >>> import tensorflow as tf
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13")
        >>> model = TFCvtForImageClassification.from_pretrained("microsoft/cvt-13")

        >>> inputs = image_processor(images=image, return_tensors="tf")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> # model predicts one of the 1000 ImageNet classes
        >>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0]
        >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)])
        ```rF  r   r   r   r   )r   r   r   r   r   N)labelsr   r   )lossr   r   )rN  rR   r   r`  r   r4   r   r   reduce_meanra  hf_compute_lossr	   r   )r.   rW   rd  r6  r7  r<   rX  rI  r   r   rI   r   r   sequence_output_meanr   re  r   s                    r'   r?   z TFCvtForImageClassification.call  s(   R ((!5#	  
 "!*AJ	;;  $"nnY7O 7A6Q3Jfe jj\[adi[i@jkO ll?KO"nn_=O!~~oAF!56~t4+?+?vV\+?+]Y,F)-)9TGf$EvE54^e^s^sttr&   c                *   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       gt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  d   g       d d d        t        | dd       t        | j                  d      rht        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  d   g       d d d        y y y # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)NTrN  r`  r   ra  rM   )rZ   r[   r4   r\   rN  rM   r]   r`  rR   rJ   hasattrra  r^   s     r'   r]   z!TFCvtForImageClassification.build8  sE   ::
4%1txx}}- %t$%4d+7t~~223 N$$dD$++2G2G2K%LMN4t,8t/]]4??#7#78 SOO))4t{{7L7LR7P*QRS S 0 9% %N NS Ss$   E0%6E=/6F	0E:=F	Fr<  )NNNNF)rW   rZ  rd  rZ  r6  r=  r7  r=  r<   r=  re   z?Union[TFImageClassifierOutputWithNoAttention, Tuple[tf.Tensor]]rA   )r    r!   r"   r,   r   r   r[  r   r	   r\  r?   r]   rB   rC   s   @r'   r^  r^    s    $ *+AB+Q`op *.#'/3&*#(@u&@u !@u -	@u
 $@u !@u 
I@u q C @uDSr&   r^  )=r#   
__future__r   collections.abcrx   dataclassesr   typingr   r   r   
tensorflowr4   modeling_tf_outputsr	   modeling_tf_utilsr
   r   r   r   r   r   r   tf_utilsr   r   utilsr   r   r   r   r   configuration_cvtr   
get_loggerr    loggerr\  r   rO   Layerr)   rE   rN   r   r   r   r   r   r   r   r   r  r  r*  rA  rM  TFCVT_START_DOCSTRINGr[  rR  r^  r%   r&   r'   <module>ry     sd    "  ! ) )  I   3  ) 
		H	%  7K 7 7(/ELL&& /(%8ell(( %8P7G%,,,, 7Gt"Mu||'9'9 "MJ);); 85<<#5#5 8DMJ++ MJ`?ell(( ?27.U\\'' 7.t?** ?4U%,,$$ U:^I## ^IB]&## ]&@:&5<<%% :&z -)U\\'' -) -)`%, % 8 & c>%% >%	>%B  eS"68T eSeSr&   