
    sg              	       B   d Z ddlZddlmZmZmZmZmZ ddlZddl	Zddlm
Z
 ddlmZmZmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZmZmZmZm Z m!Z! ddl"m#Z#  ejH                  e%      Z&dZ'dZ(g dZ)dZ*dZ+dBde,de,dee,   de,fdZ- G d de
j\                        Z/ G d de
j\                        Z0 G d de
j\                        Z1 G d de
j\                        Z2 G d d e
j\                        Z3 G d! d"e
j\                        Z4 G d# d$e
j\                        Z5 G d% d&e
j\                        Z6 G d' d(e
j\                        Z7 G d) d*e
j\                        Z8 G d+ d,e
j\                        Z9 G d- d.e
j\                        Z: G d/ d0e      Z;d1Z<d2Z= ed3e<       G d4 d5e;             Z> ed6e<       G d7 d8e;             Z? G d9 d:e
j\                        Z@ G d; d<e
j\                        ZA G d= d>e
j\                        ZB ed?e<       G d@ dAe;             ZCy)CzPyTorch MobileViT model.    N)DictOptionalSetTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings	torch_int   )MobileViTConfigr   zapple/mobilevit-small)r   i     r   ztabby, tabby catvaluedivisor	min_valuereturnc                 |    ||}t        |t        | |dz  z         |z  |z        }|d| z  k  r||z  }t        |      S )a  
    Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
    original TensorFlow repo. It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
       g?)maxint)r   r   r    	new_values       c/var/www/html/venv/lib/python3.12/site-packages/transformers/models/mobilevit/modeling_mobilevit.pymake_divisibler(   >   sS     	Is57Q;#677BWLMI3;W	y>    c                        e Zd Z	 	 	 	 	 	 ddedededededededed	ed
eeef   ddf fdZde	j                  de	j                  fdZ xZS )MobileViTConvLayerconfigin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr!   Nc                 $   t         |           t        |dz
  dz        |z  }||z  dk7  rt        d| d| d      ||z  dk7  rt        d| d| d      t	        j
                  ||||||||d		      | _        |	r t	        j                  |d
ddd      | _        nd | _        |
rdt        |
t              rt        |
   | _        y t        |j                  t              rt        |j                     | _        y |j                  | _        y d | _        y )Nr   r#   r   zInput channels (z) are not divisible by z groups.zOutput channels (zeros)	r-   r.   r/   r0   paddingr3   r1   r2   padding_modegh㈵>g?T)num_featuresepsmomentumaffinetrack_running_stats)super__init__r%   
ValueErrorr   Conv2dconvolutionBatchNorm2dnormalization
isinstancestrr   
activation
hidden_act)selfr,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r8   	__class__s               r'   r@   zMobileViTConvLayer.__init__N   s*    	{Q!+,x71$/}<STZS[[cdee& A%0>UV\U]]efgg99#%# 

 !#)$("D "&D.#."("8F--s3"():):";"("3"3"DOr)   featuresc                     | j                  |      }| j                  | j                  |      }| j                  | j                  |      }|S N)rC   rE   rH   )rJ   rL   s     r'   forwardzMobileViTConvLayer.forward   sK    ##H-)))(3H??&x0Hr)   )r   r   Fr   TT)__name__
__module____qualname__r   r%   boolr   rG   r@   torchTensorrO   __classcell__rK   s   @r'   r+   r+   M   s     "&+/4#4# 4# 	4#
 4# 4# 4# 4# 4#  4# dCi(4# 
4#l  r)   r+   c                   x     e Zd ZdZ	 ddedededededdf fd	Zd
ej                  dej                  fdZ	 xZ
S )MobileViTInvertedResidualzQ
    Inverted residual block (MobileNetv2): https://arxiv.org/abs/1801.04381
    r,   r-   r.   r0   r3   r!   Nc           	      @   t         |           t        t        t	        ||j
                  z              d      }|dvrt        d| d      |dk(  xr ||k(  | _        t        |||d      | _	        t        |||d|||      | _
        t        |||dd	
      | _        y )Nr   )r   r#   zInvalid stride .r   r-   r.   r/   r   )r-   r.   r/   r0   r1   r3   Fr-   r.   r/   r5   )r?   r@   r(   r%   roundexpand_ratiorA   use_residualr+   
expand_1x1conv_3x3
reduce_1x1)rJ   r,   r-   r.   r0   r3   expanded_channelsrK   s          r'   r@   z"MobileViTInvertedResidual.__init__   s     	*3u[6CVCV5V/W+XZ[\vha899#q[K{l/J,:KYZ
 +)*$
 -)% 
r)   rL   c                     |}| j                  |      }| j                  |      }| j                  |      }| j                  r||z   S |S rN   )ra   rb   rc   r`   )rJ   rL   residuals      r'   rO   z!MobileViTInvertedResidual.forward   sI    ??8,==*??8,&*&7&7x("EXEr)   r   )rP   rQ   rR   __doc__r   r%   r@   rT   rU   rO   rV   rW   s   @r'   rY   rY      sc    
 jk
%
47
GJ
TW
cf
	
BF F Fr)   rY   c                   t     e Zd Z	 ddedededededdf fdZd	ej                  dej                  fd
Z xZ	S )MobileViTMobileNetLayerr,   r-   r.   r0   
num_stagesr!   Nc                     t         |           t        j                         | _        t        |      D ]5  }t        ||||dk(  r|nd      }| j                  j                  |       |}7 y )Nr   r   )r-   r.   r0   )r?   r@   r   
ModuleListlayerrangerY   append)	rJ   r,   r-   r.   r0   rk   irn   rK   s	           r'   r@   z MobileViTMobileNetLayer.__init__   sh     	]]_
z" 	'A-')!"avQ	E JJe$&K	'r)   rL   c                 8    | j                   D ]
  } ||      } |S rN   rn   )rJ   rL   layer_modules      r'   rO   zMobileViTMobileNetLayer.forward   s$     JJ 	.L#H-H	.r)   )r   r   
rP   rQ   rR   r   r%   r@   rT   rU   rO   rV   rW   s   @r'   rj   rj      sV    op'%'47'GJ'TW'il'	'   r)   rj   c                        e Zd Zdededdf fdZdej                  dej                  fdZdej                  dej                  fd	Z	 xZ
S )
MobileViTSelfAttentionr,   hidden_sizer!   Nc                    t         |           ||j                  z  dk7  rt        d|f d|j                   d      |j                  | _        t	        ||j                  z        | _        | j                  | j
                  z  | _        t        j                  || j                  |j                        | _
        t        j                  || j                  |j                        | _        t        j                  || j                  |j                        | _        t        j                  |j                        | _        y )Nr   zThe hidden size z4 is not a multiple of the number of attention heads r[   )r2   )r?   r@   num_attention_headsrA   r%   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyr   Dropoutattention_probs_dropout_probdropoutrJ   r,   rx   rK   s      r'   r@   zMobileViTSelfAttention.__init__   s    333q8";<. 1334A7 
 $*#=#= #&{V5O5O'O#P !558P8PPYY{D,>,>V__U
99[$*<*<6??SYY{D,>,>V__U
zz&"E"EFr)   xc                     |j                         d d | j                  | j                  fz   } |j                  | }|j	                  dddd      S )Nr   r#   r   r   )sizerz   r{   viewpermute)rJ   r   new_x_shapes      r'   transpose_for_scoresz+MobileViTSelfAttention.transpose_for_scores   sN    ffhsmt'?'?AYAY&ZZAFFK yyAq!$$r)   hidden_statesc                    | j                  |      }| j                  | j                  |            }| j                  | j                  |            }| j                  |      }t	        j
                  ||j                  dd            }|t        j                  | j                        z  }t        j                  j                  |d      }| j                  |      }t	        j
                  ||      }|j                  dddd      j                         }|j!                         d d | j"                  fz   }	 |j$                  |	 }|S )Nr   dimr   r#   r   r   )r   r   r   r   rT   matmul	transposemathsqrtr{   r   
functionalsoftmaxr   r   
contiguousr   r|   r   )
rJ   r   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapes
             r'   rO   zMobileViTSelfAttention.forward   s&    JJ}5--dhh}.EF	//

=0IJ//0AB !<<Y5H5HR5PQ+dii8P8P.QQ --//0@b/I ,,7_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CDr)   )rP   rQ   rR   r   r%   r@   rT   rU   r   rO   rV   rW   s   @r'   rw   rw      sW    G GS GT G&%ell %u|| %
U\\ ell r)   rw   c                   d     e Zd Zdededdf fdZdej                  dej                  fdZ xZ	S )MobileViTSelfOutputr,   rx   r!   Nc                     t         |           t        j                  ||      | _        t        j
                  |j                        | _        y rN   r?   r@   r   r}   denser   hidden_dropout_probr   r   s      r'   r@   zMobileViTSelfOutput.__init__  s6    YY{K8
zz&"<"<=r)   r   c                 J    | j                  |      }| j                  |      }|S rN   r   r   rJ   r   s     r'   rO   zMobileViTSelfOutput.forward  s$    

=1]3r)   ru   rW   s   @r'   r   r     s8    > >S >T >
U\\ ell r)   r   c                   z     e Zd Zdededdf fdZdee   ddfdZdej                  dej                  fd	Z
 xZS )
MobileViTAttentionr,   rx   r!   Nc                     t         |           t        ||      | _        t	        ||      | _        t               | _        y rN   )r?   r@   rw   	attentionr   outputsetpruned_headsr   s      r'   r@   zMobileViTAttention.__init__  s4    /D)&+>Er)   headsc                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r   )lenr   r   rz   r{   r   r   r   r   r   r   r   r|   union)rJ   r   indexs      r'   prune_headszMobileViTAttention.prune_heads  s   u:?74>>55t~~7Y7Y[_[l[l
u
  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r)   r   c                 J    | j                  |      }| j                  |      }|S rN   )r   r   )rJ   r   self_outputsattention_outputs       r'   rO   zMobileViTAttention.forward,  s%    ~~m4;;|4r)   )rP   rQ   rR   r   r%   r@   r   r   rT   rU   rO   rV   rW   s   @r'   r   r     sO    " "S "T ";S ;d ;$ U\\  ell  r)   r   c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTIntermediater,   rx   intermediate_sizer!   Nc                     t         |           t        j                  ||      | _        t        |j                  t              rt        |j                     | _	        y |j                  | _	        y rN   )
r?   r@   r   r}   r   rF   rI   rG   r   intermediate_act_fnrJ   r,   rx   r   rK   s       r'   r@   zMobileViTIntermediate.__init__3  sR    YY{,=>
f''-'-f.?.?'@D$'-'8'8D$r)   r   c                 J    | j                  |      }| j                  |      }|S rN   )r   r   r   s     r'   rO   zMobileViTIntermediate.forward;  s&    

=100?r)   ru   rW   s   @r'   r   r   2  sA    9 9S 9UX 9]a 9U\\ ell r)   r   c                        e Zd Zdedededdf fdZdej                  dej                  dej                  fd	Z xZ	S )
MobileViTOutputr,   rx   r   r!   Nc                     t         |           t        j                  ||      | _        t        j
                  |j                        | _        y rN   r   r   s       r'   r@   zMobileViTOutput.__init__B  s7    YY0+>
zz&"<"<=r)   r   input_tensorc                 T    | j                  |      }| j                  |      }||z   }|S rN   r   )rJ   r   r   s      r'   rO   zMobileViTOutput.forwardG  s.    

=1]3%4r)   ru   rW   s   @r'   r   r   A  sO    > >S >UX >]a >
U\\  RWR^R^ r)   r   c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTTransformerLayerr,   rx   r   r!   Nc                 $   t         |           t        ||      | _        t	        |||      | _        t        |||      | _        t        j                  ||j                        | _        t        j                  ||j                        | _        y )Nr;   )r?   r@   r   r   r   intermediater   r   r   	LayerNormlayer_norm_epslayernorm_beforelayernorm_afterr   s       r'   r@   z"MobileViTTransformerLayer.__init__O  sq    +FK@1&+GXY%fk;LM "[f>S>S T!||KV=R=RSr)   r   c                     | j                  | j                  |            }||z   }| j                  |      }| j                  |      }| j	                  ||      }|S rN   )r   r   r   r   r   )rJ   r   r   layer_outputs       r'   rO   z!MobileViTTransformerLayer.forwardW  s\    >>$*?*?*NO(=8++M:((6{{<?r)   ru   rW   s   @r'   r   r   N  sF    T TS TUX T]a TU\\ ell r)   r   c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTTransformerr,   rx   rk   r!   Nc           	          t         |           t        j                         | _        t        |      D ]A  }t        ||t        ||j                  z              }| j                  j                  |       C y )N)rx   r   )
r?   r@   r   rm   rn   ro   r   r%   	mlp_ratiorp   )rJ   r,   rx   rk   _transformer_layerrK   s         r'   r@   zMobileViTTransformer.__init__b  sh    ]]_
z" 	1A 9'"%kF4D4D&D"E!
 JJ/0	1r)   r   c                 8    | j                   D ]
  } ||      } |S rN   rs   )rJ   r   rt   s      r'   rO   zMobileViTTransformer.forwardn  s%     JJ 	8L(7M	8r)   ru   rW   s   @r'   r   r   a  s@    
1 
1S 
1c 
1VZ 
1U\\ ell r)   r   c                        e Zd ZdZ	 ddededededededed	d
f fdZdej                  d	e	ej                  e
f   fdZdej                  de
d	ej                  fdZdej                  d	ej                  fdZ xZS )MobileViTLayerz;
    MobileViT block: https://arxiv.org/abs/2110.02178
    r,   r-   r.   r0   rx   rk   r3   r!   Nc                    t         |           |j                  | _        |j                  | _        |dk(  r)t        ||||dk(  r|nd|dkD  r|dz  nd      | _        |}nd | _        t        ||||j                        | _	        t        |||ddd      | _
        t        |||      | _        t        j                  ||j                        | _        t        |||d      | _        t        |d|z  ||j                        | _        y )	Nr#   r   )r-   r.   r0   r3   r\   F)r-   r.   r/   r4   r5   )rx   rk   r   )r?   r@   
patch_sizepatch_widthpatch_heightrY   downsampling_layerr+   conv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projectionfusion)	rJ   r,   r-   r.   r0   rx   rk   r3   rK   s	           r'   r@   zMobileViTLayer.__init__y  s    	!,,"--Q;&?')!)QvA*2Q,QA'D# 'K&*D#*#$//	
 +#$# 
 0#!
 kv7L7LM1+ST 
 )KkW]WnWn
r)   rL   c                 |   | j                   | j                  }}t        ||z        }|j                  \  }}}}t        j
                  j                         r$t        t	        j                  ||z        |z        n#t        t        j                  ||z        |z        }	t        j
                  j                         r$t        t	        j                  ||z        |z        n#t        t        j                  ||z        |z        }
d}|
|k7  s|	|k7  r't        j                  j                  ||	|
fdd      }d}|
|z  }|	|z  }||z  }|j                  ||z  |z  |||      }|j                  dd      }|j                  ||||      }|j                  dd      }|j                  ||z  |d      }||f||||||d	}||fS )
NFbilinearr   modealign_cornersTr   r#   r   r   )	orig_size
batch_sizechannelsinterpolatenum_patchesnum_patches_widthnum_patches_height)r   r   r%   shaperT   jit
is_tracingr   ceilr   r   r   r   reshaper   )rJ   rL   r   r   
patch_arear   r   orig_height
orig_width
new_height	new_widthr   num_patch_widthnum_patch_heightr   patches	info_dicts                    r'   	unfoldingzMobileViTLayer.unfolding  s   $($4$4d6G6G\|34
8@5
Hk: yy##% ejj|!;<|KLTYY{\9:\IJ 	 yy##% ejjk!9:[HITYYzK78;FG 	 
"jK&?}}00
I6ZW\ 1 H K ${2%5&8 ""!$44lOU`
 ##Aq)//*hZP##Aq)//*z"9;K &z2$ &&!0"2
	 	!!r)   r   r   c                    | j                   | j                  }}t        ||z        }|d   }|d   }|d   }|d   }	|d   }
|j                         j	                  |||d      }|j                  dd      }|j                  ||z  |	z  |
||      }|j                  dd	      }|j                  |||	|z  |
|z        }|d
   r&t        j                  j                  ||d   dd      }|S )Nr   r   r   r   r   r   r   r   r#   r   r   r   Fr   )
r   r   r%   r   r   r   r   r   r   r   )rJ   r   r   r   r   r   r   r   r   r   r   rL   s               r'   foldingzMobileViTLayer.folding  s&   $($4$4d6G6G\|34
|,
Z(.$%9:#$78 %%',,Z[RTU%%a+##!$44o|U`
 %%a+##"2\"A?U`C`
 ]#}}00y5JV[ 1 H r)   c                    | j                   r| j                  |      }|}| j                  |      }| j                  |      }| j                  |      \  }}| j	                  |      }| j                  |      }| j                  ||      }| j                  |      }| j                  t        j                  ||fd            }|S Nr   r   )r   r   r   r   r   r   r  r   r   rT   cat)rJ   rL   rf   r   r   s        r'   rO   zMobileViTLayer.forward  s    ""..x8H ==*==* "^^H5 ""7+..) <<3''1;;uyy(H)=1EFr)   rg   )rP   rQ   rR   rh   r   r%   r@   rT   rU   r   r   r   r  rO   rV   rW   s   @r'   r   r   t  s     8
8
 8
 	8

 8
 8
 8
 8
 
8
t1"%,, 1"5t9K3L 1"fu||   :  r)   r   c                   d     e Zd Zdeddf fdZ	 	 d	dej                  dededee	e
f   fdZ xZS )
MobileViTEncoderr,   r!   Nc           	         t         
|           || _        t        j                         | _        d| _        dx}}|j                  dk(  rd}d}n|j                  dk(  rd}d}t        ||j                  d   |j                  d   dd      }| j
                  j                  |       t        ||j                  d   |j                  d   dd	      }| j
                  j                  |       t        ||j                  d   |j                  d	   d|j                  d   d
      }| j
                  j                  |       |r|dz  }t        ||j                  d	   |j                  d   d|j                  d   d|      }| j
                  j                  |       |r|dz  }t        ||j                  d   |j                  d   d|j                  d   d	|      }	| j
                  j                  |	       y )NFr   T   r   r   )r-   r.   r0   rk   r#   r   )r-   r.   r0   rx   rk      )r-   r.   r0   rx   rk   r3      )r?   r@   r,   r   rm   rn   gradient_checkpointingoutput_striderj   neck_hidden_sizesrp   r   hidden_sizes)rJ   r,   dilate_layer_4dilate_layer_5r3   layer_1layer_2layer_3layer_4layer_5rK   s             r'   r@   zMobileViTEncoder.__init__  s   ]]_
&+# +0/1$!N!N!!R'!N)00311!4
 	

'")00311!4
 	

'" 00311!4++A.
 	

'"MH 00311!4++A.
 	

'"MH 00311!4++A.
 	

'"r)   r   output_hidden_statesreturn_dictc                    |rdnd }t        | j                        D ]K  \  }}| j                  r)| j                  r| j	                  |j
                  |      }n ||      }|sF||fz   }M |st        d ||fD              S t        ||      S )N c              3   &   K   | ]	  }||  y wrN   r  ).0vs     r'   	<genexpr>z+MobileViTEncoder.forward.<locals>.<genexpr>}  s     Xq!-Xs   )last_hidden_stater   )	enumeratern   r  training_gradient_checkpointing_func__call__tupler   )rJ   r   r  r  all_hidden_statesrq   rt   s          r'   rO   zMobileViTEncoder.forwardh  s     #7BD(4 
	IOA|**t}} $ A A ))!!
 !-] ;#$58H$H!
	I X]4E$FXXX-]noor)   )FT)rP   rQ   rR   r   r@   rT   rU   rS   r   r$  r   rO   rV   rW   s   @r'   r  r    sa    H# H#4 H#Z &+ 	p||p #p 	p
 
u44	5pr)   r  c                   ~    e Zd ZdZeZdZdZdZdgZ	de
ej                  ej                  ej                  f   ddfd	Zy)
MobileViTPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    	mobilevitpixel_valuesTr   moduler!   Nc                    t        |t        j                  t        j                  f      rm|j                  j
                  j                  d| j                  j                         |j                  %|j                  j
                  j                          yyt        |t        j                        rJ|j                  j
                  j                          |j                  j
                  j                  d       yy)zInitialize the weightsg        )meanstdNg      ?)rF   r   r}   rB   weightdatanormal_r,   initializer_ranger2   zero_r   fill_)rJ   r*  s     r'   _init_weightsz&MobileViTPreTrainedModel._init_weights  s    fryy"))45 MM&&CT[[5R5R&S{{&  &&( '-KK""$MM$$S) .r)   )rP   rQ   rR   rh   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   r   r}   rB   r   r4  r  r)   r'   r'  r'    sT    
 #L#$O&*#)*
*E"))RYY*L$M 
*RV 
*r)   r'  aK  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`MobileViTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aF  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`MobileViTImageProcessor.__call__`] for details.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zWThe bare MobileViT model outputting raw hidden-states without any specific head on top.c                        e Zd Zddedef fdZd Z ee       e	e
eede      	 	 	 ddeej                      dee   d	ee   d
eeef   fd              Z xZS )MobileViTModelr,   expand_outputc                 L   t         |   |       || _        || _        t	        ||j
                  |j                  d   dd      | _        t        |      | _	        | j                  r.t	        ||j                  d   |j                  d   d      | _
        | j                          y )	Nr   r   r#   )r-   r.   r/   r0   r     r   r\   )r?   r@   r,   r<  r+   num_channelsr  	conv_stemr  encoderconv_1x1_exp	post_init)rJ   r,   r<  rK   s      r'   r@   zMobileViTModel.__init__  s     *+++11!4
 (/ 2"44Q7#55a8	!D 	r)   c                     |j                         D ]e  \  }}| j                  j                  |   }t        |t              s0|j
                  j                  D ]  }|j                  j                  |        g y)zPrunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
        N)itemsrA  rn   rF   r   r   r   r   )rJ   heads_to_prunelayer_indexr   mobilevit_layerr   s         r'   _prune_headszMobileViTModel._prune_heads  ss     #1"6"6"8 	CK"ll00=O/>:)8)D)D)J)J C%%//;;EBC	Cr)   vision)
checkpointoutput_typer5  modalityexpected_outputr)  r  r  r!   c                    ||n| j                   j                  }||n| j                   j                  }|t        d      | j	                  |      }| j                  |||      }| j                  r/| j                  |d         }t        j                  |ddgd      }n|d   }d }|s|||fn|f}||dd  z   S t        |||j                  	      S )
Nz You have to specify pixel_valuesr  r  r   r   r   F)r   keepdimr   )r  pooler_outputr   )r,   r  use_return_dictrA   r@  rA  r<  rB  rT   r,  r   r   )	rJ   r)  r  r  embedding_outputencoder_outputsr  pooled_outputr   s	            r'   rO   zMobileViTModel.forward  s    %9$D $++JjJj 	 &1%<k$++B]B]?@@>>,7,,!5# ' 
  $ 1 1/!2D E "JJ'8r2hPUVM / 2 M;H;T'7[lZnFOAB///7/')77
 	
r)   )T)NNN)rP   rQ   rR   r   rS   r@   rI  r   MOBILEVIT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   rT   rU   r   r$  rO   rV   rW   s   @r'   r;  r;    s    
 t 4C ++EF&<$. 04/3&*	'
u||,'
 'tn'
 d^	'

 
u>>	?'
 G'
r)   r;  z
    MobileViT model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    c                        e Zd Zdeddf fdZ ee       eee	e
e      	 	 	 	 ddeej                     dee   deej                     d	ee   deee	f   f
d
              Z xZS )MobileViTForImageClassificationr,   r!   Nc                 |   t         |   |       |j                  | _        t        |      | _        t        j                  |j                  d      | _        |j                  dkD  r-t        j                  |j                  d   |j                        nt        j                         | _        | j                          y )NT)inplacer   r   )r?   r@   
num_labelsr;  r(  r   r   classifier_dropout_probr   r}   r  Identity
classifierrC  rJ   r,   rK   s     r'   r@   z(MobileViTForImageClassification.__init__  s      ++'/ zz&"@"@$OJPJ[J[^_J_BIIf..r2F4E4EFegepeper 	
 	r)   )rK  rL  r5  rN  r)  r  labelsr  c                 6   ||n| j                   j                  }| j                  |||      }|r|j                  n|d   }| j	                  | j                  |            }d}|| j                   j                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }	| j                  dk(  r& |	|j                         |j                               }n |	||      }n| j                   j                  dk(  r=t               }	 |	|j                  d| j                        |j                  d            }n,| j                   j                  dk(  rt!               }	 |	||      }|s|f|dd z   }
||f|
z   S |
S t#        |||j$                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NrP  r   
regressionsingle_label_classificationmulti_label_classificationr   r#   )losslogitsr   )r,   rS  r(  rR  rb  r   problem_typer_  dtyperT   longr%   r   squeezer
   r   r	   r   r   )rJ   r)  r  rd  r  outputsrV  rj  ri  loss_fctr   s              r'   rO   z'MobileViTForImageClassification.forward%  s   ( &1%<k$++B]B]..DXfq.r1<--'!*m!<={{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE3!//
 	
r)   NNNN)rP   rQ   rR   r   r@   r   rW  r   _IMAGE_CLASS_CHECKPOINTr   rY  _IMAGE_CLASS_EXPECTED_OUTPUTr   rT   rU   rS   r   r$  rO   rV   rW   s   @r'   r\  r\    s     4  ++EF*8$4	 04/3)-&*4
u||,4
 'tn4
 &	4

 d^4
 
u::	;4
 G4
r)   r\  c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTASPPPoolingr,   r-   r.   r!   Nc           	          t         |           t        j                  d      | _        t        |||dddd      | _        y )Nr   )output_sizeTrelu)r-   r.   r/   r0   r4   r5   )r?   r@   r   AdaptiveAvgPool2dglobal_poolr+   r   )rJ   r,   r-   r.   rK   s       r'   r@   zMobileViTASPPPooling.__init__d  sB    //A>*#%"!
r)   rL   c                     |j                   dd  }| j                  |      }| j                  |      }t        j                  j                  ||dd      }|S )Nr   r   Fr   )r   rz  r   r   r   r   )rJ   rL   spatial_sizes      r'   rO   zMobileViTASPPPooling.forwards  sS    ~~bc*##H-==*==,,XLzin,or)   ru   rW   s   @r'   ru  ru  c  sA    
 
S 
PS 
X\ 
  r)   ru  c                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )MobileViTASPPzs
    ASPP module defined in DeepLab papers: https://arxiv.org/abs/1606.00915, https://arxiv.org/abs/1706.05587
    r,   r!   Nc                 ~   t         |           |j                  d   }|j                  }t	        |j
                        dk7  rt        d      t        j                         | _	        t        |||dd      }| j                  j                  |       | j                  j                  |j
                  D cg c]  }t        |||d|d       c}       t        |||      }| j                  j                  |       t        |d|z  |dd      | _        t        j                  |j                   	      | _        y c c}w )
Nr   r   z"Expected 3 values for atrous_ratesr   rx  r]   )r-   r.   r/   r3   r5   r  )p)r?   r@   r  aspp_out_channelsr   atrous_ratesrA   r   rm   convsr+   rp   extendru  projectr   aspp_dropout_probr   )rJ   r,   r-   r.   in_projectionrate
pool_layerrK   s          r'   r@   zMobileViTASPP.__init__  s(   ..r2//v""#q(ABB]]_
*#%!
 	

-(

 #//
  # +!- !!#)
	
 *&+|L


*%)L 0|YZkq
 zzF$<$<=)
s   5D:rL   c                     g }| j                   D ]  }|j                   ||              t        j                  |d      }| j	                  |      }| j                  |      }|S r  )r  rp   rT   r  r  r   )rJ   rL   pyramidconvpooled_featuress        r'   rO   zMobileViTASPP.forward  s\    JJ 	+DNN4>*	+))G+,,w/,,7r)   
rP   rQ   rR   rh   r   r@   rT   rU   rO   rV   rW   s   @r'   r~  r~  {  s7    )> )>4 )>V  r)   r~  c                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )MobileViTDeepLabV3zB
    DeepLabv3 architecture: https://arxiv.org/abs/1706.05587
    r,   r!   Nc           	          t         |           t        |      | _        t	        j
                  |j                        | _        t        ||j                  |j                  dddd      | _        y )Nr   FT)r-   r.   r/   r4   r5   r2   )r?   r@   r~  asppr   	Dropout2dr`  r   r+   r  r_  rb  rc  s     r'   r@   zMobileViTDeepLabV3.__init__  s]    !&)	||F$B$BC,00**# 
r)   r   c                 r    | j                  |d         }| j                  |      }| j                  |      }|S )Nr   )r  r   rb  )rJ   r   rL   s      r'   rO   zMobileViTDeepLabV3.forward  s6    99]2./<<)??8,r)   r  rW   s   @r'   r  r    s6    
 
4 
 U\\ ell r)   r  zX
    MobileViT model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                        e Zd Zdeddf fdZ ee       eee	      	 	 	 	 dde
ej                     de
ej                     de
e   d	e
e   deeef   f
d
              Z xZS ) MobileViTForSemanticSegmentationr,   r!   Nc                     t         |   |       |j                  | _        t        |d      | _        t        |      | _        | j                          y )NF)r<  )r?   r@   r_  r;  r(  r  segmentation_headrC  rc  s     r'   r@   z)MobileViTForSemanticSegmentation.__init__  sD      ++'eD!3F!; 	r)   )rL  r5  r)  rd  r  r  c                 h   ||n| j                   j                  }||n| j                   j                  }|$| j                   j                  dk(  rt	        d      | j                  |d|      }|r|j                  n|d   }| j                  |      }d}|Yt        j                  j                  ||j                  dd dd	      }	t        | j                   j                  
      }
 |
|	|      }|s|r
|f|dd z   }n	|f|dd z   }||f|z   S |S t        |||r|j                  d      S dd      S )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> import requests
        >>> import torch
        >>> from PIL import Image
        >>> from transformers import AutoImageProcessor, MobileViTForSemanticSegmentation

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-small")
        >>> model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTrP  r   r   Fr   )ignore_indexr#   )ri  rj  r   
attentions)r,   r  rS  r_  rA   r(  r   r  r   r   r   r   r
   semantic_loss_ignore_indexr   )rJ   r)  rd  r  r  ro  encoder_hidden_statesrj  ri  upsampled_logitsrp  r   s               r'   rO   z(MobileViTForSemanticSegmentation.forward  sq   N %9$D $++JjJj 	 &1%<k$++B]B]$++"8"8A"=NOO..!%# ! 
 :E 5 5'RS*''(=>!}}88V\\"#.Zu  9   (T[[5[5[\H,f5D# WQR[0 WQR[0)-)9TGf$EvE&3G'//	
 	
 NR	
 	
r)   rq  )rP   rQ   rR   r   r@   r   rW  r   r   rY  r   rT   rU   rS   r   r$  rO   rV   rW   s   @r'   r  r    s     4  ++EF+BQ`a 04)-/3&*K
u||,K
 &K
 'tn	K

 d^K
 
u--	.K
 b GK
r)   r  )r   N)Drh   r   typingr   r   r   r   r   rT   torch.utils.checkpointr   torch.nnr	   r
   r   activationsr   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   r   configuration_mobilevitr   
get_loggerrP   loggerrY  rX  rZ  rr  rs  r%   r(   Moduler+   rY   rj   rw   r   r   r   r   r   r   r   r  r'  MOBILEVIT_START_DOCSTRINGrW  r;  r\  ru  r~  r  r  r  r)   r'   <module>r     sE  "   4 4    A A !  . Q  5 
		H	% $ . '  2 1 #  HSM UX = =@-F		 -F`bii .0RYY 0f	")) 	   >BII 
bii 
		 &299 &fRYY fRbpryy bpJ* *2	 
  ]T
- T
	T
n  K
&> K
K
\299 08BII 8v 8  	X
'? X
X
r)   