
    sg=              	          d Z ddlmZmZmZ ddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZ  ej:                  e      ZdZ dZ!g dZ"dZ#dZ$d@de%de%dee%   de%fdZ& e'd       e'd      fde'de'de'de'fdZ( G d dejR                        Z* G d dejR                        Z+ G d d ejR                        Z, G d! d"ejR                        Z- G d# d$ejR                        Z. G d% d&ejR                        Z/ G d' d(ejR                        Z0 G d) d*ejR                        Z1 G d+ d,ejR                        Z2 G d- d.e      Z3d/Z4d0Z5 ed1e4       G d2 d3e3             Z6 ed4e4       G d5 d6e3             Z7 G d7 d8ejR                        Z8 G d9 d:ejR                        Z9 G d; d<ejR                        Z: ed=e4       G d> d?e3             Z;y)AzPyTorch MobileViTV2 model.    )OptionalTupleUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )MobileViTV2Configr   z$apple/mobilevitv2-1.0-imagenet1k-256)r         r   ztabby, tabby catvaluedivisor	min_valuereturnc                 |    ||}t        |t        | |dz  z         |z  |z        }|d| z  k  r||z  }t        |      S )a  
    Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
    original TensorFlow repo. It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
       g?)maxint)r   r   r   	new_values       g/var/www/html/venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/modeling_mobilevitv2.pymake_divisibler$   <   sS     	Is57Q;#677BWLMI3;W	y>    z-infinfmin_valmax_valc                 .    t        |t        ||             S N)r    minr   r'   r(   s      r#   clipr-   K   s    wGU+,,r%   c                        e Zd Z	 	 	 	 	 	 ddedededededededed	ed
eeef   ddf fdZde	j                  de	j                  fdZ xZS )MobileViTV2ConvLayerconfigin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr   Nc                 $   t         |           t        |dz
  dz        |z  }||z  dk7  rt        d| d| d      ||z  dk7  rt        d| d| d      t	        j
                  ||||||||d		      | _        |	r t	        j                  |d
ddd      | _        nd | _        |
rdt        |
t              rt        |
   | _        y t        |j                  t              rt        |j                     | _        y |j                  | _        y d | _        y )Nr   r   r   zInput channels (z) are not divisible by z groups.zOutput channels (zeros)	r1   r2   r3   r4   paddingr7   r5   r6   padding_modegh㈵>g?T)num_featuresepsmomentumaffinetrack_running_stats)super__init__r!   
ValueErrorr   Conv2dconvolutionBatchNorm2dnormalization
isinstancestrr   
activation
hidden_act)selfr0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r<   	__class__s               r#   rD   zMobileViTV2ConvLayer.__init__Q   s*    	{Q!+,x71$/}<STZS[[cdee& A%0>UV\U]]efgg99#%# 

 !#)$("D "&D.#."("8F--s3"():):";"("3"3"DOr%   featuresc                     | j                  |      }| j                  | j                  |      }| j                  | j                  |      }|S r*   )rG   rI   rL   )rN   rP   s     r#   forwardzMobileViTV2ConvLayer.forward   sK    ##H-)))(3H??&x0Hr%   )r   r   Fr   TT)__name__
__module____qualname__r   r!   boolr   rK   rD   torchTensorrR   __classcell__rO   s   @r#   r/   r/   P   s     "&+/4#!4# 4# 	4#
 4# 4# 4# 4# 4#  4# dCi(4# 
4#l  r%   r/   c                   x     e Zd ZdZ	 ddedededededdf fd	Zd
ej                  dej                  fdZ	 xZ
S )MobileViTV2InvertedResidualzQ
    Inverted residual block (MobileNetv2): https://arxiv.org/abs/1801.04381
    r0   r1   r2   r4   r7   r   Nc           	      @   t         |           t        t        t	        ||j
                  z              d      }|dvrt        d| d      |dk(  xr ||k(  | _        t        |||d      | _	        t        |||d|||      | _
        t        |||dd	
      | _        y )Nr   )r   r   zInvalid stride .r   )r1   r2   r3   r
   )r1   r2   r3   r4   r5   r7   Fr1   r2   r3   r9   )rC   rD   r$   r!   roundexpand_ratiorE   use_residualr/   
expand_1x1conv_3x3
reduce_1x1)rN   r0   r1   r2   r4   r7   expanded_channelsrO   s          r#   rD   z$MobileViTV2InvertedResidual.__init__   s     	*3u[6CVCV5V/W+XZ[\vha899#q[K{l/J.:KYZ
 -)*$
 /)% 
r%   rP   c                     |}| j                  |      }| j                  |      }| j                  |      }| j                  r||z   S |S r*   )rc   rd   re   rb   )rN   rP   residuals      r#   rR   z#MobileViTV2InvertedResidual.forward   sI    ??8,==*??8,&*&7&7x("EXEr%   )r   rS   rT   rU   __doc__r   r!   rD   rW   rX   rR   rY   rZ   s   @r#   r\   r\      sc    
 lm
'
69
IL
VY
eh
	
BF F Fr%   r\   c                   t     e Zd Z	 ddedededededdf fdZd	ej                  dej                  fd
Z xZ	S )MobileViTV2MobileNetLayerr0   r1   r2   r4   
num_stagesr   Nc                     t         |           t        j                         | _        t        |      D ]5  }t        ||||dk(  r|nd      }| j                  j                  |       |}7 y )Nr   r   )r1   r2   r4   )rC   rD   r   
ModuleListlayerranger\   append)	rN   r0   r1   r2   r4   rm   irp   rO   s	           r#   rD   z"MobileViTV2MobileNetLayer.__init__   sh     	]]_
z" 	'A/')!"avQ	E JJe$&K	'r%   rP   c                 8    | j                   D ]
  } ||      } |S r*   rp   )rN   rP   layer_modules      r#   rR   z!MobileViTV2MobileNetLayer.forward   s$     JJ 	.L#H-H	.r%   )r   r   
rS   rT   rU   r   r!   rD   rW   rX   rR   rY   rZ   s   @r#   rl   rl      sV    qr'''69'IL'VY'kn'	'   r%   rl   c                   h     e Zd ZdZdededdf fdZdej                  dej                  fdZ	 xZ
S )	MobileViTV2LinearSelfAttentionaq  
    This layer applies a self-attention with linear complexity, as described in MobileViTV2 paper:
    https://arxiv.org/abs/2206.02680

    Args:
        config (`MobileVitv2Config`):
             Model configuration object
        embed_dim (`int`):
            `input_channels` from an expected input of size :math:`(batch_size, input_channels, height, width)`
    r0   	embed_dimr   Nc           	          t         |           t        ||dd|z  z   dddd      | _        t	        j
                  |j                        | _        t        |||dddd      | _        || _        y )Nr   r   TF)r0   r1   r2   r6   r3   r8   r9   p)	rC   rD   r/   qkv_projr   Dropoutattn_dropoutout_projrz   )rN   r0   rz   rO   s      r#   rD   z'MobileViTV2LinearSelfAttention.__init__   s{    ,!a)m,# 
 JJ)<)<=,!"# 
 #r%   hidden_statesc                    | j                  |      }t        j                  |d| j                  | j                  gd      \  }}}t        j                  j
                  j                  |d      }| j                  |      }||z  }t        j                  |dd      }t        j                  j
                  j                  |      |j                  |      z  }| j                  |      }|S )Nr   )split_size_or_sectionsdimr   Tr   keepdim)r~   rW   splitrz   r   
functionalsoftmaxr   sumrelu	expand_asr   )	rN   r   qkvquerykeyr   context_scorescontext_vectorouts	            r#   rR   z&MobileViTV2LinearSelfAttention.forward   s    mmM*
 "KKQX\XfXfDgmnosE ,,44U4C**>: ~->r4H hh!!&&u-0H0H0OOmmC 
r%   ri   rZ   s   @r#   ry   ry      s>    	#0 #S #T #2U\\ ell r%   ry   c                   p     e Zd Z	 d
dededededdf
 fdZdej                  dej                  fd	Z	 xZ
S )MobileViTV2FFNr0   rz   ffn_latent_dimffn_dropoutr   Nc           
          t         |           t        |||ddddd      | _        t	        j
                  |      | _        t        |||ddddd      | _        t	        j
                  |      | _        y )Nr   TF)r0   r1   r2   r3   r4   r6   r8   r9   )	rC   rD   r/   conv1r   r   dropout1conv2dropout2)rN   r0   rz   r   r   rO   s        r#   rD   zMobileViTV2FFN.__init__  s|     	)!'#	

 

;/)&"# 	

 

;/r%   r   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r*   )r   r   r   r   )rN   r   s     r#   rR   zMobileViTV2FFN.forward9  s@    

=1m4

=1m4r%           rS   rT   rU   r   r!   floatrD   rW   rX   rR   rY   rZ   s   @r#   r   r     sY     !0!0 0 	0
 0 
0@U\\ ell r%   r   c                   p     e Zd Z	 d
dededededdf
 fdZdej                  dej                  fd	Z	 xZ
S )MobileViTV2TransformerLayerr0   rz   r   dropoutr   Nc                 P   t         |           t        j                  d||j                        | _        t        ||      | _        t        j                  |      | _	        t        j                  d||j                        | _
        t        ||||j                        | _        y )Nr   
num_groupsnum_channelsr?   r|   )rC   rD   r   	GroupNormlayer_norm_epslayernorm_beforery   	attentionr   r   layernorm_afterr   r   ffn)rN   r0   rz   r   r   rO   s        r#   rD   z$MobileViTV2TransformerLayer.__init__B  s~     	 "	W]WlWl m7	J

W-!||qyV\VkVkl!&)^VEWEWXr%   r   c                     | j                  |      }| j                  |      }||z   }| j                  |      }| j                  |      }||z   }|S r*   )r   r   r   r   )rN   r   layernorm_1_outattention_outputlayer_outputs        r#   rR   z#MobileViTV2TransformerLayer.forwardP  sY    //>>>/:(=8++M:xx-#m3r%   r   r   rZ   s   @r#   r   r   A  s^     Y!Y Y 	Y
 Y 
Y	U\\ 	ell 	r%   r   c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTV2Transformerr0   n_layersd_modelr   Nc                 <   t         	|           |j                  }||z  g|z  }|D cg c]  }t        |dz  dz         }}t	        j
                         | _        t        |      D ].  }t        ||||         }| j                  j                  |       0 y c c}w )N   )rz   r   )
rC   rD   ffn_multiplierr!   r   ro   rp   rq   r   rr   )
rN   r0   r   r   r   ffn_dimsd	block_idxtransformer_layerrO   s
            r#   rD   zMobileViTV2Transformer.__init__]  s    .."W,-8 2::ACbB'::]]_
x 	1I ;'(9:M! JJ/0		1 ;s   Br   c                 8    | j                   D ]
  } ||      } |S r*   ru   )rN   r   rv   s      r#   rR   zMobileViTV2Transformer.forwardn  s%     JJ 	8L(7M	8r%   rw   rZ   s   @r#   r   r   \  sA    10 1C 1# 1RV 1"U\\ ell r%   r   c                       e Zd ZdZ	 	 	 ddededededededed	d
f fdZdej                  d	e	ej                  e	eef   f   fdZ
dej                  de	eef   d	ej                  fdZdej                  d	ej                  fdZ xZS )MobileViTV2Layerz=
    MobileViTV2 layer: https://arxiv.org/abs/2206.02680
    r0   r1   r2   attn_unit_dimn_attn_blocksr7   r4   r   Nc                    t         	|           |j                  | _        |j                  | _        |}|dk(  r)t        ||||dk(  r|nd|dkD  r|dz  nd      | _        |}nd | _        t        ||||j                  |      | _	        t        |||ddd      | _
        t        |||      | _        t        j                  d||j                        | _        t        |||dd	d      | _        y )
Nr   r   )r1   r2   r4   r7   )r1   r2   r3   r5   F)r1   r2   r3   r8   r9   )r   r   r   T)rC   rD   
patch_sizepatch_widthpatch_heightr\   downsampling_layerr/   conv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projection)
rN   r0   r1   r2   r   r   r7   r4   cnn_out_dimrO   s
            r#   rD   zMobileViTV2Layer.__init__y  s    	!,,"--#Q;&A')!)QvA*2Q,QA'D# 'K&*D# -#$//
 -#$# 
 2&-Zgh TZTiTij  4#$"  
r%   feature_mapc                 "   |j                   \  }}}}t        j                  j                  || j                  | j
                  f| j                  | j
                  f      }|j                  ||| j                  | j
                  z  d      }|||ffS )N)r3   r4   r   )shaper   r   unfoldr   r   reshape)rN   r   
batch_sizer1   
img_height	img_widthpatchess          r#   	unfoldingzMobileViTV2Layer.unfolding  s    9D9J9J6
KY--&&**D,<,<=%%t'7'78 ' 

 //*k4;L;LtO_O_;_acdY///r%   r   output_sizec                     |j                   \  }}}}|j                  |||z  |      }t        j                  j	                  ||| j
                  | j                  f| j
                  | j                  f      }|S )N)r   r3   r4   )r   r   r   r   foldr   r   )rN   r   r   r   in_dimr   	n_patchesr   s           r#   foldingzMobileViTV2Layer.folding  sz    4;MM1
FJ	//*fz.A9Mmm((#**D,<,<=%%t'7'78	 ) 
 r%   rP   c                 6   | j                   r| j                  |      }| j                  |      }| j                  |      }| j                  |      \  }}| j	                  |      }| j                  |      }| j                  ||      }| j                  |      }|S r*   )r   r   r   r   r   r   r   r   )rN   rP   r   r   s       r#   rR   zMobileViTV2Layer.forward  s    ""..x8H ==*==*  $~~h7 ""7+..) <<5''1r%   )r   r   r   )rS   rT   rU   rj   r   r!   rD   rW   rX   r   r   r   rR   rY   rZ   s   @r#   r   r   t  s     ;
!;
 ;
 	;

 ;
 ;
 ;
 ;
 
;
z	0U\\ 	0eELL%PSUXPX/<Y6Z 	0u|| %S/ ell   r%   r   c                   d     e Zd Zdeddf fdZ	 	 d	dej                  dededee	e
f   fdZ xZS )
MobileViTV2Encoderr0   r   Nc           	         t         |           || _        t        j                         | _        d| _        dx}}|j                  dk(  rd}d}n|j                  dk(  rd}d}t        t        d|j                  z  dd      dd	      }t        d|j                  z  d
      }t        d|j                  z  d
      }t        d|j                  z  d
      }t        d|j                  z  d
      }	t        d|j                  z  d
      }
t        |||dd      }| j
                  j                  |       t        |||dd      }| j
                  j                  |       t        |||t        |j                  d   |j                  z  d
      |j                  d         }| j
                  j                  |       |r|dz  }t        |||	t        |j                  d   |j                  z  d
      |j                  d   |      }| j
                  j                  |       |r|dz  }t        ||	|
t        |j                  d   |j                  z  d
      |j                  d   |      }| j
                  j                  |       y )NFr   Tr   r       @   r,   r   r   r         i  r   )r1   r2   r4   rm   r   r   )r1   r2   r   r   )r1   r2   r   r   r7   )rC   rD   r0   r   ro   rp   gradient_checkpointingoutput_strider$   r-   width_multiplierrl   rr   r   base_attn_unit_dimsr   )rN   r0   dilate_layer_4dilate_layer_5r7   layer_0_dimlayer_1_dimlayer_2_dimlayer_3_dimlayer_4_dimlayer_5_dimlayer_1layer_2layer_3layer_4layer_5rO   s                   r#   rD   zMobileViTV2Encoder.__init__  s|   ]]_
&+# +0/1$!N!N!!R'!N$rF333RLVWce
 %R&*A*A%A2N$S6+B+B%BAN$S6+B+B%BAN$S6+B+B%BAN$S6+B+B%BAN+#$
 	

'"+#$
 	

'""#$()C)CA)FI`I`)`jkl ..q1
 	

'"MH"#$()C)CA)FI`I`)`jkl ..q1
 	

'"MH"#$()C)CA)FI`I`)`jkl ..q1
 	

'"r%   r   output_hidden_statesreturn_dictc                    |rdnd }t        | j                        D ]K  \  }}| j                  r)| j                  r| j	                  |j
                  |      }n ||      }|sF||fz   }M |st        d ||fD              S t        ||      S )N c              3   &   K   | ]	  }||  y wr*   r   ).0vs     r#   	<genexpr>z-MobileViTV2Encoder.forward.<locals>.<genexpr>M  s     Xq!-Xs   )last_hidden_stater   )	enumeraterp   r   training_gradient_checkpointing_func__call__tupler   )rN   r   r   r   all_hidden_statesrs   rv   s          r#   rR   zMobileViTV2Encoder.forward8  s     #7BD(4 
	IOA|**t}} $ A A ))!!
 !-] ;#$58H$H!
	I X]4E$FXXX-]noor%   )FT)rS   rT   rU   r   rD   rW   rX   rV   r   r	  r   rR   rY   rZ   s   @r#   r   r     sb    O#0 O#T O#h &+ 	p||p #p 	p
 
u44	5pr%   r   c                   ~    e Zd ZdZeZdZdZdZdgZ	de
ej                  ej                  ej                  f   ddfd	Zy)
MobileViTV2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    mobilevitv2pixel_valuesTr   moduler   Nc                    t        |t        j                  t        j                  f      rm|j                  j
                  j                  d| j                  j                         |j                  %|j                  j
                  j                          yyt        |t        j                        rJ|j                  j
                  j                          |j                  j
                  j                  d       yy)zInitialize the weightsr   )meanstdNg      ?)rJ   r   LinearrF   weightdatanormal_r0   initializer_ranger6   zero_	LayerNormfill_)rN   r  s     r#   _init_weightsz(MobileViTV2PreTrainedModel._init_weights_  s    fryy"))45 MM&&CT[[5R5R&S{{&  &&( '-KK""$MM$$S) .r%   )rS   rT   rU   rj   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   r   r  rF   r  r  r   r%   r#   r  r  S  sT    
 %L%$O&*#+,
*E"))RYY*L$M 
*RV 
*r%   r  aM  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`MobileViTV2Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aF  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`MobileViTImageProcessor.__call__`] for details.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zYThe bare MobileViTV2 model outputting raw hidden-states without any specific head on top.c                        e Zd Zddedef fdZd Z ee       e	e
eede      	 	 	 ddeej                      dee   d	ee   d
eeef   fd              Z xZS )MobileViTV2Modelr0   expand_outputc           	         t         |   |       || _        || _        t	        t        d|j                  z  dd      dd      }t        ||j                  |ddd	d	
      | _	        t        |      | _        | j                          y )Nr   r   r   r,   r   r   r
   r   Tr1   r2   r3   r4   r8   r9   )rC   rD   r0   r#  r$   r-   r   r/   r   	conv_stemr   encoder	post_init)rN   r0   r#  r   rO   s       r#   rD   zMobileViTV2Model.__init__  s     *$rF333RLVWce
 .++$"
 *&1 	r%   c                     |j                         D ]e  \  }}| j                  j                  |   }t        |t              s0|j
                  j                  D ]  }|j                  j                  |        g y)zPrunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
        N)itemsr'  rp   rJ   r   r   r   prune_heads)rN   heads_to_prunelayer_indexheadsmobilevitv2_layerr   s         r#   _prune_headszMobileViTV2Model._prune_heads  sv     #1"6"6"8 	CK $ 2 2; ?+-=>):)F)F)L)L C%%//;;EBC	Cr%   vision)
checkpointoutput_typer  modalityexpected_outputr  r   r   r   c                    ||n| j                   j                  }||n| j                   j                  }|t        d      | j	                  |      }| j                  |||      }| j                  r |d   }t        j                  |ddgd      }n|d   }d }|s|||fn|f}||dd  z   S t        |||j                  	      S )
Nz You have to specify pixel_valuesr   r   r   r   Fr   r   )r  pooler_outputr   )r0   r   use_return_dictrE   r&  r'  r#  rW   r  r   r   )	rN   r  r   r   embedding_outputencoder_outputsr  pooled_outputoutputs	            r#   rR   zMobileViTV2Model.forward  s     %9$D $++JjJj 	 &1%<k$++B]B]?@@>>,7,,!5# ' 
  / 2 "JJ'8r2hPUVM / 2 M;H;T'7[lZnFOAB///7/')77
 	
r%   )T)NNN)rS   rT   rU   r   rV   rD   r0  r   MOBILEVITV2_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   rW   rX   r   r	  rR   rY   rZ   s   @r#   r"  r"    s    
0  .C ++GH&<$. 04/3&*	'
u||,'
 'tn'
 d^	'

 
u>>	?'
 I'
r%   r"  z
    MobileViTV2 model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    c                        e Zd Zdeddf fdZ ee       eee	e
e      	 	 	 	 ddeej                     dee   deej                     d	ee   deee	f   f
d
              Z xZS )!MobileViTV2ForImageClassificationr0   r   Nc                 L   t         |   |       |j                  | _        t        |      | _        t        d|j                  z  d      }|j                  dkD  r!t        j                  ||j                        nt        j                         | _
        | j                          y )Nr   r   r   r   )in_featuresout_features)rC   rD   
num_labelsr"  r  r$   r   r   r  Identity
classifierr(  )rN   r0   r2   rO   s      r#   rD   z*MobileViTV2ForImageClassification.__init__  s      +++F3%cF,C,C&CQO   1$ II,V=N=NO 	 	r%   )r2  r3  r  r5  r  r   labelsr   c                    ||n| j                   j                  }| j                  |||      }|r|j                  n|d   }| j	                  |      }d}|| j                   j
                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j
                  dk(  rIt               }	| j                  dk(  r& |	|j                         |j                               }n |	||      }n| j                   j
                  dk(  r=t               }	 |	|j                  d| j                        |j                  d            }n,| j                   j
                  dk(  rt               }	 |	||      }|s|f|dd z   }
||f|
z   S |
S t!        |||j"                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr7  r   
regressionsingle_label_classificationmulti_label_classificationr   r   )losslogitsr   )r0   r:  r  r9  rJ  problem_typerH  dtyperW   longr!   r	   squeezer   viewr   r   r   )rN   r  r   rK  r   outputsr=  rQ  rP  loss_fctr>  s              r#   rR   z)MobileViTV2ForImageClassification.forward  s   ( &1%<k$++B]B]""<FZhs"t1<--'!*/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE3!//
 	
r%   NNNN)rS   rT   rU   r   rD   r   r?  r   _IMAGE_CLASS_CHECKPOINTr   rA  _IMAGE_CLASS_EXPECTED_OUTPUTr   rW   rX   rV   r   r	  rR   rY   rZ   s   @r#   rD  rD    s    0 T " ++GH*8$4	 04/3)-&*4
u||,4
 'tn4
 &	4

 d^4
 
u::	;4
 I4
r%   rD  c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTV2ASPPPoolingr0   r1   r2   r   Nc           	          t         |           t        j                  d      | _        t        |||dddd      | _        y )Nr   )r   Tr   r%  )rC   rD   r   AdaptiveAvgPool2dglobal_poolr/   r   )rN   r0   r1   r2   rO   s       r#   rD   zMobileViTV2ASPPPooling.__init__5  sB    //A>,#%"!
r%   rP   c                     |j                   dd  }| j                  |      }| j                  |      }t        j                  j                  ||dd      }|S )Nr8  bilinearFsizemodealign_corners)r   r`  r   r   r   interpolate)rN   rP   spatial_sizes      r#   rR   zMobileViTV2ASPPPooling.forwardD  sS    ~~bc*##H-==*==,,XLzin,or%   rw   rZ   s   @r#   r]  r]  4  sB    
0 
s 
RU 
Z^ 
  r%   r]  c                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )MobileViTV2ASPPzs
    ASPP module defined in DeepLab papers: https://arxiv.org/abs/1606.00915, https://arxiv.org/abs/1706.05587
    r0   r   Nc                    t         |           t        d|j                  z  d      }|}|j                  }t        |j                        dk7  rt        d      t        j                         | _
        t        |||dd      }| j                  j                  |       | j                  j                  |j                  D cg c]  }t        |||d|d	       c}       t        |||      }| j                  j                  |       t        |d
|z  |dd      | _        t        j                   |j"                        | _        y c c}w )Nr   r   r   r
   z"Expected 3 values for atrous_ratesr   r   r_   )r1   r2   r3   r7   r9      r|   )rC   rD   r$   r   aspp_out_channelslenatrous_ratesrE   r   ro   convsr/   rr   extendr]  projectr   aspp_dropout_probr   )	rN   r0   encoder_out_channelsr1   r2   in_projectionrate
pool_layerrO   s	           r#   rD   zMobileViTV2ASPP.__init__Q  s6   -cF4K4K.KUVW*//v""#q(ABB]]_
,#%!
 	

-(

 #//
  % +!- !!#)
	
 ,FKN


*%+L 0|YZkq
 zzF$<$<=)
s   ErP   c                     g }| j                   D ]  }|j                   ||              t        j                  |d      }| j	                  |      }| j                  |      }|S )Nr   r   )rp  rr   rW   catrr  r   )rN   rP   pyramidconvpooled_featuress        r#   rR   zMobileViTV2ASPP.forward}  s\    JJ 	+DNN4>*	+))G+,,w/,,7r%   
rS   rT   rU   rj   r   rD   rW   rX   rR   rY   rZ   s   @r#   rj  rj  L  s8    *>0 *>T *>X  r%   rj  c                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )MobileViTV2DeepLabV3zB
    DeepLabv3 architecture: https://arxiv.org/abs/1706.05587
    r0   r   Nc           	          t         |           t        |      | _        t	        j
                  |j                        | _        t        ||j                  |j                  dddd      | _        y )Nr   FT)r1   r2   r3   r8   r9   r6   )rC   rD   rj  asppr   	Dropout2dclassifier_dropout_probr   r/   rm  rH  rJ  rN   r0   rO   s     r#   rD   zMobileViTV2DeepLabV3.__init__  s]    #F+	||F$B$BC.00**# 
r%   r   c                 r    | j                  |d         }| j                  |      }| j                  |      }|S )Nr   )r  r   rJ  )rN   r   rP   s      r#   rR   zMobileViTV2DeepLabV3.forward  s6    99]2./<<)??8,r%   r}  rZ   s   @r#   r  r    s7    
0 
T 
 U\\ ell r%   r  zZ
    MobileViTV2 model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                        e Zd Zdeddf fdZ ee       eee	      	 	 	 	 dde
ej                     de
ej                     de
e   d	e
e   deeef   f
d
              Z xZS )"MobileViTV2ForSemanticSegmentationr0   r   Nc                     t         |   |       |j                  | _        t        |d      | _        t        |      | _        | j                          y )NF)r#  )rC   rD   rH  r"  r  r  segmentation_headr(  r  s     r#   rD   z+MobileViTV2ForSemanticSegmentation.__init__  sE      +++F%H!5f!= 	r%   )r3  r  r  rK  r   r   c                 h   ||n| j                   j                  }||n| j                   j                  }|$| j                   j                  dk(  rt	        d      | j                  |d|      }|r|j                  n|d   }| j                  |      }d}|Yt        j                  j                  ||j                  dd dd	      }	t        | j                   j                  
      }
 |
|	|      }|s|r
|f|dd z   }n	|f|dd z   }||f|z   S |S t        |||r|j                  d      S dd      S )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> import requests
        >>> import torch
        >>> from PIL import Image
        >>> from transformers import AutoImageProcessor, MobileViTV2ForSemanticSegmentation

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")
        >>> model = MobileViTV2ForSemanticSegmentation.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr7  r8  rb  Frc  )ignore_indexr   )rP  rQ  r   
attentions)r0   r   r:  rH  rE   r  r   r  r   r   rg  r   r   semantic_loss_ignore_indexr   )rN   r  rK  r   r   rW  encoder_hidden_statesrQ  rP  upsampled_logitsrX  r>  s               r#   rR   z*MobileViTV2ForSemanticSegmentation.forward  ss   N %9$D $++JjJj 	 &1%<k$++B]B]$++"8"8A"=NOO""!%# # 
 :E 5 5'RS*''(=>!}}88V\\"#.Zu  9   (T[[5[5[\H,f5D# WQR[0 WQR[0)-)9TGf$EvE&3G'//	
 	
 NR	
 	
r%   rY  )rS   rT   rU   r   rD   r   r?  r   r   rA  r   rW   rX   rV   r   r	  rR   rY   rZ   s   @r#   r  r    s    0 T  ++GH+BQ`a 04)-/3&*K
u||,K
 &K
 'tn	K

 d^K
 
u--	.K
 b IK
r%   r  )r   N)<rj   typingr   r   r   rW   torch.utils.checkpointr   torch.nnr   r   r	   activationsr   modeling_outputsr   r   r   r   modeling_utilsr   utilsr   r   r   r   r   configuration_mobilevitv2r   
get_loggerrS   loggerrA  r@  rB  rZ  r[  r!   r$   r   r-   Moduler/   r\   rl   ry   r   r   r   r   r   r  MOBILEVITV2_START_DOCSTRINGr?  r"  rD  r]  rj  r  r  r   r%   r#   <module>r     sB  " ! ) )    A A !  .  9 
		H	% & = '  A 1 #  HSM UX  ).fe - - - -Y^ -
=299 =B-F")) -Fb		 .<RYY <~&RYY &R")) 6RYY 0oryy odip ipZ* *2	 
   _Q
1 Q
	Q
h   M
(B M
M
bRYY 09bii 9z299 8   	X
)C X
X
r%   