
    sg+\              	          d Z ddlmZmZmZ ddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZ ddlmZ  ej>                  e       Z!dZ"dZ#g dZ$dZ%dZ&d2dejN                  de(de)dejN                  fdZ* G d dejV                        Z, G d dejV                        Z- G d dejV                        Z. G d dejV                        Z/ G d d ejV                        Z0 G d! d"ejV                        Z1 G d# d$ejV                        Z2 G d% d&e      Z3d'Z4d(Z5 ed)e4       G d* d+e3             Z6 ed,e4       G d- d.e3             Z7 ed/e4       G d0 d1e3e             Z8y)3zPyTorch ConvNextV2 model.    )OptionalTupleUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BackboneOutputBaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttention)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings)BackboneMixin   )ConvNextV2Configr   zfacebook/convnextv2-tiny-1k-224)r   i      r   ztabby, tabby catinput	drop_probtrainingreturnc                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
            r   r   )r   )dtypedevice)shapendimtorchrandr    r!   floor_div)r   r   r   	keep_probr"   random_tensoroutputs          e/var/www/html/venv/lib/python3.12/site-packages/transformers/models/convnextv2/modeling_convnextv2.py	drop_pathr,   :   s     CxII[[^

Q 77E

5ELL YYMYYy!M1FM    c                   x     e Zd ZdZd	dee   ddf fdZdej                  dej                  fdZ	de
fdZ xZS )
ConvNextV2DropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                 0    t         |           || _        y N)super__init__r   )selfr   	__class__s     r+   r3   zConvNextV2DropPath.__init__R   s    "r-   hidden_statesc                 D    t        || j                  | j                        S r1   )r,   r   r   r4   r6   s     r+   forwardzConvNextV2DropPath.forwardV   s    FFr-   c                 8    dj                  | j                        S )Nzp={})formatr   )r4   s    r+   
extra_reprzConvNextV2DropPath.extra_reprY   s    }}T^^,,r-   r1   )__name__
__module____qualname____doc__r   floatr3   r$   Tensorr9   strr<   __classcell__r5   s   @r+   r/   r/   O   sG    b#(5/ #T #GU\\ Gell G-C -r-   r/   c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )ConvNextV2GRNz)GRN (Global Response Normalization) layerdimc                     t         |           t        j                  t	        j
                  ddd|            | _        t        j                  t	        j
                  ddd|            | _        y )Nr   )r2   r3   r   	Parameterr$   zerosweightbias)r4   rH   r5   s     r+   r3   zConvNextV2GRN.__init__`   sL    ll5;;q!Q#<=LLQ1c!:;	r-   r6   r   c                     t        j                  |ddd      }||j                  dd      dz   z  }| j                  ||z  z  | j                  z   |z   }|S )N   )r   rO   T)prH   keepdim)rH   rQ   ư>)r$   normmeanrL   rM   )r4   r6   global_featuresnorm_featuress       r+   r9   zConvNextV2GRN.forwarde   s_    **]aVTR'?+?+?BPT+?+UX\+\]}}'DE		QTaar-   )
r=   r>   r?   r@   intr3   r$   FloatTensorr9   rD   rE   s   @r+   rG   rG   ]   s1    3<C <
U%6%6 5;L;L r-   rG   c                   \     e Zd ZdZd fd	Zdej                  dej                  fdZ xZS )ConvNextV2LayerNormaA  LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
    c                 N   t         |           t        j                  t	        j
                  |            | _        t        j                  t	        j                  |            | _        || _	        || _
        | j                  dvrt        d| j                         |f| _        y )N)channels_lastchannels_firstzUnsupported data format: )r2   r3   r   rJ   r$   onesrL   rK   rM   epsdata_formatNotImplementedErrornormalized_shape)r4   rc   r`   ra   r5   s       r+   r3   zConvNextV2LayerNorm.__init__u   s    ll5::.>#?@LL-=!>?	&#FF%(A$BRBRAS&TUU!1 3r-   xr   c                 d   | j                   dk(  rWt        j                  j                  j	                  || j
                  | j                  | j                  | j                        }|S | j                   dk(  r|j                  }|j                         }|j                  dd      }||z
  j                  d      j                  dd      }||z
  t        j                  || j                  z         z  }|j                  |      }| j                  d d d d f   |z  | j                  d d d d f   z   }|S )Nr]   r^   r   T)rQ   rO   )r    )ra   r$   r   
functional
layer_normrc   rL   rM   r`   r    rA   rU   powsqrtto)r4   rd   input_dtypeuss        r+   r9   zConvNextV2LayerNorm.forward   s
   .##..q$2G2GVZV_V_aeaiaijA  !11''K	Aq$'AQA##At#4AQ%**Q\22A;'AAtTM*Q.1dD=1IIAr-   )rS   r]   )	r=   r>   r?   r@   r3   r$   rB   r9   rD   rE   s   @r+   r[   r[   o   s(    
4 %,, r-   r[   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZ	S )ConvNextV2EmbeddingszThis class is comparable to (and inspired by) the SwinEmbeddings class
    found in src/transformers/models/swin/modeling_swin.py.
    c                    t         |           t        j                  |j                  |j
                  d   |j                  |j                        | _        t        |j
                  d   dd      | _	        |j                  | _        y )Nr   kernel_sizestriderS   r^   r`   ra   )
r2   r3   r   Conv2dnum_channelshidden_sizes
patch_sizepatch_embeddingsr[   	layernormr4   configr5   s     r+   r3   zConvNextV2Embeddings.__init__   sr     "		!4!4Q!7VEVEV_e_p_p!
 -V-@-@-C[kl"//r-   pixel_valuesr   c                     |j                   d   }|| j                  k7  rt        d      | j                  |      }| j	                  |      }|S )Nr   zeMake sure that the channel dimension of the pixel values match with the one set in the configuration.)r"   rv   
ValueErrorry   rz   )r4   r}   rv   
embeddingss       r+   r9   zConvNextV2Embeddings.forward   sV    #))!,4,,,w  **<8
^^J/
r-   
r=   r>   r?   r@   r3   r$   rY   rB   r9   rD   rE   s   @r+   ro   ro      s*    0E$5$5 %,, r-   ro   c                   \     e Zd ZdZd fd	Zdej                  dej                  fdZ xZ	S )ConvNextV2Layera5  This corresponds to the `Block` class in the original implementation.

    There are two equivalent implementations: [DwConv, LayerNorm (channels_first), Conv, GELU,1x1 Conv]; all in (N, C,
    H, W) (2) [DwConv, Permute to (N, H, W, C), LayerNorm (channels_last), Linear, GELU, Linear]; Permute back

    The authors used (2) as they find it slightly faster in PyTorch.

    Args:
        config ([`ConvNextV2Config`]): Model configuration class.
        dim (`int`): Number of input channels.
        drop_path (`float`): Stochastic depth rate. Default: 0.0.
    c                    t         |           t        j                  ||dd|      | _        t        |d      | _        t        j                  |d|z        | _        t        |j                     | _        t        d|z        | _        t        j                  d|z  |      | _        |dkD  rt        |      | _        y t        j                          | _        y )Nr   r
   )rr   paddinggroupsrS   r`      r   )r2   r3   r   ru   dwconvr[   rz   Linearpwconv1r   
hidden_actactrG   grnpwconv2r/   Identityr,   )r4   r|   rH   r,   r5   s       r+   r3   zConvNextV2Layer.__init__   s    iiSa3O,Sd;yya#g.&++, S)yyS#.:Cc/+I6r{{}r-   r6   r   c                 N   |}| j                  |      }|j                  dddd      }| j                  |      }| j                  |      }| j	                  |      }| j                  |      }| j                  |      }|j                  dddd      }|| j                  |      z   }|S )Nr   rO   r
   r   )r   permuterz   r   r   r   r   r,   )r4   r6   r   rd   s       r+   r9   zConvNextV2Layer.forward   s    KK&IIaAq!NN1LLOHHQKHHQKLLOIIaAq!DNN1%%r-   )r   r   rE   s   @r+   r   r      s+    
]U%6%6 5<< r-   r   c                   \     e Zd ZdZd fd	Zdej                  dej                  fdZ xZ	S )ConvNextV2Stagea  ConvNeXTV2 stage, consisting of an optional downsampling layer + multiple residual blocks.

    Args:
        config ([`ConvNextV2Config`]): Model configuration class.
        in_channels (`int`): Number of input channels.
        out_channels (`int`): Number of output channels.
        depth (`int`): Number of residual blocks.
        drop_path_rates(`List[float]`): Stochastic depth rates for each layer.
    c                 ~   t         	|           ||k7  s|dkD  r?t        j                  t	        |dd      t        j
                  ||||            | _        nt        j                         | _        |xs dg|z  }t        j                  t        |      D cg c]  }t        ||||          c} | _
        y c c}w )Nr   rS   r^   rt   rq   r   )rH   r,   )r2   r3   r   
Sequentialr[   ru   downsampling_layerr   ranger   layers)
r4   r|   in_channelsout_channelsrr   rs   depthdrop_path_ratesjr5   s
            r+   r3   zConvNextV2Stage.__init__   s    ,&&1*&(mm#KTGWX		+|U[\'D#
 ')kkmD#):cUU]mm_dej_klZ[of,/RSBTUl
ls   B:r6   r   c                 J    | j                  |      }| j                  |      }|S r1   )r   r   r8   s     r+   r9   zConvNextV2Stage.forward   s&    //>M2r-   )rO   rO   rO   Nr   rE   s   @r+   r   r      s*    
U%6%6 5<< r-   r   c                   f     e Zd Z fdZ	 	 ddej
                  dee   dee   dee	e
f   fdZ xZS )ConvNextV2Encoderc           
      (   t         |           t        j                         | _        t        j                  d|j                  t        |j                              j                  |j                        D cg c]  }|j                          }}|j                  d   }t        |j                        D ]V  }|j                  |   }t        ||||dkD  rdnd|j                  |   ||         }| j                  j!                  |       |}X y c c}w )Nr   rO   r   )r   r   rs   r   r   )r2   r3   r   
ModuleListstagesr$   linspacedrop_path_ratesumdepthssplittolistrw   r   
num_stagesr   append)	r4   r|   rd   r   prev_chsiout_chsstager5   s	           r+   r3   zConvNextV2Encoder.__init__   s    mmo %q&2G2GV]]I[ \ b bcicpcp q
AHHJ
 
 &&q)v(() 	A))!,G#$$EqqmmA& / 2E KKu%H		
s   8Dr6   output_hidden_statesreturn_dictr   c                     |rdnd }t        | j                        D ]  \  }}|r||fz   } ||      } |r||fz   }|st        d ||fD              S t        ||      S )N c              3   &   K   | ]	  }||  y wr1   r   ).0vs     r+   	<genexpr>z,ConvNextV2Encoder.forward.<locals>.<genexpr>  s     Xq!-Xs   )last_hidden_stater6   )	enumerater   tupler   )r4   r6   r   r   all_hidden_statesr   layer_modules          r+   r9   zConvNextV2Encoder.forward  s     #7BD(5 	8OA|#$58H$H!(7M		8   1]4D DX]4E$FXXX-++
 	
r-   )FT)r=   r>   r?   r3   r$   rY   r   boolr   r   r   r9   rD   rE   s   @r+   r   r      sT    . 05&*	
((
 'tn
 d^	

 
u44	5
r-   r   c                   (    e Zd ZdZeZdZdZdgZd Z	y)ConvNextV2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    
convnextv2r}   r   c                    t        |t        j                  t        j                  f      rm|j                  j
                  j                  d| j                  j                         |j                  %|j                  j
                  j                          yyt        |t        j                        rJ|j                  j
                  j                          |j                  j
                  j                  d       yy)zInitialize the weightsr   )rU   stdNg      ?)
isinstancer   r   ru   rL   datanormal_r|   initializer_rangerM   zero_	LayerNormfill_)r4   modules     r+   _init_weightsz'ConvNextV2PreTrainedModel._init_weights.  s    fryy"))45 MM&&CT[[5R5R&S{{&  &&( '-KK""$MM$$S) .r-   N)
r=   r>   r?   r@   r   config_classbase_model_prefixmain_input_name_no_split_modulesr   r   r-   r+   r   r   #  s'    
 $L$$O*+
*r-   r   aL  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`ConvNextV2Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aI  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`ConvNextImageProcessor`]. See
            [`ConvNextImageProcessor.__call__`] for details.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zSThe bare ConvNextV2 model outputting raw features without any specific head on top.c                        e Zd Z fdZ ee       eeee	de
      	 	 	 d	dej                  dee   dee   deeef   fd              Z xZS )
ConvNextV2Modelc                     t         |   |       || _        t        |      | _        t        |      | _        t        j                  |j                  d   |j                        | _        | j                          y )NrR   r   )r2   r3   r|   ro   r   r   encoderr   r   rw   layer_norm_epsrz   	post_initr{   s     r+   r3   zConvNextV2Model.__init__Y  s`     .v6(0 f&9&9"&=6CXCXY 	r-   vision)
checkpointoutput_typer   modalityexpected_outputr}   r   r   r   c                 d   ||n| j                   j                  }||n| j                   j                  }|t        d      | j	                  |      }| j                  |||      }|d   }| j                  |j                  ddg            }|s
||f|dd  z   S t        |||j                        S )Nz You have to specify pixel_valuesr   r   r   rR   r   )r   pooler_outputr6   )
r|   r   use_return_dictr   r   r   rz   rU   r   r6   )r4   r}   r   r   embedding_outputencoder_outputsr   pooled_outputs           r+   r9   zConvNextV2Model.forwardf  s     %9$D $++JjJj 	 &1%<k$++B]B]?@@??<8,,!5# ' 
 ,A. '8'='=r2h'GH%}58KKK7/')77
 	
r-   )NNN)r=   r>   r?   r3   r   CONVNEXTV2_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr$   rY   r   r   r   r   r9   rD   rE   s   @r+   r   r   S  s     ++FG&<$. +//3&*	"
''"
 'tn"
 d^	"

 
u>>	?"
 H"
r-   r   z
    ConvNextV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    c                        e Zd Z fdZ ee       eeee	e
      	 	 	 	 d	dej                  deej                     dee   dee   deeef   f
d              Z xZS )
 ConvNextV2ForImageClassificationc                 0   t         |   |       |j                  | _        t        |      | _        |j                  dkD  r-t        j                  |j                  d   |j                        nt        j                         | _	        | j                          y )Nr   rR   )r2   r3   
num_labelsr   r   r   r   rw   r   
classifierr   r{   s     r+   r3   z)ConvNextV2ForImageClassification.__init__  sy      ++)&1 FLEVEVYZEZBIIf))"-v/@/@A`b`k`k`m 	
 	r-   )r   r   r   r   r}   labelsr   r   r   c                    ||n| j                   j                  }| j                  |||      }|r|j                  n|d   }| j	                  |      }d}|| j                   j
                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j
                  dk(  rIt               }	| j                  dk(  r& |	|j                         |j                               }n |	||      }n| j                   j
                  dk(  r=t               }	 |	|j                  d| j                        |j                  d            }n,| j                   j
                  dk(  rt               }	 |	||      }|s|f|dd z   }
||f|
z   S |
S t!        |||j"                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   
regressionsingle_label_classificationmulti_label_classificationrR   rO   )losslogitsr6   )r|   r   r   r   r   problem_typer   r    r$   longrX   r	   squeezer   viewr   r   r6   )r4   r}   r   r   r   outputsr   r   r   loss_fctr*   s              r+   r9   z(ConvNextV2ForImageClassification.forward  s   ( &1%<k$++B]B]//,EYgr/s1<--'!*/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE3!//
 	
r-   )NNNN)r=   r>   r?   r3   r   r   r   _IMAGE_CLASS_CHECKPOINTr   r   _IMAGE_CLASS_EXPECTED_OUTPUTr$   rY   r   
LongTensorr   r   r   r9   rD   rE   s   @r+   r   r     s     ++FG*8$4	 +/-1/3&*3
''3
 ))*3
 'tn	3

 d^3
 
u::	;3
 H3
r-   r   zT
    ConvNeXT V2 backbone, to be used with frameworks like DETR and MaskFormer.
    c                        e Zd Z fdZ ee       eee      	 	 dde	j                  dee   dee   defd              Z xZS )	ConvNextV2Backbonec                    t         |   |       t         | 	  |       t        |      | _        t        |      | _        |j                  d   g|j                  z   | _        i }t        | j                  | j                        D ]  \  }}t        |d      ||<    t        j                  |      | _        | j!                          y )Nr   r^   )ra   )r2   r3   _init_backbonero   r   r   r   rw   num_featureszip_out_featureschannelsr[   r   
ModuleDicthidden_states_normsr   )r4   r|   r  r   rv   r5   s        r+   r3   zConvNextV2Backbone.__init__  s     v&.v6(0#0034v7J7JJ !#&t'9'94==#I 	iE<)<\Wg)h&	i#%==1D#E  	r-   )r   r   r}   r   r   r   c                    ||n| j                   j                  }||n| j                   j                  }| j                  |      }| j	                  |d|      }|r|j
                  n|d   }d}t        | j                  |      D ]/  \  }}	|| j                  v s | j                  |   |	      }	||	fz  }1 |s|f}
|r|
|fz  }
|
S t        ||r|d      S dd      S )a  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("facebook/convnextv2-tiny-1k-224")
        >>> model = AutoBackbone.from_pretrained("facebook/convnextv2-tiny-1k-224")

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```NTr   r   r   )feature_mapsr6   
attentions)r|   r   r   r   r   r6   r  stage_namesout_featuresr  r   )r4   r}   r   r   r   r   r6   r  r   hidden_stater*   s              r+   r9   zConvNextV2Backbone.forward   s   8 &1%<k$++B]B]$8$D $++JjJj 	  ??<8,,!%#  
 2=--'!*#&t'7'7#G 	0E<)))>t77>|L/	0
 "_F#=**M%+?-
 	
EI
 	
r-   )NN)r=   r>   r?   r3   r   r   r   r   r   r$   rB   r   r   r9   rD   rE   s   @r+   r   r     sm    " ++FG>X 04&*	9
ll9
 'tn9
 d^	9

 
9
 Y H9
r-   r   )r   F)9r@   typingr   r   r   r$   torch.utils.checkpointr   torch.nnr   r   r	   activationsr   modeling_outputsr   r   r   r   modeling_utilsr   utilsr   r   r   r   r   utils.backbone_utilsr   configuration_convnextv2r   
get_loggerr=   loggerr   r   r   r   r   rB   rA   r   r,   Moduler/   rG   r[   ro   r   r   r   r   CONVNEXTV2_START_DOCSTRINGr   r   r   r   r   r-   r+   <module>r     s     ) )    A A !  .  2 6 
		H	% % 8 '  < 1 U\\ e T V[VbVb *- -BII $")) >299 0(bii (Xbii B,
		 ,
`* *0	 
  Y
8
/ 8

8
v  I
'@ I
I
X  	M
2M M
M
r-   