
    sg              	       L   d Z ddlZddlZddlmZmZmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZ ddlmZmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZ  ej:                  e      ZdZ  G d dejB                        Z"e
jF                  jH                  d        Z%d Z& G d dejB                        Z'd3de
jP                  de)de*de
jP                  fdZ+ G d dejB                        Z, G d dejB                        Z- G d dejB                        Z. G d d ejB                        Z/d! Z0d" Z1 G d# d$ejB                        Z2 G d% d&ejB                        Z3d'ejB                  ddfd(Z4 G d) d*e      Z5d+Z6d,Z7 ed-e6       G d. d/e5             Z8 ed0e6       G d1 d2e5e             Z9y)4zPyTorch ViTDet backbone.    N)DictListOptionalTupleUnion)nn   )ACT2FN)BackboneOutputBaseModelOutput)PreTrainedModel)add_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings)BackboneMixin   )VitDetConfigr   c                   `     e Zd ZdZ fdZd Zdej                  dej                  fdZ xZ	S )VitDetEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) to be consumed by a Transformer.
    c                 p   t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _
        || _        || _        || _        |j                  r?|dz   }t        j                  t        j                   d||j
                              | _        nd | _        t        j$                  ||||      | _        y )Nr   r   )kernel_sizestride)super__init__pretrain_image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterable
image_sizenum_patches use_absolute_position_embeddingsr   	Parametertorchzerosposition_embeddingsConv2d
projection)	selfconfigr$   r   r   r   r%   num_positions	__class__s	           ]/var/www/html/venv/lib/python3.12/site-packages/transformers/models/vitdet/modeling_vitdet.pyr   zVitDetEmbeddings.__init__2   s   !'!;!;V=N=NJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&22'!OM')||EKK=RXRdRd4e'fD$'+D$))L+:^hi    c                    |r|ddddf   }|j                   d   }t        t        j                  |            }||z  |k7  rt	        d      t
        j                  j                         s
||k7  s||k7  r]t        j                  j                  |j                  d||d      j                  dddd      ||fdd	
      }|j                  dddd      S |j                  d||d      S )a  
        Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token dimension for the
        original embeddings.

        Args:
            abs_pos_embeddings (`torch.Tensor`):
                Absolute positional embeddings with (1, num_position, num_channels).
            has_cls_token (`bool`):
                If true, has 1 embedding in abs_pos_embeddings for cls token.
            height (`int`):
                Height of input image tokens.
            width (`int`):
                Width of input image tokens.

        Returns:
            Absolute positional embeddings after processing with shape (1, height, width, num_channels)
        Nr   z5Absolute position embeddings must be a square number.r   r	      bicubicF)sizemodealign_corners)shapeintmathsqrt
ValueErrorr(   jit
is_tracingr   
functionalinterpolatereshapepermute)r-   abs_pos_embeddingshas_cls_tokenheightwidthnum_positionr7   new_abs_pos_embeddingss           r1   get_absolute_positionsz'VitDetEmbeddings.get_absolute_positionsH   s    $ !3AqrE!:)//2499\*+$;,&TUU99!dfn%']]%>%>"**1dD"=EEaAqQe_#	 &? &" *11!Q1==%--aCCr2   pixel_valuesreturnc                 z   |j                   d   }|| j                  k7  rt        d| j                   d| d      | j                  |      }| j                  c|j                  dddd      }|| j                  | j                  d|j                   d   |j                   d         z   }|j                  dddd      }|S )	Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r5   r	   T)r:   r   r>   r,   r*   rD   rK   )r-   rL   r   
embeddingss       r1   forwardzVitDetEmbeddings.forwardn   s    #))!,4,,,!../yaI  __\2
##/#++Aq!Q7J#d&A&A(($
0@0@0CZEUEUVWEX' J $++Aq!Q7Jr2   )
__name__
__module____qualname____doc__r   rK   r(   TensorrQ   __classcell__r0   s   @r1   r   r   ,   s0    
j,$DLELL U\\ r2   r   c                 T   t        dt        | |      z  dz
        }|j                  d   |k7  rtt        j                  j                  |j                  d|j                  d   d      j                  ddd      |d      }|j                  d|      j                  dd      }n|}t        j                  |       dddf   t        || z  d      z  }t        j                  |      dddf   t        | |z  d      z  }||z
  |dz
  t        | |z  d      z  z   }||j                            S )	a  
    Get relative positional embeddings according to the relative positions of query and key sizes.

    Args:
        q_size (`int`):
            Size of query q.
        k_size (`int`):
            Size of key k.
        rel_pos (`torch.Tensor`):
            Relative position embeddings (num_embeddings, num_channels).

    Returns:
        Extracted positional embeddings according to relative positions.
    r5   r   r   r4   linear)r7   r8   N      ?)r;   maxr:   r   rA   rB   rC   rD   r(   arangelong)q_sizek_sizerel_posmax_rel_distrel_pos_resizedq_coordsk_coordsrelative_coordss           r1   get_rel_posrg      s%     q3vv..23L}}Q<'--33OOAw}}Q/4<<Q1E 4 

 *11"lCKKAqQ! ||F#AtG,s6F?C/HHH||F#D!G,s6F?C/HHH(*vzS&RU=V.VVO?//122r2   c                    |\  }}|\  }}	t        |||      }
t        ||	|      }|j                  \  }}}|j                  ||||      }t        j                  d||
      }
t        j                  d||      }| j                  |||||	      |
dddddddddf   z   |dddddddddf   z   j                  |||z  ||	z        } | S )a  
    Calculate decomposed Relative Positional Embeddings as introduced in
    [MViT2](https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py).

    Args:
        attn (`torch.Tensor`):
            Attention map.
        queries (`torch.Tensor`):
            Query q in the attention layer with shape (batch_size, queries_height * queries_width, num_channels).
        rel_pos_h (`torch.Tensor`):
            Relative position embeddings (Lh, num_channels) for height axis.
        rel_pos_w (`torch.Tensor`):
            Relative position embeddings (Lw, num_channels) for width axis.
        q_size (`Tuple[int]`):
            Spatial sequence size of query q with (queries_height, queries_width).
        k_size (`Tuple[int]`):
            Spatial sequence size of key k with (keys_height, keys_width).

    Returns:
        attn (Tensor): attention map with added relative positional embeddings.
    zbhwc,hkc->bhwkzbhwc,wkc->bhwkN)rg   r:   rC   r(   einsumview)attnqueries	rel_pos_h	rel_pos_wr_   r`   queries_heightqueries_widthkeys_height
keys_widthrelative_heightrelative_width
batch_size_dimr_qrelative_weights                    r1   !add_decomposed_relative_positionsrz      s    , %+!NM$K!.+yIO 
IFN J3
//*nmS
ICll#3S/JOll#3S.IO 			*nm[*U
!Q1d*
+	,
!Q4*
+	, d:~5{Z7OP	 	 Kr2   c                   ,     e Zd ZdZd fd	ZddZ xZS )VitDetAttentionz=Multi-head Attention block with relative position embeddings.c                     t         |           |j                  }|j                  }|| _        ||z  }|dz  | _        t        j                  ||dz  |j                        | _	        t        j                  ||      | _
        |j                  | _        | j                  rot        j                  t        j                  d|d   z  dz
  |            | _        t        j                  t        j                  d|d   z  dz
  |            | _        yy)z
        Args:
            config (`VitDetConfig`):
                Model configuration.
            input_size (`Tuple[int]`, *optional*):
                Input resolution, only required in case relative position embeddings are added.
        g      r	   biasr5   r   r   N)r   r   r   num_attention_heads	num_headsscaler   Linearqkv_biasqkvproj use_relative_position_embeddingsr'   r(   r)   rm   rn   )r-   r.   
input_sizerw   r   head_dimr0   s         r1   r   zVitDetAttention.__init__   s     	  ..	")#t^
99S#'@IIc3'	060W0W-00\\%++a*Q-6G!6KX*VWDN\\%++a*Q-6G!6KX*VWDN 1r2   c           	      0   |j                   \  }}}}| j                  |      j                  |||z  d| j                  d      j	                  ddddd      }|j                  d|| j                  z  ||z  d      j                  d      \  }}	}
|| j                  z  |	j                  dd      z  }| j                  r(t        ||| j                  | j                  ||f||f      }|j                  d      }||
z  }|j                  || j                  ||d      }|j	                  ddddd      }|j                  |||d      }| j                  |      }|r>|j                  || j                  |j                   d   |j                   d         }||f}|S |f}|S )	Nr	   r4   r5   r   r      )rw   )r:   r   rC   r   rD   unbindr   	transposer   rz   rm   rn   softmaxrj   r   )r-   hidden_stateoutput_attentionsru   rG   rH   rv   r   rl   keysvaluesattention_scoresattention_probsoutputss                 r1   rQ   zVitDetAttention.forward   s   '3'9'9$
FE1hh|$,,Z%DNN\^_gghiklnoqrtuv #AzDNN/JFUZN\^ _ f fgh iv#djj0DNN2r4JJ00@ '4>>4>>FTY?]cej\k  +22r2:&/#((T^^VUTVW#++Aq!Q:#++JrJyy.-55DNNO,A,A",EG\G\]_G`O $_5G  $oGr2   N)FrR   rS   rT   rU   r   rQ   rW   rX   s   @r1   r|   r|      s    GX4r2   r|   input	drop_probtrainingrM   c                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
            r   r   )r   )dtypedevice)r:   ndimr(   randr   r   floor_div)r   r   r   	keep_probr:   random_tensoroutputs          r1   	drop_pathr     s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr2   c                   x     e Zd ZdZd	dee   ddf fdZdej                  dej                  fdZ	de
fdZ xZS )
VitDetDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rM   c                 0    t         |           || _        y r   )r   r   r   )r-   r   r0   s     r1   r   zVitDetDropPath.__init__)  s    "r2   hidden_statesc                 D    t        || j                  | j                        S r   )r   r   r   )r-   r   s     r1   rQ   zVitDetDropPath.forward-  s    FFr2   c                 8    dj                  | j                        S )Nzp={})formatr   r-   s    r1   
extra_reprzVitDetDropPath.extra_repr0  s    }}T^^,,r2   r   )rR   rS   rT   rU   r   floatr   r(   rV   rQ   strr   rW   rX   s   @r1   r   r   &  sG    b#(5/ #T #GU\\ Gell G-C -r2   r   c                   *     e Zd ZdZd fd	Zd Z xZS )VitDetLayerNormaL  
    A LayerNorm variant, popularized by Transformers, that performs point-wise mean and variance normalization over the
    channel dimension for inputs that have shape (batch_size, channels, height, width).
    https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119
    c                     t         |           t        j                  t	        j
                  |            | _        t        j                  t	        j                  |            | _        || _	        |f| _
        y r   )r   r   r   r'   r(   onesweightr)   r   epsnormalized_shape)r-   r   r   r0   s      r1   r   zVitDetLayerNorm.__init__;  sT    ll5::.>#?@LL-=!>?	!1 3r2   c                    |j                  dd      }||z
  j                  d      j                  dd      }||z
  t        j                  || j                  z         z  }| j
                  d d d d f   |z  | j                  d d d d f   z   }|S )Nr   T)keepdimr5   )meanpowr(   r=   r   r   r   )r-   xuss       r1   rQ   zVitDetLayerNorm.forwardB  s    FF1dF#UKKN40UejjTXX..KK4&*TYYq$}-EEr2   )gư>r   rX   s   @r1   r   r   4  s    4r2   r   c                   (     e Zd ZdZ fdZd Z xZS )VitDetResBottleneckBlockz
    The standard bottleneck residual block without the last activation layer. It contains 3 conv layers with kernels
    1x1, 3x3, 1x1.
    c                    t         |           t        j                  ||dd      | _        t        |      | _        t        |j                     | _	        t        j                  ||ddd      | _
        t        |      | _        t        |j                     | _        t        j                  ||dd      | _        t        |      | _        y)ar  
        Args:
            config (`VitDetConfig`):
                Model configuration.
            in_channels (`int`):
                Number of input channels.
            out_channels (`int`):
                Number of output channels.
            bottleneck_channels (`int`):
                Number of output channels for the 3x3 "bottleneck" conv layers.
        r   Fr~   r	   )paddingr   N)r   r   r   r+   conv1r   norm1r
   
hidden_actact1conv2norm2act2conv3norm3)r-   r.   in_channelsout_channelsbottleneck_channelsr0   s        r1   r   z!VitDetResBottleneckBlock.__init__P  s     	YY{,?O
$%89
6,,-	YY24GTU\ab
$%89
6,,-	YY2L!%P
$\2
r2   c                 N    |}| j                         D ]
  } ||      } ||z   }|S r   )children)r-   r   outlayers       r1   rQ   z VitDetResBottleneckBlock.forwardh  s5    ]]_ 	E*C	 #g
r2   r   rX   s   @r1   r   r   J  s    
30r2   r   c                   d     e Zd Zdededdf fdZdej                  dej                  fdZ xZS )	VitDetMlpin_featureshidden_featuresrM   Nc                    t         |           t        j                  ||      | _        t
        |j                     | _        t        j                  ||      | _        t        j                  |j                        | _        y r   )r   r   r   r   fc1r
   r   actfc2Dropoutdropout_probdrop)r-   r.   r   r   r0   s       r1   r   zVitDetMlp.__init__r  sZ    99[/:&++,99_k:JJv223	r2   r   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   r   )r-   r   s     r1   rQ   zVitDetMlp.forwardy  sH    HHQKHHQKIIaLHHQKIIaLr2   )	rR   rS   rT   r;   r   r(   rV   rQ   rW   rX   s   @r1   r   r   q  s8    4C 4# 4$ 4 %,, r2   r   c           	      `   | j                   \  }}}}|||z  z
  |z  }|||z  z
  |z  }t        j                  j                  | ddd|d|f      } ||z   ||z   }	}| j	                  |||z  ||	|z  ||      } | j                  dddddd      j                         j	                  d|||      }
|
||	ffS )a  
    Partition into non-overlapping windows with padding if needed.

    Args:
        hidden_state (`torch.Tensor`):
            Input tokens with [batch_size, height, width, num_channels].
        window_size (`int`):
            Window size.

    Returns:
        `tuple(torch.FloatTensor)` comprising various elements:
        - windows: windows after partition with [batch_size * num_windows, window_size, window_size, num_channels].
        - (padded_height, padded_width): padded height and width before partition
    r   r   r	   r5   r      r4   )r:   r   rA   padrj   rD   
contiguous)r   window_sizeru   rG   rH   r   
pad_height	pad_widthpadded_heightpadded_widthwindowss              r1   window_partitionr     s     /;.@.@+J| 44CJu{22kAI ==$$\Aq!Y:3VWL"(:"5uy7H<M$$M[0+|{?Z\giuL ""1aAq!4??AFFr;XceqrG]L111r2   c                 6   |\  }}|\  }}| j                   d   ||z  |z  |z  z  }| j                  |||z  ||z  ||d      }	|	j                  dddddd      j                         }	|	j                  |||d      }	|	ddd|d|ddf   j                         }	|	S )	aB  
    Window unpartition into original sequences and removing padding.

    Args:
        windows (`torch.Tensor`):
            Input tokens with [batch_size * num_windows, window_size, window_size, num_channels].
        window_size (`int`):
            Window size.
        pad_height_width (`Tuple[int]`):
            Padded height and width (padded_height, padded_width).
        height_width (`Tuple[int]`):
            Original height and width before padding.

    Returns:
        hidden_state: unpartitioned sequences with [batch_size, height, width, num_channels].
    r   r4   r   r	   r5   r   r   N)r:   rj   rD   r   )
r   r   pad_height_widthheight_widthr   r   rG   rH   ru   r   s
             r1   window_unpartitionr     s    " #3M< MFEq!ml&Bk&QU`&`aJ<<M[0,+2M{\gikL  ''1aAq9DDFL$$ZbQL  7F7FUFA 56AACLr2   c                        e Zd ZdZ	 ddededededdf
 fdZ	 	 dd	e	j                  d
ee	j                     dedeee	j                  e	j                  f   ee	j                     f   fdZ xZS )VitDetLayerzCThis corresponds to the Block class in the original implementation.r.   drop_path_rater   use_residual_blockrM   Nc                 t   t         |           |j                  }|j                  |j                  z  |j                  |j                  z  f}t        j                  ||j                        | _        t        ||dk(  r|n||f      | _
        |dkD  rt        |      nt        j                         | _        t        j                  ||j                        | _        t        ||t!        ||j"                  z              | _        || _        || _        | j(                  rt+        ||||dz        | _        y y )N)r   r   )r   r   )r.   r   r   r5   )r.   r   r   r   )r   r   r   r$   r   r   	LayerNormlayer_norm_epsr   r|   	attentionr   Identityr   r   r   r;   	mlp_ratiomlpr   r   r   residual)r-   r.   r   r   r   rw   r   r0   s          r1   r   zVitDetLayer.__init__  s    	  ''6+<+<<f>O>OSYSdSd>de
\\#6+@+@A
([A-=zKQ\C]
 <JC;O7UWU`U`Ub\\#6+@+@A
FSQTW]WgWgQgMhi&"4""4 $'1H	DM #r2   r   	head_maskr   c                 f   |j                  dddd      }|}| j                  |      }| j                  dkD  r7|j                  d   |j                  d   }}t	        || j                        \  }}| j                  ||      }|d   }|dd  }	| j                  dkD  rt        || j                  f      }|| j                  |      z   }|| j                  | j                  | j                  |                  z   }|j                  dddd      }| j                  r| j                  |      }|f|	z   }	|	S )Nr   r5   r	   r   )r   )rD   r   r   r:   r   r   r   r   r   r   r   r   )
r-   r   r   r   shortcutrG   rH   r   self_attention_outputsr   s
             r1   rQ   zVitDetLayer.forward  sM    &--aAq9 

=1 a)//2M4G4G4JEF.>}dN^N^._+M+!%/ "0 "
 /q1(, a.}d>N>NP`cikpbqrM !4>>-#@@%txx

=@Y7Z([[%--aAq9"" MM-8M "W,r2   )r   r   F)NF)rR   rS   rT   rU   r   r   r;   boolr   r(   rV   r   r   r   rQ   rW   rX   s   @r1   r   r     s    M qv"49LOim	@ -1"'	(||( ELL)(  	(
 
uU\\5<</0%2EE	F(r2   r   c                        e Zd Zdeddf fdZ	 	 	 	 ddej                  deej                     deded	ede	e
ef   fd
Z xZS )VitDetEncoderr.   rM   Nc           
         t         |           || _        |j                  }t	        j
                  d|j                  |      D cg c]  }|j                          }}g }t        |      D ]I  }|j                  t        |||   ||j                  v r|j                  nd||j                  v              K t        j                  |      | _        d| _        y c c}w )Nr   )r   r   r   F)r   r   r.   num_hidden_layersr(   linspacer   itemrangeappendr   window_block_indicesr   residual_block_indicesr   
ModuleListr   gradient_checkpointing)r-   r.   depthr   r   layersir0   s          r1   r   zVitDetEncoder.__init__  s    (( -2NN1f>S>SUZ,[\q!&&(\\u 	AMM#1!#4676;V;V6V 2 2\]'(F,I,I'I		 ]]6*
&+# ]s   Cr   r   r   output_hidden_statesreturn_dictc                 t   |rdnd }|rdnd }t        | j                        D ]h  \  }}	|r||fz   }|||   nd }
| j                  r+| j                  r| j	                  |	j
                  ||
|      }n
 |	||
|      }|d   }|s`||d   fz   }j |r||fz   }|st        d |||fD              S t        |||      S )N r   r   c              3   &   K   | ]	  }||  y wr   r  ).0vs     r1   	<genexpr>z(VitDetEncoder.forward.<locals>.<genexpr>H  s     mq_`_lms   last_hidden_stater   
attentions)	enumerater   r  r   _gradient_checkpointing_func__call__tupler   )r-   r   r   r   r  r  all_hidden_statesall_self_attentionsr  layer_modulelayer_head_masklayer_outputss               r1   rQ   zVitDetEncoder.forward$  s     #7BD$5b4(4 	POA|#$58H$H!.7.CilO**t}} $ A A ))!#%	! !-]OM^ _)!,M &9]1=M<O&O#'	P*   1]4D Dm]4EGZ$[mmm++*
 	
r2   )NFFT)rR   rS   rT   r   r   r(   rV   r   r   r   r  r   rQ   rW   rX   s   @r1   r  r    sz    ,| , ,2 -1"'%* )
||)
 ELL))
  	)

 #)
 )
 
uo%	&)
r2   r  modulec                     t         j                  j                  | j                  dd       | j                  +t         j                  j                  | j                  d       yy)a  
    Initialize `module.weight` using the "MSRAFill" implemented in Caffe2. Also initializes `module.bias` to 0.

    Source: https://detectron2.readthedocs.io/en/latest/_modules/fvcore/nn/weight_init.html.

    Args:
        module (torch.nn.Module): module to initialize.
    fan_outrelu)r8   nonlinearityNr   )r   initkaiming_normal_r   r   	constant_)r$  s    r1   caffe2_msra_fillr,  P  sH     GGFMM	O{{
&++q) r2   c                   |    e Zd ZdZeZdZdZdZg Z	de
ej                  ej                  ej                  f   ddfdZy)	VitDetPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    vitdetrL   Tr$  rM   Nc                 >   t        |t        j                  t        j                  f      rt        j                  j                  |j                  j                  j                  t        j                        d| j                  j                        j                  |j                  j                        |j                  _        |j                  %|j                  j                  j                          yyt        |t        j                         rJ|j                  j                  j                          |j                  j                  j#                  d       yt        |t$              rt        j                  j                  |j&                  j                  j                  t        j                        d| j                  j                        j                  |j&                  j                        |j&                  _        yt        |t(              r| j                  j*                  rt        j                  j                  |j,                  j                  j                  t        j                        d| j                  j                        |j,                  _        t        j                  j                  |j.                  j                  j                  t        j                        d| j                  j                        |j.                  _        yt        |t0              r|j2                  |j4                  |j6                  fD ]  }t9        |        |j:                  |j<                  fD ]K  }|j                  j                  j#                  d       |j                  j                  j                          M |j>                  j                  j                  j                          |j>                  j                  j                  j                          yy)zInitialize the weightsr   )r   stdNr[   ) r    r   r   r+   r)  trunc_normal_r   datator(   float32r.   initializer_ranger   r   zero_r   fill_r   r*   r|   r   rm   rn   r   r   r   r   r,  r   r   r   )r-   r$  r   s      r1   _init_weightsz#VitDetPreTrainedModel._init_weightsj  s   fryy"))45 "$!6!6""%%emm43DKKDaDa "7 "b$$% MM {{&  &&( '-KK""$MM$$S) 01.0gg.C.C**//225==AKK11 /D / b++112	 &&+ 0T[[5a5a$&GG$9$9  %%((7KK11 %: %F!
 %'GG$9$9  %%((7KK11 %: %F!  89 ,,fllC ( '( ,,5 (!!'',

%%'( LL$$**,LL""((* :r2   )rR   rS   rT   rU   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   r   r   r+   r   r9  r  r2   r1   r.  r.  ^  sQ    
  L $O&*#)+E"))RYY*L$M )+RV )+r2   r.  aH  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`VitDetConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aK  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
            for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z`The bare VitDet Transformer model outputting raw hidden-states without any specific head on top.c                        e Zd Zdef fdZdefdZdeee	e   f   ddfdZ
 ee       eee      	 	 	 	 	 dd	eej$                     d
eej$                     dee   dee   dee   deeef   fd              Z xZS )VitDetModelr.   c                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y r   )r   r   r.   r   rP   r  encoder	post_init)r-   r.   r0   s     r1   r   zVitDetModel.__init__  s;     *62$V, 	r2   rM   c                 .    | j                   j                  S r   rP   r,   r   s    r1   get_input_embeddingsz VitDetModel.get_input_embeddings      )))r2   heads_to_pruneNc                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrB  r   r   prune_heads)r-   rH  r   headss       r1   _prune_headszVitDetModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr2   output_typer:  rL   r   r   r  r  c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  || j                   j                        }| j                  |      }| j                  |||||      }|d   }|s	|f|dd z   S t        ||j                  |j                        S )a  
        Returns:

        Examples:

        ```python
        >>> from transformers import VitDetConfig, VitDetModel
        >>> import torch

        >>> config = VitDetConfig()
        >>> model = VitDetModel(config)

        >>> pixel_values = torch.randn(1, 3, 224, 224)

        >>> with torch.no_grad():
        ...     outputs = model(pixel_values)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 768, 14, 14]
        ```Nz You have to specify pixel_values)r   r   r  r  r   r   r  )r.   r   r  use_return_dictr>   get_head_maskr  rP   rB  r   r   r  )	r-   rL   r   r   r  r  embedding_outputencoder_outputssequence_outputs	            r1   rQ   zVitDetModel.forward  s    > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@ &&y$++2O2OP	??<8,,/!5# ' 
 *!,#%(;;;-)77&11
 	
r2   )NNNNN)rR   rS   rT   r   r   r   rF  r   r;   r   rM  r   VITDET_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCr   r(   rV   r   r   r   rQ   rW   rX   s   @r1   r@  r@    s    
| *&6 *C4T#Y+? CD C ++BC?Y 04,0,0/3&*?
u||,?
 ELL)?
 $D>	?

 'tn?
 d^?
 
uo%	&?
 Z D?
r2   r@  zF
    ViTDet backbone, to be used with frameworks like Mask R-CNN.
    c                        e Zd Z fdZdefdZ ee       ee	e
      	 	 	 d
dej                  dee   dee   dee   de	f
d	              Z xZS )VitDetBackbonec                    t         |   |       t         | 	  |       t        |      | _        t        |      | _        t        |j                  dz         D cg c]  }|j                   c}| _
        | j                          y c c}w )Nr   )r   r   _init_backboner   rP   r  rB  r  r  r   num_featuresrC  )r-   r.   rv   r0   s      r1   r   zVitDetBackbone.__init__  sq     v&*62$V,9>v?W?WZ[?[9\]AV//] 	 ^s   BrM   c                 .    | j                   j                  S r   rE  r   s    r1   rF  z#VitDetBackbone.get_input_embeddings(  rG  r2   rN  rL   r  r   r  c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |      }| j                  |d||      }|r|j                  n|d   }d}t        | j                  |      D ]  \  }	}
|	| j                  v s||
fz  } |s|r|f|dd z   }|S |f|dd z   }|S t        ||r|j                  nd|j                        S )a  
        Returns:

        Examples:

        ```python
        >>> from transformers import VitDetConfig, VitDetBackbone
        >>> import torch

        >>> config = VitDetConfig()
        >>> model = VitDetBackbone(config)

        >>> pixel_values = torch.randn(1, 3, 224, 224)

        >>> with torch.no_grad():
        ...     outputs = model(pixel_values)

        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 14, 14]
        ```NT)r  r   r  r   r  r5   )feature_mapsr   r  )r.   rQ  r  r   rP   rB  r   zipstage_namesout_featuresr   r  )r-   rL   r  r   r  rS  r   r   r_  stager   r   s               r1   rQ   zVitDetBackbone.forward+  s5   < &1%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq??<8,,!%/#	  
 2=--'!*#&t'7'7#G 	0E<)))/	0 #&712;6 M '712;6M%3G'//T))
 	
r2   )NNN)rR   rS   rT   r   r   rF  r   rV  r   r   rW  r(   rV   r   r   rQ   rW   rX   s   @r1   rY  rY    s    	*&6 * ++BC>X 04,0&*=
ll=
 'tn=
 $D>	=

 d^=
 
=
 Y D=
r2   rY  )r   F):rU   collections.abcr!   r<   typingr   r   r   r   r   r(   torch.utils.checkpointr   activationsr
   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   r   utils.backbone_utilsr   configuration_vitdetr   
get_loggerrR   loggerrW  Moduler   r?   script_if_tracingrg   rz   r|   rV   r   r   r   r   r   r   r   r   r   r   r  r,  r.  VITDET_START_DOCSTRINGrV  r@  rY  r  r2   r1   <module>rr     s      5 5    ! ? -  2 . 
		H	% !Uryy Up !3 !3H&R;bii ;~U\\ e T V[VbVb *-RYY -bii ,$ryy $N		 $2@>H")) HV@
BII @
F*RYY *4 *5+O 5+p	  . fW
' W
	W
t  	N
*M N
N
r2   