
    sg             	       ^   d Z ddlZddlZddlmZ ddlmZmZm	Z	m
Z
 ddlZddlZddlmZmZ ddlmZmZmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlmZm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+  e%jX                  e-      Z.dZ/dZ0g dZ1dZ2dZ3e G d de             Z4dQdej                  de5de6dej                  fdZ7 G d dejp                        Z9 G d dejp                        Z: G d  d!ejp                        Z; G d" d#ejp                        Z< G d$ d%ejp                        Z= G d& d'ejp                        Z> G d( d)ejp                        Z? G d* d+ejp                        Z@ G d, d-ejp                        ZA G d. d/ejp                        ZB G d0 d1ejp                        ZC G d2 d3e      ZDd4ZEd5ZF e#d6eE       G d7 d8eD             ZG G d9 d:ejp                        ZH e#d;eE       G d< d=eD             ZI e#d>eE       G d? d@eD             ZJ G dA dBejp                        ZK G dC dDejp                        ZL G dE dFejp                        ZM G dG dHejp                        ZN G dI dJejp                        ZO e#dKeE       G dL dMeD             ZP e#dNeE       G dO dPeDe)             ZQy)RzPyTorch BEiT model.    N)	dataclass)ListOptionalTupleUnion)Tensornn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BackboneOutputBaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedLMOutputSemanticSegmenterOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings	torch_int)BackboneMixin   )
BeitConfigr    z%microsoft/beit-base-patch16-224-pt22k)r      i   zmicrosoft/beit-base-patch16-224ztabby, tabby catc                       e Zd ZdZy)BeitModelOutputWithPoolinga  
    Class for outputs of [`BeitModel`].

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
            Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
            *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
            will be returned.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    N)__name__
__module____qualname____doc__     Y/var/www/html/venv/lib/python3.12/site-packages/transformers/models/beit/modeling_beit.pyr#   r#   @   s    r)   r#   input	drop_probtrainingreturnc                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
            r   r   )r   )dtypedevice)shapendimtorchrandr1   r2   floor_div)r+   r,   r-   	keep_probr3   random_tensoroutputs          r*   	drop_pathr<   Z   s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr)   c                   x     e Zd ZdZd	dee   ddf fdZdej                  dej                  fdZ	de
fdZ xZS )
BeitDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr,   r.   c                 0    t         |           || _        y N)super__init__r,   )selfr,   	__class__s     r*   rB   zBeitDropPath.__init__q   s    "r)   hidden_statesc                 D    t        || j                  | j                        S r@   )r<   r,   r-   rC   rE   s     r*   forwardzBeitDropPath.forwardu   s    FFr)   c                 8    dj                  | j                        S )Nzp={})formatr,   rC   s    r*   
extra_reprzBeitDropPath.extra_reprx   s    }}T^^,,r)   r@   )r$   r%   r&   r'   r   floatrB   r5   r   rH   strrL   __classcell__rD   s   @r*   r>   r>   n   sG    b#(5/ #T #GU\\ Gell G-C -r)   r>   c            	            e Zd ZdZdeddf fdZdej                  dededej                  fd	Z		 	 dd
ej                  de
ej                     dedej                  fdZ xZS )BeitEmbeddingszc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    configr.   Nc                 2   t         |           t        j                  t	        j
                  dd|j                              | _        |j                  r:t        j                  t	        j
                  dd|j                              | _	        nd | _	        t        |      | _        |j                  | _        t        |j                  t        j                   j"                        r|j                  n|j                  |j                  f| _        | j                  j$                  }|j&                  r=t        j                  t	        j
                  d|dz   |j                              | _        nd | _        t        j*                  |j,                        | _        y )Nr   )rA   rB   r	   	Parameterr5   zeroshidden_size	cls_tokenuse_mask_token
mask_tokenBeitPatchEmbeddingspatch_embeddings
patch_size
isinstance
image_sizecollectionsabcIterablenum_patches use_absolute_position_embeddingsposition_embeddingsDropouthidden_dropout_probdropout)rC   rS   rc   rD   s      r*   rB   zBeitEmbeddings.__init__   s$   ekk!Q8J8J&KL   ll5;;q!V=O=O+PQDO"DO 3F ; ++ &++[__-E-EF ##V%6%67 	
 ++7722')||EKK;QR?TZTfTf4g'hD$'+D$zz&"<"<=r)   
embeddingsheightwidthc                    |j                   d   dz
  }| j                  j                   d   dz
  }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  ddddf   }| j                  ddddf   }|j                   d   }|| j
                  z  }	|| j
                  z  }
t        |dz        }|j                  d|||      }|j                  dddd      }t        j                  j                  ||	|
fdd	
      }|j                  dddd      j                  dd|      }t        j                  ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Ng      ?r   r      bicubicFsizemodealign_cornersdim)r3   re   r5   jit
is_tracingr]   r   reshapepermuter	   
functionalinterpolateviewcat)rC   ri   rj   rk   rc   num_positionsclass_pos_embedpatch_pos_embedru   
new_height	new_widthsqrt_num_positionss               r*   interpolate_pos_encodingz'BeitEmbeddings.interpolate_pos_encoding   s`    !&&q)A-0066q9A= yy##%+*F6UZ?+++221bqb59221ab59r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr)   pixel_valuesbool_masked_posr   c                 x   |j                   \  }}}}| j                  || j                  | j                  d d dd d d f   nd       \  }\  }}	|j                         \  }
}}|K| j                  j                  |
|d      }|j                  d      j                  |      }|d|z
  z  ||z  z   }| j                  j                  |
dd      }| j                  4|r|| j                  |||      z   }n|| j                  d d d dd d f   z   }t        j                  ||fd      }| j                  |      }|||	ffS )Nr   rm   rt   )r3   r\   re   rq   rZ   expand	unsqueezetype_asrX   r   r5   r}   rh   )rC   r   r   r   _rj   rk   ri   patch_heightpatch_width
batch_sizeseq_lenmask_tokensw
cls_tokenss                  r*   rH   zBeitEmbeddings.forward   sQ    +001fe262G2G@X@X@d$221ab!8<jn3
/
/\; ",!2
GQ&//00WbIK))"-55kBA#q1u-a?J^^**:r2>
##/''$*G*G
TZ\a*bb
'$*B*B1bqb!8*LL
YY
J7Q?
\\*-
L+666r)   )NF)r$   r%   r&   r'   r    rB   r5   r   intr   r   
BoolTensorboolrH   rO   rP   s   @r*   rR   rR   ~   s    
>z >d >.&D5<< &D &DUX &D]b]i]i &DV 7;).	7ll7 "%"2"237 #'	7
 
7r)   rR   c                   |     e Zd ZdZ fdZ	 ddej                  deej                     dej                  fdZ xZ	S )r[   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|d   |d   z  |d   |d   z  f}|| _        || _        || _        || _
        || _        t        j                  ||||      | _        y )Nr   r   kernel_sizestride)rA   rB   r_   r]   num_channelsrW   r^   r`   ra   rb   rc   patch_shaper	   Conv2d
projection)	rC   rS   r_   r]   r   rW   rc   r   rD   s	           r*   rB   zBeitPatchEmbeddings.__init__   s   !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&))L+:^hir)   r   position_embeddingr.   c                    |j                   \  }}}}|| j                  k7  rt        d      | j                  |      }|j                   d   |j                   d   }	}|i|j	                  d| j
                  d   | j
                  d   d      j                  dddd      }t        j                  j                  |||	fd      }||z   }|j                  d      j                  dd      }|||	ffS )	NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.rn   r   r   r   rm   ro   rq   rr   )r3   r   
ValueErrorr   r|   r   ry   r	   rz   r{   flatten	transpose)
rC   r   r   r   r   rj   rk   ri   r   r   s
             r*   rH   zBeitPatchEmbeddings.forward   s   
 3?2D2D/
L&%4,,,w  __\2
$.$4$4Q$79I9I!9Lk)!3!8!8D<L<LQ<OQUQaQabcQdfh!i!q!q1a" "$!:!:",)D9 "; " $&88J''*44Q:
L+666r)   r@   )
r$   r%   r&   r'   rB   r5   r   r   rH   rO   rP   s   @r*   r[   r[      sE    j( 6:7ll7 %U\\27 
	7r)   r[   c                        e Zd Zddedee   ddf fdZd Z	 	 	 	 	 ddej                  deej                     d	e
d
ed   de
deee      deeej                     eej                  ej                  f   f   fdZ xZS )BeitSelfAttentionNrS   window_sizer.   c                    t         |           || _        |j                  |j                  z  dk7  r3t        |d      s't        d|j                  f d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                  d      | _        t        j                  |j                  | j                        | _        t        j                  |j                         | _        |rt%        ||      | _        y d | _        y )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .F)biasr   )rA   rB   rS   rW   num_attention_headshasattrr   r   attention_head_sizeall_head_sizer	   Linearquerykeyvaluerf   attention_probs_dropout_probrh   BeitRelativePositionBiasrelative_position_biasrC   rS   r   rD   s      r*   rB   zBeitSelfAttention.__init__  sC    : ::a?PVXhHi"6#5#5#6"7 8334A7 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1C%PYYv1143E3EF
zz&"E"EF*B6Wb*cD'*.D'r)   c                     |j                         d d | j                  | j                  fz   } |j                  | }|j	                  dddd      S )Nrm   r   rn   r   r   )rq   r   r   r|   ry   )rC   xnew_x_shapes      r*   transpose_for_scoresz&BeitSelfAttention.transpose_for_scores1  sN    ffhsmt'?'?AYAY&ZZAFFK yyAq!$$r)   rE   	head_maskoutput_attentionsr   r   r   
resolutionc                    | j                  |      }| j                  | j                  |            }| j                  | j                  |            }	| j                  |      }
t	        j
                  |
|j                  dd            }|t        j                  | j                        z  }| j                  [|\  }}|| j                  j                  z  || j                  j                  z  f}|| j                  |||j                  d         z   }|||z   }t        j                  j!                  |d      }| j#                  |      }|||z  }t	        j
                  ||	      }|j%                  dddd      j'                         }|j)                         d d | j*                  fz   } |j,                  | }|r||f}|S |f}|S )	Nrm   r   )dim_sizert   r   rn   r   )r   r   r   r   r5   matmulr   mathsqrtr   r   rS   r]   r3   r	   rz   softmaxrh   ry   
contiguousrq   r   r|   )rC   rE   r   r   r   r   r   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresrj   rk   r   attention_probscontext_layernew_context_layer_shapeoutputss                      r*   rH   zBeitSelfAttention.forward6  s    !JJ}5--dhh}.EF	//

=0IJ//0AB !<<Y5H5HR5PQ+dii8P8P.QQ &&2&MFE!T[[%;%;;UdkkF\F\=\]K/$2M2M5@S@STU@V 3N 3  
 "-/2HH --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD6G=/2 O\M]r)   r@   NFNFN)r$   r%   r&   r    r   tuplerB   r   r5   r   r   r   r   r   rH   rO   rP   s   @r*   r   r     s    /z / /SW /0% -1"'GK).+/3||3 ELL)3  	3
 !))C D3 #'3 U3Z(3 
uU\\"E%,,*D$EE	F3r)   r   c                   ~     e Zd ZdZdeddf fdZd	dej                  dej                  dej                  fdZ xZ	S )
BeitSelfOutputz
    The residual connection is defined in BeitLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rS   r.   Nc                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y r@   )	rA   rB   r	   r   rW   denserf   rg   rh   rC   rS   rD   s     r*   rB   zBeitSelfOutput.__init__r  sB    YYv1163E3EF
zz&"<"<=r)   rE   input_tensorc                 J    | j                  |      }| j                  |      }|S r@   r   rh   )rC   rE   r   gammas       r*   rH   zBeitSelfOutput.forwardw  $    

=1]3r)   r@   )
r$   r%   r&   r'   r    rB   r5   r   rH   rO   rP   s   @r*   r   r   l  sD    
>z >d >
U\\  ^c^j^j r)   r   c                        e Zd Zddedee   ddf fdZd Z	 	 	 	 	 ddej                  deej                     d	e
d
ed   de
deee      deeej                     eej                  ej                  f   f   fdZ xZS )BeitAttentionNrS   r   r.   c                     t         |           t        ||      | _        t	        |      | _        t               | _        y )Nr   )rA   rB   r   	attentionr   r;   setpruned_headsr   s      r*   rB   zBeitAttention.__init__  s2    *6{K$V,Er)   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   rt   )lenr   r   r   r   r   r   r   r   r   r;   r   r   union)rC   headsindexs      r*   prune_headszBeitAttention.prune_heads  s   u:?74>>55t~~7Y7Y[_[l[l
u
  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r)   rE   r   r   r   r   r   r   c                 n    | j                  ||||||      }| j                  |d   |      }|f|dd  z   }	|	S )Nr   r   )r   r;   )
rC   rE   r   r   r   r   r   self_outputsattention_outputr   s
             r*   rH   zBeitAttention.forward  sS     ~~9&79OQiku
  ;;|AF#%QR(88r)   r@   r   )r$   r%   r&   r    r   r   rB   r   r5   r   r   r   r   r   rH   rO   rP   s   @r*   r   r   ~  s    "z " "SW ";* -1"'GK).+/|| ELL)  	
 !))C D #' U3Z( 
uU\\"E%,,*D$EE	Fr)   r   c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )BeitIntermediaterS   r.   Nc                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r@   )rA   rB   r	   r   rW   intermediate_sizer   r^   
hidden_actrN   r   intermediate_act_fnr   s     r*   rB   zBeitIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r)   rE   c                 J    | j                  |      }| j                  |      }|S r@   )r   r   rG   s     r*   rH   zBeitIntermediate.forward  s&    

=100?r)   	r$   r%   r&   r    rB   r5   r   rH   rO   rP   s   @r*   r   r     s1    9z 9d 9U\\ ell r)   r   c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )
BeitOutputrS   r.   Nc                     t         |           t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        y r@   )
rA   rB   r	   r   r   rW   r   rf   rg   rh   r   s     r*   rB   zBeitOutput.__init__  sB    YYv779K9KL
zz&"<"<=r)   rE   c                 J    | j                  |      }| j                  |      }|S r@   r   rG   s     r*   rH   zBeitOutput.forward  r   r)   r   rP   s   @r*   r   r     s1    >z >d >
U\\ ell r)   r   c                       e Zd ZdZddedee   deddf fdZ	 	 	 	 	 dde	j                  d	ee	j                     d
eded   dedeee      deee	j                     ee	j                  e	j                  f   f   fdZ xZS )	BeitLayerz?This corresponds to the Block class in the timm implementation.NrS   r   drop_path_rater.   c                    t         |           |j                  | _        d| _        t	        ||      | _        t        |      | _        t        |      | _	        t        j                  |j                  |j                        | _        |dkD  rt        |      nt        j                          | _        t        j                  |j                  |j                        | _        |j&                  }|dkD  ryt        j(                  |t+        j,                  |j                        z  d      | _        t        j(                  |t+        j,                  |j                        z  d      | _        y d\  | _        | _        y )	Nr   r   epsr0   r   T)requires_grad)NN)rA   rB   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r;   r	   	LayerNormrW   layer_norm_epslayernorm_beforer>   Identityr<   layernorm_afterlayer_scale_init_valuerU   r5   oneslambda_1lambda_2)rC   rS   r   r   init_valuesrD   s        r*   rB   zBeitLayer.__init__  s   '-'E'E$&v;G,V4 ( "V-?-?VEZEZ [9G#9Mn5SUS^S^S`!||F,>,>FDYDYZ33?LLuzz6CUCU7W)WgklDMLLuzz6CUCU7W)WgklDM+5(DM4=r)   rE   r   r   r   r   r   r   c                    | j                  | j                  |      |||||      }|d   }|dd  }	| j                  | j                  |z  }| j                  |      |z   }| j	                  |      }
| j                  |
      }
| j                  |
      }
| j                  | j                  |
z  }
| j                  |
      |z   }
|
f|	z   }	|	S )N)r   r   r   r   r   r   )r   r  r	  r<   r  r  r;   r
  )rC   rE   r   r   r   r   r   self_attention_outputsr   r   layer_outputs              r*   rH   zBeitLayer.forward  s     "&!!-0/#9%=! "0 "
 2!4(, ==$#}}/?? '78=H ++M:((6{{<0==$==<7L ~~l3mC/G+r)   )Nr0   r   )r$   r%   r&   r'   r    r   r   rM   rB   r5   r   r   r   r   r   rH   rO   rP   s   @r*   r   r     s    I6z 6 6`e 6pt 6* -1"'GK).+/)||) ELL))  	)
 !))C D) #') U3Z() 
uU\\"E%,,*D$EE	F)r)   r   c                        e Zd Zdededdf fdZdeeef   dej                  fdZ
d	dedej                  fdZ xZS )
r   rS   r   r.   Nc                     t         |           || _        d|d   z  dz
  d|d   z  dz
  z  dz   | _        t	        j
                  t        j                  | j                  |j                              | _	        i | _
        y )Nrn   r   r   r   )rA   rB   r   num_relative_distancer	   rU   r5   rV   r   relative_position_bias_tablerelative_position_indicesr   s      r*   rB   z!BeitRelativePositionBias.__init__	  s|    &&'+a.&81&<[QR^ASVWAW%X[\%\",.LLKK22F4N4NO-
)
 *,&r)   c                    d|d   z  dz
  d|d   z  dz
  z  dz   }|d   |d   z  }t        j                  t        j                  |d         t        j                  |d         d      }t        j                  |      }t        j                  |d      }|dddddf   |dddddf   z
  }|j                  ddd      j                         }|dddddfxx   |d   dz
  z  cc<   |dddddfxx   |d   dz
  z  cc<   |dddddfxx   d|d   z  dz
  z  cc<   t        j                  |dz   fdz  |j                        }|j                  d	      |ddddf<   |dz
  |dddf<   |dz
  |dddf<   |dz
  |d
<   |S )z
        This method creates the relative position index, modified to support arbitrary window sizes,
        as introduced in [MiDaS v3.1](https://arxiv.org/abs/2307.14460).
        rn   r   r   r   ij)indexingN)rq   r1   rm   )r   r   )
r5   meshgridarangestackr   ry   r   rV   r1   sum)	rC   r   r  window_areagridcoordscoords_flattenrelative_coordsrelative_position_indexs	            r*    generate_relative_position_indexz9BeitRelativePositionBias.generate_relative_position_index  s   
 "#[^!3a!7AA<NQR<R SVW W "!n{1~5~~ell;q>:ELLUV<XcghT"vq1(At4~aqj7QQ)11!Q:EEG1a KNQ$66 1a KNQ$66 1a AA$6$:: "'++K!O3E3IQ`QfQf"g*9*=*=b*AAB')>)B12&)>)BA&(=(A%&&r)   r   c                    d| j                   d   z  dz
  }d| j                   d   z  dz
  }d|d   z  dz
  }d|d   z  dz
  }| j                  }| j                  }	||z  dz   }
|d|	dz
   }|j                  d||d      j	                  dddd      }t
        j                  j                  |t        |      t        |      fd      }|j	                  dddd      j                  |
dz
  d      }t        j                  |||	dz
  d g      }|}|| j                  j                         vr| j                  |      | j                  |<   || j                  |   j                  d         }|j                  |d   |d   z  dz   |d   |d   z  dz   d      }|j	                  ddd      j                         }|rCt
        j                  j                  |j!                  d      ||fdd	
      j#                  d      }|j!                  d      S )zu
        Modification of timm.models.beit.py: Attention._get_rel_pos_bias to support arbitrary window sizes.
        rn   r   r   r   Nrm   bilinearr   Frp   )r   r  r  rx   ry   r	   rz   r{   r   r5   r}   r  keysr!  r|   r   r   squeeze)rC   r   r   r   
old_height	old_widthr   r    old_relative_position_bias_tableold_num_relative_distancenew_num_relative_distanceold_sub_tablenew_sub_table new_relative_position_bias_tabler   r   s                   r*   rH   z BeitRelativePositionBias.forward,  s]    ))!,,q0
((++a/	Q'!+
A&*	+/+L+L($($>$>!$.$:Q$>!89X;TWX;XY%--aJKSSTUWXZ[]^_11:!6	)8L MT^ 2 
 &--aAq9AAB[^_B_acd+099<=VYZ=Z=\]^,
( d4499;;262W2WXc2dD**3/!A$B`B`adBeBjBjkmBn!o!7!<!<N[^+a/Q+a.1PST1TVX"
 "8!?!?1a!H!S!S!U#%']]%>%>&003)#	 &? &
 gaj # &//22r)   )FN)r$   r%   r&   r    r   rB   r   r   r5   r   r!  r   rH   rO   rP   s   @r*   r   r     sY    	,z 	, 	,$ 	,'E#s(O 'PUP\P\ '0/3T /3]b]i]i /3r)   r   c                        e Zd Zddedee   ddf fdZ	 	 	 	 	 	 ddej                  deej                     de	d	e	d
e	dee
e      de	deeef   fdZ xZS )BeitEncoderNrS   r   r.   c                    t         |           || _        |j                  rt	        ||      | _        nd | _        t        j                  d|j                  |j                        D cg c]  }|j                          }}t        j                  t        |j                        D cg c]!  }t        ||j                  r|nd ||         # c}      | _        d| _        y c c}w c c}w )Nr   r   )r   r   F)rA   rB   rS   !use_shared_relative_position_biasr   r   r5   linspacer   num_hidden_layersitemr	   
ModuleListranger   use_relative_position_biaslayergradient_checkpointing)rC   rS   r   r   dprirD   s         r*   rB   zBeitEncoder.__init___  s    33*B6Wb*cD'*.D' "'63H3H&JbJb!cdAqvvxdd]] v778  /5/P/PVZ#&q6	

 ',# es   *C#)&C(rE   r   r   output_hidden_statesr   r   return_dictc           	      F   |rdnd }|rdnd }	t        | j                        D ]  \  }
}|r||fz   }|||
   nd }| j                  r+| j                  r| j	                  |j
                  |||      }ns|\  }}|| j                  j                  z  || j                  j                  z  f}| j                  !| j                  |||j                  d         nd } |||||||      }|d   }|s|	|d   fz   }	 |r||fz   }|st        d |||	fD              S t        |||	      S )Nr(   r   )r   r   r   c              3   &   K   | ]	  }||  y wr@   r(   ).0vs     r*   	<genexpr>z&BeitEncoder.forward.<locals>.<genexpr>  s     mq_`_lms   )last_hidden_staterE   
attentions)	enumerater8  r9  r-   _gradient_checkpointing_func__call__rS   r]   r   r3   r   r   )rC   rE   r   r   r<  r   r   r=  all_hidden_statesall_self_attentionsr;  layer_modulelayer_head_masklayer_outputsrj   rk   r   r   s                     r*   rH   zBeitEncoder.forwardu  s    #7BD$5b4(4 #	POA|#$58H$H!.7.CilO**t}} $ A A ))!#%	! !+%)?)??$++J`J`A`a
 22> //#>Vanatatuvaw 0   ' !-!#%*,! *!,M &9]1=M<O&O#G#	PJ   1]4D Dm]4EGZ$[mmm++*
 	
r)   r@   )NFFFNT)r$   r%   r&   r    r   r   rB   r5   r   r   r   r   r   r   rH   rO   rP   s   @r*   r/  r/  ^  s    ,z , ,SW ,2 -1"'%*).+/ ;
||;
 ELL);
  	;

 #;
 #';
 U3Z(;
 ;
 
uo%	&;
r)   r/  c                   2    e Zd ZdZeZdZdZdZdgZ	dgZ
d Zy)	BeitPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    beitr   Tr   z.*relative_position_index.*c                 @   t        |t        j                  t        j                  t        j                  f      rm|j
                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j
                  j                  j                  d| j                  j                         |j                  2|j
                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j
                  j                  j                  d       yy)zInitialize the weightsr0   )meanstdNg      ?)r^   r	   r   r   ConvTranspose2dweightdatanormal_rS   initializer_ranger   zero_	Embeddingpadding_idxr  fill_)rC   modules     r*   _init_weightsz!BeitPreTrainedModel._init_weights  s   fryy"))R5G5GHI MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) .r)   N)r$   r%   r&   r'   r    config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules"_keys_to_ignore_on_load_unexpectedr]  r(   r)   r*   rN  rN    s5    
 L$O&*#$*H)I&*r)   rN  aF  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`BeitConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`BeitImageProcessor.__call__`] for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
            Whether to interpolate the pre-trained position encodings.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z^The bare Beit Model transformer outputting raw hidden-states without any specific head on top.c                   
    e Zd Zddededdf fdZd Zd Z ee	       e
eeede	      	 	 	 	 	 	 dd
ej                   deej$                     deej                      dee   dee   dedee   deeef   fd              Z xZS )	BeitModelrS   add_pooling_layerr.   Nc                    t         |   |       || _        t        |      | _        t        || j                  j                  j                        | _        |j                  rt        j                         n*t        j                  |j                  |j                        | _        |rt!        |      nd | _        | j%                          y )Nr   r   )rA   rB   rS   rR   ri   r/  r\   r   encoderuse_mean_poolingr	   r  r  rW   r  	layernorm
BeitPoolerpooler	post_init)rC   rS   rf  rD   s      r*   rB   zBeitModel.__init__  s     (0"6t7W7W7c7cd $44BKKM",,vGYGY_e_t_t:u 	 ->j(4 	r)   c                 .    | j                   j                  S r@   ri   r\   rK   s    r*   get_input_embeddingszBeitModel.get_input_embeddings	      ///r)   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrh  r8  r   r   )rC   heads_to_pruner8  r   s       r*   _prune_headszBeitModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr)   vision)
checkpointoutput_typer^  modalityexpected_outputr   r   r   r   r<  r   r=  c           	      <   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  || j                   j
                        }| j                  |||      \  }}	|j                  dd }
| j                  |||||
||      }|d   }| j                  |      }| j                  | j                  |      nd}|s|||fn|f}||dd z   S t        |||j                  |j                        S )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        N)r   r   rn   )r   r   r<  r   r=  r   r   r   )rC  pooler_outputrE   rD  )rS   r   r<  use_return_dictget_head_maskr3  ri   r3   rh  rj  rl  r#   rE   rD  )rC   r   r   r   r   r<  r   r=  embedding_outputr   r   encoder_outputssequence_outputpooled_outputhead_outputss                  r*   rH   zBeitModel.forward  sP   , 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] &&y$++2O2OP	"oo/Tl . 
! "''+
,,/!5!#%= ' 
 *!,..98<8OO4UY?L?XO];_n^pL/!""555)-')77&11	
 	
r)   )T)NNNNFN)r$   r%   r&   r    r   rB   rp  ru  r   BEIT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr#   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr5   r   r   r   r   r   rH   rO   rP   s   @r*   re  re    s    
z d d 0C ++@A&.$. 7;,0,0/3).&*6
ll6
 "%"2"236
 ELL)	6

 $D>6
 'tn6
 #'6
 d^6
 
u00	16
 B6
r)   re  c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )rk  rS   r.   Nc                     t         |           |j                  r1t        j                  |j
                  |j                        | _        y d | _        y )Nr   )rA   rB   ri  r	   r  rW   r  rj  r   s     r*   rB   zBeitPooler.__init__V  sA    KQKbKbBLL++1F1FG 	hl 	r)   rE   c                     | j                   0|d d dd d d f   }| j                  |j                  d            }|S |d d df   }|S )Nr   r   )rj  rQ  )rC   rE   patch_tokensr  s       r*   rH   zBeitPooler.forward\  sU    >>%(AB2L NN<+<+<Q+?@M
  *!Q$/Mr)   r   rP   s   @r*   rk  rk  U  s1    
z 
d 
	U\\ 	ell 	r)   rk  a  Beit Model transformer with a 'language' modeling head on top. BEiT does masked image modeling by predicting
    visual tokens of a Vector-Quantize Variational Autoencoder (VQ-VAE), whereas other vision models like ViT and DeiT
    predict RGB pixel values. As a result, this class is incompatible with [`AutoModelForMaskedImageModeling`], so you
    will need to use [`BeitForMaskedImageModeling`] directly if you wish to do masked image modeling with BEiT.c                       e Zd Zdeddf fdZ ee       eee	      	 	 	 	 	 	 	 	 dde
ej                     de
ej                     de
ej                     d	e
ej                     d
e
e   de
e   dede
e   deeef   fd              Z xZS )BeitForMaskedImageModelingrS   r.   Nc                 H   t         |   |       |j                  | _        t        |d      | _        t        j                  |j                  |j                        | _	        t        j                  |j                  |j                        | _        | j                          y )NFrf  r   )rA   rB   
num_labelsre  rO  r	   r  rW   r  rj  r   
vocab_sizelm_headrm  r   s     r*   rB   z#BeitForMaskedImageModeling.__init__p  su      ++f>	 f&8&8f>S>STyy!3!3V5F5FG 	r)   rx  r^  r   r   r   labelsr   r<  r   r=  c	           	      j   ||n| j                   j                  }| j                  |||||||      }	|	d   }
| j                  |
      }
| j	                  |
ddddf         }d}|t               } |||   |      }|s|f|	dd z   }||f|z   S |S t        |||	j                  |	j                        S )a  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, BeitForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
        >>> model = BeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, logits = outputs.loss, outputs.logits
        >>> list(logits.shape)
        [1, 196, 8192]
        ```N)r   r   r   r<  r   r=  r   r   losslogitsrE   rD  )	rS   r}  rO  rj  r  r   r   rE   rD  )rC   r   r   r   r  r   r<  r   r=  r   r  prediction_scoresmasked_lm_lossloss_fctr;   s                  r*   rH   z"BeitForMaskedImageModeling.forward}  s    ` &1%<k$++B]B]))+/!5%=#  
 "!*..9 LLAB)?@')H%&7&H&QN')GABK7F3A3M^%.YSYY$!//))	
 	
r)   )NNNNNNFN)r$   r%   r&   r    rB   r   r  r   r   r  r   r5   r   r   r   r   r   rH   rO   rP   s   @r*   r  r  h  s    z d  ++@A>X 046:,0)-,0/3).&*L
u||,L
 "%"2"23L
 ELL)	L

 &L
 $D>L
 'tnL
 #'L
 d^L
 
un$	%L
 Y BL
r)   r  z
    Beit Model transformer with an image classification head on top (a linear layer on top of the average of the final
    hidden states of the patch tokens) e.g. for ImageNet.
    c                        e Zd Zdeddf fdZ ee       eee	e
e      	 	 	 	 	 	 	 ddeej                     deej                     deej                     d	ee   d
ee   dedee   deee	f   fd              Z xZS )BeitForImageClassificationrS   r.   Nc                 .   t         |   |       |j                  | _        t        |d      | _        |j                  dkD  r*t        j                  |j                  |j                        nt        j                         | _	        | j                          y )NTr  r   )rA   rB   r  re  rO  r	   r   rW   r  
classifierrm  r   s     r*   rB   z#BeitForImageClassification.__init__  ss      ++f=	 OUN_N_bcNc"))F$6$68I8IJikititiv 	r)   )rw  rx  r^  rz  r   r   r  r   r<  r   r=  c                 4   ||n| j                   j                  }| j                  ||||||      }|r|j                  n|d   }	| j	                  |	      }
d}|| j                   j
                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j
                  dk(  rIt               }| j                  dk(  r& ||
j                         |j                               }n ||
|      }n| j                   j
                  dk(  r=t               } ||
j                  d| j                        |j                  d            }n,| j                   j
                  dk(  rt               } ||
|      }|s|
f|dd z   }||f|z   S |S t!        ||
|j"                  |j$                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r<  r   r=  r   
regressionsingle_label_classificationmulti_label_classificationrm   rn   r  )rS   r}  rO  r|  r  problem_typer  r1   r5   longr   r   r%  r   r|   r
   r   rE   rD  )rC   r   r   r  r   r<  r   r=  r   r  r  r  r  r;   s                 r*   rH   z"BeitForImageClassification.forward  s   . &1%<k$++B]B]))/!5%=#  
 2=--'!*/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE$!//))	
 	
r)   NNNNNFN)r$   r%   r&   r    rB   r   r  r   _IMAGE_CLASS_CHECKPOINTr   r  _IMAGE_CLASS_EXPECTED_OUTPUTr   r5   r   r   r   r   rH   rO   rP   s   @r*   r  r    s    
z 
d 
 ++@A*)$4	 04,0)-,0/3).&*=
u||,=
 ELL)=
 &	=

 $D>=
 'tn=
 #'=
 d^=
 
u++	,=
 B=
r)   r  c                        e Zd ZdZ	 	 	 ddededeeeeef   f   deeeeef   ef   dedeeeeef   f   dd	f fd
Z	de
j                  de
j                  fdZ xZS )BeitConvModuleaD  
    A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
    layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    in_channelsout_channelsr   paddingr   dilationr.   Nc                     t         |           t        j                  ||||||      | _        t        j
                  |      | _        t        j                         | _        y )N)r  r  r   r  r   r  )	rA   rB   r	   r   convBatchNorm2dbnReLU
activation)rC   r  r  r   r  r   r  rD   s          r*   rB   zBeitConvModule.__init__1  sQ     	II#%#
	 ...'')r)   r+   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r@   )r  r  r  )rC   r+   r;   s      r*   rH   zBeitConvModule.forwardF  s0    5!(r)   )r   Fr   )r$   r%   r&   r'   r   r   r   rN   r   rB   r5   r   rH   rO   rP   s   @r*   r  r  )  s     5601$$ $ 3c3h/0	$
 sE#s(OS01$ $ U38_,-$ 
$*U\\ ell r)   r  c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZS )	BeitPyramidPoolingBlock
pool_scaler  channelsr.   Nc                     t         |           t        j                  |      t	        ||d      g| _        t        | j
                        D ]   \  }}| j                  t        |      |       " y )Nr   r   )	rA   rB   r	   AdaptiveAvgPool2dr  layersrE  
add_modulerN   )rC   r  r  r  r;  r8  rD   s         r*   rB   z BeitPyramidPoolingBlock.__init__O  sa      ,;a@
 "$++. 	+HAuOOCFE*	+r)   r+   c                 <    |}| j                   D ]
  } ||      } |S r@   )r  )rC   r+   hidden_stater8  s       r*   rH   zBeitPyramidPoolingBlock.forwardX  s*    [[ 	/E .L	/r)   )	r$   r%   r&   r   rB   r5   r   rH   rO   rP   s   @r*   r  r  N  s?    +3 +S +C +D +U\\ ell r)   r  c            
            e Zd ZdZdeedf   dedededdf
 fd	Zd
ej                  de
ej                     fdZ xZS )BeitPyramidPoolingModulea  
    Pyramid Pooling Module (PPM) used in PSPNet.

    Args:
        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
            Module.
        in_channels (int): Input channels.
        channels (int): Channels after modules, before conv_seg.
        align_corners (bool): align_corners argument of F.interpolate.

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    pool_scales.r  r  rs   r.   Nc                    t         |           || _        || _        || _        || _        g | _        t        |      D ]I  \  }}t        |||      }| j                  j                  |       | j                  t        |      |       K y )N)r  r  r  )rA   rB   r  rs   r  r  blocksrE  r  appendr  rN   )	rC   r  r  r  rs   r;  r  blockrD   s	           r*   rB   z!BeitPyramidPoolingModule.__init__m  s    &*& &{3 	+MAz+z{emnEKKu%OOCFE*	+r)   r   c                     g }| j                   D ]Y  } ||      }t        j                  j                  ||j	                         dd  d| j
                        }|j                  |       [ |S )Nrn   r#  rp   )r  r	   rz   r{   rq   rs   r  )rC   r   ppm_outsppmppm_outupsampled_ppm_outs         r*   rH   z BeitPyramidPoolingModule.forwardy  sn    ;; 	/C!fG " 9 9affhqrl4K]K] !: ! OO-.	/ r)   )r$   r%   r&   r'   r   r   r   rB   r5   r   r   rH   rO   rP   s   @r*   r  r  _  s[    
+E#s(O 
+# 
+QT 
+ei 
+nr 
+ $u||*< r)   r  c                   j     e Zd ZdZdeddf fdZd Zdej                  dej                  fdZ	 xZ
S )	BeitUperHeadz
    Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
    [UPerNet](https://arxiv.org/abs/1807.10221).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    rS   r.   Nc                    t         |           |j                  | _        |j                  gdz  | _        |j                  | _        d| _        t        j                  | j
                  |j                  d      | _
        t        | j                  | j                  d   | j
                  | j                        | _        t        | j                  d   t        | j                        | j
                  z  z   | j
                  dd      | _        t        j                          | _        t        j                          | _        | j                  d d D ]s  }t        || j
                  d      }t        | j
                  | j
                  dd      }| j"                  j'                  |       | j$                  j'                  |       u t        t        | j                        | j
                  z  | j
                  dd      | _        y )	N   Fr   r  rm   )rs   r   r   r  )rA   rB   r  rW   r  r  rs   r	   r   r  r  r  psp_modulesr  r   
bottleneckr5  lateral_convs	fpn_convsr  fpn_bottleneck)rC   rS   r  l_convfpn_convrD   s        r*   rB   zBeitUperHead.__init__  s   !--"../!3**"))DMM63D3DRST 4R MM,,	
 )R 3t'7'7#84==#HHMM	
  ]]_++CR0 	,K#KANF%dmmT]]PQ[\]H%%f-NN!!(+		, -  !DMM1MM	
r)   c                     |d   }|g}|j                  | j                  |             t        j                  |d      }| j	                  |      }|S )Nrm   r   rt   )extendr  r5   r}   r  )rC   inputsr   psp_outsr;   s        r*   psp_forwardzBeitUperHead.psp_forward  sL    2J3((+,99X1-*r)   encoder_hidden_statesc                 P   t        | j                        D cg c]  \  }} |||          }}}|j                  | j                  |             t	        |      }t        |dz
  dd      D ]V  }||dz
     j                  dd  }||dz
     t        j                  j                  ||   |d| j                        z   ||dz
  <   X t        |dz
        D cg c]  } | j                  |   ||          }}|j                  |d          t        |dz
  dd      D ]E  }t        j                  j                  ||   |d   j                  dd  d| j                        ||<   G t        j                  |d      }| j                  |      }| j                  |      }|S c c}}w c c}w )Nr   r   rm   rn   r#  rp   rt   )rE  r  r  r  r   r6  r3   r	   rz   r{   rs   r  r5   r}   r  r  )	rC   r  r;  lateral_convlateralsused_backbone_levels
prev_shapefpn_outsr;   s	            r*   rH   zBeitUperHead.forward  s   R[\`\n\nRopq,L!6q!9:pp(()>?@  #8}+a/B7 	A!!a%..qr2J&q1uo0I0I*:TM_M_ 1J 1 HQUO	 =BBVYZBZ<[\q%DNN1%hqk2\\%+a/B7 	A--33(1+"3"3AB"7jX\XjXj 4 HQK	 99X1-$$X.(3 q ]s   FF#)r$   r%   r&   r'   r    rB   r  r5   r   rH   rO   rP   s   @r*   r  r    s<    $
z $
d $
LU\\ ell r)   r  c                        e Zd ZdZ	 ddedededeeeeef   f   ddf
 fdZd	e	j                  de	j                  fd
Z xZS )BeitFCNHeada  
    Fully Convolution Networks for Semantic Segmentation. This head is implemented of
    [FCNNet](https://arxiv.org/abs/1411.4038>).

    Args:
        config (BeitConfig): Configuration.
        in_channels
        kernel_size (int): The kernel size for convs in the head. Default: 3.
        dilation (int): The dilation rate for convs in the head. Default: 1.


    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    rS   in_indexr   r  r.   Nc           
      <   t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _	        || _
        |dz  |z  }g }|j                  t        | j                  | j
                  |||             t        | j                  dz
        D ]5  }|j                  t        | j
                  | j
                  |||             7 | j                  dk(  rt        j                         | _        nt        j"                  | | _        | j                  r8t        | j                  | j
                  z   | j
                  ||dz        | _        t        j&                  | j
                  |j(                  d      | _        y )Nrn   )r   r  r  r   r   r  r  )rA   rB   rW   r  auxiliary_channelsr  auxiliary_num_convs	num_convsauxiliary_concat_inputconcat_inputr  r  r  r6  r	   r  convs
Sequentialconv_catr   r  r  )	rC   rS   r  r   r  conv_paddingr  r;  rD   s	           r*   rB   zBeitFCNHead.__init__  sX    	!--1133"99 #q(H4  $--[R^iq	

 t~~)* 	ALLMM4==kS_jr	 >>QDJ.DJ*  4==0$--[bmqrbrDM ))DMM63D3DRSTr)   r  c                     || j                      }| j                  |      }| j                  r(| j                  t	        j
                  ||gd            }| j                  |      }|S )Nr   rt   )r  r  r  r  r5   r}   r  )rC   r  rE   r;   s       r*   rH   zBeitFCNHead.forward
  sX    -dmm<M*]]599mV-D!#LMF(r)   )rn   r   r   )r$   r%   r&   r'   r    r   r   r   rB   r5   r   rH   rO   rP   s   @r*   r  r    sv     tu U  U,/ UBE UUZ[^`efiknfn`o[oUp U	 UDU\\ ell r)   r  zf
    Beit Model transformer with a semantic segmentation head on top e.g. for ADE20k, CityScapes.
    c                        e Zd Zdeddf fdZd Z ee       ee	e
      	 	 	 	 	 	 	 ddeej                     deej                     d	eej                     d
ee   dee   dedee   deee	f   fd              Z xZS )BeitForSemanticSegmentationrS   r.   Nc                 x   t         |   |       |j                  | _        t        |d      | _        t        | j                  j                        dk7  rt        d      t        j                  t        j                  |j                  |j                  dd      t        j                  |j                        t        j                         t        j                  |j                  |j                  dd            | _        t        j                  t        j                  |j                  |j                  dd            | _        t        j"                         | _        t        j&                  dd      | _        t+        |      | _        |j.                  rt1        |      nd | _        | j5                          y )NFr  r  zBeitForSemanticSegmentation requires config.out_indices to be a list of 4 integers, specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of a base-sized architecture.rn   r   )rA   rB   r  re  rO  r   rS   out_indicesr   r	   r  rS  rW   r  GELUfpn1fpn2r  fpn3	MaxPool2dfpn4r  decode_headuse_auxiliary_headr  auxiliary_headrm  r   s     r*   rB   z$BeitForSemanticSegmentation.__init__  sO     ++f>	 t{{&&'1,- 
 MMv1163E3EST]^_NN6--.GGIv1163E3EST]^_	
	 MMv1163E3EST]^_
	 KKM	LLQq9	 (/5;5N5Nk&1TX 	r)   c                 n   t         j                  j                  ||j                  dd  dd      }|0t         j                  j                  ||j                  dd  dd      }t	        | j
                  j                        } |||      }|}|% ||      }	|| j
                  j                  |	z  z  }|S )Nr   r#  Frp   )ignore_index)r	   rz   r{   r3   r   rS   semantic_loss_ignore_indexauxiliary_loss_weight)
rC   r  auxiliary_logitsr  upsampled_logitsupsampled_auxiliary_logitsr  	main_lossr  auxiliary_losss
             r*   compute_lossz(BeitForSemanticSegmentation.compute_loss;  s    ==44bc*5 5 
 ')+)B)B v||BC'8zY^ *C *& $1W1WX-v6	'%&@&INDKK55FFDr)   r  r   r   r  r   r<  r   r=  c           	      T   ||n| j                   j                  }||n| j                   j                  }|$| j                   j                  dk(  rt	        d      | j                  |||d||      }|r|j                  n|d   }	t        |	      D 
cg c]#  \  }
}|
dz   | j                   j                  v s"|% }}
}|j                  d   }| j                   j                  | j                   j                  z  }|D cg c]3  }|ddddddf   j                  ddd      j                  |d||      5 }}| j                  | j                  | j                   | j"                  g}t%        t'        |            D ]  } ||   ||         ||<    | j)                  |      }d}| j*                  | j+                  |      }d}|| j-                  |||      }|s|r
|f|dd z   }n	|f|dd z   }||f|z   S |S t/        |||r|j                  nd|j0                  	      S c c}}
w c c}w )
aV  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, BeitForSemanticSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")
        >>> model = BeitForSemanticSegmentation.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr  r   rn   rm   r  )rS   r}  r<  r  r   rO  rE   rE  r  r3   r_   r]   ry   rx   r  r  r  r  r6  r   r  r   r
  r   rD  )rC   r   r   r  r   r<  r   r=  r   r  idxfeaturefeaturesr   patch_resolutionr   opsr;  r  r  r  r;   s                         r*   rH   z#BeitForSemanticSegmentation.forwardN  sb   J &1%<k$++B]B]$8$D $++JjJj 	 $++"8"8A"=NOO))/!%%=#  
 :E 5 5'RS* 1::O0PwWTWZ[T[_c_j_j_v_vTvGww!''*
;;11T[[5K5KKnv
ijAaQhK1a(00RAQScd
 

 yy$))TYY		:s8}% 	.A #a&!-HQK	. !!(+*#228<$$V-=vFD# WQR[0 WQR[0)-)9TGf$EvE&3G'//T))	
 	
; x
s   #H7H>8H%r  )r$   r%   r&   r    rB   r
  r   r  r   r   r  r   r5   r   r   r   r   rH   rO   rP   s   @r*   r  r    s    z d @& ++@A+BQ`a 04,0)-,0/3).&*Z
u||,Z
 ELL)Z
 &	Z

 $D>Z
 'tnZ
 #'Z
 d^Z
 
u--	.Z
 b BZ
r)   r  zM
    BEiT backbone, to be used with frameworks like DETR and MaskFormer.
    c                        e Zd Z fdZd Z ee       eee	      	 	 	 d
de
dee   dee   dee   def
d	              Z xZS )BeitBackbonec                    t         |   |       t         | 	  |       t        |j                  dz         D cg c]  }|j
                   c}| _        t        |      | _        t        || j                  j                  j                        | _        |j                  rt        | j                  j                         dk7  rt#        d      |j
                  }t%        j&                  t%        j(                  ||dd      t%        j*                  ||j,                        t%        j.                         t%        j(                  ||dd            | _        t%        j&                  t%        j(                  ||dd            | _        t%        j4                         | _        t%        j8                  dd      | _        | j=                          y c c}w )Nr   r   r  zBeitBackbone requires config.out_indices to be a list of 4 integers, specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of a base-sized architecture.rn   r   r   )rA   rB   _init_backboner6  r3  rW   num_featuresrR   ri   r/  r\   r   rh  add_fpnr   rS   r  r   r	   r  rS  r  batch_norm_epsr  r  r  r  r  r  r  rm  )rC   rS   r   rW   rD   s       r*   rB   zBeitBackbone.__init__  s[    v&9>v?W?WZ[?[9\]AV//](0"6t7W7W7c7cd>>4;;**+q0 1 
 !,,K"";STU{0E0EF	"";STU	DI b&8&8k_`ij&klDIDI1=DI 	1 ^s   Gc                 .    | j                   j                  S r@   ro  rK   s    r*   rp  z!BeitBackbone.get_input_embeddings  rq  r)   r  r   r<  r   r=  r.   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|j                  d   }| j                  |      \  }\  }}|j                  dd }	| j                  |d||	|      }
|r|
j                  n|
d   }d}t        | j                  |      D ]e  \  }}|| j                  v s| j                   j                  r5|ddddddf   }|j                  ddd      }|j                  |d||      }||fz  }g | j                   j                  rY| j                  |d         | j!                  |d         | j#                  |d         | j%                  |d	         g}t'        |      }|s|r|f|
dd z   }|S |f|
dd z   }|S t)        ||r|
j                  nd|
j*                  
      S )aL  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224")
        >>> model = AutoBackbone.from_pretrained(
        ...     "microsoft/beit-base-patch16-224", out_features=["stage1", "stage2", "stage3", "stage4"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 14, 14]
        ```Nr   rn   T)r<  r   r   r=  r   r(   rm   r   )feature_mapsrE   rD  )rS   r}  r<  r   r3   ri   rh  rE   zipstage_namesout_featuresreshape_hidden_statesry   rx   r  r  r  r  r  r   r   rD  )rC   r   r<  r   r=  r   r  r   r   r   r   rE   r  stager  r;   s                   r*   rH   zBeitBackbone.forward  s   F &1%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq!''*
8<8U55<!''+
,,!%/!#  
 2=--'!*#&t'7'7#G 	0E<)));;44#/12q#9L#/#7#71a#@L#/#7#7
BVa#bL/	0 ;;		,q/*		,q/*		,q/*		,q/*	L !.L#&712;6 M '712;6M%3G'//T))
 	
r)   )NNN)r$   r%   r&   rB   rp  r   r  r   r   r  r   r   r   rH   rO   rP   s   @r*   r  r    s    <0 ++@A>X 04,0&*S
S
 'tnS
 $D>	S

 d^S
 
S
 Y BS
r)   r  )r0   F)Rr'   collections.abcr`   r   dataclassesr   typingr   r   r   r   r5   torch.utils.checkpointr   r	   torch.nnr
   r   r   activationsr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   r   utils.backbone_utilsr   configuration_beitr    
get_loggerr$   loggerr  r  r  r  r  r#   rM   r   r<   Moduler>   rR   r[   r   r   r   r   r   r   r   r/  rN  BEIT_START_DOCSTRINGr  re  rk  r  r  r  r  r  r  r  r  r  r(   r)   r*   <module>r0     s      ! / /    A A !  . Q  2 * 
		H	%  > &  < 1  !;  2U\\ e T V[VbVb (-299 - b7RYY b7J27")) 27jQ		 QhRYY $)BII )Xryy  
 
>		 >BS3ryy S3lR
")) R
j*/ *<	  2 dY
# Y
	Y
x & s \
!4 \
\
~  Q
!4 Q
Q
h"RYY "Jbii ""ryy "JR299 Rj8")) 8v  	P
"5 P
P
f  	w
& w
w
r)   