
    sg              	          d Z ddlZddlZddlmZ ddlmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZmZmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&  e"jN                  e(      Z)dZ*dZ+g dZ,dZ-dZ.e G d de             Z/dJdej`                  de1de2dej`                  fdZ3 G d dejh                        Z5 G d dejh                        Z6 G d d ejh                        Z7 G d! d"ejh                        Z8 G d# d$ejh                        Z9 G d% d&ejh                        Z: G d' d(ejh                        Z; G d) d*ejh                        Z< G d+ d,ejh                        Z= G d- d.ejh                        Z> G d/ d0ejh                        Z? G d1 d2e      Z@d3ZAd4ZB e d5eA       G d6 d7e@             ZC G d8 d9ejh                        ZD e d:eA       G d; d<e@             ZE G d= d>ejh                        ZF G d? d@ejh                        ZG G dA dBejh                        ZH G dC dDejh                        ZI G dE dFejh                        ZJ e dGeA       G dH dIe@             ZKy)KzPyTorch Data2VecVision model.    N)	dataclass)ListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputSemanticSegmenterOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings	torch_int   )Data2VecVisionConfigr   zfacebook/data2vec-vision-base)r      i   z"facebook/data2vec-vision-base-ft1kzremote control, remotec                       e Zd ZdZy)$Data2VecVisionModelOutputWithPoolinga  
    Class for outputs of [`Data2VecVisionModel`].

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
            Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
            *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
            will be returned.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    N)__name__
__module____qualname____doc__     h/var/www/html/venv/lib/python3.12/site-packages/transformers/models/data2vec/modeling_data2vec_vision.pyr   r   =   s    r%   r   input	drop_probtrainingreturnc                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
            r   r   )r   )dtypedevice)shapendimtorchrandr-   r.   floor_div)r'   r(   r)   	keep_probr/   random_tensoroutputs          r&   	drop_pathr8   Y   s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr%   c                   x     e Zd ZdZd	dee   ddf fdZdej                  dej                  fdZ	de
fdZ xZS )
Data2VecVisionDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr(   r*   c                 0    t         |           || _        y N)super__init__r(   )selfr(   	__class__s     r&   r>   zData2VecVisionDropPath.__init__q   s    "r%   hidden_statesc                 D    t        || j                  | j                        S r<   )r8   r(   r)   r?   rA   s     r&   forwardzData2VecVisionDropPath.forwardu   s    FFr%   c                 8    dj                  | j                        S )Nzp={})formatr(   r?   s    r&   
extra_reprz!Data2VecVisionDropPath.extra_reprx   s    }}T^^,,r%   r<   )r    r!   r"   r#   r   floatr>   r1   TensorrD   strrH   __classcell__r@   s   @r&   r:   r:   n   sG    b#(5/ #T #GU\\ Gell G-C -r%   r:   c            	            e Zd ZdZdeddf fdZdej                  dededej                  fd	Z		 	 dd
ej                  de
ej                     dedej                  fdZ xZS )Data2VecVisionEmbeddingszc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    configr*   Nc                 2   t         |           t        j                  t	        j
                  dd|j                              | _        |j                  r:t        j                  t	        j
                  dd|j                              | _	        nd | _	        t        |      | _        |j                  | _        t        |j                  t        j                   j"                        r|j                  n|j                  |j                  f| _        | j                  j$                  }|j&                  r=t        j                  t	        j
                  d|dz   |j                              | _        nd | _        t        j*                  |j,                        | _        y )Nr   )r=   r>   r   	Parameterr1   zeroshidden_size	cls_tokenuse_mask_token
mask_tokenData2VecVisionPatchEmbeddingspatch_embeddings
patch_size
isinstance
image_sizecollectionsabcIterablenum_patches use_absolute_position_embeddingsposition_embeddingsDropouthidden_dropout_probdropout)r?   rP   r`   r@   s      r&   r>   z!Data2VecVisionEmbeddings.__init__   s$   ekk!Q8J8J&KL   ll5;;q!V=O=O+PQDO"DO =f E ++ &++[__-E-EF ##V%6%67 	
 ++7722')||EKK;QR?TZTfTf4g'hD$'+D$zz&"<"<=r%   
embeddingsheightwidthc                    |j                   d   dz
  }| j                  j                   d   dz
  }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  ddddf   }| j                  ddddf   }|j                   d   }|| j
                  z  }	|| j
                  z  }
t        |dz        }|j                  d|||      }|j                  dddd      }t        j                  j                  ||	|
fdd	
      }|j                  dddd      j                  dd|      }t        j                  ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Ng      ?r   r      bicubicFsizemodealign_cornersdim)r/   rb   r1   jit
is_tracingrZ   r   reshapepermuter   
functionalinterpolateviewcat)r?   rf   rg   rh   r`   num_positionsclass_pos_embedpatch_pos_embedrr   
new_height	new_widthsqrt_num_positionss               r&   interpolate_pos_encodingz1Data2VecVisionEmbeddings.interpolate_pos_encoding   s`    !&&q)A-0066q9A= yy##%+*F6UZ?+++221bqb59221ab59r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr%   pixel_valuesbool_masked_posr   c                 x   |j                   \  }}}}| j                  || j                  | j                  d d dd d d f   nd       \  }\  }}	|j                         \  }
}}|K| j                  j                  |
|d      }|j                  d      j                  |      }|d|z
  z  ||z  z   }| j                  j                  |
dd      }| j                  4|r|| j                  |||      z   }n|| j                  d d d dd d f   z   }t        j                  ||fd      }| j                  |      }|||	ffS )Nr   rj   rq   )r/   rY   rb   rn   rW   expand	unsqueezetype_asrU   r   r1   rz   re   )r?   r   r   r   _rg   rh   rf   patch_heightpatch_width
batch_sizeseq_lenmask_tokensw
cls_tokenss                  r&   rD   z Data2VecVisionEmbeddings.forward   sQ    +001fe262G2G@X@X@d$221ab!8<jn3
/
/\; ",!2
GQ&//00WbIK))"-55kBA#q1u-a?J^^**:r2>
##/''$*G*G
TZ\a*bb
'$*B*B1bqb!8*LL
YY
J7Q?
\\*-
L+666r%   )NF)r    r!   r"   r#   r   r>   r1   rJ   intr   r   
BoolTensorboolrD   rL   rM   s   @r&   rO   rO   }   s    
>3 > >.&D5<< &D &DUX &D]b]i]i &DV 7;).	7ll7 "%"2"237 #'	7
 
7r%   rO   c                   |     e Zd ZdZ fdZ	 ddej                  deej                     dej                  fdZ xZ	S )rX   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|d   |d   z  |d   |d   z  f}|| _        || _        || _        || _
        || _        t        j                  ||||      | _        y )Nr   r   kernel_sizestride)r=   r>   r\   rZ   num_channelsrT   r[   r]   r^   r_   r`   patch_shaper   Conv2d
projection)	r?   rP   r\   rZ   r   rT   r`   r   r@   s	           r&   r>   z&Data2VecVisionPatchEmbeddings.__init__   s   !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&))L+:^hir%   r   position_embeddingr*   c                    |j                   \  }}}}|| j                  k7  rt        d      | j                  |      }|j                   d   |j                   d   }	}|i|j	                  d| j
                  d   | j
                  d   d      j                  dddd      }t        j                  j                  |||	fd      }||z   }|j                  d      j                  dd      }|||	ffS )	NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.rk   r   r   r   rj   rl   rn   ro   )r/   r   
ValueErrorr   ry   r   rv   r   rw   rx   flatten	transpose)
r?   r   r   r   r   rg   rh   rf   r   r   s
             r&   rD   z%Data2VecVisionPatchEmbeddings.forward   s   
 3?2D2D/
L&%4,,,w  __\2
$.$4$4Q$79I9I!9Lk)!3!8!8D<L<LQ<OQUQaQabcQdfh!i!q!q1a" "$!:!:",)D9 "; " $&88J''*44Q:
L+666r%   r<   )
r    r!   r"   r#   r>   r1   rJ   r   rD   rL   rM   s   @r&   rX   rX      sE    j( 6:7ll7 %U\\27 
	7r%   rX   c                        e Zd Zddedee   ddf fdZd Z	 	 	 	 	 ddej                  deej                     d	e
d
ed   de
deee      deeej                     eej                  ej                  f   f   fdZ xZS )Data2VecVisionSelfAttentionNrP   window_sizer*   c                    t         |           || _        |j                  |j                  z  dk7  r3t        |d      s't        d|j                  f d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                  d      | _        t        j                  |j                  | j                        | _        t        j                  |j                         | _        |rt%        ||      | _        y d | _        y )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .F)biasr   )r=   r>   rP   rT   num_attention_headshasattrr   r   attention_head_sizeall_head_sizer   Linearquerykeyvaluerc   attention_probs_dropout_probre   "Data2VecVisionRelativePositionBiasrelative_position_biasr?   rP   r   r@   s      r&   r>   z$Data2VecVisionSelfAttention.__init__  sC    : ::a?PVXhHi"6#5#5#6"7 8334A7 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1C%PYYv1143E3EF
zz&"E"EF*LVal*mD'*.D'r%   c                     |j                         d d | j                  | j                  fz   } |j                  | }|j	                  dddd      S )Nrj   r   rk   r   r   )rn   r   r   ry   rv   )r?   xnew_x_shapes      r&   transpose_for_scoresz0Data2VecVisionSelfAttention.transpose_for_scores2  sN    ffhsmt'?'?AYAY&ZZAFFK yyAq!$$r%   rA   	head_maskoutput_attentionsr   r   r   
resolutionc                    | j                  |      }| j                  | j                  |            }| j                  | j                  |            }	| j                  |      }
t	        j
                  |
|j                  dd            }|t        j                  | j                        z  }| j                  [|\  }}|| j                  j                  z  || j                  j                  z  f}|| j                  |||j                  d         z   }|||z   }t        j                  j!                  |d      }| j#                  |      }|||z  }t	        j
                  ||	      }|j%                  dddd      j'                         }|j)                         d d | j*                  fz   } |j,                  | }|r||f}|S |f}|S )	Nrj   r   )dim_sizerq   r   rk   r   )r   r   r   r   r1   matmulr   mathsqrtr   r   rP   rZ   r/   r   rw   softmaxre   rv   
contiguousrn   r   ry   )r?   rA   r   r   r   r   r   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresrg   rh   r   attention_probscontext_layernew_context_layer_shapeoutputss                      r&   rD   z#Data2VecVisionSelfAttention.forward7  s    !JJ}5--dhh}.EF	//

=0IJ//0AB !<<Y5H5HR5PQ+dii8P8P.QQ &&2&MFE!T[[%;%;;UdkkF\F\=\]K/$2M2M5@S@STU@V 3N 3  
 "-/2HH --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD6G=/2 O\M]r%   r<   NFNFN)r    r!   r"   r   r   tupler>   r   r1   rJ   r   r   r   r   rD   rL   rM   s   @r&   r   r     s    /3 /(5/ /]a /0% -1"'QU).+/3||3 ELL)3  	3
 !))M N3 #'3 U3Z(3 
uU\\"E%,,*D$EE	F3r%   r   c                   ~     e Zd ZdZdeddf fdZd	dej                  dej                  dej                  fdZ xZ	S )
Data2VecVisionSelfOutputz
    The residual connection is defined in Data2VecVisionLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rP   r*   Nc                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y r<   )	r=   r>   r   r   rT   denserc   rd   re   r?   rP   r@   s     r&   r>   z!Data2VecVisionSelfOutput.__init__t  sB    YYv1163E3EF
zz&"<"<=r%   rA   input_tensorc                 J    | j                  |      }| j                  |      }|S r<   r   re   )r?   rA   r   gammas       r&   rD   z Data2VecVisionSelfOutput.forwardy  $    

=1]3r%   r<   )
r    r!   r"   r#   r   r>   r1   rJ   rD   rL   rM   s   @r&   r   r   n  sE    
>3 > >
U\\  ^c^j^j r%   r   c                        e Zd Zddedee   ddf fdZd Z	 	 	 	 	 ddej                  deej                     d	e
d
ed   de
deee      deeej                     eej                  ej                  f   f   fdZ xZS )Data2VecVisionAttentionNrP   r   r*   c                     t         |           t        ||      | _        t	        |      | _        t               | _        y )Nr   )r=   r>   r   	attentionr   r7   setpruned_headsr   s      r&   r>   z Data2VecVisionAttention.__init__  s2    4VU.v6Er%   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   rq   )lenr   r   r   r   r   r   r   r   r   r7   r   r   union)r?   headsindexs      r&   prune_headsz#Data2VecVisionAttention.prune_heads  s   u:?74>>55t~~7Y7Y[_[l[l
u
  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r%   rA   r   r   r   r   r   r   c                 n    | j                  ||||||      }| j                  |d   |      }|f|dd  z   }	|	S )Nr   r   )r   r7   )
r?   rA   r   r   r   r   r   self_outputsattention_outputr   s
             r&   rD   zData2VecVisionAttention.forward  sS     ~~9&79OQiku
  ;;|AF#%QR(88r%   r<   r   )r    r!   r"   r   r   r   r>   r   r1   rJ   r   r   r   r   rD   rL   rM   s   @r&   r   r     s    "3 "(5/ "]a ";* -1"'QU).+/|| ELL)  	
 !))M N #' U3Z( 
uU\\"E%,,*D$EE	Fr%   r   c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )Data2VecVisionIntermediaterP   r*   Nc                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r<   )r=   r>   r   r   rT   intermediate_sizer   r[   
hidden_actrK   r   intermediate_act_fnr   s     r&   r>   z#Data2VecVisionIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r%   rA   c                 J    | j                  |      }| j                  |      }|S r<   )r   r   rC   s     r&   rD   z"Data2VecVisionIntermediate.forward  s&    

=100?r%   	r    r!   r"   r   r>   r1   rJ   rD   rL   rM   s   @r&   r   r     s2    93 9 9U\\ ell r%   r   c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )Data2VecVisionOutputrP   r*   Nc                     t         |           t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        y r<   )
r=   r>   r   r   r   rT   r   rc   rd   re   r   s     r&   r>   zData2VecVisionOutput.__init__  sB    YYv779K9KL
zz&"<"<=r%   rA   c                 J    | j                  |      }| j                  |      }|S r<   r   rC   s     r&   rD   zData2VecVisionOutput.forward  r   r%   r   rM   s   @r&   r   r     s2    >3 > >
U\\ ell r%   r   c                       e Zd ZdZ	 ddedee   deddf fdZ	 	 	 	 	 dde	j                  d	ee	j                     d
eded   dedeee      deee	j                     ee	j                  e	j                  f   f   fdZ xZS )Data2VecVisionLayerz?This corresponds to the Block class in the timm implementation.NrP   r   drop_path_rater*   c                    t         |           |j                  | _        d| _        t	        ||      | _        t        |      | _        t        |      | _	        t        j                  |j                  |j                        | _        |dkD  rt        |      nt        j                          | _        t        j                  |j                  |j                        | _        |j&                  }|dkD  ryt        j(                  |t+        j,                  |j                        z  d      | _        t        j(                  |t+        j,                  |j                        z  d      | _        y d\  | _        | _        y )	Nr   r   epsr,   r   T)requires_grad)NN)r=   r>   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r7   r   	LayerNormrT   layer_norm_epslayernorm_beforer:   Identityr8   layernorm_afterlayer_scale_init_valuerR   r1   oneslambda_1lambda_2)r?   rP   r   r   init_valuesr@   s        r&   r>   zData2VecVisionLayer.__init__  s    	'-'E'E$0[Q6v>*62 "V-?-?VEZEZ [CQTWCW/?]_]h]h]j!||F,>,>FDYDYZ33?LLuzz6CUCU7W)WgklDMLLuzz6CUCU7W)WgklDM+5(DM4=r%   rA   r   r   r   r   r   r   c                    | j                  | j                  |      |||||      }|d   }|dd  }	| j                  | j                  |z  }| j                  |      |z   }| j	                  |      }
| j                  |
      }
| j                  |
      }
| j                  | j                  |
z  }
| j                  |
      |z   }
|
f|	z   }	|	S )N)r   r   r   r   r   r   )r   r  r  r8   r  r   r7   r  )r?   rA   r   r   r   r   r   self_attention_outputsr   r   layer_outputs              r&   rD   zData2VecVisionLayer.forward  s     "&!!-0/#9%=! "0 "
 2!4(, ==$#}}/?? '78=H ++M:((6{{<0==$==<7L ~~l3mC/G+r%   )Nr,   r   )r    r!   r"   r#   r   r   r   rI   r>   r1   rJ   r   r   r   r   rD   rL   rM   s   @r&   r   r     s    I jm6*69A%6af6	6. -1"'QU).+/)||) ELL))  	)
 !))M N) #') U3Z() 
uU\\"E%,,*D$EE	F)r%   r   c                        e Zd Zdededdf fdZdeeef   dej                  fdZ
d	dedej                  fdZ xZS )
r   rP   r   r*   Nc                     t         |           || _        d|d   z  dz
  d|d   z  dz
  z  dz   | _        t	        j
                  t        j                  | j                  |j                              | _	        i | _
        y )Nrk   r   r   r   )r=   r>   r   num_relative_distancer   rR   r1   rS   r   relative_position_bias_tablerelative_position_indicesr   s      r&   r>   z+Data2VecVisionRelativePositionBias.__init__  s|    &&'+a.&81&<[QR^ASVWAW%X[\%\",.LLKK22F4N4NO-
)
 *,&r%   c                    d|d   z  dz
  d|d   z  dz
  z  dz   }|d   |d   z  }t        j                  t        j                  |d         t        j                  |d         d      }t        j                  |      }t        j                  |d      }|dddddf   |dddddf   z
  }|j                  ddd      j                         }|dddddfxx   |d   dz
  z  cc<   |dddddfxx   |d   dz
  z  cc<   |dddddfxx   d|d   z  dz
  z  cc<   t        j                  |dz   fdz  |j                        }|j                  d	      |ddddf<   |dz
  |dddf<   |dz
  |dddf<   |dz
  |d
<   |S )z
        This method creates the relative position index, modified to support arbitrary window sizes,
        as introduced in [MiDaS v3.1](https://arxiv.org/abs/2307.14460).
        rk   r   r   r   ij)indexingN)rn   r-   rj   )r   r   )
r1   meshgridarangestackr   rv   r   rS   r-   sum)	r?   r   r  window_areagridcoordscoords_flattenrelative_coordsrelative_position_indexs	            r&    generate_relative_position_indexzCData2VecVisionRelativePositionBias.generate_relative_position_index  s   
 "#[^!3a!7AA<NQR<R SVW W "!n{1~5~~ell;q>:ELLUV<XcghT"vq1(At4~aqj7QQ)11!Q:EEG1a KNQ$66 1a KNQ$66 1a AA$6$:: "'++K!O3E3IQ`QfQf"g*9*=*=b*AAB')>)B12&)>)BA&(=(A%&&r%   r   c                    d| j                   d   z  dz
  }d| j                   d   z  dz
  }d|d   z  dz
  }d|d   z  dz
  }| j                  }| j                  }	||z  dz   }
|d|	dz
   }|j                  d||d      j	                  dddd      }t
        j                  j                  |t        |      t        |      fd      }|j	                  dddd      j                  |
dz
  d      }t        j                  |||	dz
  d g      }|}|| j                  j                         vr| j                  |      | j                  |<   || j                  |   j                  d         }|j                  |d   |d   z  dz   |d   |d   z  dz   d      }|j	                  ddd      j                         }|rCt
        j                  j                  |j!                  d      ||fdd	
      j#                  d      }|j!                  d      S )zu
        Modification of timm.models.beit.py: Attention._get_rel_pos_bias to support arbitrary window sizes.
        rk   r   r   r   Nrj   bilinearr   Frm   )r   r  r  ru   rv   r   rw   rx   r   r1   rz   r  keysr  ry   r   r   squeeze)r?   r   r   r   
old_height	old_widthr~   r    old_relative_position_bias_tableold_num_relative_distancenew_num_relative_distanceold_sub_tablenew_sub_table new_relative_position_bias_tabler   r   s                   r&   rD   z*Data2VecVisionRelativePositionBias.forward5  s]    ))!,,q0
((++a/	Q'!+
A&*	+/+L+L($($>$>!$.$:Q$>!89X;TWX;XY%--aJKSSTUWXZ[]^_11:!6	)8L MT^ 2 
 &--aAq9AAB[^_B_acd+099<=VYZ=Z=\]^,
( d4499;;262W2WXc2dD**3/!A$B`B`adBeBjBjkmBn!o!7!<!<N[^+a/Q+a.1PST1TVX"
 "8!?!?1a!H!S!S!U#%']]%>%>&003)#	 &? &
 gaj # &//22r%   )FN)r    r!   r"   r   r   r>   r   r   r1   rJ   r  r   rD   rL   rM   s   @r&   r   r     sZ    	,3 	,% 	,D 	,'E#s(O 'PUP\P\ '0/3T /3]b]i]i /3r%   r   c                        e Zd Zddedee   ddf fdZ	 	 	 	 	 	 ddej                  deej                     de	d	e	d
e	dee
e      de	deeef   fdZ xZS )Data2VecVisionEncoderNrP   r   r*   c                    t         |           || _        |j                  rt	        ||      | _        nd | _        t        j                  d|j                  |j                        D cg c]  }|j                          }}t        j                  t        |j                        D cg c]!  }t        ||j                  r|nd ||         # c}      | _        d| _        y c c}w c c}w )Nr   r   )r   r   F)r=   r>   rP   !use_shared_relative_position_biasr   r   r1   linspacer   num_hidden_layersitemr   
ModuleListranger   use_relative_position_biaslayergradient_checkpointing)r?   rP   r   r   dprir@   s         r&   r>   zData2VecVisionEncoder.__init__i  s    33*LVal*mD'*.D' "'63H3H&JbJb!cdAqvvxdd]] v778  $/5/P/PVZ#&q6	

 ',# es   *C#)&C(rA   r   r   output_hidden_statesr   r   return_dictc           	      F   |rdnd }|rdnd }	t        | j                        D ]  \  }
}|r||fz   }|||
   nd }| j                  r+| j                  r| j	                  |j
                  |||      }ns|\  }}|| j                  j                  z  || j                  j                  z  f}| j                  !| j                  |||j                  d         nd } |||||||      }|d   }|s|	|d   fz   }	 |r||fz   }|st        d |||	fD              S t        |||	      S )Nr$   r   )r   r   r   c              3   &   K   | ]	  }||  y wr<   r$   ).0vs     r&   	<genexpr>z0Data2VecVisionEncoder.forward.<locals>.<genexpr>  s     mq_`_lms   )last_hidden_staterA   
attentions)	enumerater5  r6  r)   _gradient_checkpointing_func__call__rP   rZ   r   r/   r   r   )r?   rA   r   r   r9  r   r   r:  all_hidden_statesall_self_attentionsr8  layer_modulelayer_head_masklayer_outputsrg   rh   r   r   s                     r&   rD   zData2VecVisionEncoder.forward  s    #7BD$5b4(4 #	POA|#$58H$H!.7.CilO**t}} $ A A ))!#%	! !+%)?)??$++J`J`A`a
 22> //#>Vanatatuvaw 0   ' !-!#%*,! *!,M &9]1=M<O&O#G#	PJ   1]4D Dm]4EGZ$[mmm++*
 	
r%   r<   )NFFFNT)r    r!   r"   r   r   r   r>   r1   rJ   r   r   r   r   r   rD   rL   rM   s   @r&   r,  r,  h  s    ,3 ,(5/ ,]a ,2 -1"'%*).+/ ;
||;
 ELL);
  	;

 #;
 #';
 U3Z(;
 ;
 
uo%	&;
r%   r,  c                   2    e Zd ZdZeZdZdZdZdgZ	dgZ
d Zy)	Data2VecVisionPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    data2vec_visionr   Tr   z.*relative_position_index.*c                 @   t        |t        j                  t        j                  t        j                  f      rm|j
                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j
                  j                  j                  d| j                  j                         |j                  2|j
                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j
                  j                  j                  d       yy)zInitialize the weightsr,   )meanstdNg      ?)r[   r   r   r   ConvTranspose2dweightdatanormal_rP   initializer_ranger   zero_	Embeddingpadding_idxr   fill_)r?   modules     r&   _init_weightsz+Data2VecVisionPreTrainedModel._init_weights  s   fryy"))R5G5GHI MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) .r%   N)r    r!   r"   r#   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules"_keys_to_ignore_on_load_unexpectedrZ  r$   r%   r&   rK  rK    s6    
 (L)$O&*#./*H)I&*r%   rK  aP  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`Data2VecVisionConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`BeitImageProcessor.__call__`] for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
            Whether to interpolate the pre-trained position encodings.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zhThe bare Data2VecVision Model transformer outputting raw hidden-states without any specific head on top.c                   
    e Zd Zddededdf fdZd Zd Z ee	       e
eeede	      	 	 	 	 	 	 dd
ej                   deej$                     deej                      dee   dee   dedee   deeef   fd              Z xZS )Data2VecVisionModelrP   add_pooling_layerr*   Nc                    t         |   |       || _        t        |      | _        t        || j                  j                  j                        | _        |j                  rt        j                         n*t        j                  |j                  |j                        | _        |rt!        |      nd | _        | j%                          y )Nr   r   )r=   r>   rP   rO   rf   r,  rY   r   encoderuse_mean_poolingr   r  r   rT   r   	layernormData2VecVisionPoolerpooler	post_init)r?   rP   rc  r@   s      r&   r>   zData2VecVisionModel.__init__  s     26:,VAaAaAmAmn $44BKKM",,vGYGY_e_t_t:u 	 7H*62T 	r%   c                 .    | j                   j                  S r<   )rf   rY   rG   s    r&   get_input_embeddingsz(Data2VecVisionModel.get_input_embeddings  s    ///r%   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsre  r5  r   r   )r?   heads_to_pruner5  r   s       r&   _prune_headsz Data2VecVisionModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr%   vision)
checkpointoutput_typer[  modalityexpected_outputr   r   r   r   r9  r   r:  c           	      <   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  || j                   j
                        }| j                  |||      \  }}	|j                  dd }
| j                  |||||
||      }|d   }| j                  |      }| j                  | j                  |      nd}|s|||fn|f}||dd z   S t        |||j                  |j                        S )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        N)r   r   rk   )r   r   r9  r   r:  r   r   r   )r@  pooler_outputrA   rA  )rP   r   r9  use_return_dictget_head_maskr0  rf   r/   re  rg  ri  r   rA   rA  )r?   r   r   r   r   r9  r   r:  embedding_outputr   r   encoder_outputssequence_outputpooled_outputhead_outputss                  r&   rD   zData2VecVisionModel.forward   sP   , 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] &&y$++2O2OP	"oo/Tl . 
! "''+
,,/!5!#%= ' 
 *!,..98<8OO4UY?L?XO];_n^pL/!""5553-')77&11	
 	
r%   )F)NNNNFN)r    r!   r"   r   r   r>   rl  rp  r    DATA2VEC_VISION_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr1   rJ   r   r   r   r   rD   rL   rM   s   @r&   rb  rb     s    3  Y] 0C ++KL&8$. 7;,0,0/3).&*6
ll6
 "%"2"236
 ELL)	6

 $D>6
 'tn6
 #'6
 d^6
 
u::	;6
 M6
r%   rb  c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )rh  rP   r*   Nc                     t         |           |j                  r1t        j                  |j
                  |j                        | _        y d | _        y )Nr   )r=   r>   rf  r   r   rT   r   rg  r   s     r&   r>   zData2VecVisionPooler.__init__c  sA    KQKbKbBLL++1F1FG 	hl 	r%   rA   c                     | j                   0|d d dd d d f   }| j                  |j                  d            }|S |d d df   }|S )Nr   r   )rg  rN  )r?   rA   patch_tokensr}  s       r&   rD   zData2VecVisionPooler.forwardi  sU    >>%(AB2L NN<+<+<Q+?@M
  *!Q$/Mr%   r   rM   s   @r&   rh  rh  b  s2    
3 
 
	U\\ 	ell 	r%   rh  z
    Data2VecVision Model transformer with an image classification head on top (a linear layer on top of the average of
    the final hidden states of the patch tokens) e.g. for ImageNet.
    c                        e Zd Zdeddf fdZ ee       eee	e
e      	 	 	 	 	 	 	 ddeej                     deej                     deej                     d	ee   d
ee   dedee   deee	f   fd              Z xZS )$Data2VecVisionForImageClassificationrP   r*   Nc                 .   t         |   |       |j                  | _        t        |d      | _        |j                  dkD  r*t        j                  |j                  |j                        nt        j                         | _	        | j                          y )NTrc  r   )r=   r>   
num_labelsrb  rL  r   r   rT   r  
classifierrj  r   s     r&   r>   z-Data2VecVisionForImageClassification.__init__~  st      ++26TR OUN_N_bcNc"))F$6$68I8IJikititiv 	r%   )rr  rs  r[  ru  r   r   labelsr   r9  r   r:  c                 4   ||n| j                   j                  }| j                  ||||||      }|r|j                  n|d   }	| j	                  |	      }
d}|| j                   j
                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j
                  dk(  rIt               }| j                  dk(  r& ||
j                         |j                               }n ||
|      }n| j                   j
                  dk(  r=t               } ||
j                  d| j                        |j                  d            }n,| j                   j
                  dk(  rt               } ||
|      }|s|
f|dd z   }||f|z   S |S t!        ||
|j"                  |j$                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r9  r   r:  r   
regressionsingle_label_classificationmulti_label_classificationrj   rk   losslogitsrA   rA  )rP   rx  rL  rw  r  problem_typer  r-   r1   longr   r   r"  r
   ry   r	   r   rA   rA  )r?   r   r   r  r   r9  r   r:  r   r}  r  r  loss_fctr7   s                 r&   rD   z,Data2VecVisionForImageClassification.forward  s   . &1%<k$++B]B]&&/!5%=# ' 
 2=--'!*/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE$!//))	
 	
r%   NNNNNFN)r    r!   r"   r   r>   r   r  r   _IMAGE_CLASS_CHECKPOINTr   r  _IMAGE_CLASS_EXPECTED_OUTPUTr   r1   rJ   r   r   r   rD   rL   rM   s   @r&   r  r  u  s    
3 
 
 ++KL*)$4	 04,0)-,0/3).&*=
u||,=
 ELL)=
 &	=

 $D>=
 'tn=
 #'=
 d^=
 
u++	,=
 M=
r%   r  c                        e Zd ZdZ	 	 	 ddededeeeeef   f   deeeeef   ef   dedeeeeef   f   dd	f fd
Z	de
j                  de
j                  fdZ xZS )Data2VecVisionConvModuleaD  
    A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
    layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    in_channelsout_channelsr   paddingr   dilationr*   Nc                     t         |           t        j                  ||||||      | _        t        j
                  |      | _        t        j                         | _        y )N)r  r  r   r  r   r  )	r=   r>   r   r   convBatchNorm2dbnReLU
activation)r?   r  r  r   r  r   r  r@   s          r&   r>   z!Data2VecVisionConvModule.__init__  sQ     	II#%#
	 ...'')r%   r'   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r<   )r  r  r  )r?   r'   r7   s      r&   rD   z Data2VecVisionConvModule.forward  s0    5!(r%   )r   Fr   )r    r!   r"   r#   r   r   r   rK   r   r>   r1   rJ   rD   rL   rM   s   @r&   r  r    s     5601$$ $ 3c3h/0	$
 sE#s(OS01$ $ U38_,-$ 
$*U\\ ell r%   r  c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZS )	!Data2VecVisionPyramidPoolingBlock
pool_scaler  channelsr*   Nc                     t         |           t        j                  |      t	        ||d      g| _        t        | j
                        D ]   \  }}| j                  t        |      |       " y )Nr   r   )	r=   r>   r   AdaptiveAvgPool2dr  layersrB  
add_modulerK   )r?   r  r  r  r8  r5  r@   s         r&   r>   z*Data2VecVisionPyramidPoolingBlock.__init__  sa      ,$[(J
 "$++. 	+HAuOOCFE*	+r%   r'   c                 <    |}| j                   D ]
  } ||      } |S r<   )r  )r?   r'   hidden_stater5  s       r&   rD   z)Data2VecVisionPyramidPoolingBlock.forward  s*    [[ 	/E .L	/r%   )	r    r!   r"   r   r>   r1   rJ   rD   rL   rM   s   @r&   r  r    s?    +3 +S +C +D +U\\ ell r%   r  c            
            e Zd ZdZdeedf   dedededdf
 fd	Zd
ej                  de
ej                     fdZ xZS )"Data2VecVisionPyramidPoolingModulea  
    Pyramid Pooling Module (PPM) used in PSPNet.

    Args:
        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
            Module.
        in_channels (int): Input channels.
        channels (int): Channels after modules, before conv_seg.
        align_corners (bool): align_corners argument of F.interpolate.

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    pool_scales.r  r  rp   r*   Nc                    t         |           || _        || _        || _        || _        g | _        t        |      D ]I  \  }}t        |||      }| j                  j                  |       | j                  t        |      |       K y )N)r  r  r  )r=   r>   r  rp   r  r  blocksrB  r  appendr  rK   )	r?   r  r  r  rp   r8  r  blockr@   s	           r&   r>   z+Data2VecVisionPyramidPoolingModule.__init__  s    &*& &{3 	+MAz5%;E KKu%OOCFE*	+r%   r   c                     g }| j                   D ]Y  } ||      }t        j                  j                  ||j	                         dd  d| j
                        }|j                  |       [ |S )Nrk   r   rm   )r  r   rw   rx   rn   rp   r  )r?   r   ppm_outsppmppm_outupsampled_ppm_outs         r&   rD   z*Data2VecVisionPyramidPoolingModule.forward&  sn    ;; 	/C!fG " 9 9affhqrl4K]K] !: ! OO-.	/ r%   )r    r!   r"   r#   r   r   r   r>   r1   rJ   r   rD   rL   rM   s   @r&   r  r  
  s[    +E#s(O +# +QT +ei +nr + $u||*< r%   r  c                   j     e Zd ZdZdeddf fdZd Zdej                  dej                  fdZ	 xZ
S )	Data2VecVisionUperHeadz
    Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
    [UPerNet](https://arxiv.org/abs/1807.10221).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    rP   r*   Nc                    t         |           |j                  | _        |j                  gdz  | _        |j                  | _        d| _        t        j                  | j
                  |j                  d      | _
        t        | j                  | j                  d   | j
                  | j                        | _        t        | j                  d   t        | j                        | j
                  z  z   | j
                  dd      | _        t        j                          | _        t        j                          | _        | j                  d d D ]s  }t        || j
                  d      }t        | j
                  | j
                  dd      }| j"                  j'                  |       | j$                  j'                  |       u t        t        | j                        | j
                  z  | j
                  dd      | _        y )	N   Fr   r  rj   )rp   r   r   r  )r=   r>   r  rT   r  r  rp   r   r   r  r  r  psp_modulesr  r   
bottleneckr2  lateral_convs	fpn_convsr  fpn_bottleneck)r?   rP   r  l_convfpn_convr@   s        r&   r>   zData2VecVisionUperHead.__init__:  s   !--"../!3**"))DMM63D3DRST >R MM,,	
 3R 3t'7'7#84==#HHMM	
  ]]_++CR0 	,K-k4==VWXF/t}}Z[efgH%%f-NN!!(+		, 7  !DMM1MM	
r%   c                     |d   }|g}|j                  | j                  |             t        j                  |d      }| j	                  |      }|S )Nrj   r   rq   )extendr  r1   rz   r  )r?   inputsr   psp_outsr7   s        r&   psp_forwardz"Data2VecVisionUperHead.psp_forward`  sL    2J3((+,99X1-*r%   encoder_hidden_statesc                 P   t        | j                        D cg c]  \  }} |||          }}}|j                  | j                  |             t	        |      }t        |dz
  dd      D ]V  }||dz
     j                  dd  }||dz
     t        j                  j                  ||   |d| j                        z   ||dz
  <   X t        |dz
        D cg c]  } | j                  |   ||          }}|j                  |d          t        |dz
  dd      D ]E  }t        j                  j                  ||   |d   j                  dd  d| j                        ||<   G t        j                  |d      }| j                  |      }| j                  |      }|S c c}}w c c}w )Nr   r   rj   rk   r   rm   rq   )rB  r  r  r  r   r3  r/   r   rw   rx   rp   r  r1   rz   r  r  )	r?   r  r8  lateral_convlateralsused_backbone_levels
prev_shapefpn_outsr7   s	            r&   rD   zData2VecVisionUperHead.forwardi  s   R[\`\n\nRopq,L!6q!9:pp(()>?@  #8}+a/B7 	A!!a%..qr2J&q1uo0I0I*:TM_M_ 1J 1 HQUO	 =BBVYZBZ<[\q%DNN1%hqk2\\%+a/B7 	A--33(1+"3"3AB"7jX\XjXj 4 HQK	 99X1-$$X.(3 q ]s   FF#)r    r!   r"   r#   r   r>   r  r1   rJ   rD   rL   rM   s   @r&   r  r  2  s=    $
3 $
 $
LU\\ ell r%   r  c                        e Zd ZdZ	 	 	 ddedededeeeeef   f   ddf
 fdZd	e	j                  de	j                  fd
Z xZS )Data2VecVisionFCNHeada  
    Fully Convolution Networks for Semantic Segmentation. This head is implemented of
    [FCNNet](https://arxiv.org/abs/1411.4038>).

    Args:
        config (Data2VecVisionConfig): Configuration.
        in_channels
        kernel_size (int): The kernel size for convs in the head. Default: 3.
        dilation (int): The dilation rate for convs in the head. Default: 1.


    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    rP   in_indexr   r  r*   Nc           
      <   t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _	        || _
        |dz  |z  }g }|j                  t        | j                  | j
                  |||             t        | j                  dz
        D ]5  }|j                  t        | j
                  | j
                  |||             7 | j                  dk(  rt        j                         | _        nt        j"                  | | _        | j                  r8t        | j                  | j
                  z   | j
                  ||dz        | _        t        j&                  | j
                  |j(                  d      | _        y )Nrk   )r   r  r  r   r   r  r  )r=   r>   rT   r  auxiliary_channelsr  auxiliary_num_convs	num_convsauxiliary_concat_inputconcat_inputr  r  r  r3  r   r  convs
Sequentialconv_catr   r  r  )	r?   rP   r  r   r  conv_paddingr  r8  r@   s	           r&   r>   zData2VecVisionFCNHead.__init__  sX    	!--1133"99 #q(H4$  $--[R^iq	

 t~~)* 	ALL(MM4==kS_jr	 >>QDJ.DJ4  4==0$--[bmqrbrDM ))DMM63D3DRSTr%   r  c                     || j                      }| j                  |      }| j                  r(| j                  t	        j
                  ||gd            }| j                  |      }|S )Nr   rq   )r  r  r  r  r1   rz   r  )r?   r  rA   r7   s       r&   rD   zData2VecVisionFCNHead.forward  sX    -dmm<M*]]599mV-D!#LMF(r%   )rk   r   r   )r    r!   r"   r#   r   r   r   r   r>   r1   rJ   rD   rL   rM   s   @r&   r  r    s|    " 01$U$$U $U 	$U
 U38_,-$U 
$ULU\\ ell r%   r  zp
    Data2VecVision Model transformer with a semantic segmentation head on top e.g. for ADE20k, CityScapes.
    c                        e Zd Zdeddf fdZd Z ee       ee	e
      	 	 	 	 	 	 	 ddeej                     deej                     d	eej                     d
ee   dee   dedee   deee	f   fd              Z xZS )%Data2VecVisionForSemanticSegmentationrP   r*   Nc                 x   t         |   |       |j                  | _        t        |d      | _        t        | j                  j                        dk7  rt        d      t        j                  t        j                  |j                  |j                  dd      t        j                  |j                        t        j                         t        j                  |j                  |j                  dd            | _        t        j                  t        j                  |j                  |j                  dd            | _        t        j"                         | _        t        j&                  dd      | _        t+        |      | _        |j.                  rt1        |      nd | _        | j5                          y )NFr  r  zData2VecVisionForSemanticSegmentation requires config.out_indices to be a list of 4 integers, specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of a base-sized architecture.rk   r   )r=   r>   r  rb  rL  r   rP   out_indicesr   r   r  rP  rT   r  GELUfpn1fpn2r  fpn3	MaxPool2dfpn4r  decode_headuse_auxiliary_headr  auxiliary_headrj  r   s     r&   r>   z.Data2VecVisionForSemanticSegmentation.__init__  sQ     ++26US t{{&&'1,- 
 MMv1163E3EST]^_NN6--.GGIv1163E3EST]^_	
	 MMv1163E3EST]^_
	 KKM	LLQq9	 2&9?E?X?X3F;^b 	r%   c                 n   t         j                  j                  ||j                  dd  dd      }|0t         j                  j                  ||j                  dd  dd      }t	        | j
                  j                        } |||      }|}|% ||      }	|| j
                  j                  |	z  z  }|S )Nr   r   Frm   )ignore_index)r   rw   rx   r/   r
   rP   semantic_loss_ignore_indexauxiliary_loss_weight)
r?   r  auxiliary_logitsr  upsampled_logitsupsampled_auxiliary_logitsr  	main_lossr  auxiliary_losss
             r&   compute_lossz2Data2VecVisionForSemanticSegmentation.compute_loss  s    ==44bc*5 5 
 ')+)B)B v||BC'8zY^ *C *& $1W1WX-v6	'%&@&INDKK55FFDr%   )rs  r[  r   r   r  r   r9  r   r:  c           	      T   ||n| j                   j                  }||n| j                   j                  }|$| j                   j                  dk(  rt	        d      | j                  |||d||      }|r|j                  n|d   }	t        |	      D 
cg c]#  \  }
}|
dz   | j                   j                  v s"|% }}
}|j                  d   }| j                   j                  | j                   j                  z  }|D cg c]3  }|ddddddf   j                  ddd      j                  |d||      5 }}| j                  | j                  | j                   | j"                  g}t%        t'        |            D ]  } ||   ||         ||<    | j)                  |      }d}| j*                  | j+                  |      }d}|| j-                  |||      }|s|r
|f|dd z   }n	|f|dd z   }||f|z   S |S t/        |||r|j                  nd|j0                  	      S c c}}
w c c}w )
aR  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, Data2VecVisionForSemanticSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/data2vec-vision-base")
        >>> model = Data2VecVisionForSemanticSegmentation.from_pretrained("facebook/data2vec-vision-base")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr  r   rk   rj   r  )rP   rx  r9  r  r   rL  rA   rB  r  r/   r\   rZ   rv   ru   r  r  r  r  r3  r   r  r  r  r   rA  )r?   r   r   r  r   r9  r   r:  r   r  idxfeaturefeaturesr   patch_resolutionr   opsr8  r  r  r  r7   s                         r&   rD   z-Data2VecVisionForSemanticSegmentation.forward  sd   J &1%<k$++B]B]$8$D $++JjJj 	 $++"8"8A"=NOO&&/!%%=# ' 
 :E 5 5'RS* 1::O0PwWTWZ[T[_c_j_j_v_vTvGww!''*
;;11T[[5K5KKnv
ijAaQhK1a(00RAQScd
 

 yy$))TYY		:s8}% 	.A #a&!-HQK	. !!(+*#228<$$V-=vFD# WQR[0 WQR[0)-)9TGf$EvE&3G'//T))	
 	
; x
s   #H7H>8H%r  )r    r!   r"   r   r>   r  r   r  r   r   r  r   r1   rJ   r   r   r   rD   rL   rM   s   @r&   r  r    s    3  @& ++KL+BQ`a 04,0)-,0/3).&*Z
u||,Z
 ELL)Z
 &	Z

 $D>Z
 'tnZ
 #'Z
 d^Z
 
u--	.Z
 b MZ
r%   r  )r,   F)Lr#   collections.abcr]   r   dataclassesr   typingr   r   r   r   r1   torch.utils.checkpointr   torch.nnr	   r
   r   activationsr   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   r   configuration_data2vec_visionr   
get_loggerr    loggerr  r  r  r  r  r   rJ   rI   r   r8   Moduler:   rO   rX   r   r   r   r   r   r   r   r,  rK  DATA2VEC_VISION_START_DOCSTRINGr  rb  rh  r  r  r  r  r  r  r  r$   r%   r&   <module>r     s   $   ! / /    A A !  . Q  @ 
		H	% ) 6 &  ? 7  +E  4U\\ e T V[VbVb *-RYY -b7ryy b7L27BII 27lQ")) Qjryy &)bii )Z "
299 
@")) @HS3 S3nR
BII R
l*O *<	# $  2 n#
Y
7 Y

Y
z299 &  $Q
+H Q
Q
j"ryy "L		 $$ $PRRYY Rl<BII <~  $	P
,I P
P
r%   