
    sgj                     R   d Z ddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZ ddlZddlZddlmZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZmZmZ ddlmZ ddlmZm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(  e#jR                  e*      Z+dZ,dZ-g dZ.e G d de"             Z/e G d de"             Z0 G d dejb                        Z2 G d dejb                        Z3 G d dejb                        Z4 G d dejb                        Z5 G d  d!ejb                        Z6 G d" d#ejb                        Z7 G d$ d%ejb                        Z8 G d& d'ejb                        Z9 G d( d)ejb                        Z: G d* d+ejb                        Z; G d, d-ejb                        Z<d. Z= G d/ d0ejb                        Z> G d1 d2ejb                        Z? G d3 d4ejb                        Z@ G d5 d6ejb                        ZA G d7 d8e      ZBd9ZCd:ZD ed;eC       G d< d=eB             ZE G d> d?ejb                        ZF G d@ dAejb                        ZG G dB dCejb                        ZH edDeC       G dE dFeB             ZI G dG dHejb                        ZJ G dI dJejb                        ZK edKeC       G dL dMeB             ZLy)NzPyTorch DPT (Dense Prediction Transformers) model.

This implementation is heavily inspired by OpenMMLab's implementation, found here:
https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/dpt_head.py.

    N)	dataclass)ListOptionalSetTupleUnion)nn)CrossEntropyLoss   )ACT2FN)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardreplace_return_docstrings)BaseModelOutputDepthEstimatorOutputSemanticSegmenterOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputlogging	torch_int)load_backbone   )	DPTConfigr   zIntel/dpt-large)r   iA  i   c                   f    e Zd ZU dZdZej                  ed<   dZe	e
ej                  df      ed<   y)*BaseModelOutputWithIntermediateActivationsa#  
    Base class for model's outputs that also contains intermediate activations that can be used at later stages. Useful
    in the context of Vision models.:

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
            Intermediate activations that can be used to compute hidden states of the model at various layers.
    Nlast_hidden_states.intermediate_activations)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r    r   r        W/var/www/html/venv/lib/python3.12/site-packages/transformers/models/dpt/modeling_dpt.pyr   r   9   s:    	 -1))0HLhuU->->-C'DELr)   r   c                       e Zd ZU dZdZej                  ed<   dZej                  ed<   dZ	e
eej                  df      ed<   dZe
eej                  df      ed<   dZe
eej                  df      ed<   y)	4BaseModelOutputWithPoolingAndIntermediateActivationsa  
    Base class for model's outputs that also contains a pooling of the last hidden states as well as intermediate
    activations that can be used by the model at later stages.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
            Last layer hidden-state of the first token of the sequence (classification token) after further processing
            through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
            the classification token after processing through a linear layer and a tanh activation function. The linear
            layer weights are trained from the next sentence prediction (classification) objective during pretraining.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
            Intermediate activations that can be used to compute hidden states of the model at various layers.
    Nlast_hidden_statepooler_output.hidden_states
attentionsr    )r!   r"   r#   r$   r-   r%   r&   r'   r.   r/   r   r   r0   r    r(   r)   r*   r,   r,   J   s    6 ,0u((/'+M5$$+=AM8E%"3"3S"89:A:>Ju00#567>HLhuU->->-C'DELr)   r,   c            	       p     e Zd ZdZd	 fd	Zd
dZ	 ddej                  dededej                  fdZ	 xZ
S )DPTViTHybridEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                 b   t         
|           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }t        |      | _        | j                  j                  d   }t        | j                  j                        dk7  r+t        dt        | j                  j                               ddg| _        ||j                   }	|	dd  }|	d   }nCt        |t        j                  j                        r|n||f}| j                  j                  d   }|| _        |d   | _        || _        t#        j$                  ||d      | _        t#        j(                  t+        j,                  dd|j
                              | _        t#        j(                  t+        j,                  d|dz   |j
                              | _        y )Nr   r   r   z1Expected backbone to have 3 output features, got kernel_size)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterabler   backbonechannelslen
ValueErrorresidual_feature_map_indexbackbone_featmap_shaper	   Conv2d
projection	Parameterr%   zeros	cls_tokenposition_embeddings)selfconfigfeature_sizer:   r;   r<   r=   num_patchesfeature_dimfeat_map_shape	__class__s             r*   r9   zDPTViTHybridEmbeddings.__init__u   s   !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY%f-mm,,R0t}}%%&!+PQTUYUbUbUkUkQlPmnoo+,a&'#::N)"#.L(+K !+<9Q9Q RYegsXt  --004K$$Q-())K!Lekk!Q8J8J&KL#%<<A{QPVPbPb0c#d r)   c                 r   |d d d |f   }|d|d f   }t        t        |      dz        }|j                  d||d      j                  dddd      }t        j
                  j                  |||fd      }|j                  dddd      j                  d||z  d      }t        j                  ||gd	      }|S 
Nr         ?r   r4   r      bilinear)sizemodedim)	r   rD   reshapepermuter	   
functionalinterpolater%   catrN   posembgrid_size_heightgrid_size_widthstart_index
posemb_tokposemb_gridold_grid_sizes           r*   _resize_pos_embedz(DPTViTHybridEmbeddings._resize_pos_embed   s    A||O,
Q_-!#k"2c"9:!))!]M2NVVWXZ[]^`abmm//CSUdBelv/w!))!Q15==aAQTcAceghJ4!<r)   pixel_valuesinterpolate_pos_encodingreturn_dictreturnc                    |j                   \  }}}}|| j                  k7  rt        d      |sV|| j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d	      | j	                  | j
                  || j                  z  || j                  z        }| j                  |      }	|	j                  d   }
| j                  D cg c]  }|	j                  |    }}| j                  |
      j                  d	      j                  dd	      }| j                  j                  |dd      }t        j                   ||fd
      }||z   }|s||fS t#        ||      S c c}w )NeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model (z).r4   rX   r\   )r   r    )shaper<   rE   r:   rk   rM   r;   rB   feature_mapsrF   rI   flatten	transposerL   expandr%   rb   r   )rN   rl   rm   rn   
batch_sizer<   heightwidthrM   backbone_outputfeaturesindexoutput_hidden_states
embeddings
cls_tokenss                  r*   forwardzDPTViTHybridEmbeddings.forward   s    3?2D2D/
L&%4,,,w  (++u8J/J (% 9+,Adooa.@-AE 
 #44$$f&?$//AY
 --5"//3 RVQpQpq < <U Cqq__X.66q9CCAqI
^^**:r2>
YY
J7Q?
  "55
 455 :)%9
 	
  rs   )E?Nr   )FF)r!   r"   r#   r$   r9   rk   r%   Tensorboolr   __classcell__rT   s   @r*   r2   r2   n   sH     eD gl)
!LL)
DH)
_c)
	)
r)   r2   c                   2     e Zd ZdZ fdZddZddZ xZS )DPTViTEmbeddingszB
    Construct the CLS token, position and patch embeddings.

    c                    t         |           t        j                  t	        j
                  dd|j                              | _        t        |      | _	        | j                  j                  }t        j                  t	        j
                  d|dz   |j                              | _        t        j                  |j                        | _        || _        y )Nr   )r8   r9   r	   rJ   r%   rK   r=   rL   DPTViTPatchEmbeddingspatch_embeddingsrQ   rM   Dropouthidden_dropout_probdropoutrO   )rN   rO   rQ   rT   s      r*   r9   zDPTViTEmbeddings.__init__   s    ekk!Q8J8J&KL 5f =++77#%<<A{QPVPbPb0c#d zz&"<"<=r)   c                 ~   |d d d |f   }|d|d f   }t        |j                  d      dz        }|j                  d||d      j                  dddd      }t        j
                  j                  |||fd      }|j                  dddd      j                  d||z  d      }t        j                  ||gd	      }|S rV   )	r   rZ   r^   r_   r	   r`   ra   r%   rb   rc   s           r*   rk   z"DPTViTEmbeddings._resize_pos_embed   s    A||O,
Q_-!+"2"21"5"<=!))!]M2NVVWXZ[]^`abmm//CSUdBelv/w!))!Q15==aAQTcAceghJ4!<r)   c                    |j                   \  }}}}| j                  j                  }| j                  | j                  ||z  ||z        }| j                  |      }	|	j                         \  }}
}| j                  j                  |dd      }t        j                  ||	fd      }	|	|z   }	| j                  |	      }	|s|	fS t        |	      S )Nr4   r   r\   )r   )rs   rO   r;   rk   rM   r   rZ   rL   rw   r%   rb   r   r   )rN   rl   rn   rx   r<   ry   rz   r;   rM   r   seq_len_r   s                r*   r   zDPTViTEmbeddings.forward   s    2>2D2D/
L&% [[++
"44$$f
&:EZ<O
 **<8
!+!2
GQ ^^**:r2>
YY
J7Q?
  "55
\\*-
= 9ZXXr)   r   )F)r!   r"   r#   r$   r9   rk   r   r   r   s   @r*   r   r      s    
Yr)   r   c                   (     e Zd ZdZ fdZd Z xZS )r   z$
    Image to Patch Embedding.

    c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        || _
        t        j                  ||||      | _        y )Nr   r   )r7   stride)r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rQ   r	   rH   rI   )rN   rO   r:   r;   r<   r=   rQ   rT   s          r*   r9   zDPTViTPatchEmbeddings.__init__  s    !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&))L+:^hir)   c                     |j                   \  }}}}|| j                  k7  rt        d      | j                  |      j	                  d      j                  dd      }|S )Nrq   rX   r   )rs   r<   rE   rI   ru   rv   )rN   rl   rx   r<   ry   rz   r   s          r*   r   zDPTViTPatchEmbeddings.forward   sb    2>2D2D/
L&%4,,,w  __\2::1=GG1M
r)   r!   r"   r#   r$   r9   r   r   r   s   @r*   r   r     s    
jr)   r   c            
            e Zd Zdeddf fdZdej                  dej                  fdZ	 d
deej                     de	de
eej                  ej                  f   eej                     f   fd	Z xZS )DPTViTSelfAttentionrO   ro   Nc                    t         |           |j                  |j                  z  dk7  r3t	        |d      s't        d|j                  f d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                  |j                        | _        t        j                  |j                  | j                  |j                        | _        t        j                  |j                  | j                  |j                        | _        t        j                  |j                         | _        y )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .)bias)r8   r9   r=   num_attention_headshasattrrE   intattention_head_sizeall_head_sizer	   Linearqkv_biasquerykeyvaluer   attention_probs_dropout_probr   rN   rO   rT   s     r*   r9   zDPTViTSelfAttention.__init__,  s1    : ::a?PVXhHi"6#5#5#6"7 8334A7 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
zz&"E"EFr)   xc                     |j                         d d | j                  | j                  fz   }|j                  |      }|j	                  dddd      S )Nr4   r   rX   r   r   )rZ   r   r   viewr_   )rN   r   new_x_shapes      r*   transpose_for_scoresz(DPTViTSelfAttention.transpose_for_scores>  sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r)   	head_maskoutput_attentionsc                    | j                  |      }| j                  | j                  |            }| j                  | j                  |            }| j                  |      }t	        j
                  ||j                  dd            }|t        j                  | j                        z  }t        j                  j                  |d      }	| j                  |	      }	||	|z  }	t	        j
                  |	|      }
|
j                  dddd      j                         }
|
j!                         d d | j"                  fz   }|
j%                  |      }
|r|
|	f}|S |
f}|S )Nr4   r5   r\   r   rX   r   r   )r   r   r   r   r%   matmulrv   mathsqrtr   r	   r`   softmaxr   r_   
contiguousrZ   r   r   )rN   r/   r   r   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                r*   r   zDPTViTSelfAttention.forwardC  sT    !JJ}5--dhh}.EF	//

=0IJ//0AB !<<Y5H5HR5PQ+dii8P8P.QQ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2 O\M]r)   NF)r!   r"   r#   r   r9   r%   r   r   r   r   r   r   r   r   r   s   @r*   r   r   +  s    Gy GT G$%ell %u|| % bg!(0(>!Z^!	uU\\5<</0%2EE	F!r)   r   c                   |     e Zd ZdZdeddf fdZdej                  dej                  dej                  fdZ xZ	S )	DPTViTSelfOutputz
    The residual connection is defined in DPTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rO   ro   Nc                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y r   )	r8   r9   r	   r   r=   denser   r   r   r   s     r*   r9   zDPTViTSelfOutput.__init__n  sB    YYv1163E3EF
zz&"<"<=r)   r/   input_tensorc                 J    | j                  |      }| j                  |      }|S r   r   r   rN   r/   r   s      r*   r   zDPTViTSelfOutput.forwards  s$    

=1]3r)   )
r!   r"   r#   r$   r   r9   r%   r   r   r   r   s   @r*   r   r   h  sD    
>y >T >
U\\  RWR^R^ r)   r   c                        e Zd Zdeddf fdZdee   ddfdZ	 	 ddej                  de
ej                     d	edeeej                  ej                  f   eej                     f   fd
Z xZS )DPTViTAttentionrO   ro   Nc                     t         |           t        |      | _        t	        |      | _        t               | _        y r   )r8   r9   r   	attentionr   outputsetpruned_headsr   s     r*   r9   zDPTViTAttention.__init__{  s0    ,V4&v.Er)   headsc                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r\   )rD   r   r   r   r   r   r   r   r   r   r   r   r   union)rN   r   r}   s      r*   prune_headszDPTViTAttention.prune_heads  s   u:?74>>55t~~7Y7Y[_[l[l
u
  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r)   r/   r   r   c                 h    | j                  |||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   )r   r   )rN   r/   r   r   self_outputsattention_outputr   s          r*   r   zDPTViTAttention.forward  sE     ~~mY@QR;;|AF#%QR(88r)   r   )r!   r"   r#   r   r9   r   r   r   r%   r   r   r   r   r   r   r   r   s   @r*   r   r   z  s    "y "T ";S ;d ;, -1"'	|| ELL)  	
 
uU\\5<</0%2EE	Fr)   r   c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )DPTViTIntermediaterO   ro   Nc                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r8   r9   r	   r   r=   intermediate_sizer   r>   
hidden_actstrr   intermediate_act_fnr   s     r*   r9   zDPTViTIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r)   r/   c                 J    | j                  |      }| j                  |      }|S r   )r   r   )rN   r/   s     r*   r   zDPTViTIntermediate.forward  s&    

=100?r)   	r!   r"   r#   r   r9   r%   r   r   r   r   s   @r*   r   r     s1    9y 9T 9U\\ ell r)   r   c                   x     e Zd Zdeddf fdZdej                  dej                  dej                  fdZ xZS )DPTViTOutputrO   ro   Nc                     t         |           t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        y r   )
r8   r9   r	   r   r   r=   r   r   r   r   r   s     r*   r9   zDPTViTOutput.__init__  sB    YYv779K9KL
zz&"<"<=r)   r/   r   c                 T    | j                  |      }| j                  |      }||z   }|S r   r   r   s      r*   r   zDPTViTOutput.forward  s.    

=1]3%4r)   r   r   s   @r*   r   r     s?    >y >T >
U\\  RWR^R^ r)   r   c                        e Zd ZdZdeddf fdZ	 	 d
dej                  deej                     de	de
eej                  ej                  f   eej                     f   fd	Z xZS )DPTViTLayerz?This corresponds to the Block class in the timm implementation.rO   ro   Nc                 r   t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        y )Nr   eps)r8   r9   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r	   	LayerNormr=   layer_norm_epslayernorm_beforelayernorm_afterr   s     r*   r9   zDPTViTLayer.__init__  s    '-'E'E$(0.v6"6* "V-?-?VEZEZ [!||F,>,>FDYDYZr)   r/   r   r   c                     | j                  | j                  |      ||      }|d   }|dd  }||z   }| j                  |      }| j                  |      }| j	                  ||      }|f|z   }|S )N)r   r   r   )r   r   r   r   r   )rN   r/   r   r   self_attention_outputsr   r   layer_outputs           r*   r   zDPTViTLayer.forward  s     "&!!-0/ "0 "

 2!4(, )=8 ++M:((6 {{<?/G+r)   r   )r!   r"   r#   r$   r   r9   r%   r   r   r   r   r   r   r   r   s   @r*   r   r     s    I[y [T [ -1"'	|| ELL)  	
 
uU\\5<</0%2EE	Fr)   r   c                        e Zd Zdeddf fdZ	 	 	 	 ddej                  deej                     deded	ede	e
ef   fd
Z xZS )DPTViTEncoderrO   ro   Nc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r   )
r8   r9   rO   r	   
ModuleListrangenum_hidden_layersr   layergradient_checkpointingrN   rO   r   rT   s      r*   r9   zDPTViTEncoder.__init__  sN    ]]vG_G_A`#aAK$7#ab
&+# $bs   A#r/   r   r   r~   rn   c                 t   |rdnd }|rdnd }t        | j                        D ]h  \  }}	|r||fz   }|||   nd }
| j                  r+| j                  r| j	                  |	j
                  ||
|      }n
 |	||
|      }|d   }|s`||d   fz   }j |r||fz   }|st        d |||fD              S t        |||      S )Nr(   r   r   c              3   &   K   | ]	  }||  y wr   r(   ).0vs     r*   	<genexpr>z(DPTViTEncoder.forward.<locals>.<genexpr>  s     mq_`_lms   )r-   r/   r0   )	enumerater   r   training_gradient_checkpointing_func__call__tupler   )rN   r/   r   r   r~   rn   all_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputss               r*   r   zDPTViTEncoder.forward  s     #7BD$5b4(4 	POA|#$58H$H!.7.CilO**t}} $ A A ))!#%	! !-]OM^ _)!,M &9]1=M<O&O#'	P*   1]4D Dm]4EGZ$[mmm++*
 	
r)   )NFFT)r!   r"   r#   r   r9   r%   r   r   r   r   r  r   r   r   r   s   @r*   r   r     sz    ,y ,T , -1"'%* )
||)
 ELL))
  	)

 #)
 )
 
uo%	&)
r)   r   c                   t     e Zd ZdZ fdZd Zd Zddeej                     deej                     fdZ
 xZS )	DPTReassembleStagea@  
    This class reassembles the hidden states of the backbone into image-like feature representations at various
    resolutions.

    This happens in 3 stages:
    1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to
       `config.readout_type`.
    2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
    3. Resizing the spatial dimensions (height, width).

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
    c                     t         |           || _        t        j                         | _        |j                  r| j                  |       n| j                  |       |j                  | _	        y r   )
r8   r9   rO   r	   r   layers	is_hybrid_init_reassemble_dpt_hybrid_init_reassemble_dptneck_ignore_stagesr   s     r*   r9   zDPTReassembleStage.__init__3  sU    mmo,,V4%%f-"(";";r)   c           	      v   t        t        t        |j                              |j                        D ]r  \  }}|dk  r.| j
                  j                  t        j                                9|dkD  s?| j
                  j                  t        ||j                  |   |             t |j                  dk7  rt        d|j                   d      t        j                         | _        t        |      }t        t        |j                              D ]  }|dk  rA| j                  j                  t        j                  t        j                                      I|dkD  sO| j                  j                  t        j                  t        j                   d|z  |      t"        |j$                                   y)a   "
        For DPT-Hybrid the first 2 reassemble layers are set to `nn.Identity()`, please check the official
        implementation: https://github.com/isl-org/DPT/blob/f43ef9e08d70a752195028a51be5e1aff227b913/dpt/vit.py#L438
        for more details.
        r   rC   factorprojectzReadout type z! is not supported for DPT-Hybrid.rX   N)zipr   rD   neck_hidden_sizesreassemble_factorsr  appendr	   IdentityDPTReassembleLayerreadout_typerE   r   readout_projects_get_backbone_hidden_size
Sequentialr   r   r   )rN   rO   r	  r  r=   s        r*   r  z.DPTReassembleStage._init_reassemble_dpt_hybrid?  sX    U3v'?'?#@A6C\C\] 	tIAvAv""2;;=1Q""#5fvG_G_`aGbkq#rs		t )+}V-@-@,AAbcdd !#/7s63345 	AAv%%,,R]]2;;=-IJQ%%,,MM"))AO["I6RXRcRcKde		r)   c           	      <   t        t        t        |j                              |j                        D ]9  \  }}| j
                  j                  t        ||j                  |   |             ; |j                  dk(  rt        j                         | _        t        |      }t        t        |j                              D ]Y  }| j                  j                  t        j                  t        j                  d|z  |      t        |j                                   [ y y )Nr  r  rX   )r  r   rD   r  r  r  r  r  r  r	   r   r   r!  r"  r   r   r   )rN   rO   r	  r  r=   r   s         r*   r  z'DPTReassembleStage._init_reassemble_dptY  s    U3v'?'?#@A6C\C\] 	pIAvKK1&6C[C[\]C^gmno	p )+$&MMOD!3F;K3v7789 %%,,MM"))AO["I6RXRcRcKde ,r)   r/   ro   c                    g }t        |      D ]  \  }}|| j                  vr|dddf   |ddddf   }}|j                  \  }}	}
|||j                  ||||
      }n"t	        |	dz        }|j                  ||||
      }|j                  dddd      j                         }|j                  }| j                  j                  dk(  r|j                  d      j                  d      }|j                  d      j                  |      } | j                  |   t        j                  ||fd	            }|j                  ddd      j                  |      }nM| j                  j                  d
k(  r4|j                  d      |j                  d	      z   }|j                  |      } | j                  |   |      }|j!                  |        |S )z
        Args:
            hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
                List of hidden states from the backbone.
        Nr   r   rW   r   rX   r  )r   rX   r   r4   add)r  r  rs   r^   r   r_   r   rO   r  ru   	unsqueeze	expand_asr   r%   rb   r  r  )rN   r/   patch_heightpatch_widthoutr	  hidden_staterL   rx   sequence_lengthr<   rZ   feature_shapereadouts                 r*   r   zDPTReassembleStage.forwarde  s    (7 	%OA|///*6q!t*<l1ab5>Q<	<H<N<N9
O\+0G#/#7#7
LR]_k#lL$_c%9:D#/#7#7
D$P\#]L+33Aq!Q?JJL , 2 2;;++y8#/#7#7#:#B#B9#ML'11!4>>|LG#;4#8#8#;EII|U\F]_a<b#cL#/#7#71a#@#H#H#WL[[--6#/#7#7#:Y=P=PQS=T#TL#/#7#7#FL-t{{1~l;JJ|$3	%6 
r)   NN)r!   r"   r#   r$   r9   r  r  r   r%   r   r   r   r   s   @r*   r  r  #  s@    
<4
#T%,,%7 #aefkfrfras #r)   r  c                 z    | j                   $| j                  du r| j                   j                  S | j                  S r   )backbone_configr  r=   )rO   s    r*   r!  r!    s9    )f.>.>%.G%%111!!!r)   c                   $     e Zd Z fdZd Z xZS )r  c           	      \   t         |           t        |      }t        j                  ||d      | _        |dkD  r t        j                  ||||d      | _        y |dk(  rt        j                         | _        y |dk  r,t        j                  ||dt        d|z        d      | _        y y )Nr   )in_channelsout_channelsr7   r   r7   r   paddingr   )
r8   r9   r!  r	   rH   rI   ConvTranspose2dresizer  r   )rN   rO   rC   r  r=   rT   s        r*   r9   zDPTReassembleLayer.__init__  s    /7))(`ab A:,,XxV\blmnDKq[++-DKaZ))HhAcRSV\R\oghiDK r)   c                 J    | j                  |      }| j                  |      }|S r   )rI   r9  )rN   r+  s     r*   r   zDPTReassembleLayer.forward  s$    |4{{<0r)   r!   r"   r#   r9   r   r   r   s   @r*   r  r    s    jr)   r  c                   $     e Zd Z fdZd Z xZS )DPTFeatureFusionStagec                     t         |           t        j                         | _        t        t        |j                              D ]&  }| j                  j                  t        |             ( y r   )
r8   r9   r	   r   r  r   rD   r  r  DPTFeatureFusionLayerr   s      r*   r9   zDPTFeatureFusionStage.__init__  sR    mmos63345 	>AKK4V<=	>r)   c                     |d d d   }g }d }t        || j                        D ]*  \  }}|	 ||      }n	 |||      }|j                  |       , |S )Nr4   )r  r  r  )rN   r/   fused_hidden_statesfused_hidden_stater+  r   s         r*   r   zDPTFeatureFusionStage.forward  sq    %dd+ !#&}dkk#B 	;L%!)%*<%8"%*+=|%L"&&'9:	; #"r)   r;  r   s   @r*   r=  r=    s    >#r)   r=  c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )DPTPreActResidualLayerz
    ResidualConvUnit, pre-activate residual unit.

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
    c                 l   t         |           |j                  | _        |j                  |j                  n| j                   }t        j                         | _        t        j                  |j                  |j                  ddd|      | _
        t        j                         | _        t        j                  |j                  |j                  ddd|      | _        | j                  rIt        j                  |j                        | _        t        j                  |j                        | _        y y )Nr   r   )r7   r   r7  r   )r8   r9   !use_batch_norm_in_fusion_residualuse_batch_normuse_bias_in_fusion_residualr	   ReLUactivation1rH   fusion_hidden_sizeconvolution1activation2convolution2BatchNorm2dbatch_norm1batch_norm2)rN   rO   rH  rT   s      r*   r9   zDPTPreActResidualLayer.__init__  s   $FF 11= ..((( 	$ 779II%%%%,
 779II%%%%,
 !~~f.G.GHD!~~f.G.GHD r)   r+  ro   c                    |}| j                  |      }| j                  |      }| j                  r| j                  |      }| j	                  |      }| j                  |      }| j                  r| j                  |      }||z   S r   )rJ  rL  rG  rP  rM  rN  rQ  rN   r+  residuals      r*   r   zDPTPreActResidualLayer.forward  s    ''5((6++L9L''5((6++L9Lh&&r)   )	r!   r"   r#   r$   r9   r%   r   r   r   r   s   @r*   rD  rD    s*     ID'ELL 'U\\ 'r)   rD  c                   ,     e Zd ZdZd fd	ZddZ xZS )r?  a3  Feature fusion layer, merges feature maps from different stages.

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
        align_corners (`bool`, *optional*, defaults to `True`):
            The align_corner setting for bilinear upsample.
    c                     t         |           || _        t        j                  |j
                  |j
                  dd      | _        t        |      | _        t        |      | _	        y )Nr   T)r7   r   )
r8   r9   align_cornersr	   rH   rK  rI   rD  residual_layer1residual_layer2)rN   rO   rW  rT   s      r*   r9   zDPTFeatureFusionLayer.__init__  sT    *))F$=$=v?X?Xfgnrs5f=5f=r)   c                    |l|j                   |j                   k7  r?t        j                  j                  ||j                   d   |j                   d   fdd      }|| j	                  |      z   }| j                  |      }t        j                  j                  |dd| j                        }| j                  |      }|S )NrX   r   rY   FrZ   r[   rW  scale_factorr[   rW  )rs   r	   r`   ra   rX  rY  rW  rI   rS  s      r*   r   zDPTFeatureFusionLayer.forward  s    !!X^^3==44L$6$6q$9<;M;Ma;P#QXbrw 5  ($*>*>x*HHL++L9}}00qzI[I[ 1 
 |4r)   Tr   r   r   s   @r*   r?  r?    s    >r)   r?  c                   &    e Zd ZdZeZdZdZdZd Z	y)DPTPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    dptrl   Tc                    t        |t        j                  t        j                  t        j                  f      rm|j
                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rJ|j                  j                  j                          |j
                  j                  j                  d       yy)zInitialize the weightsg        )meanstdNg      ?)r>   r	   r   rH   r8  weightdatanormal_rO   initializer_ranger   zero_r   fill_)rN   modules     r*   _init_weightsz DPTPreTrainedModel._init_weights-  s    fryy"))R5G5GHI MM&&CT[[5R5R&S{{&  &&( '-KK""$MM$$S) .r)   N)
r!   r"   r#   r$   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingrl  r(   r)   r*   r`  r`  "  s$    
 L$O&*#
*r)   r`  aE  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`ViTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aP  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
            for details.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
z]The bare DPT Model transformer outputting raw hidden-states without any specific head on top.c                        e Zd Zd fd	Zd Zd Z ee       ee	e
ede      	 	 	 	 ddej                  deej                     dee   d	ee   d
ee   deee
f   fd              Z xZS )DPTModelc                 T   t         |   |       || _        |j                  rt	        |      | _        nt        |      | _        t        |      | _        t        j                  |j                  |j                        | _        |rt        |      nd | _        | j!                          y )Nr   )r8   r9   rO   r  r2   r   r   r   encoderr	   r   r=   r   	layernormDPTViTPoolerpooler	post_init)rN   rO   add_pooling_layerrT   s      r*   r9   zDPTModel.__init__a  s      4V<DO.v6DO$V,f&8&8f>S>ST.?l6*T 	r)   c                 r    | j                   j                  r| j                  S | j                  j                  S r   )rO   r  r   r   )rN   s    r*   get_input_embeddingszDPTModel.get_input_embeddingsr  s)    ;;  ??"??333r)   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrt  r   r   r   )rN   heads_to_pruner   r   s       r*   _prune_headszDPTModel._prune_headsx  sE    
 +002 	CLE5LLu%//;;EB	Cr)   vision)
checkpointoutput_typerm  modalityexpected_outputrl   r   r   r~   rn   ro   c                 Z   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  || j                   j
                        }| j                  ||      }|s|d   n|j                  }| j                  |||||      }|d   }	| j                  |	      }	| j                  | j                  |	      nd }
|s|
|	|
fn|	f}||dd  z   |dd  z   S t        |	|
|j                  |j                  |j                        S )N)rn   r   r   r   r~   rn   r   )r-   r.   r/   r0   r    )rO   r   r~   use_return_dictget_head_maskr   r   r   rt  ru  rw  r,   r/   r0   r    )rN   rl   r   r   r~   rn   embedding_outputembedding_last_hidden_statesencoder_outputssequence_outputpooled_outputhead_outputss               r*   r   zDPTModel.forward  sU     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] &&y$++2O2OP	??<[?QBM'7':ScSvSv$,,(/!5# ' 
 *!,..98<8OO4UY?L?XO];_n^pL/!""558H8LLLC-')77&11%5%N%N
 	
r)   r^  )NNNN)r!   r"   r#   r9   r{  r  r   DPT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr,   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr%   r&   r   r   r   r   r   r   r   s   @r*   rr  rr  \  s    
"4C ++?@&H$. 26,0/3&*/
''/
 E--./
 $D>	/

 'tn/
 d^/
 
uJJ	K/
 A/
r)   rr  c                   *     e Zd Zdef fdZd Z xZS )rv  rO   c                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )r8   r9   r	   r   r=   r   Tanh
activationr   s     r*   r9   zDPTViTPooler.__init__  s9    YYv1163E3EF
'')r)   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r  )rN   r/   first_token_tensorr  s       r*   r   zDPTViTPooler.forward  s6     +1a40

#566r)   )r!   r"   r#   r   r9   r   r   r   s   @r*   rv  rv    s    $y $
r)   rv  c                   h     e Zd ZdZ fdZddeej                     deej                     fdZ xZ	S )DPTNecka;  
    DPTNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
    input and produces another list of tensors as output. For DPT, it includes 2 stages:

    * DPTReassembleStage
    * DPTFeatureFusionStage.

    Args:
        config (dict): config dict.
    c           
         t         |           || _        |j                   |j                  j                  dv rd | _        nt        |      | _        t        j                         | _	        |j                  D ]?  }| j                  j                  t        j                  ||j                  ddd             A t        |      | _        y )N)swinv2r   r   Fr7   r7  r   )r8   r9   rO   r1  
model_typereassemble_stager  r	   r   convsr  r  rH   rK  r=  fusion_stage)rN   rO   channelrT   s      r*   r9   zDPTNeck.__init__  s     !!-&2H2H2S2SWa2a$(D!$6v$>D!]]_
// 	sGJJbii1J1JXYcdkpqr	s 2&9r)   r/   ro   c                    t        |t        t        f      st        d      t	        |      t	        | j
                  j                        k7  rt        d      | j                  | j                  |||      }t        |      D cg c]  \  }} | j                  |   |       }}}| j                  |      }|S c c}}w )z
        Args:
            hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
                List of hidden states from the backbone.
        z2hidden_states should be a tuple or list of tensorszOThe number of hidden states should be equal to the number of neck hidden sizes.)r>   r  list	TypeErrorrD   rO   r  rE   r  r  r  r  )rN   r/   r(  r)  r	  featurer|   r   s           r*   r   zDPTNeck.forward  s     -%7PQQ}T[[%B%B!CCnoo   , 11-{[M=F}=UVzq'MDJJqM'*VV ""8, Ws   B:r/  
r!   r"   r#   r$   r9   r   r%   r   r   r   r   s   @r*   r  r    s6    	:"T%,,%7 aefkfrfras r)   r  c                   `     e Zd ZdZ fdZdeej                     dej                  fdZ xZ	S )DPTDepthEstimationHeada	  
    Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
    the predictions to the input resolution after the first convolutional layer (details can be found in the paper's
    supplementary material).
    c                    t         |           || _        d | _        |j                  rt        j                  ddddd      | _        |j                  }t        j                  t        j                  ||dz  ddd      t        j                  ddd	
      t        j                  |dz  dddd      t        j                         t        j                  ddddd      t        j                               | _        y )N   )r   r   )r   r   r6  rX   r   r   rY   Tr\      r   )r8   r9   rO   rI   add_projectionr	   rH   rK  r"  UpsamplerI  headrN   rO   r|   rT   s      r*   r9   zDPTDepthEstimationHead.__init__  s       iiSfV]cdDO,,MMIIhA1QPQRKKQZtLIIh!mRQq!LGGIIIb!1a@GGI
	r)   r/   ro   c                     || j                   j                     }| j                  +| j                  |      } t        j                         |      }| j                  |      }|j                  d      }|S )Nr   r\   )rO   head_in_indexrI   r	   rI  r  squeeze)rN   r/   predicted_depths      r*   r   zDPTDepthEstimationHead.forward  sg    %dkk&?&?@??& OOM:M%BGGIm4M))M2)11a18r)   r  r   s   @r*   r  r    s-    
&T%,,%7 ELL r)   r  zu
    DPT Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
    c                        e Zd Z fdZ ee       eee      	 	 	 	 	 dde	j                  dee	j                     dee	j                     dee   dee   dee   d	eee	j                      ef   fd
              Z xZS )DPTForDepthEstimationc                 $   t         |   |       d | _        |j                  du r)|j                  |j                  t        |      | _        nt        |d      | _        t        |      | _	        t        |      | _        | j                          y NF)ry  )r8   r9   rB   r  r1  r   rr  ra  r  neckr  r  rx  r   s     r*   r9   zDPTForDepthEstimation.__init__/  s}     u$&*@*@*LPVP_P_Pk)&1DM%@DH FO	 +62	 	r)   r  rm  rl   r   labelsr   r~   rn   ro   c                 f    d}|t        d      ||n j                  j                  }||n j                  j                  }||n j                  j                  } j
                  + j
                  j                  |||      }|j                  }	n j                  |||d|      }|r|j                  n|d   }	 j                  j                  s:t        |	dd       D 
cg c]   \  }
}|
 j                  j                  v s|" }	}
}nD|r|j                  nt        |d         }|j                   fdt        |	dd       D               |}	d	\  }} j                  j                   S j                  j                  d
u r;|j"                  \  }}}} j                  j                   j$                  }||z  }||z  } j'                  |	||      }	 j)                  |	      }|s|r
|f|dd z   }n	|f|dd z   }||f|z   S |S t+        |||r|j                  nd|j,                        S c c}}
w )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth depth estimation maps for computing the loss.

        Returns:

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DPTForDepthEstimation
        >>> import torch
        >>> import numpy as np
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large")
        >>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")

        >>> # prepare image for the model
        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # interpolate to original size
        >>> post_processed_output = image_processor.post_process_depth_estimation(
        ...     outputs,
        ...     target_sizes=[(image.height, image.width)],
        ... )

        >>> # visualize the prediction
        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
        >>> depth = predicted_depth * 255 / predicted_depth.max()
        >>> depth = depth.detach().cpu().numpy()
        >>> depth = Image.fromarray(depth.astype("uint8"))
        ```NzTraining is not implemented yet)r~   r   Tr  r   r4   c              3   ^   K   | ]$  \  }}|j                   j                  d d v r| & ywrX   NrO   backbone_out_indicesr   idxr  rN   s      r*   r  z0DPTForDepthEstimation.forward.<locals>.<genexpr>  s6      .$Wdkk>>qrBB .s   *-r/  FrX   )lossr  r/   r0   )NotImplementedErrorrO   r  r~   r   rB   forward_with_filtered_kwargsrt   ra  r/   r  r  r  r    r  extendr1  rs   r;   r  r  r   r0   )rN   rl   r   r  r   r~   rn   r  r   r/   r  r  backbone_hidden_statesr(  r)  r   ry   rz   r;   r  r   s   `                    r*   r   zDPTForDepthEstimation.forwardA  st   b %&GHH%0%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq==$mm@@3G[l A G $00Mhh#"3%)'  G 6AG11gajM ;;((09-:K0L! ,WPSW[WbWbWwWwPwG! ! NY)I)I^bcjkmcn^o&&-- .(1-2C(D.  !7$.!k;;&&2t{{7L7LPU7U"."4"4Aq&%44??J!Z/L:-K		-{K))M2#)+gabk9)+gabk9)-)9TGf$EvE#+3G'//T))	
 	
?!s   & H-H-)NNNNN)r!   r"   r#   r9   r   r  r   r   r  r%   r&   r   
LongTensorr   r   r   r   r   r   r   s   @r*   r  r  (  s    $ ++?@+?o^ 26-1,0/3&*n
''n
 E--.n
 ))*	n

 $D>n
 'tnn
 d^n
 
uU\\"$88	9n
 _ An
r)   r  c                   \     e Zd Z fdZdeej                     dej                  fdZ xZS )DPTSemanticSegmentationHeadc                    t         |           || _        |j                  }t	        j
                  t	        j                  ||ddd      t	        j                  |      t	        j                         t	        j                  |j                        t	        j                  ||j                  d      t	        j                  ddd	            | _        y )
Nr   r   Fr  r6   rX   rY   Tr\  )r8   r9   rO   rK  r	   r"  rH   rO  rI  r   semantic_classifier_dropout
num_labelsr  r  r  s      r*   r9   z$DPTSemanticSegmentationHead.__init__  s    ,,MMIIhaONN8$GGIJJv99:IIh 1 1qAKKQZtL
	r)   r/   ro   c                 Z    || j                   j                     }| j                  |      }|S r   )rO   r  r  rN   r/   logitss      r*   r   z#DPTSemanticSegmentationHead.forward  s)    %dkk&?&?@=)r)   )	r!   r"   r#   r9   r   r%   r   r   r   r   s   @r*   r  r    s(    
T%,,%7 ELL r)   r  c                   $     e Zd Z fdZd Z xZS )DPTAuxiliaryHeadc                 X   t         |           |j                  }t        j                  t        j
                  ||ddd      t        j                  |      t        j                         t        j                  dd      t        j
                  ||j                  d            | _
        y )Nr   r   Fr  g?r6   )r8   r9   rK  r	   r"  rH   rO  rI  r   r  r  r  s      r*   r9   zDPTAuxiliaryHead.__init__  sv    ,,MMIIhaONN8$GGIJJsE"IIh 1 1qA
	r)   c                 (    | j                  |      }|S r   )r  r  s      r*   r   zDPTAuxiliaryHead.forward  s    =)r)   r;  r   s   @r*   r  r    s    

r)   r  zY
    DPT Model with a semantic segmentation head on top e.g. for ADE20k, CityScapes.
    c                       e Zd Z fdZ ee       eee      	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     de	e   de	e   de	e   d	eee
j                      ef   fd
              Z xZS )DPTForSemanticSegmentationc                     t         |   |       t        |d      | _        t	        |      | _        t        |      | _        |j                  rt        |      nd | _
        | j                          y r  )r8   r9   rr  ra  r  r  r  r  use_auxiliary_headr  auxiliary_headrx  r   s     r*   r9   z#DPTForSemanticSegmentation.__init__  s^     Fe< FO	 07	:@:S:S.v6Y] 	r)   r  rl   r   r  r   r~   rn   ro   c                     ||n j                   j                  }||n j                   j                  }|$ j                   j                  dk(  rt	        d       j                  |||d|      }|r|j                  n|d   } j                   j                  s:t        |dd       D 	
cg c]   \  }	}
|	 j                   j                  v s|
" }}	}
nD|r|j                  nt        |d         }|j                   fdt        |dd       D               |} j                  |      } j                  |      }d} j                   j                  |d         }d}|t         j"                  j%                  ||j&                  d	d d
d      }|0t         j"                  j%                  ||j&                  d	d d
d      }t)         j                   j*                        } |||      } ||      }| j                   j,                  |z  z   }|s|r
|f|dd z   }n	|f|dd z   }||f|z   S |S t/        |||r|j                  nd|j0                        S c c}
}	w )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DPTForSemanticSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large-ade")
        >>> model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr  r4   c              3   `   K   | ]%  \  }}|j                   j                  d d v s"| ' ywr  r  r  s      r*   r  z5DPTForSemanticSegmentation.forward.<locals>.<genexpr>3  s7      *(CCSWS^S^SsSstutvSwLw*s   #..)r/   r5   rY   Fr[  )ignore_indexrX   )r  r  r/   r0   )rO   r  r~   r  rE   ra  r/   r  r  r  r    r  r  r  r  r  r	   r`   ra   rs   r
   semantic_loss_ignore_indexauxiliary_loss_weightr   r0   )rN   rl   r   r  r   r~   rn   r   r/   r  r  r  r  auxiliary_logitsr  upsampled_logitsupsampled_auxiliary_logitsloss_fct	main_lossauxiliary_lossr   s   `                    r*   r   z"DPTForSemanticSegmentation.forward  s   F &1%<k$++B]B]$8$D $++JjJj 	 $++"8"8A"=NOO((/!%#  
 2=--'!* {{$$,5mAB6G,H(CCSWS^S^SsSsLsM  JUW%E%EZ^_fgi_jZk"")) *,5mAB6G,H*  3M			>=)*#22=3DE!}}88V\\"#.Zu  9    +-/]]-F-F$6<<+<:]b .G .* (T[[5[5[\H !16:I%&@&INt{{@@>QQD# WQR[0 WQR[0)-)9TGf$EvE&3G'//T))	
 	
Us   , II)NNNNNN)r!   r"   r#   r9   r   r  r   r   r  r   r%   r&   r  r   r   r   r   r   r   r   s   @r*   r  r    s     ++?@+BQ`a 5915-1,0/3&*e
u001e
 E--.e
 ))*	e

 $D>e
 'tne
 d^e
 
uU\\"$;;	<e
 b Ae
r)   r  )Mr$   collections.abcr?   r   dataclassesr   typingr   r   r   r   r   r%   torch.utils.checkpointr	   torch.nnr
   activationsr   
file_utilsr   r   r   r   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   utils.backbone_utilsr   configuration_dptr   
get_loggerr!   loggerr  r  r  r   r,   Moduler2   r   r   r   r   r   r   r   r   r   r  r!  r  r=  rD  r?  r`  DPT_START_DOCSTRINGr  rr  rv  r  r  r  r  r  r  r(   r)   r*   <module>r     s     ! 4 4    % !  _ ^ - Q 4 4 1 ( 
		H	%  ( '  M M M   M;  M  MF`
RYY `
F7Yryy 7YtBII @9")) 9zryy $&bii &T "299  '")) 'V0
BII 0
fe eP" ,#BII #0:'RYY :'z"BII "J* *0	  . cW
! W
	W
v299 2bii 2j&RYY &R  	C
. C
C
L")) 2ryy &  	w
!3 w
w
r)   