
    sgB                        d Z ddlmZmZmZmZ ddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ dZ G d dej.                        Z G d dej.                        Z G d dej.                        Z G d dej.                        Z G d dej.                        Z G d de      ZdZdZ ede       G d de             Z y)zrPyTorch UperNet model. Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.    )ListOptionalTupleUnionN)nn)CrossEntropyLoss   )SemanticSegmenterOutput)PreTrainedModel)add_start_docstrings%add_start_docstrings_to_model_forwardreplace_return_docstrings)load_backbone   )UperNetConfigr   c                        e Zd ZdZ	 	 	 ddededeeeeef   f   deeeeef   ef   dedeeeeef   f   dd	f fd
Z	de
j                  de
j                  fdZ xZS )UperNetConvModulez
    A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
    layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
    in_channelsout_channelskernel_sizepaddingbiasdilationreturnNc                     t         |           t        j                  ||||||      | _        t        j
                  |      | _        t        j                         | _        y )N)r   r   r   r   r   r   )	super__init__r   Conv2dconvBatchNorm2d
batch_normReLU
activation)selfr   r   r   r   r   r   	__class__s          _/var/www/html/venv/lib/python3.12/site-packages/transformers/models/upernet/modeling_upernet.pyr   zUperNetConvModule.__init__(   sQ     	II#%#
	 ..6'')    inputc                 l    | j                  |      }| j                  |      }| j                  |      }|S N)r   r!   r#   )r$   r(   outputs      r&   forwardzUperNetConvModule.forward=   s1    5!((r'   )r   Fr   )__name__
__module____qualname____doc__intr   r   strboolr   torchTensorr,   __classcell__r%   s   @r&   r   r   "   s     5601$$ $ 3c3h/0	$
 sE#s(OS01$ $ U38_,-$ 
$*U\\ ell r'   r   c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZS )	UperNetPyramidPoolingBlock
pool_scaler   channelsr   Nc                     t         |           t        j                  |      t	        ||d      g| _        t        | j
                        D ]   \  }}| j                  t        |      |       " y )Nr   r   )	r   r   r   AdaptiveAvgPool2dr   layers	enumerate
add_moduler2   )r$   r:   r   r;   ilayerr%   s         r&   r   z#UperNetPyramidPoolingBlock.__init__F   sa      ,k8C
 "$++. 	+HAuOOCFE*	+r'   r(   c                 <    |}| j                   D ]
  } ||      } |S r*   )r?   )r$   r(   hidden_staterC   s       r&   r,   z"UperNetPyramidPoolingBlock.forwardO   s*    [[ 	/E .L	/r'   )	r-   r.   r/   r1   r   r4   r5   r,   r6   r7   s   @r&   r9   r9   E   s?    +3 +S +C +D +U\\ ell r'   r9   c            
            e Zd ZdZdeedf   dedededdf
 fd	Zd
ej                  de
ej                     fdZ xZS )UperNetPyramidPoolingModulea}  
    Pyramid Pooling Module (PPM) used in PSPNet.

    Args:
        pool_scales (`Tuple[int]`):
            Pooling scales used in Pooling Pyramid Module.
        in_channels (`int`):
            Input channels.
        channels (`int`):
            Channels after modules, before conv_seg.
        align_corners (`bool`):
            align_corners argument of F.interpolate.
    pool_scales.r   r;   align_cornersr   Nc                    t         |           || _        || _        || _        || _        g | _        t        |      D ]I  \  }}t        |||      }| j                  j                  |       | j                  t        |      |       K y )N)r:   r   r;   )r   r   rH   rI   r   r;   blocksr@   r9   appendrA   r2   )	r$   rH   r   r;   rI   rB   r:   blockr%   s	           r&   r   z$UperNetPyramidPoolingModule.__init__e   s    &*& &{3 	+MAz.*R]hpqEKKu%OOCFE*	+r'   xc                     g }| j                   D ]Y  } ||      }t        j                  j                  ||j	                         dd  d| j
                        }|j                  |       [ |S )N   bilinearsizemoderI   )rK   r   
functionalinterpolaterS   rI   rL   )r$   rN   ppm_outsppmppm_outupsampled_ppm_outs         r&   r,   z#UperNetPyramidPoolingModule.forwardq   sn    ;; 	/C!fG " 9 9affhqrl4K]K] !: ! OO-.	/ r'   )r-   r.   r/   r0   r   r1   r3   r   r4   r5   r   r,   r6   r7   s   @r&   rG   rG   V   s[    
+E#s(O 
+# 
+QT 
+ei 
+nr 
+ $u||*< r'   rG   c                   l     e Zd ZdZ fdZd Zd Zd Zdej                  dej                  fdZ
 xZS )	UperNetHeadz
    Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
    [UPerNet](https://arxiv.org/abs/1807.10221).
    c                    t         |           || _        |j                  | _        || _        |j
                  | _        d| _        t        j                  | j                  |j                  d      | _        t        | j                  | j                  d   | j                  | j                        | _        t        | j                  d   t        | j                        | j                  z  z   | j                  dd      | _        t        j"                         | _        t        j"                         | _        | j                  d d D ]s  }t        || j                  d      }t        | j                  | j                  dd      }| j$                  j)                  |       | j&                  j)                  |       u t        t        | j                        | j                  z  | j                  dd      | _        y )NFr   r=   )rI   r	   r   r   )r   r   configrH   r   hidden_sizer;   rI   r   r   
num_labels
classifierrG   psp_modulesr   len
bottleneck
ModuleListlateral_convs	fpn_convsrL   fpn_bottleneck)r$   r`   r   l_convfpn_convr%   s        r&   r   zUperNetHead.__init__   s   !--&**"))DMM63D3DRST 7R MM,,	
 ,R 3t'7'7#84==#HHMM	
  ]]_++CR0 	,K&{DMMqQF(ST^_`H%%f-NN!!(+		, 0  !DMM1MM	
r'   c                 :    | j                  | j                         y r*   apply_init_weightsr$   s    r&   init_weightszUperNetHead.init_weights       

4%%&r'   c                    t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          y y y Ng        )meanstd

isinstancer   r   weightdatanormal_r`   initializer_ranger   zero_r$   modules     r&   rp   zUperNetHead._init_weights   a    fbii(MM&&CT[[5R5R&S{{&  &&( ' )r'   c                     |d   }|g}|j                  | j                  |             t        j                  |d      }| j	                  |      }|S )Nr^   r   dim)extendrd   r4   catrf   )r$   inputsrN   psp_outsr+   s        r&   psp_forwardzUperNetHead.psp_forward   sL    2J3((+,99X1-*r'   encoder_hidden_statesr   c                 P   t        | j                        D cg c]  \  }} |||          }}}|j                  | j                  |             t	        |      }t        |dz
  dd      D ]V  }||dz
     j                  dd  }||dz
     t        j                  j                  ||   |d| j                        z   ||dz
  <   X t        |dz
        D cg c]  } | j                  |   ||          }}|j                  |d          t        |dz
  dd      D ]E  }t        j                  j                  ||   |d   j                  dd  d| j                        ||<   G t        j                  |d      }| j                  |      }| j                  |      }|S c c}}w c c}w )Nr   r   r^   rP   rQ   rR   r   )r@   rh   rL   r   re   rangeshaper   rU   rV   rI   ri   r4   r   rj   rc   )	r$   r   rB   lateral_convlateralsused_backbone_levels
prev_shapefpn_outsr+   s	            r&   r,   zUperNetHead.forward   s   R[\`\n\nRopq,L!6q!9:pp(()>?@  #8}+a/B7 	A!!a%..qr2J&q1uo0I0I*:TM_M_ 1J 1 HQUO	 =BBVYZBZ<[\q%DNN1%hqk2\\%+a/B7 	A--33(1+"3"3AB"7jX\XjXj 4 HQK	 99X1-$$X.(3 q ]s   FF#)r-   r.   r/   r0   r   rr   rp   r   r4   r5   r,   r6   r7   s   @r&   r\   r\   |   s8    
%
N')U\\ ell r'   r\   c                        e Zd ZdZ	 ddededeeeeef   f   ddf fdZd Zd	 Z	d
e
j                  de
j                  fdZ xZS )UperNetFCNHeada  
    Fully Convolution Networks for Semantic Segmentation. This head is the implementation of
    [FCNNet](https://arxiv.org/abs/1411.4038>).

    Args:
        config:
            Configuration.
        in_channels (int):
            Number of input channels.
        kernel_size (int):
            The kernel size for convs in the head. Default: 3.
        dilation (int):
            The dilation rate for convs in the head. Default: 1.
    in_indexr   r   r   Nc           
      J   t         |           || _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _
        || _        |dz  |z  }g }|j                  t        | j                  | j                  |||             t        | j                  dz
        D ]5  }|j                  t        | j                  | j                  |||             7 | j                  dk(  rt        j                          | _        nt        j$                  | | _        | j                  r8t        | j                  | j                  z   | j                  ||dz        | _        t        j(                  | j                  |j*                  d      | _        y )NrP   )r   r   r   r   r   r_   r=   )r   r   r`   auxiliary_in_channelsr   auxiliary_channelsr;   auxiliary_num_convs	num_convsauxiliary_concat_inputconcat_inputr   rL   r   r   r   Identityconvs
Sequentialconv_catr   rb   rc   )	r$   r`   r   r   r   conv_paddingr   rB   r%   s	           r&   r   zUperNetFCNHead.__init__   s_    	!771133"99 #q(H4  $--[R^iq	

 t~~)* 	ALL!MM4==kS_jr	 >>QDJ.DJ-  4==0$--[bmqrbrDM ))DMM63D3DRSTr'   c                 :    | j                  | j                         y r*   rn   rq   s    r&   rr   zUperNetFCNHead.init_weights  rs   r'   c                    t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          y y y ru   rx   r   s     r&   rp   zUperNetFCNHead._init_weights  r   r'   r   c                     || j                      }| j                  |      }| j                  r(| j                  t	        j
                  ||gd            }| j                  |      }|S )Nr   r   )r   r   r   r   r4   r   rc   )r$   r   hidden_statesr+   s       r&   r,   zUperNetFCNHead.forward  sX    -dmm<M*]]599mV-D!#LMF(r'   )rP   r	   r   )r-   r.   r/   r0   r1   r   r   r   rr   rp   r4   r5   r,   r6   r7   s   @r&   r   r      sv      hi"U #"U69"UINsTYZ]_bZbTcOcId"U	"UH')U\\ ell r'   r   c                   (    e Zd ZdZeZdZg Zd Zd Z	y)UperNetPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    pixel_valuesc                     t        |t              r\|j                  j                          |j                  j                          |j
                  |j
                  j                          y y y r*   )ry   r   backbonerr   decode_headauxiliary_headr   s     r&   rp   z$UperNetPreTrainedModel._init_weights*  sW    f45OO((*++-$$0%%224 1 6r'   c                     | j                   j                          | j                  j                          | j                  | j                  j                          yy)zInitialize the weightsN)r   rr   r   r   rq   s    r&   rr   z#UperNetPreTrainedModel.init_weights1  sG    ""$%%'*,,. +r'   N)
r-   r.   r/   r0   r   config_classmain_input_name_no_split_modulesrp   rr    r'   r&   r   r      s#    
 !L$O5/r'   r   aI  
    Parameters:
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.
        config ([`UperNetConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
ax  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`SegformerImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers in case the backbone has them. See
            `attentions` under returned tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers of the backbone. See `hidden_states` under
            returned tensors for more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zMUperNet framework leveraging any vision backbone e.g. for ADE20k, CityScapes.c                        e Zd Z fdZ eej                  d             eee	      	 	 	 	 	 dde
ej                     de
e   de
e   de
ej                     de
e   d	eeef   fd
              Z xZS )UperNetForSemanticSegmentationc                     t         |   |       t        |      | _        t	        || j                  j
                        | _        |j                  rt        |      nd | _	        | j                          y )N)r   )r   r   r   r   r\   r;   r   use_auxiliary_headr   r   	post_init)r$   r`   r%   s     r&   r   z'UperNetForSemanticSegmentation.__init__X  s[     %f- 'v4==;Q;QR8>8Q8QnV4W[ 	r'   zbatch_size, sequence_length)output_typer   r   output_attentionsoutput_hidden_stateslabelsreturn_dictr   c                    |$| j                   j                  dk(  rt        d      ||n| j                   j                  }||n| j                   j                  }||n| j                   j
                  }| j                  j                  |||      }|j                  }| j                  |      }t        j                  j                  ||j                  dd dd      }d}	| j                  A| j                  |      }	t        j                  j                  |	|j                  dd dd      }	d}
|Pt        | j                   j                   	      } |||      }
|	% ||	|      }|
| j                   j"                  |z  z  }
|s|r
|f|dd z   }n	|f|dd z   }|
|
f|z   S |S t%        |
||j&                  |j(                  
      S )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, UperNetForSemanticSegmentation
        >>> from PIL import Image
        >>> from huggingface_hub import hf_hub_download

        >>> image_processor = AutoImageProcessor.from_pretrained("openmmlab/upernet-convnext-tiny")
        >>> model = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-convnext-tiny")

        >>> filepath = hf_hub_download(
        ...     repo_id="hf-internal-testing/fixtures_ade20k", filename="ADE_val_00000001.jpg", repo_type="dataset"
        ... )
        >>> image = Image.open(filepath).convert("RGB")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)

        >>> logits = outputs.logits  # shape (batch_size, num_labels, height, width)
        >>> list(logits.shape)
        [1, 150, 512, 512]
        ```Nr   z/The number of labels should be greater than one)r   r   rP   rQ   FrR   )ignore_index)losslogitsr   
attentions)r`   rb   
ValueErroruse_return_dictr   r   r   forward_with_filtered_kwargsfeature_mapsr   r   rU   rV   r   r   r   loss_ignore_indexauxiliary_loss_weightr
   r   r   )r$   r   r   r   r   r   outputsfeaturesr   auxiliary_logitsr   loss_fctauxiliary_lossr+   s                 r&   r,   z&UperNetForSemanticSegmentation.forwardd  s   N $++"8"8A"=NOO%0%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq--<</CWh = 
 ''!!(+**68J8J128NU_ot*u*#228<!}}88 |'9'9!"'=J^c  9   'T[[5R5RSHFF+D+!)*:F!C99NJJ# WQR[0 WQR[0)-)9TGf$EvE&!//))	
 	
r'   )NNNNN)r-   r.   r/   r   r   UPERNET_INPUTS_DOCSTRINGformatr   r
   _CONFIG_FOR_DOCr   r4   r5   r3   r   tupler,   r6   r7   s   @r&   r   r   S  s    

 ++C+J+JKh+ij+BQ`a 04,0/3)-&*R
u||,R
 $D>R
 'tn	R

 &R
 d^R
 
u--	.R
 b kR
r'   r   )!r0   typingr   r   r   r   r4   r   torch.nnr   modeling_outputsr
   modeling_utilsr   utilsr   r   r   utils.backbone_utilsr   configuration_upernetr   r   Moduler   r9   rG   r\   r   r   UPERNET_START_DOCSTRINGr   r   r   r'   r&   <module>r      s    y / /   % 7 - k k 1 0 " 		  F "#")) #LZ")) ZzDRYY DN/_ /2    Wa
%; a
	a
r'   