
    sgز              	          d Z ddlZddlmZ ddlmZmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ  ej<                  e      Z dZ!dZ"g dZ#e G d de             Z$e G d de             Z% G d dejL                        Z' G d dejL                        Z( G d dejL                        Z) G d dejL                        Z*d?dejV                  de,de-dejV                  fd Z. G d! d"ejL                        Z/ G d# d$ejL                        Z0 G d% d&ejL                        Z1 G d' d(ejL                        Z2 G d) d*ejL                        Z3 G d+ d,ejL                        Z4 G d- d.e      Z5d/Z6d0Z7 ed1e6       G d2 d3e5             Z8d4ejV                  d5e9dejV                  fd6Z:d4ejV                  d7e9d8e9dejV                  fd9Z; G d: d;ejL                        Z< ed<e6       G d= d>e5             Z=y)@zPyTorch SegGpt model.    N)	dataclass)DictListOptionalTupleUnion)nn)
functional   )ACT2FN)PreTrainedModel)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings	torch_int   )SegGptConfigr   zBAAI/seggpt-vit-large)r   i  i  c                       e Zd ZU dZej
                  ed<   dZee	ej
                        ed<   dZ
ee	ej
                        ed<   dZee	ej
                        ed<   y)SegGptEncoderOutputa  
    Output type of [`SegGptEncoderOutput`].
    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape `(batch_size, patch_height, patch_width, hidden_size)`.
        attentions (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.output_attentions=True`):
            Tuple of *torch.FloatTensor* (one for each layer) of shape
            `(batch_size, num_heads, seq_len, seq_len)`.
        intermediate_hidden_states (`Tuple[torch.FloatTensor]`, *optional*, returned when `config.intermediate_hidden_state_indices` is set):
            Tuple of `torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`.
            Each element in the Tuple corresponds to the output of the layer specified in `config.intermediate_hidden_state_indices`.
            Additionaly, each feature passes through a LayerNorm.
    last_hidden_stateNhidden_states
attentionsintermediate_hidden_states)__name__
__module____qualname____doc__torchFloatTensor__annotations__r   r   r   r   r        ]/var/www/html/venv/lib/python3.12/site-packages/transformers/models/seggpt/modeling_seggpt.pyr   r   1   sd    " (((8<M8E%"3"345<59Ju00129EIu/@/@)A BIr$   r   c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   y)SegGptImageSegmentationOutputaT  
    Output type of [`SegGptImageSegmentationOutput`].

    Args:
        loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
            The loss value.
        pred_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            The predicted masks.
        hidden_states (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape `(batch_size, patch_height, patch_width, hidden_size)`.
        attentions (`Tuple[torch.FloatTensor]`, `optional`, returned when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape
            `(batch_size, num_heads, seq_len, seq_len)`.
    Nloss
pred_masksr   r   )r   r   r   r   r(   r   r    r!   r"   r)   r   r   r   r#   r$   r%   r'   r'   J   sg      )-D(5$$
%,.2J**+28<M8E%"3"345<59Ju00129r$   r'   c                   (     e Zd ZdZ fdZd Z xZS )SegGptPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        || _
        t        j                  ||||      | _        y )Nr   r   )kernel_sizestride)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesr	   Conv2d
projection)selfconfigr1   r2   r3   r4   r9   	__class__s          r%   r0   zSegGptPatchEmbeddings.__init__j   s    !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&))L+:^hir$   c                 N   |j                   \  }}}}|| j                  k7  rt        d      || j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d	      | j	                  |      j                  ddd	d      }|S )
NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model ().   r   )shaper3   
ValueErrorr1   r;   permute)r<   pixel_values
batch_sizer3   heightwidth
embeddingss          r%   forwardzSegGptPatchEmbeddings.forwardx   s    2>2D2D/
L&%4,,,w  T__Q''5DOOA4F+F$VHAeW4KDOO\]L^K__`aeapapqras`ttvw  __\2::1aAF
r$   )r   r   r   r   r0   rK   __classcell__r>   s   @r%   r+   r+   c   s    jr$   r+   c                        e Zd ZdZdeddf fdZdededej                  fdZ		 	 dd	ej                  d
ej                  de
ej                     de
e   dej                  f
dZ xZS )SegGptEmbeddingszX
    Construct the embeddings from patch, position embeddings for input and prompt.
    r=   returnNc                 ~   t         |           t        j                  t	        j
                  ddd|j                              | _        t        j                  t	        j
                  ddd|j                              | _        t        j                  t	        j
                  ddd|j                              | _	        t        j                  t	        j
                  ddd|j                              | _
        t        j                  t	        j
                  ddd|j                              | _        t        |      | _        |j                  |j                  z  dz  dz   }t        j                  t	        j                   d||j                              | _        t        j$                  |j&                        | _        y )Nr   rB   )r/   r0   r	   	Parameterr    zerosr4   
mask_tokensegment_token_inputsegment_token_prompttype_token_semantictype_token_instancer+   patch_embeddingspretrain_image_sizer2   randnposition_embeddingsDropouthidden_dropout_probdropout)r<   r=   num_positionsr>   s      r%   r0   zSegGptEmbeddings.__init__   s3   ,,u{{1aF<N<N'OP#%<<Aq!VEWEW0X#Y $&LLQ1fFXFX1Y$Z!#%<<Aq!VEWEW0X#Y #%<<Aq!VEWEW0X#Y  5f =33v7H7HHQNQRR#%<<A}fN`N`0a#b zz&"<"<=r$   rH   rI   c                    | j                   d d dd f   }|j                  d   }t        |dz        }t        j                  j                         s
||k7  s||k7  rSt        j                  |j                  d||d      j                  dddd      ||fdd	      }|j                  dddd      S |j                  d||d      S )
Nr         ?r   r   rB   bicubicF)sizemodealign_corners)
r\   rC   r   r    jit
is_tracingFinterpolatereshaperE   )r<   rH   rI   patch_pos_embedr9   pretrain_patch_sizes         r%   interpolate_pos_encodingz)SegGptEmbeddings.interpolate_pos_encoding   s    221ab59%++A.'S(89 99!%8F%BFY]bFbmm''+>@SUWX``abdeghjkle_#	O #**1aA66"**1feR@@r$   rF   prompt_pixel_valuesbool_masked_posembedding_typec                 R   | j                  |      }| j                  |      }|j                  \  }}}	}
| j                  j                  |||	d      }|j	                  d      j                  |      j                  d||	d      }|d|z
  z  ||z  z   }||nd}| j                  ||	      }|| j                  z   }|| j                  z   }||z   }||z   }|dk(  r| j                  }n |dk(  r| j                  }nt        d|       ||z   }||z   }t        j                  ||fd      }|S )Nrc   r   instancesemanticzBEmbedding type should be either 'semantic' or 'instance', but got r   dim)rY   rC   rT   expand	unsqueezetype_asrl   ro   rU   rV   rW   rX   rD   r    cat)r<   rF   rp   rq   rr   input_embeddingsprompt_embeddingsrG   patch_heightpatch_width_rT   w	pos_embedtype_embeddingrJ   s                   r%   rK   zSegGptEmbeddings.forward   sh     00> 112EF3C3I3I0
L+q__++JkSUV
%%b)11*=EEb,Xcefg-Q7*q.H+9+E: 11,L	 ,d.F.FF-0I0II ,i7-	9 Z'!55Nz)!55Nabpaqrss+n<->YY 02CD!L
r$   )NN)r   r   r   r   r   r0   intr    Tensorro   r   
BoolTensorstrrK   rL   rM   s   @r%   rO   rO      s    >| > > As A3 A5<< A, 7;(,+ll+ #\\+ "%"2"23	+
 !+ 
+r$   rO   c                   8    e Zd ZdZ fdZdededej                  dej                  fdZdej                  d	ej                  d
ej                  dej                  de	eef   de	eef   dej                  fdZ
ddej                  dej                  fdZ xZS )SegGptAttentionz=Multi-head Attention block with relative position embeddings.c                    t         |           |j                  |j                  }}t	        |t
        j                  j                        r|n||f}t	        |t
        j                  j                        r|n||f}|d   |j                  z  |d   |j                  z  f}|j                  |j                  z  }|j                  | _	        |dz  | _
        t        j                  |j                  |j                  dz  |j                        | _        t        j                  |j                  |j                        | _        |j                   | _        | j                   r||t#        d      t        j$                  t'        j(                  d|d   z  dz
  |            | _        t        j$                  t'        j(                  d|d   z  dz
  |            | _        y y )Nr   r   g      r   biaszBInput size must be provided if using relative positional encoding.rB   )r/   r0   r1   r2   r5   r6   r7   r8   r4   num_attention_headsscaler	   Linearqkv_biasqkvproj use_relative_position_embeddingsrD   rR   r    rS   	rel_pos_h	rel_pos_w)r<   r=   r1   r2   
input_sizehead_dimr>   s         r%   r0   zSegGptAttention.__init__   s   !'!2!2F4E4EJ
#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
 mv'8'88*Q-6K\K\:\]
%%)C)CC#)#=#= t^
99V//1C1Ca1Gfoo^IIf00&2D2DE	060W0W-00! !eff  \\%++a*Q-6G!6KX*VWDN\\%++a*Q-6G!6KX*VWDN 1r$   q_sizek_sizerel_posrP   c                    t        dt        ||      z  dz
        }t        j                  |j	                  d|j
                  d   d      j                  ddd      |d      }|j	                  d|      j                  dd      }t        j                  |      dddf   t        ||z  d      z  }t        j                  |      dddf   t        ||z  d      z  }||z
  |dz
  t        ||z  d      z  z   }||j                            S )	a  
        Get relative positional embeddings according to the relative positions of
            query and key sizes.

        Args:
            q_size (int):
                size of the query.
            k_size (int):
                size of key k.
            rel_pos (`torch.Tensor`):
                relative position embeddings (L, channel).

        Returns:
            Extracted positional embeddings according to relative positions.
        rB   r   r   rc   linear)re   rf   N      ?)
r   maxrj   rk   rl   rC   rE   r    arangelong)	r<   r   r   r   max_rel_distrel_pos_resizedq_coordsk_coordsrelative_coordss	            r%   get_rel_poszSegGptAttention.get_rel_pos   s     1s6622Q67--OOAw}}Q/4<<Q1E

 *11"lCKKAqQ <<'403v3LL<<'a03v3LL#h.6A:Vf_VYAZ2ZZ33566r$   attnqueryr   r   c                    |\  }}|\  }	}
| j                  ||	|      }| j                  ||
|      }|j                  \  }}}|j                  ||||      }t        j                  d||      }t        j                  d||      }|j                  ||||	|
      }||dddddddddf   z   |dddddddddf   z   }|j                  |||z  |	|
z        }|S )a  
        Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
        https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py

        Args:
            attn (`torch.Tensor`):
                attention map.
            query (`torch.Tensor`):
                query q in the attention layer with shape (batch_size, query_height * query_width, channel).
            rel_pos_h (`torch.Tensor`):
                relative position embeddings (Lh, channel) for height axis.
            rel_pos_w (`torch.Tensor`):
                relative position embeddings (Lw, channel) for width axis.
            q_size (tuple):
                spatial sequence size of query q with (query_height, query_width).
            k_size (tuple):
                spatial sequence size of key k with (key_height, key_width).

        Returns:
            attn (`torch.Tensor`):
                attention map with added relative positional embeddings.
        zbhwc,hkc->bhwkzbhwc,wkc->bhwkN)r   rC   rl   r    einsum)r<   r   r   r   r   r   r   query_heightquery_width
key_height	key_widthrelative_position_heightrelative_position_widthrG   r   rw   reshaped_queryrel_hrel_ws                      r%   add_decomposed_rel_posz&SegGptAttention.add_decomposed_rel_pos  s    > %+!k &
I#'#3#3L*i#X "&"2"2;	9"U"[[
Asz<cR-~?WX-~?VW||Jk:yYeAq!Q,--aAtQ6F0GG||J{(BJQZDZ[r$   r   c           	         |j                   \  }}}}| j                  |      j                  |||z  d| j                  d      j	                  ddddd      }|j                  d|| j                  z  ||z  d      j                  d      \  }}	}
|| j                  z  |	j                  dd      z  }| j                  r.| j                  ||| j                  | j                  ||f||f      }t        j                  j                  j                  |t        j                   d      j#                  |j$                        }|rE|j'                  || j                  ||z  d      }|j'                  || j                  z  ||z  d      }nd }||
z  j                  || j                  ||d      }|j	                  ddddd      j                  |||d      }| j)                  |      }||fS )	Nr   rc   rB   r   r      )dtyperw   )rC   r   rl   r   rE   unbindr   	transposer   r   r   r   r    r	   r
   softmaxfloat32tor   viewr   )r<   r   output_attentionsrG   rH   rI   r   r   r   keyvalueattn_weightsattn_weights_reshapedattn_outputs                 r%   rK   zSegGptAttention.forwardC  s   '4':':$
FE1 HH]#WZ%D4L4LbQWQ1a# 	  KK:8P8P+PRX[`R`bdellmnosE

*cmmB.CC0066eT^^T^^fe_W]_dVeL xx**22<u}}Z\2]``afalalm
 %1$5$5j$BZBZ\bej\jln$o!055j4C[C[6[]cfk]kmopL$(!#e+44ZAYAY[achjlm!))!Q1a8@@VUZ\^_ii,233r$   )F)r   r   r   r   r0   r   r    r   r   r   r   rK   rL   rM   s   @r%   r   r      s    GX07# 7s 7U\\ 7ell 7@+ll+ ||+ <<	+
 <<+ c3h+ c3h+ 
+Z#4U\\ #4u|| #4r$   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )	SegGptMlpc                    t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        |j                     | _
        y N)r/   r0   r	   r   r4   mlp_dimlin1lin2r   
hidden_actactr<   r=   r>   s     r%   r0   zSegGptMlp.__init__k  sX    IIf00&..A	IIfnnf.@.@A	&++,r$   r   rP   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   r<   r   s     r%   rK   zSegGptMlp.forwardq  s2    		-0/		-0r$   )r   r   r   r0   r    r   rK   rL   rM   s   @r%   r   r   j  s#    -U\\ ell r$   r   input	drop_probtrainingrP   c                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
            r   r   )r   )r   device)rC   ndimr    randr   r   floor_div)r   r   r   	keep_probrC   random_tensoroutputs          r%   	drop_pathr   y  s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr$   c                   x     e Zd ZdZd	dee   ddf fdZdej                  dej                  fdZ	de
fdZ xZS )
SegGptDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rP   c                 0    t         |           || _        y r   )r/   r0   r   )r<   r   r>   s     r%   r0   zSegGptDropPath.__init__  s    "r$   r   c                 D    t        || j                  | j                        S r   )r   r   r   r   s     r%   rK   zSegGptDropPath.forward  s    FFr$   c                 8    dj                  | j                        S )Nzp={})formatr   r<   s    r%   
extra_reprzSegGptDropPath.extra_repr  s    }}T^^,,r$   r   )r   r   r   r   r   floatr0   r    r   rK   r   r   rL   rM   s   @r%   r   r     sG    b#(5/ #T #GU\\ Gell G-C -r$   r   c                        e Zd Zdededdf fdZ	 	 ddej                  dede	d	e	de
eej                  ej                  f   eej                     f   f
d
Z xZS )SegGptLayerr=   drop_path_raterP   Nc                 t   t         |           t        |      | _        t	        |      | _        |dkD  rt        |      nt        j                         | _	        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        y )Nr   eps)r/   r0   r   	attentionr   mlpr   r	   Identityr   	LayerNormr4   layer_norm_epslayernorm_beforelayernorm_after)r<   r=   r   r>   s      r%   r0   zSegGptLayer.__init__  s    (0V$;IC;O7UWU`U`Ub "V-?-?VEZEZ [!||F,>,>FDYDYZr$   r   ensemble_condfeature_ensembler   c                    | j                  | j                  |      |      }|d   }|dd  }|r|j                  d   dz  |k\  r|j                  |j                  d   dz  d      \  }}	|dk(  ra|j                  d   dz  }
|	j	                  d|
d      }	|	j                  dd      j                  |	      }	 |	j                  |j                   }	n"|	j                  dd      j                  |	      }	t        j                  ||	gd      }| j                  |      |z   }|}| j                  |      }| j                  |      }|| j                  |      z   }|f|z   }|S )	N)r   r   r   rB   rv   rc   T)rw   keepdim)r   r   rC   splitrl   mean	expand_asr    r{   r   r   r   )r<   r   r   r   r   self_attention_outputsattention_outputoutputspromptinputsnum_promptsresiduals               r%   rK   zSegGptLayer.forward  sz    "&!!-0/ "0 "
 2!4(, 0 6 6q 9Q >- O-334D4J4J14MQR4RXY3ZNFF!.44Q71<;;D9CCFK'6D9CCFK$yy&&)9qA '78=H ,,];/ 4>>-#@@ "W,r$   )FF)r   r   r   r   r   r0   r    r   r   boolr   r   rK   rL   rM   s   @r%   r   r     s    [| [U [t [ "'"'#||# # 	#
  # 
uU\\5<</0%2EE	F#r$   r   c                   p     e Zd Zdeddf fdZ	 	 	 	 ddej                  dededed	edee	e
f   fd
Z xZS )SegGptEncoderr=   rP   Nc           
         t         |           || _        t        j                  d|j
                  |j                        D cg c]  }|j                          }}t        j                  t        |j                        D cg c]  }t        |||          c}      | _        t        j                  |j                  |j                        | _        d| _        y c c}w c c}w )Nr   r   F)r/   r0   r=   r    linspacer   num_hidden_layersitemr	   
ModuleListranger   layersr   r4   r   	layernormgradient_checkpointing)r<   r=   xdprir>   s        r%   r0   zSegGptEncoder.__init__  s    !&63H3H&JbJb!cdAqvvxddmm%PVPhPhJi$jQ[Q%@$jkf&8&8f>S>ST&+# e$js   CC"r   r   r   output_hidden_statesreturn_dictc                    |rdnd }|rdnd }g }t        | j                        D ]  \  }	}
|r||fz   }| j                  j                  |	kD  rdnd}| j                  r,| j
                  r | j                  |
j                  ||||      }n |
||||      }|d   }|	| j                  j                  k(  r.|d |j                  d   dz   ||j                  d   dz  d  z   dz  }|	| j                  j                  v r |j                  | j                  |             |s||d   fz   } |r||fz   }|st        d ||||fD              S t        ||||      S )Nr#   rB   r   r   rb   c              3   $   K   | ]  }|| 
 y wr   r#   ).0vs     r%   	<genexpr>z(SegGptEncoder.forward.<locals>.<genexpr>  s      = s   )r   r   r   r   )	enumerater  r=   merge_indexr  r   _gradient_checkpointing_func__call__rC   !intermediate_hidden_state_indicesappendr  tupler   )r<   r   r   r   r  r  all_hidden_statesall_self_attentionsr   r  layer_moduler   layer_outputss                r%   rK   zSegGptEncoder.forward  s    #7BD$5b4%'"(5 	POA|#$58H$H! "&!8!81!<A!M**t}} $ A A ))!!$%! !-]MK[]n o)!,MDKK+++!"?M$7$7$:a$?@=Q^QdQdefQgklQlQnCoo! DKKAAA*11$..2OP &9]1=M<O&O#;	P>   1]4D D '):<OQkl  
 #++*'A	
 	
r$   )FFFT)r   r   r   r   r0   r    r   r  r   r  r   rK   rL   rM   s   @r%   r  r    sr    ,| , , "'"'%* 9
||9
 9
  	9

 #9
 9
 
u))	*9
r$   r  c                   \     e Zd ZdZd fd	Zdej                  dej                  fdZ xZS )SegGptLayerNormaA  LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
    c                 N   t         |           t        j                  t	        j
                  |            | _        t        j                  t	        j                  |            | _        || _	        || _
        | j                  dvrt        d| j                         |f| _        y )N)channels_lastchannels_firstzUnsupported data format: )r/   r0   r	   rR   r    onesweightrS   r   r   data_formatNotImplementedErrornormalized_shape)r<   r-  r   r+  r>   s       r%   r0   zSegGptLayerNorm.__init__  s    ll5::.>#?@LL-=!>?	&#FF%(A$BRBRAS&TUU!1 3r$   r  rP   c                 d   | j                   dk(  rWt        j                  j                  j	                  || j
                  | j                  | j                  | j                        }|S | j                   dk(  r|j                  }|j                         }|j                  dd      }||z
  j                  d      j                  dd      }||z
  t        j                  || j                  z         z  }|j                  |      }| j                  d d d d f   |z  | j                  d d d d f   z   }|S )Nr'  r(  r   T)r   rB   r   )r+  r    r	   r
   
layer_normr-  r*  r   r   r   r   r   powsqrtr   )r<   r  input_dtypeuss        r%   rK   zSegGptLayerNorm.forward!  s
   .##..q$2G2GVZV_V_aeaiaijA  !11''K	Aq$'AQA##At#4AQ%**Q\22A;'AAtTM*Q.1dD=1IIAr$   )gư>r'  )	r   r   r   r   r0   r    r   rK   rL   rM   s   @r%   r%  r%    s(    
4 %,, r$   r%  c                   >     e Zd Z fdZdej
                  fdZ xZS )SegGptDecoderHeadc                 T   t         |           t        j                  |j                  |j                  dd      | _        t        |j                  |j                  d      | _        t        |j                     | _        t        j                  |j                  ddd      | _        y )Nr   r   )r-   paddingr(  )r-  r   r+  T)r-   r   )r/   r0   r	   r:   decoder_hidden_sizeconvr%  r   r  r   r   act_fctheadr   s     r%   r0   zSegGptDecoderHead.__init__0  s    II&&&&	
	 )#77V=R=R`p
 f//0IIf88!QUV	r$   r   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r   )r;  r  r<  r=  r   s     r%   rK   zSegGptDecoderHead.forward>  s@    		-0}5]3		-0r$   )r   r   r   r0   r    r!   rK   rL   rM   s   @r%   r7  r7  /  s    WU%6%6 r$   r7  c                   v     e Zd Z fdZdej
                  dej
                  fdZdej
                  fdZ xZS )SegGptDecoderc                 B   t         |           t        j                  |j                  t        |j                        z  |j                  dz  |j                  z  d      | _	        t        |      | _        |j                  | _        |j                  | _        || _        y )NrB   Tr   )r/   r0   r	   r   r4   lenr  r2   r:  decoder_embedr7  decoder_predr=   r   s     r%   r0   zSegGptDecoder.__init__H  s    YYV%M%M!NNq 6#=#==

 .f5 ++#)#=#= r$   r   rP   c                    |j                   \  }}}}|j                  |||| j                  | j                  | j                        }|j	                  dddddd      }|j                  |d|| j                  z  || j                  z  f      }|S )	Nr      r   r   rB   r   rc   rC   )rC   rl   r2   r:  rE   )r<   r   rG   r~   r   r   s         r%   _reshape_hidden_statesz$SegGptDecoder._reshape_hidden_statesT  s    3@3F3F0
L+q%--k4??DOOUYUmUm
 &--aAq!Q?%--r<$//#A;QUQ`Q`C`a . 
 r$   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )rC  rH  rD  r   s     r%   rK   zSegGptDecoder.forward`  s8    **=933MB))-8r$   )	r   r   r   r0   r    r!   rH  rK   rL   rM   s   @r%   r@  r@  G  s9    

E4E4E 
%J[J[ 
U%6%6 r$   r@  c                       e Zd ZdZeZdZdZdZddgZ	de
ej                  ej                  ej                  f   dd	fd
Zy	)SegGptPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    modelrF   TrO   r   modulerP   Nc                 2   | j                   j                  }t        |t        j                  t        j
                  f      rt        j                  j                  |j                  j                  j                  t        j                        d|      j                  |j                  j                        |j                  _	        |j                  %|j                  j                  j                          yyt        |t        j                         rJ|j                  j                  j                          |j                  j                  j#                  d       yt        |t$              rt        j                  j                  |j&                  j                  j                  t        j                        d|      j                  |j&                  j                        |j&                  _	        t        j                  j                  |j(                  j                  j                  t        j                        d|      j                  |j(                  j                        |j(                  _	        yt        |t*              rt        j                  j                  |j,                  j                  j                  t        j                        d|      j                  |j,                  j                        |j,                  _	        t        j                  j                  j/                  |j0                  |       t        j                  j                  j/                  |j2                  |       t        j                  j                  j/                  |j4                  |       t        j                  j                  j/                  |j6                  |       t        j                  j                  j/                  |j8                  |       yy)zInitialize the weightsr   )r   stdNr   )rO  )r=   initializer_ranger5   r	   r   r:   inittrunc_normal_r*  datar   r    r   r   r   zero_r   fill_r   r   r   rO   r\   normal_rT   rU   rV   rW   rX   )r<   rM  rO  s      r%   _init_weightsz#SegGptPreTrainedModel._init_weightst  s   kk++fryy"))45 "$!6!6v}}7I7I7L7LU]]7[bekn!6!o!r!r##"FMM {{&  &&( '-KK""$MM$$S)0$&GG$9$9  %%((7 %: % b!!''(	 ! %'GG$9$9  %%((7 %: % b!!''(	 !  01.0gg.C.C**//225==A /D / b++112	 &&+ HHMM!!&"3"3!=HHMM!!&"<"<#!FHHMM!!&"="=3!GHHMM!!&"<"<#!FHHMM!!&"<"<#!F 2r$   )r   r   r   r   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   r	   r   r:   r   rW  r#   r$   r%   rK  rK  h  sY    
  L$O&*#+];&GE"))RYY*L$M &GRV &Gr$   rK  aH  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`SegGptConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a   
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`SegGptImageProcessor.__call__`]
            for details.

        prompt_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Prompt pixel values. Prompt pixel values can be obtained using [`AutoImageProcessor`]. See
            [`SegGptImageProcessor.__call__`] for details.

        prompt_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Prompt mask. Prompt mask can be obtained using [`AutoImageProcessor`]. See [`SegGptImageProcessor.__call__`] for
            details.

        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        feature_ensemble (`bool`, *optional*):
            Boolean indicating whether to use feature ensemble or not. If `True`, the model will use feature ensemble
            if we have at least two prompts. If `False`, the model will not use feature ensemble. This argument should
            be considered when doing few-shot inference on an input image i.e. more than one prompt for the same image.

        embedding_type (`str`, *optional*):
            Embedding type. Indicates whether the prompt is a semantic or instance embedding. Can be either
            instance or semantic.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z`The bare SegGpt Model transformer outputting raw hidden-states without any specific head on top.c                   \    e Zd Zdef fdZdefdZdeee	e   f   ddfdZ
 ee       eee      	 	 	 	 	 	 	 dd	ej"                  d
ej"                  dej"                  deej&                     dee   dee   deej,                     dee   dee   dee   deeef   fd              Z xZS )SegGptModelr=   c                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y r   )r/   r0   r=   rO   rJ   r  encoder	post_initr   s     r%   r0   zSegGptModel.__init__  s;     *62$V, 	r$   rP   c                 .    | j                   j                  S r   )rJ   rY   r   s    r%   get_input_embeddingsz SegGptModel.get_input_embeddings  s    ///r$   heads_to_pruneNc                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr`  layerr   prune_heads)r<   rd  rg  headss       r%   _prune_headszSegGptModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr$   output_typerX  rF   rp   prompt_masksrq   r   rr   labelsr   r  r  c                 n   ||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
||nd}| j                  j
                  j                  j                  j                  }|j                  |      }|j                  |      }t        j                  ||fd      }|t        j                  ||fd      nt        j                  ||fd      }||t        j                  d       |w| j                  j
                  j                  }t        j                  |t        j                         j                  |j"                        }d||dz  d |j%                  d      }| j	                  ||||	      }| j'                  ||||	|

      }|S )a  
        labels (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, `optional`):
            Ground truth mask for input images.

        Returns:

        Examples:

        ```python
        >>> from transformers import SegGptImageProcessor, SegGptModel
        >>> from PIL import Image
        >>> import requests

        >>> image_input_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_2.jpg"
        >>> image_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1.jpg"
        >>> mask_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1_target.png"

        >>> image_input = Image.open(requests.get(image_input_url, stream=True).raw)
        >>> image_prompt = Image.open(requests.get(image_prompt_url, stream=True).raw)
        >>> mask_prompt = Image.open(requests.get(mask_prompt_url, stream=True).raw).convert("L")

        >>> checkpoint = "BAAI/seggpt-vit-large"
        >>> model = SegGptModel.from_pretrained(checkpoint)
        >>> image_processor = SegGptImageProcessor.from_pretrained(checkpoint)

        >>> inputs = image_processor(images=image_input, prompt_images=image_prompt, prompt_masks=mask_prompt, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> list(outputs.last_hidden_state.shape)
        [1, 56, 28, 1024]
        ```
        NFrB   rv   zLabels were provided, but bool_masked_pos were not. It will be set to default value. If you're training the model, make sure to provide a bool_masked_pos.r/  r   r   )rr   rq   )r   r   r  r  )r=   r   r  use_return_dictrJ   rY   r;   r*  r   r   r    r{   loggerwarning_oncer9   rS   r  r   ry   r`  )r<   rF   rp   rm  rq   r   rr   rn  r   r  r  expected_dtyper9   embedding_outputencoder_outputss                  r%   rK   zSegGptModel.forward  s   ^ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]/?/K+QV99DDKKQQ#~6144^D yy"5|!D!L ~ II|\2:L&1q9 	 "v'9 m "//::FFK#kk+UZZHKKLL_L_`O23OK1,./-77:O??-n^m + 
 ,,-/!5# ' 
 r$   NNNNNNN)r   r   r   r   r0   r+   rc  r   r   r   rj  r   SEGGPT_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCr    r   r   r   r  r   r!   r   r   rK   rL   rM   s   @r%   r^  r^    s<   
| 0&; 0C4T#Y+? CD C ++BC+>_] 7;+/(,.2,0/3&*[ll[ #\\[ ll	[
 "%"2"23[ #4.[ ![ **+[ $D>[ 'tn[ d^[ 
u))	*[ ^ D[r$   r^  tensorr2   c                     | j                   \  }}}}||z  }||z  }| j                  ||||||f      } | j                  dddddd      } | j                  |||z  |dz  dz  f      } | S )NrG  r   rB   r   r   rF  r   )rC   rl   rE   )ry  r2   rG   r3   rH   rI   r~   r   s           r%   patchifyr{  G  s    .4ll+JfeZ'L:%K^^:|\:Wbdn"o^pF^^Aq!Q1-F^^:|k/I:WX=[\K\"]^^FMr$   r~   r   c           	      b   | j                   d   }t        | j                   d   dz  dz        }||z  | j                   d   k7  r"t        d| j                   d    d| d| d	      | j                  |||||df
      } | j	                  dddddd      } | j                  |d||z  ||z  f
      } | S )Nr   rc   r   rb   r   zNumber of patches z does not match patch height (z) and width (rA   rG  rF  rB   r   )rC   r   rD   rl   rE   )ry  r~   r   rG   r2   s        r%   
unpatchifyr}  S  s    aJfll2&*s23Jk!V\\!_4 a 11OP\~]jkvjwwyz
 	
 ^^:|[*V`bc"d^eF^^Aq!Q1-F^^:q,2K[[eMe"f^gFMr$   c                        e Zd Z fdZdej
                  dej
                  dej
                  dej                  fdZ xZS )
SegGptLossc                 f    t         |           |j                  | _        |j                  | _        y r   )r/   r0   betar2   r   s     r%   r0   zSegGptLoss.__init__c  s&    KK	 ++r$   rm  r)   rn  rq   c                    t        j                  ||fd      }|dddddf   j                  dd| j                  dz  dz        }t	        ||j
                  d   | j                  z  |j
                  d   | j                  z        }t        j                  ||d| j                        }||z  j                         |j                         z  }|S )aN  Computes the L1 loss between the predicted masks and the ground truth masks.

        Args:
            prompt_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Pixel values from mask prompt.

            pred_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, 2*height, width)`):
                Predicted masks.

            labels (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Ground truth mask for input images.

            bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
                Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Returns:
            `torch.FloatTensor`: The mean L1 loss between the predicted masks and the ground truth masks.
        rB   rv   Nr   r   none)	reductionr  )
r    r{   repeatr2   r}  rC   rj   smooth_l1_lossr  sum)r<   rm  r)   rn  rq   ground_truthmaskr(   s           r%   rK   zSegGptLoss.forwardh  s    2 yy,!7Q?q!Tz*11!Q8JQ8NO$ 2 21 5 H,J\J\]^J_cgcrcrJrs
LFQUQZQZ[t  "TXXZ/r$   )	r   r   r   r0   r    r!   r   rK   rL   rM   s   @r%   r  r  b  sK    ,
!''! %%! !!	!
 ))!r$   r  zCSegGpt model with a decoder on top for one-shot image segmentation.c                   0    e Zd Zdef fdZ ee       eee	      	 	 	 	 	 	 	 dde
j                  de
j                  de
j                  dee
j                     dee   d	ee   d
ee
j                      dee   dee   dee   deeef   fd              Z xZS )SegGptForImageSegmentationr=   c                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y r   )r/   r0   r=   r^  rL  r@  decoderra  r   s     r%   r0   z#SegGptForImageSegmentation.__init__  s;      (
$V, 	r$   rk  rF   rp   rm  rq   r   rr   rn  r   r  r  rP   c                 4   ||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
|| j                  j
                  j                  j                  }t        j                  |t        j                        j                  |j                        }d||dz  d |j                  d      }| j	                  |||||||||	|

      }|
r|j                  n|d   }t        j                  |d      }| j!                  |      }d}| t#        | j                         } |||||      }|
s)|f}|	r	||d   fz   }|r|	rdnd}|||   fz   }||f|z   }|S t%        |||j&                  |j(                  	      S )
a^  
        labels (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, `optional`):
            Ground truth mask for input images.

        Returns:

        Examples:

        ```python
        >>> from transformers import SegGptImageProcessor, SegGptForImageSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> image_input_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_2.jpg"
        >>> image_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1.jpg"
        >>> mask_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1_target.png"

        >>> image_input = Image.open(requests.get(image_input_url, stream=True).raw)
        >>> image_prompt = Image.open(requests.get(image_prompt_url, stream=True).raw)
        >>> mask_prompt = Image.open(requests.get(mask_prompt_url, stream=True).raw).convert("L")

        >>> checkpoint = "BAAI/seggpt-vit-large"
        >>> model = SegGptForImageSegmentation.from_pretrained(checkpoint)
        >>> image_processor = SegGptImageProcessor.from_pretrained(checkpoint)

        >>> inputs = image_processor(images=image_input, prompt_images=image_prompt, prompt_masks=mask_prompt, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> result = image_processor.post_process_semantic_segmentation(outputs, target_sizes=[(image_input.height, image_input.width)])[0]
        >>> print(list(result.shape))
        [170, 297]
        ```
        Nr/  r   rB   r   )
rF   rp   rm  rq   r   rr   rn  r   r  r  rc   rv   )r(   r)   r   r   )r=   r   r  rp  rL  rJ   rY   r9   r    rS   r  r   r   ry   r   r{   r  r  r'   r   r   )r<   rF   rp   rm  rq   r   rr   rn  r   r  r  r9   r   r   r)   r(   loss_fnr   idxs                      r%   rK   z"SegGptForImageSegmentation.forward  s   ^ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]"**//@@LLK#kk+UZZHKKLL_L_`O23OK1,./-77:O**% 3%+-)/!5#  
 LWW%G%G\cdf\g"%*YY/Ir%R"\\"<=
 -G<V_MD ]F#71:-/ /aQ73</16)M,!!//))	
 	
r$   rv  )r   r   r   r   r0   r   rw  r   r'   rx  r    r   r   r   r  r   r!   r   r   rK   rL   rM   s   @r%   r  r    s   
|  ++BC+HWfg 7;+/(,.2,0/3&*a
lla
 #\\a
 ll	a

 "%"2"23a
 #4.a
 !a
 **+a
 $D>a
 'tna
 d^a
 
u33	4a
 h Da
r$   r  )r   F)>r   collections.abcr6   dataclassesr   typingr   r   r   r   r   r    torch.utils.checkpointr	   torch.nnr
   rj   activationsr   modeling_utilsr   utilsr   r   r   r   r   r   configuration_seggptr   
get_loggerr   rq  rx  _CHECKPOINT_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r'   Moduler+   rO   r   r   r   r   r  r   r   r   r  r%  r7  r@  rK  SEGGPT_START_DOCSTRINGrw  r^  r   r{  r}  r  r  r#   r$   r%   <module>r     sB     ! 5 5    $ ! -  / 
		H	% ! . &  J+ J J0 :K : :0 BII  FRryy RjK4bii K4^		 U\\ e T V[VbVb *-RYY -,")) ,^B
BII B
Lbii <		 0BII B2GO 2Gj	 " J fs' s	sl	U\\ 	s 	u|| 	u|| 3 S U\\ ' 'T In
!6 n
	n
r$   