
    sgs                       d Z ddlZddlZddlmZ ddlmZmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZmZmZ ddlmZm Z m!Z!  ejD                  e#      Z$dZ%de
jL                  de
jL                  fdZ'de
jL                  de
jL                  fdZ(e G d de             Z)e G d de             Z*e G d de             Z+ G d dejX                        Z- G d dejX                        Z. G d dejX                        Z/ G d  d!ejX                        Z0 G d" d#ejX                        Z1 G d$ d%e      Z2d&Z3d'Z4d(Z5d)Z6 G d* d+ejX                        Z7 G d, d-ejX                        Z8 G d. d/e2      Z9 G d0 d1ejX                        Z: G d2 d3e2      Z; ee3       G d4 d5e2             Z< G d6 d7ejX                        Z= G d8 d9e2      Z> ed:e3       G d; d<e2             Z?y)=zPyTorch CLIPSeg model.    N)	dataclass)AnyOptionalTupleUnion)nn   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings	torch_int   )CLIPSegConfigCLIPSegTextConfigCLIPSegVisionConfigzCIDAS/clipseg-rd64-refinedlogitsreturnc                     t         j                  j                  | t        j                  t        |       | j                              S )Ndevice)r   
functionalcross_entropytorcharangelenr   )r   s    _/var/www/html/venv/lib/python3.12/site-packages/transformers/models/clipseg/modeling_clipseg.pycontrastive_lossr%   1   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 Z    t        |       }t        | j                               }||z   dz  S )Ng       @)r%   t)r'   caption_loss
image_losss      r$   clipseg_lossr,   6   s,    #J/L!*,,.1J:%,,r&   c                       e Zd ZU dZdZeej                     ed<   dZ	ej                  ed<   dZ
ej                  ed<   dZej                  ed<   dZej                  ed<   dZeed<   dZeed	<   d
ee   fdZy)CLIPSegOutputa  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
        text_model_output (`BaseModelOutputWithPooling`):
            The output of the [`CLIPSegTextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`CLIPSegVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r4   r5   Ngetattrto_tuple.0kselfs     r$   	<genexpr>z)CLIPSegOutput.to_tuple.<locals>.<genexpr>\   s=      
  LLDGRYZ^`aRbRkRkRmm
   -0tuplekeysr>   s   `r$   r:   zCLIPSegOutput.to_tuple[   #     
YY[
 
 	
r&   )__name__
__module____qualname____doc__r/   r   r!   FloatTensor__annotations__r0   r1   r2   r3   r4   r   r5   r   r   r:    r&   r$   r.   r.   <   s    ( )-D(5$$
%,*.e''.)-OU&&-%)K"")&*L%##*48186:3:
%* 
r&   r.   c                       e Zd ZU dZdZej                  ed<   dZe	e
ej                        ed<   dZe	e
ej                        ed<   y)CLIPSegDecoderOutputa  
    Args:
        logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
            Classification scores for each pixel.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    Nr   hidden_states
attentions)rF   rG   rH   rI   r   r!   rJ   rK   rO   r   r   rP   rL   r&   r$   rN   rN   b   sM     !%FE$8<M8E%"3"345<59Ju00129r&   rN   c                       e Zd ZU dZdZeej                     ed<   dZ	ej                  ed<   dZ
ej                  ed<   dZej                  ed<   dZeed<   dZeed<   d	ee   fd
Zy)CLIPSegImageSegmentationOutputa,  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        ...
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`CLIPSegVisionModel`].
    Nr/   r   conditional_embeddingspooled_outputr5   decoder_outputr   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r5   rU   Nr8   r;   s     r$   r?   z:CLIPSegImageSegmentationOutput.to_tuple.<locals>.<genexpr>   s<      
  IIDGwW[]^O_OhOhOjj
r@   rA   rD   s   `r$   r:   z'CLIPSegImageSegmentationOutput.to_tuple   rE   r&   )rF   rG   rH   rI   r/   r   r!   rJ   rK   r   rS   rT   r5   r   rU   rN   r   r   r:   rL   r&   r$   rR   rR   v   sv     )-D(5$$
%, $FE$04E--4'+M5$$+6:3:+/N(/
%* 
r&   rR   c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )CLIPSegVisionEmbeddingsconfigc                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebias   r   position_idsr   
persistent)super__init__rZ   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr!   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr"   expandr>   rZ   	__class__s     r$   rh   z CLIPSegVisionEmbeddings.__init__   s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr&   
embeddingsheightwidthr   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nrd   g      ?r	   ra   bicubicF)sizemodealign_cornersdim)shaperv   weight	unsqueezer!   jit
is_tracingrb   rl   r   reshapepermuter   r   interpolateviewcat)r>   r{   r|   r}   rs   rv   rt   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r$   interpolate_pos_encodingz0CLIPSegVisionEmbeddings.interpolate_pos_encoding   sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr&   pixel_valuesc                     |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  |      }|j	                  d      j                  dd      }| j                  j                  |dd      }t        j                  ||gd      }	|r|	| j                  |	||      z   }	|	S |	| j                  | j                        z   }	|	S )	NzInput image size (*z) doesn't match model ().ra   r   rd   r   )r   rk   
ValueErrorrr   flatten	transposero   rx   r!   r   r   rv   rb   )
r>   r   r   
batch_size_r|   r}   patch_embedsclass_embedsr{   s
             r$   forwardzCLIPSegVisionEmbeddings.forward   s   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4OPTP_P_O``abfbqbqarrtu  ++L9#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr&   )T)rF   rG   rH   r   rh   r!   Tensorintr   rJ   r   __classcell__rz   s   @r$   rY   rY      se    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Y^YeYe r&   rY   c            	            e Zd Zdef fdZ	 	 	 ddeej                     deej                     deej                     dej                  fdZ
 xZS )	CLIPSegTextEmbeddingsrZ   c                 N   t         |           |j                  }t        j                  |j
                  |      | _        t        j                  |j                  |      | _        | j                  dt        j                  |j                        j                  d      d       y )Nrb   rc   Fre   )rg   rh   ri   r   ru   
vocab_sizetoken_embeddingmax_position_embeddingsrv   rw   r!   r"   rx   r>   rZ   rj   rz   s      r$   rh   zCLIPSegTextEmbeddings.__init__   s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r&   	input_idsrb   inputs_embedsr   c                     ||j                   d   n|j                   d   }|| j                  d d d |f   }|| j                  |      }| j                  |      }||z   }|S )Nrd   )r   rb   r   rv   )r>   r   rb   r   
seq_lengthposition_embeddingsr{   s          r$   r   zCLIPSegTextEmbeddings.forward   s{     -6,AY__R(}GZGZ[]G^
,,Q^<L  00;M"55lC"%88
r&   )NNN)rF   rG   rH   r   rh   r   r!   
LongTensorrJ   r   r   r   r   s   @r$   r   r      sk    

0 

 153759	E,,- u//0   1 12	
 
r&   r   c                        e Zd ZdZ fdZdej                  dedefdZ	 	 	 ddej                  de	ej                     d	e	ej                     d
e	e
   deej                  e	ej                     f   f
dZ xZS )CLIPSegAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                 
   t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r         )rg   rh   rZ   ri   rj   num_attention_heads	num_headshead_dimr   scaleattention_dropoutdropoutr   Lineark_projv_projq_projout_projry   s     r$   rh   zCLIPSegAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar&   tensorseq_lenbszc                     |j                  ||| j                  | j                        j                  dd      j	                         S )Nr   ra   )r   r   r   r   
contiguous)r>   r   r   r   s       r$   _shapezCLIPSegAttention._shape  s7    {{3GQQRSUVWbbddr&   rO   attention_maskcausal_attention_maskoutput_attentionsr   c                    |j                         \  }}}| j                  |      | j                  z  }| j                  | j	                  |      d|      }	| j                  | j                  |      d|      }
|| j                  z  d| j                  f} | j                  |||      j                  | } |	j                  | }	 |
j                  | }
|	j                  d      }t        j                  ||	j                  dd            }|j                         || j                  z  ||fk7  r/t        d|| j                  z  ||f d|j                                |{|j                         |d||fk7  r#t        d|d||f d|j                                |j                  || j                  ||      |z   }|j                  || j                  z  ||      }|{|j                         |d||fk7  r#t        d|d||f d|j                                |j                  || j                  ||      |z   }|j                  || j                  z  ||      }t        j                  j                  |d      }|r?|j                  || j                  ||      }|j                  || j                  z  ||      }nd}t        j                  j!                  || j                   | j"                  	      }t        j                  ||
      }|j                         || j                  z  || j                  fk7  r7t        d
|| j                  || j                  f d|j                                |j                  || j                  || j                        }|j                  dd      }|j%                  |||      }| j'                  |      }||fS )z#Input shape: Batch x Time x Channelrd   r   ra   z$Attention weights should be of size z	, but is Nz!Attention mask should be of size r   )ptrainingz `attn_output` should be of size )r   r   r   r   r   r   r   r   r   r!   bmmr   r   r   r   softmaxr   r   r   r   )r>   rO   r   r   r   r   tgt_lenrj   query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                    r$   r   zCLIPSegAttention.forward  s    #0"4"4"6Wi {{=1DJJ>[[]!;RE
{{4;;}#=r3GDNN*B>
Ct{{<#>CCZP$Z__j1
(|((*5//!$yyz/C/CAq/IJ3#7'"JJ6dnn8LgW^7_6` a %%'(*  !,$))+Q/II 7a'8R7S T-22457  (,,S$..'7SVkkL',,S4>>-A7GTL%""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S$..'7SVddL',,S4>>-A7GTL}},,\r,B
 %1$5$5c4>>7T[$\!055cDNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CRVR_R_3`2a b$$&') 
 "&&sDNNGT]]S!++Aq1!))#w	BmmK0111r&   )NNF)rF   rG   rH   rI   rh   r!   r   r   r   r   boolr   r   r   r   s   @r$   r   r     s    GB&eU\\ eC ec e 268<,1L2||L2 !.L2  (5	L2
 $D>L2 
u||Xell33	4L2r&   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )
CLIPSegMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y N)rg   rh   rZ   r
   
hidden_actactivation_fnr   r   ri   intermediate_sizefc1fc2ry   s     r$   rh   zCLIPSegMLP.__init__o  sd    #F$5$5699V//1I1IJ99V55v7I7IJr&   rO   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   )r>   rO   s     r$   r   zCLIPSegMLP.forwardv  s4    /**=9/r&   )rF   rG   rH   rh   r!   r   r   r   r   s   @r$   r   r   n  s$    KU\\ ell r&   r   c                        e Zd Zdef fdZ	 d	dej                  dej                  dej                  dee   de	ej                     f
dZ xZS )
CLIPSegEncoderLayerrZ   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y N)epsrg   rh   ri   rj   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2ry   s     r$   rh   zCLIPSegEncoderLayer.__init__  m    ++)&1<<F<Q<QRf%<<F<Q<QRr&   rO   r   r   r   r   c                     |}| j                  |      }| j                  ||||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rO   r   r   r   )r   r   r   r   r>   rO   r   r   r   residualr   outputss           r$   r   zCLIPSegEncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
#| !=0 ((7/ =0 "&Gr&   F)rF   rG   rH   r   rh   r!   r   r   r   r   rJ   r   r   r   s   @r$   r   r   ~  sf    S} S -2&||& &  %||	&
 $D>& 
u  	!&r&   r   c                   "    e Zd ZdZeZdZdZd Zy)CLIPSegPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    clipTc                 L
   | j                   j                  }t        |t              rj|j                  j
                  j                  j                  d|dz         |j                  j
                  j                  j                  d|dz         nt        |t              r| j                   j                  }t        j                  j                  |j                  d|j                  dz  |z         t        j                  j                  |j                  j
                  |j                   j                  |z         t        j                  j                  |j                  j
                  |j                   j                  |z         nt        |t               r-| j                   j                  }|j                  dz  d|j                   j"                  z  dz  z  |z  }|j                  dz  |z  }t        j                  j                  |j$                  j
                  |       t        j                  j                  |j&                  j
                  |       t        j                  j                  |j(                  j
                  |       t        j                  j                  |j*                  j
                  |       nt        |t,              r| j                   j                  }|j                   j.                  dz  d|j                   j"                  z  dz  z  |z  }d|j                   j.                  z  dz  |z  }t        j                  j                  |j0                  j
                  |       t        j                  j                  |j2                  j
                  |       nt        |t4              rt        j                  j                  |j6                  j
                  |j8                  dz  | j                   j                  z         t        j                  j                  |j:                  j
                  |j<                  dz  | j                   j                  z         t        |t        j>                        rI|j@                  j                  jC                          |j
                  j                  jE                  d       t        |t        jF                        r2|j@                  %|j@                  j                  jC                          yyy)	zInitialize the weightsg        g{Gz?)meanstdr   )r   ra   g      ?N)$rZ   initializer_factor
isinstancer   r   r   datanormal_rv   rY   r   initro   rj   rr   initializer_ranger   num_hidden_layersr   r   r   r   r   ri   r   r   CLIPSegModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr   r`   zero_fill_r   )r>   modulefactorin_proj_stdout_proj_stdfc_stds         r$   _init_weightsz$CLIPSegPreTrainedModel._init_weights  s   //f34""))..66CVd]6S%%,,1199sQU9V 78[[33FGGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOhGGOOF55<<&--BaBadjBjOk 01[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE
+[[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O?-GGOO&&--))4/$++2P2PP   GGOO((//++T1DKK4R4RR  
 fbll+KK""$MM$$S)fbii(V[[-DKK""$ .E(r&   N)	rF   rG   rH   rI   r   config_classbase_model_prefixsupports_gradient_checkpointingr  rL   r&   r$   r   r     s    
 !L&*#'%r&   r   aI  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`CLIPSegConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        interpolate_pos_encoding (`bool`, *optional*, defaults to `True`):
            Whether to interpolate the pre-trained position encodings.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        interpolate_pos_encoding (`bool`, *optional*, defaults to `True`):
            Whether to interpolate the pre-trained position encodings.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                        e Zd ZdZdef fdZ	 	 	 	 	 ddeej                     deej                     dee	   dee	   dee	   d	e
eef   fd
Z xZS )CLIPSegEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPSegEncoderLayer`].

    Args:
        config: CLIPSegConfig
    rZ   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
rg   rh   rZ   r   
ModuleListranger  r   layersgradient_checkpointing)r>   rZ   r   rz   s      r$   rh   zCLIPSegEncoder.__init__R  sP    mm%PVPhPhJi$jQ%8%@$jk&+# %ks   A#r   r   r   output_hidden_statesreturn_dictr   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	t	        | j
                        D ]b  \  }
}|r||	fz   }| j                  r,| j                  r | j                  |j                  |	|||      }n ||	|||      }|d   }	|sZ||d   fz   }d |r||	fz   }|st        d |	||fD              S t        |	||      S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NrL   )r   r   r   c              3   &   K   | ]	  }||  y wr   rL   r<   vs     r$   r?   z)CLIPSegEncoder.forward.<locals>.<genexpr>  s     eqWXWde   )last_hidden_staterO   rP   )rZ   r   r  use_return_dict	enumerater  r  r   _gradient_checkpointing_func__call__rB   r   )r>   r   r   r   r   r  r  encoder_statesall_attentionsrO   idxencoder_layerlayer_outputss                r$   r   zCLIPSegEncoder.forwardX  sH   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8 	FC#!/=2B!B**t}} $ A A!**!")%! !.!")&7	! *!,M !/=3C2E!E-	F0  +}.>>Ne]NN$Seee+>Vd
 	
r&   NNNNN)rF   rG   rH   rI   r   rh   r   r!   r   r   r   r   r   r   r   r   s   @r$   r  r  I  s    ,} , 268<,0/3&*O
 !.O
  (5	O

 $D>O
 'tnO
 d^O
 
uo%	&O
r&   r  c                        e Zd Zdef fdZ ee       eee      	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     de	e   de	e   d	e	e   d
eeef   fd              Z xZS )CLIPSegTextTransformerrZ   c                     t         |           || _        |j                  }t	        |      | _        t        |      | _        t        j                  ||j                        | _        |j                  | _        y r   )rg   rh   rZ   ri   r   r{   r  encoderr   r   r   final_layer_normeos_token_idr   s      r$   rh   zCLIPSegTextTransformer.__init__  sa    &&	/7%f- "YF<Q<Q R #//r&   output_typer  r   r   rb   r   r  r  r   c                 Z   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      |j                         }|j                  d|d         }| j                  ||      }t        ||j                  |j                        }	|t        ||j                        }| j                  |||	|||      }
|
d   }| j                  |      }| j                  dk(  rm|t        j                   |j"                  d   |j                        |j%                  t        j&                  |j                  	      j)                  d
      f   }n|t        j                   |j"                  d   |j                        |j%                  t        j&                  |j                  	      | j                  k(  j'                         j)                  d
      f   }|s
||f|
dd z   S t+        |||
j,                  |
j.                        S )
        Returns:

        NzYou have to specify input_idsrd   )r   rb   r   )r   r   r   r   r  r  r   ra   )dtyper   r   r   r#  pooler_outputrO   rP   )rZ   r   r  r$  r   r   r   r{   r   r8  r   r   r1  r2  r3  r!   r"   r   tor   argmaxr   rO   rP   )r>   r   r   rb   r   r  r  input_shaperO   r   encoder_outputsr#  rT   s                r$   r   zCLIPSegTextTransformer.forward  s.     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]<==nn&NN2{27	),W !A,,]5I5I!
 %7H[H[\N,,')"7/!5# ' 
 ,A. 112CD! ..44Q7@Q@X@XY5995F5M5MNUUZ\U]_M ..44Q7@Q@X@XY EII6G6N6NOSWSdSddB!M %}58KKK)/')77&11	
 	
r&   NNNNNN)rF   rG   rH   r   rh   r   CLIPSEG_TEXT_INPUTS_DOCSTRINGr   r   r   r!   r   r   r   r   r   r   r   s   @r$   r/  r/    s    	00 	0 ++HI+ETef -115/3,0/3&*O
ELL)O
 !.O
 u||,	O

 $D>O
 'tnO
 d^O
 
u00	1O
 g JO
r&   r/  c                   "    e Zd ZeZddgZdef fdZdej                  fdZ	d Z
 ee       eee      	 	 	 	 	 	 dd	eej"                     d
eej"                     deej"                     dee   dee   dee   deeef   fd              Z xZS )CLIPSegTextModelr   r   rZ   c                 d    t         |   |       t        |      | _        | j	                          y r   )rg   rh   r/  
text_model	post_initry   s     r$   rh   zCLIPSegTextModel.__init__  s&     08r&   r   c                 B    | j                   j                  j                  S r   rD  r{   r   rD   s    r$   get_input_embeddingsz%CLIPSegTextModel.get_input_embeddings  s    ))999r&   c                 :    || j                   j                  _        y r   rG  )r>   values     r$   set_input_embeddingsz%CLIPSegTextModel.set_input_embeddings  s    5:""2r&   r4  r   r   rb   r   r  r  c                 0    | j                  ||||||      S )aM  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPSegTextModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegTextModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   rb   r   r  r  )rD  )r>   r   r   rb   r   r  r  s          r$   r   zCLIPSegTextModel.forward  s,    8 )%/!5#  
 	
r&   r?  )rF   rG   rH   r   r  _no_split_modulesrh   r   ModulerH  rK  r   r@  r   r   r   r!   r   r   r   r   r   r   r   s   @r$   rB  rB    s    $L02GH0 :bii :; ++HI+ETef -115/3,0/3&*!
ELL)!
 !.!
 u||,	!

 $D>!
 'tn!
 d^!
 
u00	1!
 g J!
r&   rB  c                        e Zd Zdef fdZ ee       eee      	 	 	 	 dde	e
j                     de	e   de	e   de	e   de	e   d	eeef   fd
              Z xZS )CLIPSegVisionTransformerrZ   c                     t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        y r   )rg   rh   rZ   ri   rY   r{   r   r   r   pre_layrnormr  r1  post_layernormr   s      r$   rh   z!CLIPSegVisionTransformer.__init__D  sj    &&	1&9LL8M8MN%f- ll9&:O:OPr&   r4  r   r   r  r  r   r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||      }| j                  |      }| j                  ||||      }|d   }|dddddf   }	| j                  |	      }	|s
||	f|dd z   S t        ||	|j                  |j                        S )r7  N)r   )r   r   r  r  r   r   r9  )rZ   r   r  r$  r{   rS  r1  rT  r   rO   rP   )
r>   r   r   r  r  r   rO   r>  r#  rT   s
             r$   r   z CLIPSegVisionTransformer.forwardN  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]Ogh))-8,,'/!5#	 ' 
 ,A.)!Q'2++M:%}58KKK)/')77&11	
 	
r&   )NNNT)rF   rG   rH   r   rh   r   CLIPSEG_VISION_INPUTS_DOCSTRINGr   r   r   r!   rJ   r   r   r   r   r   r   s   @r$   rQ  rQ  B  s    Q2 Q ++JK+ETgh -1/3&*37(
u001(
 $D>(
 'tn	(

 d^(
 #+4.(
 
u00	1(
 i L(
r&   rQ  c                        e Zd ZeZdZdef fdZdej                  fdZ	 e
e       eee      	 	 	 	 	 ddeej                      dee   dee   d	ee   d
ee   deeef   fd              Z xZS )CLIPSegVisionModelr   rZ   c                 d    t         |   |       t        |      | _        | j	                          y r   )rg   rh   rQ  vision_modelrE  ry   s     r$   rh   zCLIPSegVisionModel.__init__  s'     4V<r&   r   c                 B    | j                   j                  j                  S r   )rZ  r{   rr   rD   s    r$   rH  z'CLIPSegVisionModel.get_input_embeddings  s      ++;;;r&   r4  r   r  r   r  c                 .    | j                  |||||      S )a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegVisionModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegVisionModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   r  r   r  )rZ  )r>   r   r   r  r   r  s         r$   r   zCLIPSegVisionModel.forward  s,    @   %/!5%=# ! 
 	
r&   NNNTN)rF   rG   rH   r   r  main_input_namerh   r   rO  rH  r   rV  r   r   r   r!   rJ   r   r   r   r   r   r   s   @r$   rX  rX  {  s    &L$O2 <bii < ++JK+ETgh 59,0/337&*$
u001$
 $D>$
 'tn	$

 #+4.$
 d^$
 
u00	1$
 i L$
r&   rX  c                   d    e Zd ZeZdef fdZ ee      	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee   dee   dee   d	e	j                  fd
       Z ee      	 	 	 	 	 ddee	j                     dee   dee   dedee   d	e	j                  fd       Z ee       eee      	 	 	 	 	 	 	 	 	 ddee	j&                     dee	j                     dee	j                     dee	j&                     dee   dee   dee   dedee   d	eeef   fd              Z xZS )r  rZ   c                 P   t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }|j                  | _	        |j                  | _        |j                  | _        t        |      | _        t        |      | _        t#        j$                  | j                  | j                  d      | _        t#        j$                  | j                  | j                  d      | _        t#        j*                  t-        j.                  | j0                  j2                              | _        | j7                          y )NzNconfig.text_config is expected to be of type CLIPSegTextConfig but is of type .zRconfig.vision_config is expected to be of type CLIPSegVisionConfig but is of type F)r`   )rg   rh   r   text_configr   	TypeErrortypevision_configr   projection_dimri   r  r	  r/  rD  rQ  rZ  r   r   r  r  rm   r!   r   rZ   logit_scale_init_valuelogit_scalerE  )r>   rZ   rc  rf  rz   s       r$   rh   zCLIPSegModel.__init__  sW    &,,.?@++,-Q0 
 &..0CD--./q2 
 ((,,$33)55 - 9 90=4]C!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY 	r&   r   r   rb   r   r  r  r   c                     ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||||||      }|d   }| j                  |      }	|	S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPSegTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPSegModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```rM  r   )rZ   r   r  r$  rD  r  )
r>   r   r   rb   r   r  r  text_outputsrT   text_featuress
             r$   get_text_featureszCLIPSegModel.get_text_features  s    6 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B])%/!5# ' 
 %Q,,];r&   r   r   c                     ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |||||      }|d   }| j                  |      }|S )aI  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPSegVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```r]  r   )rZ   r   r  r$  rZ  r  )	r>   r   r   r  r   r  vision_outputsrT   image_featuress	            r$   get_image_featureszCLIPSegModel.get_image_features  s    @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%/!5%=# + 
 'q)//>r&   r4  return_lossc
           	         ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	| j	                  |||||	      }
| j                  ||||||	      }|
d   }| j                  |      }|d   }| j                  |      }||j                  ddd      z  }||j                  ddd      z  }| j                  j                         }t        j                  ||j                               |z  }|j                         }d}|rt        |      }|	s||||||
f}||f|z   S |S t        |||||||
	      S )
a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```Nr]  rM  r   ra   rd   T)r   r   keepdim)r/   r0   r1   r2   r3   r4   r5   )rZ   r   r  r$  rZ  rD  r  r  normri  expr!   matmulr)   r,   r.   )r>   r   r   r   rb   rr  r   r  r   r  ro  rk  r3   r2   ri  r1   r0   r/   outputs                      r$   r   zCLIPSegModel.forward7  s   N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%/!5%=# + 
 )%/!5# ' 
 &a(--l;"1o**;7 $l&7&7!T&7&RR!K$4$4qb$$4$OO &&**,,,{LNN4DES*,,.0D&lT`bpqF)-)9TGf$EvE-+#%* .
 	
r&   r?  r^  )	NNNNNNNTN)rF   rG   rH   r   r  rh   r   r@  r   r!   r   r   rJ   rm  rV  rq  CLIPSEG_INPUTS_DOCSTRINGr   r.   r   r   r   r   r   r   s   @r$   r  r    s4    L} @ ++HI -115/3,0/3&*,ELL), !., u||,	,
 $D>, 'tn, d^, 
		, J,\ ++JK 59,0/3)-&*0u0010 $D>0 'tn	0
 #'0 d^0 
		0 L0d ++CD=}U 15481537&*,0/3)-&*[
E,,-[
 u001[
 !.	[

 u//0[
 d^[
 $D>[
 'tn[
 #'[
 d^[
 
um#	$[
 V E[
r&   r  c                        e Zd ZdZdef fdZ	 d
dej                  dej                  dej                  dee	   de
ej                     f
d	Z xZS )CLIPSegDecoderLayerz
    CLIPSeg decoder layer, which is identical to `CLIPSegEncoderLayer`, except that normalization is applied after
    self-attention/MLP, rather than before.
    rZ   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y r   r   ry   s     r$   rh   zCLIPSegDecoderLayer.__init__  r   r&   rO   r   r   r   r   c                     |}| j                  ||||      \  }}||z   }| j                  |      }|}| j                  |      }||z   }| j                  |      }|f}|r||fz  }|S r   )r   r   r   r   r   s           r$   r   zCLIPSegDecoderLayer.forward  s    " !&*nn')"7/	 '5 '
#| !=0((7 / =0((7 "&Gr&   r   )rF   rG   rH   rI   r   rh   r!   r   r   r   r   rJ   r   r   r   s   @r$   r{  r{    sk    S} S -2'||' '  %||	'
 $D>' 
u  	!'r&   r{  c                        e Zd Zdef fdZ	 	 	 d	deej                     dej                  dee	   dee	   dee	   f
dZ
 xZS )
CLIPSegDecoderrZ   c                    t         |   |       |j                  | _        t        j                  |j
                  |j                        | _        t        j                  |j
                  |j                        | _        |j                  r|j                  j                  dz  |j                  j                  dz  f}t        j                  t        j                  |j                  |j                  dd      t        j                         t        j                  |j                  |j                  dz  |d   |d         t        j                         t        j                  |j                  dz  d|d   |d               | _        nPt        j                  |j                  d|j                  j                  |j                  j                        | _        t#        |j$                        }t        j&                  t)        |      D cg c]6  }t        j                  |j                  j*                  |j                        8 c}      | _        t/        j0                  |j                        }|j                  |_        |j2                  |_        |j6                  |_        d	|_        t        j&                  t)        t#        |j$                              D cg c]  }t=        |       c}      | _        y c c}w c c}w )
N   r	   r   )r^   paddingra   r   )r^   r_   )r_   relu) rg   rh   conditional_layerr   r   rg  
reduce_dimfilm_mulfilm_add"use_complex_transposed_convolutionrf  rl   
Sequentialrp   ReLUConvTranspose2dtransposed_convolutionr#   extract_layersr  r  ri   reducescopydeepcopydecoder_num_attention_headsr   decoder_intermediate_sizer   r   r{  r  )r>   rZ   transposed_kernelsdepthr   decoder_configrz   s         r$   rh   zCLIPSegDecoder.__init__  sQ    !'!9!9		&"7"79J9JK		&"7"79J9JK44"("6"6"A"AQ"FH\H\HgHgklHl!m*,--		&++V->->AWXY	""%%%%* 21 5-a0	 	""%%*A;Ma;PYklmYn+D' +-*<*<!!1f&:&:&E&EfNbNbNmNm+D' F))*}}UZ[`UabPQRYYv++779J9JKb
 v';';<%+%6%6"-3-O-O*+1+K+K($*!mmRWX[\b\q\qXrRs$tQ%8%H$tu c %us   ;K9K>rO   rS   r   r  r  c                 @   |rdnd }|rdnd }|d d d   }d }	t        t        || j                  | j                              D ]  \  }
\  }}}|	 ||      |	z   }	n ||      }	|
| j                  k(  rJ| j                  |      |	j                  ddd      z  | j                  |      z   }	|	j                  ddd      }	 ||	d d |      }|d   }	|r||	fz  }|s||d   fz  } |	d d dd d d f   j                  ddd      }	t        t        j                  |	j                  d               }|j                  d   }|	j                  ||	j                  d   ||      }	| j                  |	      j                  d      }|st        d |||fD              S t!        |||      S )	NrL   rd   r   r   ra   )r   r   r   c              3   &   K   | ]	  }||  y wr   rL   r   s     r$   r?   z)CLIPSegDecoder.forward.<locals>.<genexpr>+  s     aqSTS`ar"  )r   rO   rP   )r%  zipr  r  r  r  r   r  r   mathsqrtr   r   r  squeezerB   rN   )r>   rO   rS   r   r  r  all_hidden_statesr)  activationsrx  i
activationlayerreducer,  r   r   r   s                     r$   r   zCLIPSegDecoder.forward  s    #7BD0d#DbD).7KVZVbVb8c.d 	6*A*
E6!
+f4
+D***'=>PQSTVWAXX[_[h[h*\   1a0!t4[lM #1%F#!fY.! =#3"55-	60 12q!))!Q2499V\\!_-.+11!4
Za$E,,V4<<Q?aV->$Oaaa#+%
 	
r&   )NNT)rF   rG   rH   r   rh   r   r!   r   r   r   r   r   r   s   @r$   r  r    sk    (v} (v\ -1/3&*6
U\\*6
 !&6
 $D>	6

 'tn6
 d^6
r&   r  zn
    CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation.
    c                   
    e Zd ZeZdef fdZ	 	 	 	 	 ddedeej                     deej                     deej                     deej                     f
dZ
 ee       eee	      	 	 	 	 	 	 	 	 	 	 	 ddeej                      d
eej                      deej                      deej                      deej                     deej"                     deej"                     dee   dee   dedee   deeef   fd              Z xZS )CLIPSegForImageSegmentationrZ   c                     t         |   |       || _        t        |      | _        |j
                  | _        t        |      | _        | j                          y r   )	rg   rh   rZ   r  r   r  r  decoderrE  ry   s     r$   rh   z$CLIPSegForImageSegmentation.__init__=  sI      (	$33%f- 	r&   r   r   r   rb   conditional_pixel_valuesc                    |Vt        |      |k7  rt        d      t        j                         5  | j                  j                  |||      }d d d        |S |St        |      |k7  rt        d      t        j                         5  | j                  j                  |      }d d d        |S t        d      # 1 sw Y   S xY w# 1 sw Y   S xY w)Nz@Make sure to pass as many prompt texts as there are query images)r   rb   zAMake sure to pass as many prompt images as there are query imagesz[Invalid conditional, should be either provided as `input_ids` or `conditional_pixel_values`)r#   r   r!   no_gradr   rm  rq  )r>   r   r   r   rb   r  rS   s          r$   get_conditional_embeddingsz6CLIPSegForImageSegmentation.get_conditional_embeddingsJ  s      9~+ !cdd )-)D)Dn< *E *& &% &1+,
: !dee `)-)E)EF^)_&` &%	 m  &%` &%s   B9C9CCr4  r   rS   labelsr   r  r   r  r   c                 "   ||n| j                   j                  }t        j                         5  | j                  j                  ||d|
|      }| j                  j                  |d         }|r|j                  n|d   }| j                  D cg c]
  }||dz       }}|r<t        |j                  |j                  |	r|j                  nd|j                        }n|	s|dd |dd z   n|}ddd       |$| j                  |j                  d   ||||	      }n[|j                  d   |j                  d   k7  rt        d
      |j                  d   | j                   j                   k7  rt        d      | j#                  |||	|      }|r|j$                  n|d   }d}|8|j'                  |j(                        }t+        j,                         } |||      }|s|||f}||f|z   S |S t/        ||||      S c c}w # 1 sw Y   xY w)a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoProcessor, CLIPSegForImageSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> texts = ["a cat", "a remote", "a blanket"]
        >>> inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> print(logits.shape)
        torch.Size([3, 352, 352])
        ```NTr]  r   ra   r9  r	   r   )r   r   r   rb   r  zWMake sure to pass as many conditional embeddings as there are query images in the batchzcMake sure that the feature dimension of the conditional embeddings matches `config.projection_dim`.)r   r  r  )r/   r   rS   rT   r5   rU   )rZ   r$  r!   r  r   rZ  r  rO   r  r   r#  r:  rP   r  r   r   rg  r  r   r;  r   r   BCEWithLogitsLossrR   )r>   r   r   r  rS   r   rb   r  r   r  r   r  ro  rT   rO   r  r  decoder_outputsr   r/   loss_fnrx  s                         r$   r   z#CLIPSegForImageSegmentation.forwardg  sn   Z &1%<k$++B]B] ]]_ 	!YY33)"3%))A' 4 N !II77q8IJM<GN88^\]M^M9=9L9LMA=Q/MKM !;&4&F&F"0">">BV.">">\`-88	" DXN2A&);;]k /	8 ")%)%D%D'--a0#-))A &E &" &++A.,2D2DQ2GG m  &++A.$++2L2LL 0  ,,"/!5# ' 
 ,7''OA<NYYv}}-F**,G66*D4m^UdeF)-)9TGf$EvE-#9' .*
 	
q N	 	s   A HG?AH?HHr-  )NNNNNNNNNTN)rF   rG   rH   r   r  rh   r   r   r!   r   r  r   ry  r   rR   r   rJ   r   r   r   r   r.   r   r   r   s   @r$   r  r  4  s    !L}  ,015/3;?&& ELL)& !.	&
 u||,& #+5<<"8&: ++CD+IXij 2648@D>B1537-1,0/3)-&*y
E--.y
 u001y
 #+5+<+<"=	y

 !)):): ;y
 !.y
 u//0y
 ))*y
 $D>y
 'tny
 #'y
 d^y
 
um#	$y
 k Ey
r&   r  )@rI   r  r  dataclassesr   typingr   r   r   r   r!   torch.utils.checkpointr   r  r
   modeling_attn_mask_utilsr   r   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   r   r   r   configuration_clipsegr   r   r   
get_loggerrF   logger_CHECKPOINT_FOR_DOCr   r%   r,   r.   rN   rR   rO  rY   r   r   r   r   r   CLIPSEG_START_DOCSTRINGr@  rV  ry  r  r/  rB  rQ  rX  r  r{  r  r  rL   r&   r$   <module>r     s7      ! . .    ! d K -  Y X 
		H	% 3 
`U\\ `ell `
-U\\ -ell - !
K !
 !
H :; : :& 
[ 
 
0Pbii PhBII De2ryy e2R  /")) /d1%_ 1%h	 ! @# "% R^
RYY ^
B^
RYY ^
B4
- 4
n6
ryy 6
r3
/ 3
l -.b
) b
 /b
J6")) 6ra
+ a
H  	h
"8 h
h
r&   