
    sg                       d Z ddlZddlmZ ddlmZmZmZm	Z	 ddl
ZddlZddlZddlmZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZmZmZ ddl m!Z!m"Z"m#Z#  ejH                  e%      Z&dZ'dejP                  dejP                  fdZ)dejP                  dejP                  fdZ*dejP                  de+fdZ,dGdejP                  de-de.de+dejP                  f
dZ/dHdZ0d Z1 G d dejd                        Z3 G d dejd                        Z4 G d d ejd                        Z5e G d! d"e             Z6 G d# d$ejd                        Z7 G d% d&ejd                        Z8 G d' d(ejd                        Z9 G d) d*ejd                        Z: G d+ d,ejd                        Z; G d- d.e;      Z< G d/ d0ejd                        Z= G d1 d2ejd                        Z> G d3 d4e      Z?d5Z@d6ZAd7ZBd8ZC G d9 d:ejd                        ZD G d; d<ejd                        ZE G d= d>ejd                        ZF G d? d@e?      ZG G dA dBejd                        ZH G dC dDe?      ZI ee@       G dE dFe?             ZJy)IzPyTorch GroupViT model.    N)	dataclass)AnyOptionalTupleUnion)nn   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings	torch_int   )GroupViTConfigGroupViTTextConfigGroupViTVisionConfigznvidia/groupvit-gcc-yfcclogitsreturnc                     t         j                  j                  | t        j                  t        |       | j                              S )Ndevice)r   
functionalcross_entropytorcharangelenr   )r   s    a/var/www/html/venv/lib/python3.12/site-packages/transformers/models/groupvit/modeling_groupvit.pycontrastive_lossr%   0   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 Z    t        |       }t        | j                               }||z   dz  S )Ng       @)r%   t)r'   caption_loss
image_losss      r$   groupvit_lossr,   5   s,    #J/L!*,,.1J:%,,r&   dimc                     | j                  |      }|j                  |d      d   }t        j                  | t        j                        j                  ||d      }||j                         z
  |z   }|S )NTkeepdimr   memory_format      ?)softmaxmaxr!   
zeros_likelegacy_contiguous_formatscatter_detach)r   r-   y_softindexy_hardrets         r$   hard_softmaxr>   ;   sk    ^^C FJJsDJ)!,EfE4R4RS\\]`bgilmF
6==?
"V
+CJr&   tauhardc                 :   t         j                  j                  j                  t        j                  d| j
                  | j                        t        j                  d| j
                  | j                              }|j                  | j                        }| |z   |z  }|j                  |      }|rd|j                  |d      d   }t        j                  | t         j                        j                  ||d      }||j                         z
  |z   }	|	S |}	|	S )N        )r   dtyper3   Tr/   r   r1   )r!   distributionsgumbelGumbeltensorr   rC   sampleshaper4   r5   r6   r7   r8   r9   )
r   r?   r@   r-   gumbel_distgumbelsr:   r;   r<   r=   s
             r$   gumbel_softmaxrL   E   s    %%,,33SfllCSfllCK   .G3&G__S!F

3
-a0!!&8V8VW``adfkmpqv}}&/ J Jr&   c                    ||z  | j                   d   z  dz  }||kD  r4t        t        j                  ||z              }| j                   d   |z  }n3t        t        j                  ||z              }| j                   d   |z  }| j                   d   }| j                   d   }| j	                  ||||      } t
        j                  j                  | ||fd|      } | S )a  
    Args:
        attentions (`torch.Tensor`): attention map of shape [batch_size, groups, feat_height*feat_width]
        height (`int`): height of the output attention map
        width (`int`): width of the output attention map
        align_corners (`bool`, *optional*): the `align_corner` argument for `nn.functional.interpolate`.

    Returns:
        `torch.Tensor`: resized attention map of shape [batch_size, groups, height, width]
             ?r   r   bilinearsizemodealign_corners)rI   intnproundreshaper   r   interpolate)	
attentionsheightwidthrT   scale
feat_widthfeat_height
batch_sizegroupss	            r$   resize_attention_maprb   [   s     e^z//22s:E~%%-01
 &&q)Z7"((6E>23%%a(K7
!!!$Ja F##JZPJ**&%z + J r&   c           	      H   g }t        j                         5  d}| D ]i  }|j                  ddd      j                         }||}n||z  }t	        |j                  ddd      j                         g| }|j                  |       k 	 ddd       |d   }|S # 1 sw Y   xY w)a1  
    Args:
        attentions (`tuple(torch.FloatTensor)`: tuple of attention maps returned by `GroupViTVisionTransformer`
        hw_shape (`tuple(int)`): height and width of the output attention map
    Returns:
        `torch.Tensor`: the attention map of shape [batch_size, groups, height, width]
    Nr   rN   r   )r!   no_gradpermute
contiguousrb   append)rZ   hw_shape	attn_mapsprev_attn_masks
attn_maskscur_attn_mapfinal_groupings          r$   get_grouping_from_attentionsro   y   s     I	 +$ 		+J#++Aq!4??AJ&","1J">/0G0G1a0P0[0[0]i`hiL\*		++ r]N!+ +s   A1BB!c                   *     e Zd Zdef fdZd Z xZS )GroupViTCrossAttentionLayerconfigc                 "   t         |           t        |      | _        t	        j
                  |j                  |j                        | _        t        |      | _
        t	        j
                  |j                  |j                        | _        y Neps)super__init__GroupViTAttentionattnr   	LayerNormhidden_sizelayer_norm_epsnorm2GroupViTMLPmlp	norm_postselfrr   	__class__s     r$   rx   z$GroupViTCrossAttentionLayer.__init__   sb    %f-	\\&"4"4&:O:OP
v&f&8&8f>S>STr&   c                     |}|| j                  ||      d   z   }|| j                  | j                  |            z   }| j                  |      }|S )N)encoder_hidden_statesr   )rz   r   r~   r   )r   querykeyxs       r$   forwardz#GroupViTCrossAttentionLayer.forward   sQ    		%s	;A>>A''NN1r&   )__name__
__module____qualname__r   rx   r   __classcell__r   s   @r$   rq   rq      s    U3 Ur&   rq   c                   2     e Zd Zdef fdZddZd Z xZS )GroupViTAssignAttentionrr   c                    t         |           |j                  dz  | _        t	        j
                  |j                  |j                        | _        t	        j
                  |j                  |j                        | _        t	        j
                  |j                  |j                        | _        t	        j
                  |j                  |j                        | _	        |j                  | _
        y )N      )rw   rx   r|   r]   r   Linearq_projk_projv_projproj
assign_epsr   s     r$   rx   z GroupViTAssignAttention.__init__   s    ''-
ii 2 2F4F4FGii 2 2F4F4FGii 2 2F4F4FGIIf00&2D2DE	 ++r&   c                     |r| j                   rt        |d|      }|S |rt        |d      }|S t        j                  j                  |d      }|S )N)r-   r@   r-   )trainingrL   r>   r   r   r4   )r   rz   rE   r@   s       r$   get_attnz GroupViTAssignAttention.get_attn   sX    dmm!$BT:D  #Db1  }},,Tr,:r&   c                 t   |}| j                  |      }| j                  |      }| j                  |      }||j                  dd      z  | j                  z  }| j                  |      }| j                  |dd      }||j                  dd      | j                  z   z  }||z  }| j                  |      }||fS )Nr   rd   F)rE   r@   Tr-   r0   )	r   r   r   	transposer]   r   sumr   r   )r   r   r   valueraw_attnrz   	soft_attnouts           r$   r   zGroupViTAssignAttention.forward   s    E" kk# E" CMM"b11TZZ?}}X&MM(5uME	txxBx5GHUliinI~r&   )TT)r   r   r   r   rx   r   r   r   r   s   @r$   r   r      s    ,3 ,	r&   r   c                   0     e Zd Zdef fdZd Zd Z xZS )GroupViTTokenAssignrr   c                 d   t         |           || _        t        j                  |j
                  |j                        | _        t        |j                  t        j                  j                        r|j                  n|j                  |j                  f}|D cg c]  }t        ||j
                  z         c}\  }}t        ||||      | _        t        j                  |j
                  |j                        | _        t        j                  |j
                  |j                        | _        t%        |      | _        t)        |      | _        t        j                  |j
                  |j                        | _        t/        ||j
                  ||j
                        | _        y c c}w rt   )rw   rx   num_output_groupr   r{   r|   r}   norm_tokens
isinstanceassign_mlp_ratiocollectionsabcIterablerU   GroupViTMixerMLP	mlp_internorm_post_tokensnorm_xrq   pre_assign_attnr   assign
norm_new_xr   mlp_channels)	r   rr   num_group_tokenr   r   r   
tokens_dimchannels_dimr   s	           r$   rx   zGroupViTTokenAssign.__init__   sK    0<<(:(:@U@UV &11;??3K3KL ##))6+B+BC 	
 JZ#ZACF,>,>(>$?#Z 
L)&/:O_` "V-?-?VEZEZ [ll6#5#56;P;PQ:6B-f5,,v'9'9v?T?TU'0B0BLRXRdRde $[s   F-c                 J    | j                  |      }| j                  |      }|S )z
        Args:
            group_tokens (torch.Tensor): group tokens, [batch_size, num_group_tokens, channels]

        Returns:
            projected_group_tokens (torch.Tensor): [batch_size, num_output_groups, channels]
        )r   r   )r   group_tokensprojected_group_tokenss      r$   project_group_tokenz'GroupViTTokenAssign.project_group_token   s+     "&!=!%!6!67M!N%%r&   c                    | j                  |      }| j                  |      }| j                  |      }| j                  ||      }| j	                  ||      \  }}||z  }|| j                  | j                  |            z   }||fS )z
        Args:
            image_tokens (`torch.Tensor`): image tokens, of shape [batch_size, input_length, channels]
            group_tokens (`torch.Tensor`): group tokens, [batch_size, num_group_tokens, channels]
        )r   r   r   r   r   r   r   )r   image_tokensr   r   new_image_tokens	attentions         r$   r   zGroupViTTokenAssign.forward   s     ''5{{<0!%!9!9,!G!%!5!56Ll![&*kk2H,&W#)22+d.?.?P`@a.bb**r&   )r   r   r   r   rx   r   r   r   r   s   @r$   r   r      s    f3 f*&+r&   r   c                      e Zd ZU dZdZeej                     ed<   dZ	ej                  ed<   dZ
ej                  ed<   dZej                  ed<   dZej                  ed<   dZej                  ed<   dZeed	<   dZeed
<   dee   fdZy)GroupViTModelOutputa\  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        segmentation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
            Classification scores for each pixel.

            <Tip warning={true}>

            The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
            to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
            original image size as post-processing. You should always check your logits shape and resize as needed.

            </Tip>

        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of
            [`GroupViTTextModel`].
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of
            [`GroupViTVisionModel`].
        text_model_output (`BaseModelOutputWithPooling`):
            The output of the [`GroupViTTextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`GroupViTVisionModel`].
    Nlosslogits_per_imagelogits_per_textsegmentation_logitstext_embedsimage_embedstext_model_outputvision_model_outputr   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r   r   N)getattrto_tuple).0kr   s     r$   	<genexpr>z/GroupViTModelOutput.to_tuple.<locals>.<genexpr>:  s=      
  LLDGRYZ^`aRbRkRkRmm
s   -0)tuplekeysr   s   `r$   r   zGroupViTModelOutput.to_tuple9  s#     
YY[
 
 	
r&   )r   r   r   __doc__r   r   r!   FloatTensor__annotations__r   r   r   r   r   r   r   r   r   r   r    r&   r$   r   r     s    B )-D(5$$
%,*.e''.)-OU&&--1**1%)K"")&*L%##*48186:3:
%* 
r&   r   c            	            e Zd ZdZ	 	 	 	 ddedeeeeef   f   dedef fdZddej                  de
d	ej                  fd
Z xZS )GroupViTPatchEmbeddingsz#
    Image to Patch Embedding.
    
image_size
patch_sizenum_channels	embed_dimc                 ^   t         |           t        |t        j                  j
                        r|n||f}t        |t        j                  j
                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        t        j                  ||||      | _        y )Nr   r   )kernel_sizestride)rw   rx   r   r   r   r   r   r   num_patchesr   Conv2d
projection)r   r   r   r   r   r   r   s         r$   rx   z GroupViTPatchEmbeddings.__init__E  s     	#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$&))L)\fgr&   pixel_valuesinterpolate_pos_encodingr   c                 8   |j                   \  }}}}|sV|| j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d	      | j                  |      j	                  d      j                  dd      }|S )Nr   r   zInput image size (*z) doesn't match model ().rN   )rI   r   
ValueErrorr   flattenr   )r   r   r   r`   r   r[   r\   r   s           r$   r   zGroupViTPatchEmbeddings.forwardV  s    2>2D2D/
L&%'++u8J/J (% 9+,Adooa.@-AE  OOL)11!4>>q!Dr&   )      r	   i   F)r   r   r   r   rU   r   r   rx   r!   Tensorboolr   r   r   s   @r$   r   r   @  s}     24hh #uS#X./h 	h
 h"	ELL 	D 	]b]i]i 	r&   r   c                        e Zd Zdef fdZdej                  dededej                  fdZddej                  d	e	dej                  fd
Z
 xZS )GroupViTVisionEmbeddingsrr   c                    t         |           t        |j                  |j                  |j
                  |j                        | _        | j                  j                  }t        j                  t        j                  d||j                              | _        t        j                  |j                        | _        t        j                   |j                  |j"                        | _        |j                  | _        || _        y )N)r   r   r   r   r   ru   )rw   rx   r   r   r   r   r|   patch_embeddingsr   r   	Parameterr!   zerosposition_embeddingsDropoutdropoutr{   r}   	layernormrr   )r   rr   r   r   s      r$   rx   z!GroupViTVisionEmbeddings.__init__c  s     7((((,,((	!
 ++77#%<<A{FL^L^0_#` zz&..1f&8&8f>S>ST ++r&   
embeddingsr[   r\   r   c                 0   |j                   d   }| j                  j                   d   }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  }|j                   d   }|| j
                  z  }|| j
                  z  }	t        |dz        }
|j                  d|
|
|      }|j                  dddd      }t        j                  j                  |||	fdd	      }|j                  dddd      j                  dd|      }|S )
a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and no class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   rd   rO   r   r	   rN   bicubicFrQ   )rI   r   r!   jit
is_tracingr   r   rX   rf   r   r   rY   view)r   r   r[   r\   r   num_positionspatch_pos_embedr-   
new_height	new_widthsqrt_num_positionss              r$   r   z1GroupViTVisionEmbeddings.interpolate_pos_encodings  s#    !&&q)0066q9 yy##%+*F6UZ?+++22r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nr&   r   r   c                 
   |j                   \  }}}}| j                  ||      }| j                  |      }|j                         \  }}}	|r|| j	                  |||      z   }n|| j
                  z   }| j                  |      }|S )N)r   )rI   r   r   rR   r   r   r   )
r   r   r   r`   r   r[   r\   r   seq_len_s
             r$   r   z GroupViTVisionEmbeddings.forward  s    2>2D2D/
L&%**<Rj*k
^^J/
!+!2
GQ $#d&C&CJPVX]&^^J#d&>&>>J\\*-
r&   r   )r   r   r   r   rx   r!   r   rU   r   r   r   r   r   s   @r$   r   r   b  sc    3  $5<< $ $UX $]b]i]i $LELL D ]b]i]i r&   r   c            	            e Zd Zdef fdZ	 	 	 ddeej                     deej                     deej                     dej                  fdZ
 xZS )	GroupViTTextEmbeddingsrr   c                 N   t         |           |j                  }t        j                  |j
                  |      | _        t        j                  |j                  |      | _        | j                  dt        j                  |j                        j                  d      d       y )Nposition_ids)r   rd   F)
persistent)rw   rx   r|   r   	Embedding
vocab_sizetoken_embeddingmax_position_embeddingsposition_embeddingregister_bufferr!   r"   expandr   rr   r   r   s      r$   rx   zGroupViTTextEmbeddings.__init__  s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r&   	input_idsr  inputs_embedsr   c                     ||j                   d   n|j                   d   }|| j                  d d d |f   }|| j                  |      }| j                  |      }||z   }|S )Nrd   r   )rI   r  r  r  )r   r  r  r  
seq_lengthr   r   s          r$   r   zGroupViTTextEmbeddings.forward  s{     -6,AY__R(}GZGZ[]G^
,,Q^<L  00;M"55lC"%88
r&   NNN)r   r   r   r   rx   r   r!   
LongTensorr   r   r   r   r   s   @r$   r  r    sk    

1 

 153759	E,,- u//0   1 12	
 
r&   r  c            
           e Zd ZdZdededededef
 fdZed        Zd	 Z	dd
e
j                  dee
j                     de
j                  fdZ	 	 dde
j                  dee
j                     dee   dee
j                      fdZ xZS )GroupViTStagezMThis corresponds to the `GroupingLayer` class in the GroupViT implementation.rr   depthnum_prev_group_tokenr   r   c           	      f   t         |           || _        || _        |dkD  r:t	        j
                  t        j                  d||j                              | _	        nd | _	        t	        j                  t        |      D cg c]  }t        |       c}      | _        |dkD  rt        |||      | _        nd | _        |dkD  rc|dkD  r^t	        j                   t	        j"                  |j                  |j$                        t'        |||j                  dz  |            | _        y d | _        y c c}w )Nr   r   )rr   r   r   ru   rN   )rw   rx   r!  r   r   r   r!   r   r|   group_token
ModuleListrangeGroupViTEncoderLayerlayersr   
downsample
Sequentialr{   r}   r   group_projector)r   rr   r!  r"  r   r   r  r   s          r$   rx   zGroupViTStage.__init__  s    	
.Q!||EKK?FL^L^,_`D#Dmm5QV<$Xa%9&%A$XYQ1 /!1DO #DO!#!(;#%==V//V5J5JK )=v?Q?QUV?VXgh$D 
 $(D # %Ys    D.c                     | j                   d uS N)r$  r   s    r$   with_group_tokenzGroupViTStage.with_group_token  s    t++r&   c                 z    | j                   r,|d d d | j                   f   |d d | j                   d f   fS |d fS r-  )r.  r   )r   r   s     r$   split_xzGroupViTStage.split_x  sN      Q/4/////0!A8L8L7L7N4N2OOOd7Nr&   r   r$  r   c                 <    ||S t        j                  ||gd      S )Nr   r   )r!   cat)r   r   r$  s      r$   concat_xzGroupViTStage.concat_x  s#    Hyy![)q11r&   hidden_statesprev_group_tokenoutput_attentionsc                    | j                   rM| j                  j                  |j                  d      dd      }| j                  || j	                  |      z   }nd}|}| j                  ||      }| j                  D ]  } ||dd      }|d   } | j                  |      \  }}d}	| j                  | j                  ||      \  }}	||f}
|r|
|	fz   }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the grouping tensors of Grouping block.
        r   rd   N)attention_maskcausal_attention_mask)	r.  r$  r  rR   r+  r3  r(  r0  r)  )r   r4  r5  r6  r$  r   cat_xlayer	layer_outr   outputss              r$   r   zGroupViTStage.forward  s       **11-2D2DQ2GRPK##/)D,@,@AQ,RRKa-[[ 	!EeDPTUIaLE	! e,;	??&??1k:LAyk",Gr&   r-  NF)r   r   r   r   r   rU   rx   propertyr.  r0  r!   r   r   r3  r   r   r   r   r   r   s   @r$   r   r     s    W ($ (  ( "	 (
  (  (D , ,2%,, 2Xell5K 2W\WcWc 2 48,1	'||' #5<<0' $D>	'
 
u  	!'r&   r   c            
            e Zd Z	 	 	 d	dedee   dee   dee   f fdZdej                  dej                  fdZ	 xZ
S )
r   rr   r|   intermediate_sizeoutput_sizec                    t         |           || _        t        |j                     | _        ||n|j                  }||n|j                  }||n|}t        j                  ||      | _
        t        j                  ||      | _        y r-  )rw   rx   rr   r
   
hidden_actactivation_fnr|   rA  r   r   fc1fc2)r   rr   r|   rA  rB  r   s        r$   rx   zGroupViTMLP.__init__-  s     	#F$5$56%0%<k&BTBT1B1N-TZTlTl%0%<k+99[*;<99.<r&   r4  r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r-  )rF  rE  rG  )r   r4  s     r$   r   zGroupViTMLP.forward=  s4    /**=9/r&   r  )r   r   r   r   r   rU   rx   r!   r   r   r   r   s   @r$   r   r   ,  s`     &*+/%)=$= c]= $C=	=
 c]= U\\ ell r&   r   c                        e Zd Z fdZ xZS )r   c                 f    t         |   |j                  dd            }|j                  dd      S Nr   rN   )rw   r   r   )r   r   r   s     r$   r   zGroupViTMixerMLP.forwardE  s-    GOAKK1-.{{1a  r&   )r   r   r   r   r   r   s   @r$   r   r   D  s    ! !r&   r   c                   6    e Zd ZdZ fdZdej                  dedefdZ	 	 	 	 ddej                  de	ej                     d	e	ej                     d
e	ej                     de	e   deej                  e	ej                     e	eej                        f   fdZ xZS )ry   z=Multi-headed attention from 'Attention Is All You Need' paperc                 
   t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r   r   )rw   rx   rr   r|   r   num_attention_heads	num_headshead_dimr   r]   attention_dropoutr   r   r   r   r   r   out_projr   s     r$   rx   zGroupViTAttention.__init__M  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar&   rG   r
  bszc                     |j                  ||| j                  | j                        j                  dd      j	                         S rK  )r  rO  rP  r   rg   )r   rG   r
  rS  s       r$   _shapezGroupViTAttention._shape`  s7    {{3GQQRSUVWbbddr&   r4  r8  r9  r   r6  r   c                 *   |j                         \  }}}|du}	| j                  |      | j                  z  }
|	rE| j                  | j	                  |      d|      }| j                  | j                  |      d|      }nD| j                  | j	                  |      d|      }| j                  | j                  |      d|      }|| j                  z  d| j                  f} | j                  |
||      j                  | }
 |j                  | } |j                  | }|j                  d      }t        j                  |
|j                  dd            }|j                         || j                  z  ||fk7  r/t        d|| j                  z  ||f d|j                                |{|j                         |d||fk7  r#t        d|d||f d|j                                |j                  || j                  ||      |z   }|j                  || j                  z  ||      }|{|j                         |d||fk7  r#t        d|d||f d|j                                |j                  || j                  ||      |z   }|j                  || j                  z  ||      }t        j                  j                  |d      }|r?|j                  || j                  ||      }|j                  || j                  z  ||      }nd}t        j                  j!                  || j                   | j"                  	      }t        j                  ||      }|j                         || j                  z  || j                  fk7  r7t        d
|| j                  || j                  f d|j                                |j                  || j                  || j                        }|j                  dd      }|j%                  |||      }| j'                  |      }||fS )z#Input shape: Batch x Time x ChannelNrd   r   rN   z$Attention weights should be of size z	, but is z!Attention mask should be of size r   )pr   z `attn_output` should be of size )rR   r   r]   rU  r   r   rO  rP  r  r!   bmmr   r   r   r   r4   r   r   rX   rR  )r   r4  r8  r9  r   r6  rS  tgt_lenr   is_cross_attentionquery_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                      r$   r   zGroupViTAttention.forwardc  s    #0"4"4"6Wi2$> {{=1DJJ>T[[1F%GSQJ;;t{{3H'I2sSLT[[%?SIJ;;t{{='A2sKLDNN*B>
Ct{{<#>CCZP$Z__j1
(|((*5//!$yyz/C/CAq/IJ3#7'"JJ6dnn8LgW^7_6` a %%'(*  !,$))+Q/II 7a'8R7S T-22457  (,,S$..'7SVkkL',,S4>>-A7GTL%""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S$..'7SVddL',,S4>>-A7GTL}},,\r,B
 %1$5$5c4>>7T[$\!055cDNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CRVR_R_3`2a b$$&') 
 "&&sDNNGT]]S!++Aq1!))#w	BmmK0111r&   )NNNF)r   r   r   r   rx   r!   r   rU   rU  r   r   r   r   r   r   r   s   @r$   ry   ry   J  s    GB&eU\\ eC ec e 268<=A,1R2||R2 !.R2  (5	R2
  ((9(9:R2 $D>R2 
u||Xell3XeELL>Q5RR	SR2r&   ry   c                        e Zd Zdef fdZ	 d	dej                  dej                  dej                  dee   de	ej                     f
dZ xZS )
r'  rr   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y rt   )rw   rx   r|   r   ry   	self_attnr   r{   r}   layer_norm1r   r   layer_norm2r   s     r$   rx   zGroupViTEncoderLayer.__init__  sm    ++*62<<F<Q<QRv&<<F<Q<QRr&   r4  r8  r9  r6  r   c                     |}| j                  |      }| j                  ||||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r4  r8  r9  r6  )rg  rf  rh  r   )r   r4  r8  r9  r6  residualr`  r=  s           r$   r   zGroupViTEncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
#| !=0 ((7/ =0 "&Gr&   r   )r   r   r   r   rx   r!   r   r   r   r   r   r   r   r   s   @r$   r'  r'    sf    S~ S -2&||& &  %||	&
 $D>& 
u  	!&r&   r'  c                   "    e Zd ZdZeZdZdZd Zy)GroupViTPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    groupvitTc                 d   | j                   j                  }t        |t        j                  t        j
                  f      rX|j                  j                  j                  d|       |j                  |j                  j                  j                          nct        |t        j                        rI|j                  j                  j                          |j                  j                  j                  d       | j                   j                  }t        |t              ri|j                  j                  j                  j                  d|dz         |j                   j                  j                  j                  d|dz         yt        |t"              r,| j                   j                  }|j$                  dz  d|j                   j&                  z  dz  z  |z  }|j$                  dz  |z  }t        j(                  j                  |j*                  j                  |       t        j(                  j                  |j,                  j                  |       t        j(                  j                  |j.                  j                  |       t        j(                  j                  |j0                  j                  |       yt        |t2              r| j                   j                  }|j                   j4                  dz  d|j                   j&                  z  dz  z  |z  }d|j                   j4                  z  dz  |z  }t        j(                  j                  |j6                  j                  |       t        j(                  j                  |j8                  j                  |       yy)	zInitialize the weightsrB   )meanstdNr3   g{Gz?r   rN   )rp  )rr   initializer_ranger   r   r   r   weightdatanormal_biaszero_r{   fill_initializer_factorr  r  r  ry   r   num_hidden_layersinitr   r   r   rR  r   r|   rF  rG  )r   module
init_rangefactorin_proj_stdout_proj_stdfc_stds          r$   _init_weightsz%GroupViTPreTrainedModel._init_weights  s    [[22
fryy"))45 MM&&CZ&@{{&  &&(-KK""$MM$$S)//f45""))..66CVd]6S%%,,1199sQU9V 12[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE,[[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O? -r&   N)	r   r   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointingr  r   r&   r$   rl  rl    s     
 "L"&*#@r&   rl  aJ  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`GroupViTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`CLIPTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
aE  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`CLIPTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`CLIPImageProcessor.__call__`] for details.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                   |     e Zd Zdeddf fdZ	 	 	 d
dej                  dee   dee   dee   de	e
ef   f
d	Z xZS )GroupViTVisionEncoderrr   r   Nc                 h   t         |           || _        t        j                  t        t        |j                              D cg c]P  }t        ||j                  |   |j                  |   |j                  |   |dkD  r|j                  |dz
     nd      R c}      | _        d| _        y c c}w )Nr   r   )rr   r!  r   r   r"  F)rw   rx   rr   r   r%  r&  r#   depthsr   num_group_tokensnum_output_groupsstagesgradient_checkpointing)r   rr   ir   s      r$   rx   zGroupViTVisionEncoder.__init__x  s    mm s6==12	  ! --*$*$;$;A$>%+%=%=a%@LMPQE)A)A!a%)HWX	
 ',#	s   AB/r4  output_hidden_statesr6  return_dictc                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd }|rdnd }d }t	        | j
                        D ]3  \  }}	|r||fz   } |	|||      }
|
d   }|
d   }|s%|
d   +||
d   fz   }5 |r||fz   }|st        d |||fD              S t        |||      S )Nr   r   r   rN   c              3   &   K   | ]	  }||  y wr-  r   r   vs     r$   r   z0GroupViTVisionEncoder.forward.<locals>.<genexpr>  s     gqYZYfg   last_hidden_stater4  rZ   )rr   r6  r  use_return_dict	enumerater  r   r   )r   r4  r  r6  r  all_hidden_statesall_groupingsr   r  stagelayer_outputss              r$   r   zGroupViTVisionEncoder.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]"6BD/T!$++. 
	DHAu#$58H$H!!-?PQM)!,M(+L ]1%5%A -q1A0C C
	D   1]4D Dg]4E}$Uggg+;LYf
 	
r&   r  )r   r   r   r   rx   r!   r   r   r   r   r   r   r   r   r   s   @r$   r  r  w  sq    ,3 , ,( 04,0&*%
||%
 'tn%
 $D>	%

 d^%
 
uo%	&%
r&   r  c                        e Zd ZdZdef fdZ	 	 	 	 	 ddeej                     deej                     dee	   dee	   dee	   d	e
eef   fd
Z xZS )GroupViTTextEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self-attention layers. Each layer is a
    [`GroupViTEncoderLayer`].

    Args:
        config: GroupViTTextConfig
    rr   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r>  )
rw   rx   rr   r   r%  r&  ry  r'  r(  r  )r   rr   r  r   s      r$   rx   zGroupViTTextEncoder.__init__  sP    mm5QWQiQiKj$ka%9&%A$kl&+# %ls   A#r8  r9  r6  r  r  r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	t	        | j
                        D ]b  \  }
}|r||	fz   }| j                  r,| j                  r | j                  |j                  |	|||      }n ||	|||      }|d   }	|sZ||d   fz   }d |r||	fz   }|st        d |	||fD              S t        |	||      S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr   )r6  r   r   c              3   &   K   | ]	  }||  y wr-  r   r  s     r$   r   z.GroupViTTextEncoder.forward.<locals>.<genexpr>  s     eqWXWder  r  )rr   r6  r  r  r  r(  r  r   _gradient_checkpointing_func__call__r   r   )r   r  r8  r9  r6  r  r  encoder_statesall_attentionsr4  idxencoder_layerr  s                r$   r   zGroupViTTextEncoder.forward  sH   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8 	FC#!/=2B!B**t}} $ A A!**!")%! !.!")&7	! *!,M !/=3C2E!E-	F0  +}.>>Ne]NN$Seee+>Vd
 	
r&   )NNNNN)r   r   r   r   r   rx   r   r!   r   r   r   r   r   r   r   r   s   @r$   r  r    s    ,1 , 268<,0/3&*O
 !.O
  (5	O

 $D>O
 'tnO
 d^O
 
uo%	&O
r&   r  c                        e Zd Zdef fdZ ee       eee      	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     de	e   de	e   d	e	e   d
eeef   fd              Z xZS )GroupViTTextTransformerrr   c                     t         |           || _        |j                  }t	        |      | _        t        |      | _        t        j                  ||j                        | _        |j                  | _        y rt   )rw   rx   rr   r|   r  r   r  encoderr   r{   r}   final_layer_normeos_token_idr  s      r$   rx   z GroupViTTextTransformer.__init__  sa    &&	08*62 "YF<Q<Q R #//r&   output_typer  r  r8  r  r6  r  r  r   c                 Z   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      |j                         }|j                  d|d         }| j                  ||      }t        ||j                  |j                        }	|t        ||j                        }| j                  |||	|||      }
|
d   }| j                  |      }| j                  dk(  rm|t        j                   |j"                  d   |j                        |j%                  t        j&                  |j                  	      j)                  d
      f   }n|t        j                   |j"                  d   |j                        |j%                  t        j&                  |j                  	      | j                  k(  j'                         j)                  d
      f   }|s
||f|
dd z   S t+        |||
j,                  |
j.                        S )
        Returns:

        NzYou have to specify input_idsrd   )r  r  r   )r  r8  r9  r6  r  r  r   rN   )rC   r   r   r   r  pooler_outputr4  rZ   )rr   r6  r  r  r   rR   r  r   r   rC   r   r   r  r  r  r!   r"   rI   torU   argmaxr   r4  rZ   )r   r  r8  r  r6  r  r  input_shaper4  r9  encoder_outputsr  pooled_outputs                r$   r   zGroupViTTextTransformer.forward  s.    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]<==nn&NN2{27	),W !A,,]5I5I!

 %7H[H[\N,,')"7/!5# ' 
 ,A. 112CD! ..44Q7@Q@X@XY5995F5M5MNUUZ\U]_M ..44Q7@Q@X@XY EII6G6N6NOSWSdSddB!M %}58KKK)/')77&11	
 	
r&   NNNNNN)r   r   r   r   rx   r   GROUPVIT_TEXT_INPUTS_DOCSTRINGr   r   r   r!   r   r   r   r   r   r   r   s   @r$   r  r    s    	01 	0 ++IJ+ETfg -115/3,0/3&*P
ELL)P
 !.P
 u||,	P

 $D>P
 'tnP
 d^P
 
u00	1P
 h KP
r&   r  c                       e Zd ZeZdef fdZdej                  fdZd Z	 e
e       eee      	 	 	 	 	 	 ddeej                      deej                      d	eej                      d
ee   dee   dee   deeef   fd              Z xZS )GroupViTTextModelrr   c                 d    t         |   |       t        |      | _        | j	                          y r-  )rw   rx   r  
text_model	post_initr   s     r$   rx   zGroupViTTextModel.__init__v  s&     1&9r&   r   c                 B    | j                   j                  j                  S r-  r  r   r  r   s    r$   get_input_embeddingsz&GroupViTTextModel.get_input_embeddings|  s    ))999r&   c                 :    || j                   j                  _        y r-  r  )r   r   s     r$   set_input_embeddingsz&GroupViTTextModel.set_input_embeddings  s    5:""2r&   r  r  r8  r  r6  r  r  c                 0    | j                  ||||||      S )aK  
        Returns:

        Examples:

        ```python
        >>> from transformers import CLIPTokenizer, GroupViTTextModel

        >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> model = GroupViTTextModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r  r8  r  r6  r  r  )r  )r   r  r8  r  r6  r  r  s          r$   r   zGroupViTTextModel.forward  s,    8 )%/!5#  
 	
r&   r  )r   r   r   r   r  rx   r   Moduler  r  r   r  r   r   r   r!   r   r   r   r   r   r   r   s   @r$   r  r  s  s    %L1 :bii :; ++IJ+ETfg -115/3,0/3&*!
ELL)!
 !.!
 u||,	!

 $D>!
 'tn!
 d^!
 
u00	1!
 h K!
r&   r  c                        e Zd Zdef fdZ ee       eee      	 	 	 	 d
de	e
j                     de	e   de	e   de	e   deeef   f
d	              Z xZS )GroupViTVisionTransformerrr   c                     t         |           || _        |j                  }t	        |      | _        t        |      | _        t        j                  ||j                        | _        y rt   )rw   rx   rr   r|   r   r   r  r  r   r{   r}   r   r  s      r$   rx   z"GroupViTVisionTransformer.__init__  sP    &&	26:,V4iV5J5JKr&   r  r   r  r6  r  r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  |      }| j                  ||||      }|d   }| j                  |      }|j                  d      }|s
||f|dd z   S t        |||j                  |j                        S )r  Nz You have to specify pixel_values)r4  r  r6  r  r   r   r   r  )rr   r6  r  r  r   r   r  r   ro  r   r4  rZ   )	r   r   r  r6  r  r4  r  r  r  s	            r$   r   z!GroupViTVisionTransformer.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@5,,'!5/#	 ' 
 ,A. !NN+<=)..1.5%}58KKK)/')77&11	
 	
r&   NNNN)r   r   r   r   rx   r    GROUPVIT_VISION_INPUTS_DOCSTRINGr   r   r   r!   r   r   r   r   r   r   r   s   @r$   r  r    s    L3 L ++KL+EThi 59/3,0&*+
u001+
 'tn+
 $D>	+

 d^+
 
u00	1+
 j M+
r&   r  c                        e Zd ZeZdZdef fdZdefdZ e	e
       eee      	 	 	 	 ddeej                     dee   dee   d	ee   deeef   f
d
              Z xZS )GroupViTVisionModelr   rr   c                 d    t         |   |       t        |      | _        | j	                          y r-  )rw   rx   r  vision_modelr  r   s     r$   rx   zGroupViTVisionModel.__init__  s'     5f=r&   r   c                 B    | j                   j                  j                  S r-  )r  r   r   r   s    r$   r  z(GroupViTVisionModel.get_input_embeddings  s      ++<<<r&   r  r6  r  r  c                 ,    | j                  ||||      S )a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, GroupViTVisionModel

        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> model = GroupViTVisionModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r6  r  r  )r  )r   r   r6  r  r  s        r$   r   zGroupViTVisionModel.forward  s(    >   %/!5#	 ! 
 	
r&   r  )r   r   r   r   r  main_input_namerx   r   r  r   r  r   r   r   r!   r   r   r   r   r   r   r   s   @r$   r  r    s    'L$O3 =&= = ++KL+EThi 59,0/3&*"
u001"
 $D>"
 'tn	"

 d^"
 
u00	1"
 j M"
r&   r  c                   d    e Zd ZeZdef fdZ ee      	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee   dee   dee   d	e	j                  fd
       Z ee      	 	 	 	 ddee	j                     dee   dee   dee   d	e	j                  f
d       Z ee       eee      	 	 	 	 	 	 	 	 	 ddee	j&                     dee	j                     dee	j                     dee	j&                     dee   dee   dee   dee   dee   d	eeef   fd              Z xZS )GroupViTModelrr   c           
      6   t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }|j                  | _	        |j                  | _
        |j                  | _        |j                  | _        t        |      | _        t!        |      | _        t%        j&                  t%        j(                  | j                  | j                  d      t%        j*                  | j                        t%        j,                  d      t%        j(                  | j                  | j                  d            | _        t%        j&                  t%        j(                  | j                  | j                  d      t%        j*                  | j                        t%        j,                  d      t%        j(                  | j                  | j                  d            | _        t%        j2                  t5        j6                  | j8                  j:                              | _        | j?                          y )NzOconfig.text_config is expected to be of type GroupViTTextConfig but is of type .zSconfig.vision_config is expected to be of type GroupViTVisionConfig but is of type T)ru  )inplace) rw   rx   r   text_configr   	TypeErrortypevision_configr   projection_dimprojection_intermediate_dimr|   text_embed_dimvision_embed_dimr  r  r  r  r   r*  r   BatchNorm1dReLUvisual_projectiontext_projectionr   r!   rG   rr   logit_scale_init_valuelogit_scaler  )r   rr   r  r  r   s       r$   rx   zGroupViTModel.__init__  s    &,,.@A++,-Q0 
 &..0DE--./q2 
 ((,,$33+1+M+M()55 - 9 91+>5mD!#IId++T-M-MTXYNN4;;<GGD!IId668K8KRVW	"
  "}}IId))4+K+KRVWNN4;;<GGD!IId668K8KRVW	 
 <<T[[5W5W(XY 	r&   r  r8  r  r6  r  r  r   c                     ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||||||      }|d   }| j                  |      }	|	S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`GroupViTTextModel`].

        Examples:

        ```python
        >>> from transformers import CLIPTokenizer, GroupViTModel

        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```r  r   )rr   r6  r  r  r  r  )
r   r  r8  r  r6  r  r  text_outputsr  text_featuress
             r$   get_text_featureszGroupViTModel.get_text_featuresE  s    6 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B])%/!5# ' 
 %Q,,];r&   r   c                     ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||||      }|d   }| j                  |      }|S )aH  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`GroupViTVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, GroupViTModel

        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```r  r   )rr   r6  r  r  r  r  )r   r   r6  r  r  vision_outputsr  image_featuress           r$   get_image_featuresz GroupViTModel.get_image_featurest  s    > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%/!5#	 + 
 'q)//>r&   r  return_lossoutput_segmentationc
           
         ||n| j                   j                  }||n| j                   j                  }|rd}||n| j                   j                  }|	|	n| j                   j                  }	| j                  ||||	      }
| j                  ||||||	      }|
d   }| j                  |      }|d   }| j                  |      }||j                  dd      z  }||j                  dd      z  }| j                  j                         }t        j                  ||j                               |z  }|j                         }d}|rb|
d   }| j                  |j                  d|j                   d               }|r|
d	   }n|
d
   }t#        ||j                   d
d       }||j                  dd      z  }t        j                  ||j                               |z  }|j                  |j                   d   d|j                   d         j%                  dd
d      }|j                  |j                   d   |j                   d   d      }t        j                  ||      |z  }|j                  |j                   d   |j                   d   |j                   d
   |j                   d	         }d}|rt'        |      }|	s|
|||||||
f}n||||||
f}||f|z   S |S t)        ||||||||
      S )a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, GroupViTModel

        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```NTr  r  r   rd   r   r   r	   rN   )r   r   r   r   r   r   r   r   )rr   r6  r  r  r  r  r  r  r  normr  expr!   matmulr)   rX   rI   ro   rf   r,   r   )r   r  r   r8  r  r  r6  r  r  r  r  r  r   r   r  r   r   
seg_logitsimage_group_embedsrZ   groupinglogits_per_image_groupflatten_groupingr   outputs                            r$   r   zGroupViTModel.forward  sn   N 2C1N-TXT_T_TqTq#6#BHgHg 	  $$8$D $++JjJj 	 &1%<k$++B]B]**%/!5#	 + 
 )%/!5# ' 
 &a(--l;"1o**;7 $l&7&7B&7&MM!K$4$4T$4$JJ &&**,,,{LNN4DES*,,.
 "0!2!%!7!78J8R8RSUWiWoWoprWs8t!u#+A.
+A.
3J@R@RSTSU@VWH "46H6M6MRT^b6M6c!c%*\\2Dkmmo%VYd%d"%;%C%C""1%r;+<+<Q+?&gaA #
  (//q0A8>>RSCTVXY &<>NOR]]J#++  #Z%5%5a%8(..:KX^^\]M^J  1D%$#  " +O[,Xdftu)-)9TGf$EvE"-+ *#%* .	
 		
r&   r  r  )	NNNNNNNNN)r   r   r   r   r  rx   r   r  r   r!   r   r   r   r  r  r  GROUPVIT_INPUTS_DOCSTRINGr   r   r  r   r   r   r   r   s   @r$   r  r    s-   !L)~ )V ++IJ -115/3,0/3&*,ELL), !., u||,	,
 $D>, 'tn, d^, 
		, K,\ ++KL 59,0/3&*.u001. $D>. 'tn	.
 d^. 
		. M.` ++DE+>^\ 15481537&*,0/3.2&*K
E,,-K
 u001K
 !.	K

 u//0K
 d^K
 $D>K
 'tnK
 &d^K
 d^K
 
u))	*K
 ] FK
r&   r  )r   Frd   r   )Kr   collections.abcr   dataclassesr   typingr   r   r   r   numpyrV   r!   torch.utils.checkpointr   activationsr
   modeling_attn_mask_utilsr   r   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   r   r   r   configuration_groupvitr   r   r   
get_loggerr   logger_CHECKPOINT_FOR_DOCr   r%   r,   rU   r>   floatr   rL   rb   ro   r  rq   r   r   r   r   r   r  r   r   r   ry   r'  rl  GROUPVIT_START_DOCSTRINGr  r  r  r  r  r  r  r  r  r  r   r&   r$   <module>r     sy     ! . .     ! d K -  ] \ 
		H	%0 
`U\\ `ell `
-ell -u|| - C 5<< e t RU _d_k_k ,<:"))  -bii -`4+")) 4+n /
+ /
 /
dbii DGryy GVRYY B[BII [|")) 0!{ !k2		 k2^/299 /d)@o )@X	 " @$  # L7
BII 7
t^
")) ^
B^
bii ^
B2
/ 2
j7
		 7
t1
1 1
h ./[
+ [
 0[
r&   