
    sg y                       d Z ddlZddlZddlmZ ddlmZ ddlmZmZm	Z	m
Z
mZmZmZ ddlZddlZddlmZ ddlmZ dd	lmZmZ dd
lmZmZmZ ddlmZmZmZmZmZm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'  ejP                  e)      Z*dZ+dZ,dZ-dZ.dZ/g dZ0dZ1dZ2ee'e%e&f   Z3e G d de             Z4e G d de             Z5e G d de             Z6 G d dejn                        Z8 G d dejn                        Z9 G d d ejn                        Z: G d! d"ejn                        Z; G d# d$ejn                        Z< G d% d&ejn                        Z= G d' d(ejn                        Z> G d) d*ejn                        Z? G d+ d,ejn                        Z@ G d- d.ejn                        ZA G d/ d0ejn                        ZBd1ZCd2ZDd3ZEeEeDz   ZFd4ZGeGeDz   ZHd5eDz   ZId6ZJeEeGz   eDz   eJz   ZKd7eGz   eEz   d8z   eDz   ZLd9ZM G d: d;e      ZN ed<eCj                  d=             G d> d?eN             ZP ed@eCj                  d=             G dA dBeN             ZQ edCeCj                  d=             G dD dEeN             ZR edFeCj                  dG=             G dH dIeN             ZS G dJ dKejn                        ZT G dL dMejn                        ZU G dN dOejn                        ZV edPeCj                  dQ=             G dR dSeN             ZW G dT dUejn                        ZX G dV dWejn                        ZY G dX dYejn                        ZZ G dZ d[ejn                        Z[ ed\eCj                  dG=      eMz          G d] d^eN             Z\y)_zPyTorch FLAVA model.    N)OrderedDict)	dataclass)AnyDictListOptionalSetTupleUnion)nn   )ACT2FN)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputadd_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings	torch_int   )FlavaConfigFlavaImageCodebookConfigFlavaImageConfigFlavaMultimodalConfigFlavaTextConfigzfacebook/flava-fullzfacebook/flava-image-codebookr   r    r   )r         g$(~k@c                       e Zd ZU dZdZeej                     ed<   dZ	ee
   ed<   dZeej                     ed<   dZee
   ed<   dZeej                     ed<   dZee
   ed<   d	ee   fd
Zy)FlavaModelOutputa  
    Output from FlavaModel containing embeddings and outputs from individual encoders.

    Note that `image_embeddings` and `text_embeddigns` returned are similar to pooled output returned from a
    transformer. If you want embeddings for contrastive loss or retrieval use a FLAVA model's `image_projection` and
    `text_projection` layers on `image_embeddings` and `text_embeddings` respectively.

    Args:
        image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
            The image embeddings which are basically the pooled output of [`FlavaImageModel`].
        image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
            The output of the [`FlavaImageModel`].
        text_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` are present):
            The text embeddings which are basically the pooled output of [`FlavaTextModel`].
        text_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids` are present):
            The output of the [`FlavaTextModel`].
        multimodal_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present and `skip_multimodal_encoder` is `None` or `False`):
            The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
        multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_multimodal_encoder` is `None` or `False`):
            The output of the [`FlavaMultimodalModel`].
    Nimage_embeddingsimage_outputtext_embeddingstext_outputmultimodal_embeddingsmultimodal_outputreturnc                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r(   r&   r*   Ngetattrto_tuple).0kselfs     [/var/www/html/venv/lib/python3.12/site-packages/transformers/models/flava/modeling_flava.py	<genexpr>z,FlavaModelOutput.to_tuple.<locals>.<genexpr>b   s=      
  TTDGZabfhiZjZsZsZuu
   -0tuplekeysr3   s   `r4   r0   zFlavaModelOutput.to_tuplea   s#     
YY[
 
 	
    )__name__
__module____qualname____doc__r%   r   torchFloatTensor__annotations__r&   r   r'   r(   r)   r*   r
   r   r0    r;   r4   r$   r$   B   s    , 59hu00189=L(56=37OXe//078<K45<9=8E$5$56=>Bx :;B
%* 
r;   r$   c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeej                     ed<   d	efd
Zy)FlavaLossesa"  Class representing pretraining losses from FLAVA model

    Args:
        mim (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels` and `pixel_values` are present, `input_ids_masked` is absent and `mim_weight` > 0.:
            Masked Image Modeling loss as used in BeIT calculated only for unimodal image data.
        mlm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels` and `input_ids_masked` are present, `pixel_values` is absent and `mlm_weight` > 0.:
            Masked Language Modeling loss as used in BERT calculated only for unimodal text data.
        itm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `itm_labels`, `input_ids_masked`, `pixel_values` are present and `itm_weight` > 0.:
            Image Text Matching (ITM) loss calculated for paired image-text data. Note that ITM loss is calculated on
            masked pairs in FLAVA.
        global_contrastive (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `input_ids` and `pixel_values` are present and `global_contrastive_weight` > 0.:
            Contrastive loss for image-text similarity similar to CLIP but calculated globally for paired image-text
            data. This is calculated on unmasked images and texts.
        mmm_image (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_image_weight` > 0.:
            Masked Multimodal Modeling loss's image component calculated on paired image-text data.
        mmm_text (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_text_weight` > 0.:
            Masked Multimodal Modeling loss's text component calculated on paired image-text data.
    Nmimmlmitmglobal_contrastive	mmm_imagemmm_textr+   c                 B    d}| j                         D ]	  }|d} |S  |S )NTF)values)r3   all_nonevs      r4   rN   zFlavaLosses.all_none   s5     	A} 		 r;   )r<   r=   r>   r?   rF   r   r@   rA   rB   rG   rH   rI   rJ   rK   boolrN   rC   r;   r4   rE   rE   h   s    & (,C%##	$+'+C%##	$+'+C%##	$+6:!2!23:-1Ix))*1,0Hhu(()0$ r;   rE   c                      e Zd ZU dZdZeej                     ed<   dZ	e
ed<   dZeej                     ed<   dZee   ed<   dZeej                     ed<   dZee   ed<   dZeej                     ed	<   dZee   ed
<   dZeej                     ed<   dZee   ed<   dZeej                     ed<   dZee   ed<   dZeej                     ed<   dZee   ed<   dZeej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeej                     ed<   dee    fdZ!y)FlavaForPreTrainingOutputa  
    Output from FlavaForPreTraining containing embeddings, and outputs from individual encoders.

    Note that `image_embeddings` and `text_embeddings` returned are similar to pooled output returned from a
    transformer. If you want embeddings for contrastive loss or retrieval use a FLAVA model's `image_projection` and
    `text_projection` layers on `image_embeddings` and `text_embeddings` respectively.

    Args:
        loss (`torch.FloatTensor`, *optional*, returned when `return_loss` is True):
            Total loss calculated for this model.
        loss_info (`FlavaLosses`):
            Detailed info for FLAVA Pretraining losses. Check `FlavaLosses` class description for the information on
            the keys.
        image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
            The image embeddings which are basically the pooled output of [`FlavaImageModel`].
        image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
            The output of the [`FlavaImageModel`].
        text_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` are present):
            The text embeddings which are basically the pooled output of [`FlavaTextModel`].
        text_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids` are present):
            The output of the [`FlavaTextModel`].
        multimodal_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`):
            The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
        multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`):
            The output of the [`FlavaMultimodalModel`].

        image_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
            The image embeddings which are basically the pooled output of [`FlavaImageModel`]. Uses `bool_masked_pos`
            to create masked images.
        image_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
            The output of the [`FlavaImageModel`]. Uses `bool_masked_pos` to create masked images.
        text_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids_masked` are present):
            The text embeddings which are basically the pooled output of [`FlavaTextModel`].
        text_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` are present):
            The output of the [`FlavaTextModel`].
        multimodal_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present):
            The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
        multimodal_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
            The output of the [`FlavaMultimodalModel`].

        mim_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape `(total_masked_patches, image_vocab_size)` , *optional*, returned when `pixel_values` are present and `input_ids_masked` are not):
                The logits for MIM unimodal loss. Uses `book_masked_pos` to get masked patches. The flattened output is
                returned when `bool_masked_pos` has some of the patches masked.
        mlm_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(total_masked_seq_length, text_vocab_size)`, *optional*, returned when `input_ids_masked` are present and `pixel_values` are not):
                The logits for MLM unimodal loss. The flattened output is returned when `input_ids_masked` has some of
                the tokens masked.
        itm_logits (`torch.FloatTensor` of shape `(batch_size, 2)`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
                The logits for ITM loss. Note that ITM loss is calculated on masked pairs in FLAVA.
        mmm_image_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape`(total_masked_patches, image_vocab_size)`, *optional*, returned when `pixel_values` and `input_ids_masked` are present):
                The logits for MMM image multimodal loss. Uses `book_masked_pos` to get masked patches. The flattened
                output is returned when `bool_masked_pos` has some of the patches masked.
        mmm_text_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(`(total_masked_seq_length, text_vocab_size)`), *optional*, returned when `pixel_values` and `input_ids_masked` are present):
                The logits for MMM text multimodal loss. The flattened output is returned when `input_ids_masked` has
                some of the tokens masked.
        contrastive_logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeddings` and `text_embeddings` but passed through FLAVA's
            `image_projection` and `text_projection` layers respectively. This represents the image-text similarity
            scores. This is calculated on unmasked images and texts.
        contrastive_logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeddings` and `image_embeddings` but passed through FLAVA's
            `text_projection` and `image_projection` layers respectively. This is calculated on unmasked images and
            texts.
    Nloss	loss_infor%   r&   r'   r(   r)   r*   image_masked_embeddingsimage_masked_outputtext_masked_embeddingstext_masked_outputmultimodal_masked_embeddingsmultimodal_masked_output
mim_logits
mlm_logits
itm_logitscontrastive_logits_per_imagecontrastive_logits_per_textmmm_image_logitsmmm_text_logitsr+   c                 T     g dt         fd j                         D              S )N)r(   r&   r*   rX   rV   rZ   c              3   d   K   | ]'  }|vr|   nt        |      j                          ) y wNr.   )r1   r2   r3   transformer_outputss     r4   r5   z5FlavaForPreTrainingOutput.to_tuple.<locals>.<genexpr>   s4     sbc)< <T!W'$PQBRB[B[B]]sr6   r7   )r3   re   s   `@r4   r0   z"FlavaForPreTrainingOutput.to_tuple   s(    
 sgkgpgpgrsssr;   )"r<   r=   r>   r?   rS   r   r@   rA   rB   rT   rE   r%   r&   r   r'   r(   r)   r*   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   r
   r   r0   rC   r;   r4   rR   rR      s   >@ )-D(5$$
%,!I{!48hu00189=L(56=37OXe//078<K45<9=8E$5$56=>Bx :;B;?Xe&7&78?@D"<=D:>HU%6%67>?C!;<C@D (5+<+<"=DEIh'ABI.2J**+2.2J**+2.2J**+2@D (5+<+<"=D?C%*;*;!<C48hu001837OXe//07	t%* 	tr;   rR   c            	            e Zd ZdZddededdf fdZdej                  de	d	e	dej                  fd
Z
	 	 ddej                  deej                     dedej                  fdZ xZS )FlavaImageEmbeddingszb
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
    configuse_mask_tokenr+   Nc                    t         |           |xs |j                  }t        j                  t        j                  dd|j                              | _        |r4t        j                  t        j                  dd|j                              nd | _        t        |j                  |j                  |j                  |j                        | _        | j                  j                  }t        j                  t        j                  d|dz   |j                              | _        t        j                   |j"                        | _        |j                  | _        || _        y )Nr   )
image_size
patch_sizenum_channels	embed_dim)super__init__
mask_tokenr   	Parameterr@   zeroshidden_size	cls_tokenPatchEmbeddingsrk   rl   rm   patch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropoutrh   )r3   rh   ri   rx   	__class__s       r4   rp   zFlavaImageEmbeddings.__init__   s    '<6+<+<ekk!Q8J8J&KLQ_",,u{{1a9K9K'LMei /((((,,((	!
 ++77#%<<A{QPVPbPb0c#d zz&"<"<= ++r;   
embeddingsheightwidthc                    |j                   d   dz
  }| j                  j                   d   dz
  }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  ddddf   }| j                  ddddf   }|j                   d   }|| j
                  z  }	|| j
                  z  }
t        |dz        }|j                  d|||      }|j                  dddd      }t        j                  j                  ||	|
fdd	
      }|j                  dddd      j                  dd|      }t        j                  ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Ng      ?r   r      bicubicF)sizemodealign_cornersdim)shapery   r@   jit
is_tracingrl   r   reshapepermuter   
functionalinterpolateviewcat)r3   r~   r   r   rx   num_positionsclass_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss               r4   interpolate_pos_encodingz-FlavaImageEmbeddings.interpolate_pos_encoding  s`    !&&q)A-0066q9A= yy##%+*F6UZ?+++221bqb59221ab59r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr;   pixel_valuesbool_masked_posr   c                 V   |j                   \  }}}}| j                  ||      }|j                         \  }}	}
|| j                  j	                  ||	d      }|j                         dk(  r!|j                  |j                  d      d      }|j                  d      j                  |      }|d|z
  z  ||z  z   }| j                  j	                  |dd      }t        j                  ||fd      }|r|| j                  |||      z   }n|| j                  z   }| j                  |      }|S )N)r   r   r   r         ?r   r   )r   rw   r   rq   expandr   r   	unsqueezetype_asru   r@   r   r   ry   r|   )r3   r   r   r   
batch_sizerm   r   r   r~   seq_len_mask_tokensmask
cls_tokenss                 r4   forwardzFlavaImageEmbeddings.forward3  s4    3?2D2D/
L&%**<Rj*k
!+!2
GQ&//00WbIK""$)"1"6"67K7KA7NPR"S",,R088ED#sTz2[45GGJ ^^**:r2>
YY
J7Q?
 $#d&C&CJPVX]&^^J#d&>&>>J\\*-
r;   FNF)r<   r=   r>   r?   r   rP   rp   r@   Tensorintr   r   
BoolTensorr   __classcell__r}   s   @r4   rg   rg      s    /  RV &&D5<< &D &DUX &D]b]i]i &DV 7;).	ll "%"2"23 #'	
 
r;   rg   c            	            e Zd ZdZ	 	 	 	 ddedeeeeef   f   dedef fdZddej                  de
d	ej                  fd
Z xZS )rv   z#
    Image to Patch Embedding.
    rk   rl   rm   rn   c                 V   t         |           t        |t        j                  j
                        s||f}t        |t        j                  j
                        s||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        t        j                  ||||      | _        y )Nr   r   )kernel_sizestride)ro   rp   
isinstancecollectionsabcIterablerk   rl   rx   r   Conv2d
projection)r3   rk   rl   rm   rn   rx   r}   s         r4   rp   zPatchEmbeddings.__init__\  s     	*koo&>&>?$j1J*koo&>&>?$j1J!!}
15*Q-:VW=:XY$$&))L)\fgr;   r   r   r+   c                 8   |j                   \  }}}}|sV|| j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d	      | j                  |      j	                  d      j                  dd      }|S )Nr   r   zInput image size (*z) doesn't match model (z).r   )r   rk   
ValueErrorr   flatten	transpose)r3   r   r   r   rm   r   r   xs           r4   r   zPatchEmbeddings.forwardo  s    2>2D2D/
L&%'++u8J/J (% 9+,Adooa.@-AE  OOL)11!4>>q!Dr;   )      r   r"   r   )r<   r=   r>   r?   r   r   r
   rp   r@   r   rP   r   r   r   s   @r4   rv   rv   W  s}     24hh #uS#X./h 	h
 h&	ELL 	D 	]b]i]i 	r;   rv   c                        e Zd ZdZ fdZ	 	 	 ddeej                     deej                     deej                     fdZ xZ	S )FlavaTextEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 >   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        t#        |dd      | _        | j'                  dt)        j*                  |j                        j-                  d      d       | j'                  d	t)        j.                  | j0                  j3                         t(        j4                  
      d       y )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r   r   F)
persistenttoken_type_ids)dtype)ro   rp   r   	Embedding
vocab_sizert   pad_token_idword_embeddingsmax_position_embeddingsry   type_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsrz   r{   r|   r/   r   register_bufferr@   aranger   rs   r   r   longr3   rh   r}   s     r4   rp   zFlavaTextEmbeddings.__init__~  s/   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
r;   	input_idsr   r   c                 $   |j                         }|d   }|| j                  d d d |f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }|}n:t        j                  |t
        j                  | j                  j                        }| j                  |      }| j                  |      }	||	z   }
| j                  dk(  r| j                  |      }|
|z  }
| j                  |
      }
| j                  |
      }
|
S )Nr   r   r   )r   devicer   )r   r   hasattrr   r   r@   rs   r   r   r   r   r   ry   r   r|   )r3   r   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedinputs_embedsr   r~   ry   s               r4   r   zFlavaTextEmbeddings.forward  s     nn& ^
,,Q^<L
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l,,Y7 $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
r;   )NNN)
r<   r=   r>   r?   rp   r   r@   r   r   r   r   s   @r4   r   r   {  sR    Q
* -115/3	 ELL)  !.  u||,	 r;   r   c                   "    e Zd Zdeddf fdZdej                  dej                  fdZ	 	 	 ddej                  deej                     d	eej                     d
e	de
eej                  ej                  f   eej                     f   f
dZ xZS )FlavaSelfAttentionrh   r+   Nc                    t         |           |j                  |j                  z  dk7  r3t	        |d      s't        d|j                  f d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                  |j                        | _        t        j                  |j                  | j                  |j                        | _        t        j                  |j                  | j                  |j                        | _        t        j                  |j                         | _        y )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .bias)ro   rp   rt   num_attention_headsr   r   r   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyvaluerz   attention_probs_dropout_probr|   r   s     r4   rp   zFlavaSelfAttention.__init__  s1    : ::a?PVXhHi"6#5#5#6"7 8334A7 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
zz&"E"EFr;   r   c                     |j                         d d | j                  | j                  fz   } |j                  | }|j	                  dddd      S )Nr   r   r   r   r   )r   r   r   r   r   )r3   r   new_x_shapes      r4   transpose_for_scoresz'FlavaSelfAttention.transpose_for_scores  sN    ffhsmt'?'?AYAY&ZZAFFK yyAq!$$r;   hidden_statesattention_mask	head_maskoutput_attentionsc                    | j                  |      }| j                  | j                  |            }| j                  | j                  |            }| j                  |      }t	        j
                  ||j                  dd            }	|	t        j                  | j                        z  }	||	|z   }	t        j                  j                  |	d      }
| j                  |
      }
||
|z  }
t	        j
                  |
|      }|j                  dddd      j                         }|j!                         d d | j"                  fz   } |j$                  | }|r||
f}|S |f}|S )Nr   r   r   r   r   r   )r   r   r   r   r@   matmulr   mathsqrtr   r   r   softmaxr|   r   
contiguousr   r   r   )r3   r   r   r   r   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                 r4   r   zFlavaSelfAttention.forward  sg    !JJ}5--dhh}.EF	//

=0IJ//0AB !<<Y5H5HR5PQ+dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD6G=/2 O\M]r;   NNF)r<   r=   r>   FlavaPossibleConfigsrp   r@   r   r   r   rP   r   r
   r   r   r   s   @r4   r   r     s    G3 G G$%ell %u|| % 26,0"'(||( !.( ELL)	(
  ( 
uU\\5<</0%2EE	F(r;   r   c                   |     e Zd ZdZdeddf fdZdej                  dej                  dej                  fdZ xZ	S )	FlavaSelfOutputz
    The residual connection is defined in FlavaLayer (same as ViTLayer) instead of here (as is the case with other
    models), due to the layernorm applied before each block.
    rh   r+   Nc                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y rd   )	ro   rp   r   r   rt   denserz   r{   r|   r   s     r4   rp   zFlavaSelfOutput.__init__  sB    YYv1163E3EF
zz&"<"<=r;   r   input_tensorc                 J    | j                  |      }| j                  |      }|S rd   r  r|   r3   r   r  s      r4   r   zFlavaSelfOutput.forward  s$    

=1]3r;   )
r<   r=   r>   r?   r  rp   r@   r   r   r   r   s   @r4   r  r    sE    
>3 > >
U\\  RWR^R^ r;   r  c                        e Zd Zdeddf fdZdee   ddfdZ	 	 	 ddej                  de
ej                     d	e
ej                     d
edeeej                  ej                  f   eej                     f   f
dZ xZS )FlavaAttentionrh   r+   Nc                     t         |           t        |      | _        t	        |      | _        t               | _        y rd   )ro   rp   r   	attentionr  outputsetpruned_headsr   s     r4   rp   zFlavaAttention.__init__
  s0    +F3%f-Er;   headsc                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r   )lenr   r  r   r   r  r   r   r   r   r  r  r   union)r3   r  indexs      r4   prune_headszFlavaAttention.prune_heads  s   u:?74>>55t~~7Y7Y[_[l[l
u
  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r;   r   r   r   r   c                 l    | j                  ||||      }| j                  |d   |      }|f|dd  z   }|S N)r   r   r   r   r   )r  r  )r3   r   r   r   r   self_outputsattention_outputr	  s           r4   r   zFlavaAttention.forward"  sQ     ~~.Iar & 
  ;;|AF#%QR(88r;   r
  )r<   r=   r>   r  rp   r	   r   r   r@   r   r   rP   r   r
   r   r   r   s   @r4   r  r  	  s    "3 " ";S ;d ;* 26,0"'|| !. ELL)	
   
uU\\5<</0%2EE	Fr;   r  c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )FlavaIntermediaterh   r+   Nc                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y rd   )ro   rp   r   r   rt   intermediate_sizer  r   
hidden_actstrr   intermediate_act_fnr   s     r4   rp   zFlavaIntermediate.__init__4  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r;   r   c                 J    | j                  |      }| j                  |      }|S rd   )r  r+  r3   r   s     r4   r   zFlavaIntermediate.forward=  s&    

=100?r;   	r<   r=   r>   r  rp   r@   r   r   r   r   s   @r4   r&  r&  3  s2    93 9 9U\\ ell r;   r&  c                   x     e Zd Zdeddf fdZdej                  dej                  dej                  fdZ xZS )FlavaOutputrh   r+   Nc                     t         |           t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        y rd   )
ro   rp   r   r   r(  rt   r  rz   r{   r|   r   s     r4   rp   zFlavaOutput.__init__E  sB    YYv779K9KL
zz&"<"<=r;   r   r  c                 T    | j                  |      }| j                  |      }||z   }|S rd   r  r  s      r4   r   zFlavaOutput.forwardK  s.    

=1]3%4r;   r.  r   s   @r4   r0  r0  D  s@    >3 > >U\\  RWR^R^ r;   r0  c                        e Zd ZdZdeddf fdZ	 	 	 ddej                  deej                     deej                     d	e	de
eej                  ej                  f   eej                     f   f
d
Z xZS )
FlavaLayerz?This corresponds to the Block class in the timm implementation.rh   r+   Nc                 r   t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        y Nr   r   )ro   rp   chunk_size_feed_forwardseq_len_dimr  r  r&  intermediater0  r  r   r   rt   r   layernorm_beforelayernorm_afterr   s     r4   rp   zFlavaLayer.__init__W  s    '-'E'E$'/-f5!&) !#V-?-?VEZEZ [!||F,>,>FDYDYZr;   r   r   r   r   c                     | j                  | j                  |      |||      }|d   }|dd  }||z   }| j                  |      }| j                  |      }| j	                  ||      }|f|z   }|S r"  )r  r:  r;  r9  r  )	r3   r   r   r   r   self_attention_outputsr$  r	  layer_outputs	            r4   r   zFlavaLayer.forwardc  s     "&!!-0)/	 "0 "
 2!4(, )=8 ++M:((6 {{<?/G+r;   r
  )r<   r=   r>   r?   r  rp   r@   r   r   rP   r   r
   r   r   r   s   @r4   r4  r4  T  s    I
[3 
[ 
[ 26,0"'|| !. ELL)	
   
uU\\5<</0%2EE	Fr;   r4  c                        e Zd Zdeddf fdZ	 	 	 	 	 ddej                  deej                     deej                     ded	ed
ede	e
ef   fdZ xZS )FlavaEncoderrh   r+   Nc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r   )
ro   rp   rh   r   
ModuleListrangenum_hidden_layersr4  layergradient_checkpointing)r3   rh   r   r}   s      r4   rp   zFlavaEncoder.__init__  sN    ]]fF^F^@_#`1Jv$6#`a
&+# $as   A#r   r   r   r   output_hidden_statesreturn_dictc                 x   |rdnd }|rdnd }t        | j                        D ]j  \  }	}
|r||fz   }|||	   nd }| j                  r,| j                  r | j	                  |
j
                  ||||      }n |
||||      }|d   }|sb||d   fz   }l |r||fz   }|st        d |||fD              S t        |||      S )NrC   r   r   c              3   &   K   | ]	  }||  y wrd   rC   )r1   rO   s     r4   r5   z'FlavaEncoder.forward.<locals>.<genexpr>  s     mq_`_lm   )last_hidden_stater   
attentions)	enumeraterE  rF  training_gradient_checkpointing_func__call__r8   r   )r3   r   r   r   r   rG  rH  all_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputss                r4   r   zFlavaEncoder.forward  s    #7BD$5b4(4 	POA|#$58H$H!.7.CilO**t}} $ A A ))!"#%! !-]NO]n o)!,M &9]1=M<O&O#)	P,   1]4D Dm]4EGZ$[mmm+;LYl
 	
r;   )NNFFT)r<   r=   r>   r   rp   r@   r   r   rP   r   r8   r   r   r   r   s   @r4   r@  r@    s    ,{ ,t , 26,0"'%* )
||)
 !.)
 ELL)	)

  )
 #)
 )
 
uo%	&)
r;   r@  c                   D     e Zd Zdef fdZdej                  fdZ xZS )FlavaPoolerrh   c                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y rd   )ro   rp   r   r   rt   r  Tanh
activationr   s     r4   rp   zFlavaPooler.__init__  s9    YYv1163E3EF
'')r;   r   c                 \    |d d df   }| j                  |      }| j                  |      }|S Nr   )r  r\  )r3   r   first_token_tensorpooled_outputs       r4   r   zFlavaPooler.forward  s6     +1a40

#566r;   r.  r   s   @r4   rY  rY    s     $3 $
U\\ r;   rY  aD  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`{config}`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a;  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`FlavaImageProcessor.__call__`] for details.

        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        interpolate_pos_encoding (`bool`, *optional*):
            Whether to interpolate the pre-trained position encodings.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)

        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:
            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            [What are token type IDs?](../glossary#token-type-ids)
z
    Args:
        hidden_states (`torch.FloatTensor` of shape `(batch_size, image_num_patches + text_seq_len, hidden_size)`):
            The concatenated hidden states of unimodal encoders.
z
    Args:
        skip_multimodal_encoder (*bool*, *optional*):
            Skip any calculations for multimodal encoder. Useful if multimodal encoding is not going to be used.
a  
    Args:
        input_ids_masked (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary. These ones are the masked version of the original task
            to be used with MLM. Indices can be obtained using [`AutoTokenizer`] along with
            [`DataCollatorForMaskedLanguageModeling`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)

a  
        image_attention_mask (`torch.FloatTensor` of shape `({1})`, *optional*):
            Mask to avoid performing attention on padding token indices specifically for images. Mask values selected
            in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)

        skip_unmasked_multimodal_encoder (*bool*, *optional*):
            Skip any calculations for multimodal encoder for unmasked inputs. FLAVA pretraining doesn't need unmasked
            multimodal embeddings or outputs as of now.

        mlm_labels (`torch.LongTensor` of shape `(batch_size, text_seq_len)`, *optional*):
            Labels for computing the left-to-right language and multimodal masked modeling loss (next word prediction).
            Indices should be in `[-100, 0, ..., text_config.vocab_size - 1]` (see `input_ids` docstring). Tokens with
            indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0,
            ..., text_config.vocab_size - 1]`.

        mim_labels (`torch.LongTensor` of shape `(batch_size, image_num_patches)`, *optional*):
            Labels for computing the image and multimodal masked modeling loss. Indices should be in `[-100, 0, ...,
            image_config.vocab_size - 1]`. Tokens with indices set to `-100` are ignored (masked), the loss is only
            computed for the tokens with labels in `[0, ..., image_config.vocab_size - 1]`. If not passed, they are
            generated automatically using the image codebook assigned to the model. By default, it uses
            [`FlavaImageCodebook`]. See [`FlavaImageCodebook`] to understand how to generate mim_labels.

        itm_labels (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*):
            Labels for computing the image-text matching loss. 0 means the pairs don't match and 1 means they match.
            The pairs with 0 will be skipped for calculation of MMM and global contrastive losses as well.

        return_loss (`bool`, *optional*, default to None):
            Whether to return calculated loss or not.
z
    Parameters:
        image_codebook ([`nn.Module`]): If passed, the image codebook will be set to this. Otherwise. it will
            be initialized using the image_codebook_config defined in the config first as the first parameter.
c                   t    e Zd ZdZeZdZdZdee	j                  e	j                  e	j                  f   ddfdZy)FlavaPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    flavaTmoduler+   Nc                 "   t        |t        j                  t        j                  f      rm|j                  j
                  j                  d| j                  j                         |j                  %|j                  j
                  j                          yyt        |t        j                        rz|j                  j
                  j                  d| j                  j                         |j                  2|j                  j
                  |j                     j                          yyt        |t        j                        rJ|j                  j
                  j                          |j                  j
                  j                  d       yy)zInitialize the weightsg        )meanstdNr   )r   r   r   r   weightdatanormal_rh   initializer_ranger   zero_r   r   r   fill_)r3   rd  s     r4   _init_weightsz"FlavaPreTrainedModel._init_weights^  s   fryy"))45 MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) .r;   )r<   r=   r>   r?   r   config_classbase_model_prefixsupports_gradient_checkpointingr   r   r   r   r   rn  rC   r;   r4   rb  rb  T  sF    
 L&*#*E"))RYY*L$M *RV *r;   rb  zeThe bare FLAVA Image Model transformer outputting raw hidden-states without any specific head on top.)rh   c                       e Zd ZeZdZdZddedef fdZde	j                  fdZde	j                  fd	Zd
eeee   f   ddfdZ eej'                  d             eeeede      	 	 	 	 	 	 	 	 ddeej6                     deej8                     dee   deej6                     deej6                     dee   dee   dee   deeef   fd              Z xZ S )FlavaImageModelzflava.image_modelr   rh   add_pooling_layerc                    t         |   |       || _        t        |      | _        t        |      | _        t        j                  |j                  |j                        | _        |rt        |      nd | _        | j                          y Nr   )ro   rp   rh   rg   r~   r@  encoderr   r   rt   r   	layernormrY  pooler	post_initr3   rh   rt  r}   s      r4   rp   zFlavaImageModel.__init__y  sg     .v6#F+f&8&8f>S>ST->k&)Dr;   r+   c                 .    | j                   j                  S rd   r~   rw   r:   s    r4   get_input_embeddingsz$FlavaImageModel.get_input_embeddings  s    ///r;   r   c                 &    || j                   _        y rd   r}  r3   r   s     r4   set_input_embeddingsz$FlavaImageModel.set_input_embeddings  s    +0(r;   heads_to_pruneNc                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 yz
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        Nitemsrw  rE  r  r   r3   r  rE  r  s       r4   _prune_headszFlavaImageModel._prune_heads  E    
 +002 	CLE5LLu%//;;EB	Cr;   batch_size, image_num_patchesvision)
checkpointoutput_typero  modalityexpected_outputr   r   r   r   r   rG  rH  c	                 "   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  || j                   j                        }| j                  |||      }	| j                  |	|||||      }
|
d   }| j                  |      }| j                  | j                  |      nd }|s
||f|
dd  z   S t        |||
j                  |
j                        S )Nz You have to specify pixel_values)r   r   r   r   r   rG  rH  r   r   rL  pooler_outputr   rM  )rh   r   rG  use_return_dictr   get_head_maskrD  r~   rw  rx  ry  r   r   rM  )r3   r   r   r   r   r   r   rG  rH  embedding_outputencoder_outputssequence_outputr`  s                r4   r   zFlavaImageModel.forward  s8   & 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@ &&y$++2O2OP	??/Tl + 
 ,,)/!5# ' 
 *!,..98<8OO4UY#]3oab6III)-')77&11	
 	
r;   TNNNNNNNN)!r<   r=   r>   r   ro  rp  main_input_namerP   rp   r   Moduler~  r  r   r   r   r  r   FLAVA_IMAGE_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   !_CONFIG_CLASS_FOR_IMAGE_MODEL_DOC_EXPECTED_IMAGE_OUTPUT_SHAPEr   r@   r   r   r   r8   r   r   r   s   @r4   rs  rs  o  sb   
 $L+$O/ D 0bii 01")) 1C4T#Y+? CD C ++G+N+NOn+op&.64 046:3715,0,0/3&*3
u||,3
 "%"2"233
 #+4.	3

 !.3
 ELL)3
 $D>3
 'tn3
 d^3
 
u00	13
 q3
r;   rs  zdThe bare FLAVA Text Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd ZeZdZddedef fdZdefdZ	de
j                  fdZd	eeee   f   dd
fdZ eej'                  d             eeee      	 	 	 	 	 	 	 	 ddeej4                     deej4                     deej4                     deej4                     deej4                     dee   dee   dee   deeef   fd              Z xZS )FlavaTextModelzflava.text_modelrh   rt  c                    t         |   |       || _        t        |      | _        t        |      | _        t        j                  |j                  |j                        | _        |rt        |      nd | _        | j                          y rv  )ro   rp   rh   r   r~   r@  rw  r   r   rt   r   rx  rY  ry  rz  r{  s      r4   rp   zFlavaTextModel.__init__  sg     -f5#F+f&8&8f>S>ST->k&)Dr;   r+   c                 .    | j                   j                  S rd   r~   r   r:   s    r4   r~  z#FlavaTextModel.get_input_embeddings  s    ...r;   r   c                 &    || j                   _        y rd   r  r  s     r4   r  z#FlavaTextModel.set_input_embeddings  s    */'r;   r  Nc                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 yr  r  r  s       r4   r  zFlavaTextModel._prune_heads  r  r;   batch_size, text_seq_lengthr  r  ro  r   r   r   r   r   r   rG  rH  c	                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      |j                         }	|!t        j                  |	|j                        }| j                  || j                   j                        }| j                  ||	|j                        }
| j                  |||      }| j                  ||
||||      }|d   }| j                  |      }| j                  | j                  |      nd }|s
||f|dd  z   S t!        |||j"                  |j$                        S )NzYou have to specify input_idsr   )r   r   r   r  r   r   r  )rh   r   rG  r  r   r   r@   onesr   r  rD  get_extended_attention_maskr~   rw  rx  ry  r   r   rM  )r3   r   r   r   r   r   r   rG  rH  r   extended_attention_maskr  r  r  r`  s                  r4   r   zFlavaTextModel.forward  s   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]<==nn&!"ZZI<L<LMN &&y$++2O2OP	040P0PK)9)91
  ??)% + 
 ,,2/!5# ' 
 *!,..98<8OO4UY#]3oab6III)-')77&11	
 	
r;   r  r  )r<   r=   r>   r    ro  rp  rP   rp   rv   r~  r   r  r  r   r   r   r  r   FLAVA_TEXT_INPUTS_DOCSTRINGr  r   r  r    _CONFIG_CLASS_FOR_TEXT_MODEL_DOCr   r@   r   r   r8   r   r   r   s   @r4   r  r    sU   
 #L*
 
4 
/o /0")) 0C4T#Y+? CD C ++F+M+MNk+lm&.5 -11515/3,0,0/3&*=
ELL)=
 !.=
 !.	=

 u||,=
 ELL)=
 $D>=
 'tn=
 d^=
 
u00	1=
 n=
r;   r  zjThe bare FLAVA Multimodal Model transformer outputting raw hidden-states without any specific head on top.c                   6    e Zd ZeZdZdZddef fdZdee	e
e	   f   ddfdZ eej                  d	             eeee
      	 	 	 	 	 ddej(                  deej(                     deej(                     dee   dee   dee   deeef   fd              Z xZS )FlavaMultimodalModelzflava.multimodal_modelr   rh   c                    t         |   |       || _        | j                  j                  | _        | j                  r9t	        j
                  t        j                  dd|j                              | _	        t        |      | _        t	        j                  |j                  |j                        | _        |rt        |      nd | _        | j#                          y r6  )ro   rp   rh   use_cls_tokenr   rr   r@   rs   rt   ru   r@  rw  r   r   rx  rY  ry  rz  r{  s      r4   rp   zFlavaMultimodalModel.__init__E  s     ![[66\\%++aF<N<N*OPDN#F+f&8&8f>S>ST->k&)Dr;   r  r+   Nc                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 yr  r  r  s       r4   r  z!FlavaMultimodalModel._prune_headsS  r  r;   ,batch_size, image_num_patches + text_seq_lenr  r   r   r   rG  rH  c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|j	                         \  }}}	| j
                  r;| j                  j                  |dd      }
t        j                  |
|fd      }|dz  }|#t        j                  ||f|j                        }| j                  || j                   j                        }| j                  |||f|j                        }| j                  ||||||      }|d   }| j!                  |      }| j"                  | j#                  |      nd }|s
||f|dd  z   S t%        |||j&                  |j(                        S )Nr   r   r   r  r  r   r  )rh   r   rG  r  r   r  ru   r   r@   r   r  r   r  rD  r  rw  rx  ry  r   r   rM  )r3   r   r   r   r   rG  rH  r   r   r   r   r  r  r  r`  s                  r4   r   zFlavaMultimodalModel.forward[  s   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$1$6$6$8!
J..z2rBJ!IIz=&AqIM!OJ!"ZZZ(@I]I]^N &&y$++2O2OP	040P0PZ4m6J6J1
 ,,2/!5# ' 
 *!,..98<8OO4UY#]3oab6III)-')77&11	
 	
r;   r  )NNNNN)r<   r=   r>   r   ro  rp  r  rp   r   r   r   r  r   !FLAVA_MULTIMODAL_INPUTS_DOCSTRINGr  r   r  r   &_CONFIG_CLASS_FOR_MULTIMODAL_MODEL_DOCr@   r   r   rP   r   r8   r   r   r   s   @r4   r  r  ;  s   
 )L0%O4 C4T#Y+? CD C +)001_`  &.; 26,0,0/3&*7
||7
 !.7
 ELL)	7

 $D>7
 'tn7
 d^7
 
u00	17
7
r;   r  z_The bare FLAVA Model transformer outputting raw hidden-states without any specific head on top.r   c                       e Zd ZeZdef fdZ eej                  d            	 	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e   d	e	e   d
e	e   de
j                  fd       Z eej                  d            	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                      de	e   de	e
j                     de	e
j                     de	e   d	e	e   d
e	e   de
j                  fd       Z eej                  d             eee      	 	 	 	 	 	 	 	 	 	 	 dde	e
j*                     de	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e
j*                     de	e
j                     de	e   de	e   d	ed
e	e   deeef   fd              Z xZS )
FlavaModelrh   c                    t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s%t        ddt        |j                         dz         |j                  }|j                  }|j                  }|j                  | _        |j                  | _        |j                  | _        |j                  | _        t!        |      | _        t%        |      | _        t)        |      | _        t-        j.                  | j                  | j                        | _        t-        j.                  | j                  | j                        | _        t-        j4                  t7        j8                  | j:                  j<                              | _        t-        j.                  | j                  | j                        | _         t-        j.                  | j                  | j                        | _!        | jE                          y )NzLconfig.text_config is expected to be of type FlavaTextConfig but is of type r   zNconfig.image_config is expected to be of type FlavaImageConfig but is of type zMconfig.multimodal_config is expected to be of type FlavaMultimodalConfig but zis of type )#ro   rp   r   text_configr    	TypeErrortypeimage_configr   multimodal_configr   projection_dimrt   text_hidden_sizeimage_hidden_sizemm_hidden_sizer  
text_modelrs  image_modelr  multimodal_modelr   r   image_projectiontext_projectionrr   r@   tensorrh   logit_scale_init_valuelogit_scaleimage_to_mm_projectiontext_to_mm_projectionrz  )r3   rh   r  r  r  r}   s        r4   rp   zFlavaModel.__init__  s    &,,o>++,-Q0 
 &--/?@,,-.a1 
 &224IJ_V%=%= >?qAB 
 ((**"44$33 + 7 7!-!9!9/;;(5*<8 45F G "		$*@*@$BUBU V!yy)>)>@S@ST<<T[[5W5W(XY&(ii0F0FH[H[&\#%'YYt/D/DdFYFY%Z"r;   r  r   r   r   r   r   rG  rH  r+   c           	          dj                  t               | j                  |||||||      }|d   }	| j                  |	      }
|
S )Na  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`FlavaTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoProcessor, FlavaModel

        >>> model = FlavaModel.from_pretrained("{0}")
        >>> processor = AutoProcessor.from_pretrained("{0}")

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], max_length=77, padding="max_length", return_tensors="pt"
        ... )
        >>> text_features = model.get_text_features(**inputs)
        ```)r   r   r   r   r   rG  rH  r   )r  r  r  r  )r3   r   r   r   r   r   rG  rH  text_outputsr`  text_featuress              r4   get_text_featureszFlavaModel.get_text_features  s]    	" v)*))%/!5# ' 
 %Q,,];r;   r  r   r   r   r   c	           
          dj                  t               | j                  ||||||||      }	|	d   }
| j                  |
      }|S )Na  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`FlavaImageModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, FlavaModel

        >>> model = FlavaModel.from_pretrained("{0}")
        >>> processor = AutoProcessor.from_pretrained("{0}")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```)r   r   r   r   r   rG  r   rH  r   )r  r  r  r  )r3   r   r   r   r   r   r   rG  rH  image_outputsr`  image_featuress               r4   get_image_featureszFlavaModel.get_image_features  sc    	* v)*((%+)/!5%=# ) 	
 &a(..}=r;   r  r  ro  image_attention_maskskip_multimodal_encoderc           	         ||n| j                   j                  }|
st        d      d}d}d}d}|5| j                  ||||	|
|      }|d   |d   }}| j	                  |d         }d}d}d}d}|6| j                  |||||	|
|      }|d   |d   }}| j                  |d         }d}d}|||s|g|j                  \  }}}| j                  j                  r|dz  }t        j                  |||j                  	      }t        j                  ||gd
      }nd}t        j                  ||gd
      }| j                  |||      }|d   }|s||||||fS t        ||||||      S )a  
        Returns:

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, FlavaModel

        >>> model = FlavaModel.from_pretrained("facebook/flava-full")
        >>> processor = AutoProcessor.from_pretrained("facebook/flava-full")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(text=["a photo of a cat"], images=image, return_tensors="pt", padding=True)

        >>> outputs = model(**inputs)

        >>> image_embeddings = outputs.image_embeddings
        >>> text_embeddings = outputs.text_embeddings
        >>> multimodal_embeddings = outputs.multimodal_embeddings

        >>> outputs.image_embeddings.shape
        torch.Size([1, 197, 768])

        >>> text_embeddings.shape
        torch.Size([1, 7, 768])

        >>> multimodal_embeddings.shape
        torch.Size([1, 205, 768])
        ```
        NzRFLAVA model requires hidden states to work. Please set `output_hidden_states=True`)r   r   r   r   rG  rH  r   r   r   )r   r   r   r   r   rG  rH  r   r  r   )r   rH  )r%   r&   r'   r(   r)   r*   )rh   rH  r   r  r  r  r  r   r  r  r@   r  r   r   r$   )r3   r   r   r   r   r   r   r  r  r   rG  rH  r%   image_statesimage_mm_projectionr&   r'   text_statestext_mm_projectionr(   r)   r*   r   r   r   attention_mask_imageattention_multimodalmultimodal_inputs                               r4   r   zFlavaModel.forward-  s
   j &1%<k$++BYBY#qrr"#++) /3"3%9' , L .:!_l1ol"&"="=l2>N"O! //#-)-"3%9' * K ,7q>;q>[O!%!;!;KO!L $ */A/MVm))<)B)B&
GQ((66qLG',zz*gNaNhNh'i$',yy2F1W]^'_$'+$$yy*=?Q)RXYZ $ 5 5 1ES^ !6 ! %6a$8! %!   -%+#"7/
 	
r;   )NNNNNNNr  )NNNNNNNNNTN)r<   r=   r>   r   ro  rp   r   r  r  r   r@   r   rP   rA   r  r  r   r  FLAVA_MODEL_INPUTS_DOCSTRINGr   r$   
LongTensorr   r
   r0  r   r   r   s   @r4   r  r    s   
 L){ )V ++F+M+MNk+lm -11515/3,0/3&*)ELL)) !.) !.	)
 u||,) $D>) 'tn) d^) 
		) n)V ++G+N+NOn+op 046:3715,0,0/3&*/u||,/ "%"2"23/ #+4.	/
 !./ ELL)/ $D>/ 'tn/ d^/ 
		/ q/b +$++,Z[ +;+V 1548151526377;26,0%)&*z
E,,-z
 u001z
 !.	z

 !.z
 "%,,/z
 u//0z
 'u||4z
 "*$z
 $D>z
 #z
 d^z
 
uk!	"z
 Wz
r;   r  c                   `     e Zd Zdedef fdZdej                  dej                  fdZ xZS )FlavaImageCodebookResPathin_sizeout_sizec                    t         |           |dz  }t               }t        j                         |d<   t        j
                  ||dd      |d<   t        j                         |d<   t        j
                  ||dd      |d<   t        j                         |d	<   t        j
                  ||dd      |d
<   t        j                         |d<   t        j
                  ||dd      |d<   t        j                  |      | _        y )N   relu_1r   r   r   paddingconv_1relu_2conv_2relu_3conv_3relu_4r   conv_4)ro   rp   r   r   ReLUr   
Sequentialpath)r3   r  r  kwargshid_sizer  r}   s         r4   rp   z"FlavaImageCodebookResPath.__init__  s    q=}X7H!QOXX8X1aPXX8X1aPXX8X1aPXMM$'	r;   r   r+   c                 $    | j                  |      S rd   )r  r3   r   s     r4   r   z!FlavaImageCodebookResPath.forward  s    yy|r;   	r<   r=   r>   r   rp   r@   r   r   r   r   s   @r4   r  r    s1    ( (s (  %,, r;   r  c                   d     e Zd Zdededef fdZdej                  dej                  fdZ xZS )FlavaImageCodebookBlockr  r  
num_layersc                     t         |           d|dz  z  | _        ||k7  rt        j                  ||dd      | _        nt        j                         | _        t        ||      | _        y )Nr   r   r   r  )	ro   rp   	post_gainr   r   id_pathIdentityr  res_path)r3   r  r  r   r  r}   s        r4   rp   z FlavaImageCodebookBlock.__init__  sW    j!m,h99WhAqQDL;;=DL1'8Dr;   r   r+   c                 b    | j                  |      | j                  | j                  |      z  z   S rd   )r  r  r  r  s     r4   r   zFlavaImageCodebookBlock.forward  s'    ||A$--2B!BBBr;   r  r   s   @r4   r  r    s?    
E 
Es 
E 
EC C%,, Cr;   r  c                   n     e Zd Zd
dededededef
 fdZdej                  dej                  fd	Z xZ	S )FlavaImageCodebookLayerGroup
num_blocksr   r  r  use_poolc                 $   t         |           t               }t        |      D ]4  }|dk(  rt	        |||      |d|dz    <   t	        |||      |d|dz    <   6 |rt        j                  d      |d<   t        j                  |      | _        y )Nr   block_r   r   )r   pool)	ro   rp   r   rC  r  r   	MaxPool2dr  group)	r3   r	  r   r  r  r
  blocksrT  r}   s	           r4   rp   z%FlavaImageCodebookLayerGroup.__init__  s    z" 	aAAv)@(T^)_!u~&)@8U_)`!u~&		a \\a8F6N]]6*
r;   r   r+   c                 $    | j                  |      S rd   )r  r  s     r4   r   z$FlavaImageCodebookLayerGroup.forward  s    zz!}r;   r  )
r<   r=   r>   r   rP   rp   r@   r   r   r   r   s   @r4   r  r    sH    +3 +C +# +QT +`d + %,, r;   r  a"  
    The FLAVA's image codebook model inspired from DALL-E's original encoder. Outputs raw hidden states and can be used
    to generate image tokens for an image based on DALL-E's vocab. Used to generate labels for MIM. Use
    `get_codebook_indices` to get image tokens for an image.
    r   c                        e Zd ZdZeZdZdZdedef fdZ	de
j                  de
j                  fdZde
j                  de
j                  fd	Zde
j                  de
j                  fd
Z xZS )FlavaImageCodebook r   Frh   r  c                    t         |   |       || _        |j                  | _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _        | j                  | j
                  z  }t               }t        j                         |d<   t        j                  d| j                  z  | j                  dd      |d<   t               }t        j                  | j                  d| j                  z  dd      |d	<   t        | j
                  |d| j                  z  d| j                  z        |d
<   t        | j
                  |d| j                  z  d| j                  z        |d<   t        | j
                  |d| j                  z  d| j                  z        |d<   t        | j
                  |d| j                  z  d| j                  z  d      |d<   t        j                  |      |d<   t        j                  |      | _        | j                          | j                  j                   r| j#                         D ]	  }d|_         y y )Nrelu   r   r   r  conv   r   inputgroup_1r   group_2r  group_3F)r
  group_4r  )ro   rp   rh   
num_groupsinput_channelsnum_blocks_per_grouprt   r   r   r   r  r   r  r  r  rz  freeze
parametersrequires_grad)r3   rh   r  r   output_blocksr  paramr}   s          r4   rp   zFlavaImageCodebook.__init__  s   
 	  ++$33$*$?$?!!-- ++__t'@'@@
# "	f "		!d.>.>*>]^hi jf))D$7$7T=M=M9M[\fghw8%%z1t7G7G3GTM]M]I]
y 9%%z1t7G7G3GTM]M]I]
y 9%%z1t7G7G3GTM]M]I]
y 9%%z1t7G7G3GTM]M]I]hm
y ==7xmmF+;;* ,&+#, r;   r+   c                 |    dj                  t               | j                  |      }t        j                  |d      S )Na  
        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Pixel values. Codebook pixel values can be obtained using [`AutoImageProcessor`] by passing
                `return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.

        Examples:
        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoImageProcessor, FlavaImageCodebook

        >>> model = FlavaImageCodebook.from_pretrained("{0}")
        >>> image_processor = AutoImageProcessor.from_pretrained("{0}")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
        >>> inputs = dict(pixel_values=inputs.codebook_pixel_values)

        >>> outputs = model.get_codebook_indices(**inputs)
        ```
        r   )axis)r  _CHECKPOINT_FOR_CODEBOOK_DOCr  r@   argmaxr3   r   z_logitss      r4   get_codebook_indicesz'FlavaImageCodebook.get_codebook_indices"  s3    	. F/0;;|,||H1--r;   c                 \    | j                  |      } t        j                  d      |      S )Nr   r   )r  r   Softmaxr+  s      r4   get_codebook_probsz%FlavaImageCodebook.get_codebook_probs>  s&    ;;|, rzza **r;   c                 8   dj                  t               t        |j                        dk7  rt	        d|j                   d      |j                  d   | j
                  k7  r(t	        d|j                  d    d| j
                         | j                  |      S )Na  
        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Pixel values. Codebook pixel values can be obtained using [`AutoImageProcessor`] by passing
                `return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoImageProcessor, FlavaImageCodebook

        >>> model = FlavaImageCodebook.from_pretrained("{0}")
        >>> image_processor = AutoImageProcessor.from_pretrained("{0}")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
        >>> inputs = dict(pixel_values=inputs.codebook_pixel_values)

        >>> outputs = model(**inputs)
        >>> print(outputs.shape)
        (1, 196)
        ```
        r  zinput shape z
 is not 4dr   z
input has z channels but model built for )r  r)  r  r   r   r   r  )r3   r   s     r4   r   zFlavaImageCodebook.forwardB  s    	4 F/0|!!"a'|L,>,>+?zJKKa D$7$77z,*<*<Q*?)@@^_c_r_r^stuu{{<((r;   )r<   r=   r>   rp  r   ro  r  rq  r   rp   r@   r   r-  r0  rA   r   r   r   s   @r4   r  r    s     +L$O&+#*,(*, *,X. .%,, .8+u|| + + )E$5$5  )%,,  )r;   r  c                   $     e Zd Z fdZd Z xZS )FlavaPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y rv  )ro   rp   r   r   rt   r  r   r)  r*  r   transform_act_fnr   r   r   s     r4   rp   z%FlavaPredictionHeadTransform.__init__f  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr;   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rd   )r  r5  r   r-  s     r4   r   z$FlavaPredictionHeadTransform.forwardo  s4    

=1--m<}5r;   r<   r=   r>   rp   r   r   r   s   @r4   r3  r3  e  s    Ur;   r3  c                   ,     e Zd Zd fd	Zd Zd Z xZS )FlavaMaskedPredictionHeadc                 |   t         |           || _        t        |      | _        t        j                  |j                  |j                  d      | _	        t        j                  t        j                  |j                              | _        ||| j                  _        | j                  | j                  _        y )NFr   )ro   rp   rh   r3  	transformr   r   rt   r   decoderrr   r@   rs   r   rh  )r3   rh   rh  r}   s      r4   rp   z"FlavaMaskedPredictionHead.__init__w  s    5f=yy!3!3V5F5FUSLLV->->!?@	"(DLL !IIr;   c                 :    | j                   | j                  _         y rd   )r   r<  r:   s    r4   _tie_weightsz&FlavaMaskedPredictionHead._tie_weights  s     IIr;   c                 J    | j                  |      }| j                  |      }|S rd   )r;  r<  r  s     r4   r   z!FlavaMaskedPredictionHead.forward  s"    NN1LLOr;   rd   )r<   r=   r>   rp   r>  r   r   r   s   @r4   r9  r9  v  s    
&&r;   r9  c                   $     e Zd Z fdZd Z xZS )FlavaITMHeadc                     t         |           || _        t        |      | _        t        j                  |j                  d      | _        y )Nr   )	ro   rp   rh   rY  ry  r   r   rt   seq_relationshipr   s     r4   rp   zFlavaITMHead.__init__  s:    !&) "		&*<*<a @r;   c                 J    | j                  |      }| j                  |      }|S rd   )ry  rC  r  s     r4   r   zFlavaITMHead.forward  s$    KKN!!!$r;   r7  r   s   @r4   rA  rA    s    Ar;   rA  c                   $     e Zd Z fdZd Z xZS )FlavaGlobalContrastiveHeadc                 R    t         |           || _        |j                  | _        y rd   )ro   rp   rh   global_backprop_contrastiver   s     r4   rp   z#FlavaGlobalContrastiveHead.__init__  s#    +1+M+M(r;   c                     t        j                  |      }t         j                  j                         rt         j                  j	                         s8t        j
                  |j                  d      |j                        }|g}|g}n{|j                  d      }t         j                  j                         }	| j                  rgt         j                  j                  j                  j                  |      }t         j                  j                  j                  j                  |      }nt        |	      D 
cg c]  }
t        j                  |       }}
t        |	      D 
cg c]  }
t        j                  |       }}
t         j                  j                  ||       t         j                  j                  ||       |t         j                  j                         z  t        j
                  ||j                        z   }t        j                   |      }t        j                   |      }t        j"                  ||j%                  dd            |z  }t        j"                  ||j%                  dd            |z  }|||fS c c}
w c c}
w )Nr   r  r   )r@   expdistributedis_availableis_initializedr   r   r   get_world_sizerH  r   r   
all_gatherrC  
zeros_likeget_rankr   r   r   )r3   r%   r'   r  temperaturelabelsimage_embeddings_alltext_embeddings_alllocal_batch_size
world_sizer   logits_per_imagelogits_per_texts                r4   r   z"FlavaGlobalContrastiveHead.forward  s   ii,  --/u7H7H7W7W7Y\\"2"7"7":CSCZCZ[F$4#5 #2"3/44Q7**99;J// (-'8'8';';'F'F'Q'QRb'c$&+&7&7&:&:&E&E&P&PQ`&a#SXYcSd'ea(8(8(I'e$'eSXYcSd&eau'7'78H'I&e#&e!!,,-ACST!!,,-@/R%(9(9(B(B(DDu|| )9)@)@H F  %yy)=>#ii(;< <<(8:M:W:WXY[\:]^all,,8L8V8VWXZ[8\]`kk&88 (f&es   9J$Jr7  r   s   @r4   rF  rF    s    N
9r;   rF  zk
    The FLAVA model for pretraining which outputs losses, embeddings, logits and transformer outputs.
    c            )           e Zd Zg dZddedeej                     f fdZde	j                  fdZ eej                  dd             eee	      	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd
ee	j"                     dee	j"                     dee	j$                     dee	j$                     dee	j                     dee	j                     dee	j                     dee	j"                     dee	j                     dedee	j                     dee	j                     dee	j                     dee   dedee   dee   deee	j                     ef   f$d              Z xZS )FlavaForPreTraining)zmmm_text_head.decoder.biaszmmm_image_head.decoder.biaszmlm_head.decoder.biaszmim_head.decoder.biasrh   image_codebookc                 b   t         |   |       t        |      | _        || _        | j                  &|j
                  rt        |j                        | _        t        |j                        | _
        t        |j                        | _        t        |      | _        t        |j                        | _        t        |j                        | _        t#        |      | _        |j                  j&                  | _        |j                  j&                  | _        |j,                  | _        |j.                  | _        |j0                  | _        |j2                  | _        |j4                  | _        |j6                  | _        |j8                  | _        |j:                  | _        | j=                          y rd   )ro   rp   r  rc  r\  init_codebookr  image_codebook_configr9  r  mim_headr  mlm_headrA  itm_headmmm_image_headmmm_text_headrF  global_contrastive_headr   image_vocab_sizetext_vocab_size
mlm_weight
mim_weightglobal_contrastive_weightce_ignore_index
itm_weightmmm_image_weightmmm_text_weight skip_unmasked_multimodal_encoderrz  )r3   rh   r\  r}   s      r4   rp   zFlavaForPreTraining.__init__  sO    '
,&6+?+?"4V5Q5Q"RD 2&2E2EF1&2D2DE$V,78K8KL6v7I7IJ'A&'I$ & 3 3 > >%11<< ++ ++)/)I)I&%55 ++ & 7 7%55060W0W-r;   r   c                 n    |j                         dkD  r!|j                  |j                  d      d      }|S )Nr   r   r   )r   r   r   r  s     r4   _resize_to_2dz!FlavaForPreTraining._resize_to_2d  s,    557Q;qvvay"%Ar;   zbatch_size, text_seq_lenr  r  r   input_ids_maskedr   codebook_pixel_valuesr   r   r   r   r  ro  
mlm_labels
mim_labels
itm_labelsr   rG  rH  return_lossr+   c                    ||n| j                   j                  }||n| j                   j                  }|
|
n| j                  }
||t        j                  d       |}| j                  ||||||	|
||d
      }| j                  |||||	|||d	      }d}|j                  }|j                  }|j                  }|j                  }|j                  }dx}x}x}x}x}x}} dx}!x}"x}#}$dx}%x}&}'||C|A|r?| j                  t        d      |t        d      | j                  j                  |      }| j                  dkD  r|||}(|| j                  |      }| j                  |      }| j                   ||j#                  d      <   |(dd|j%                  d	       dddf   }(|j#                  | j                         })||)   }*|(|)ddf   }(| j'                  |(      }!|rjt(        j*                  j-                  |!j/                  d
| j0                        |*j/                  d
            }|| j                  z  }n| j'                  |(      }!| j2                  dkD  r|||}+|| j                  |      }|+dd|j%                  d	       dddf   }+|j#                  | j                         })||)   },|+|)ddf   }+| j5                  |+      }"|rjt(        j*                  j-                  |"j/                  d
| j6                        |,j/                  d
            }|| j2                  z  }n| j5                  |+      }"| j8                  dkD  r|| j;                  |      }%||j#                  d      }-t=        j>                  |-jA                         |-|-jC                  dg            }|r/t(        j*                  j-                  |%|      } | | j8                  z  } |||   }|||   }|
||   }||   }|| jD                  dkD  r|}(|j%                  d	      d	z
  }.|(dddd|.z   ddf   }(|| j                  |      }| j                  |      }| j                   ||j#                  d      <   |j#                  | j                         })||)   }*|(|)ddf   }(| jG                  |(      }$|rjt(        j*                  j-                  |$j/                  d
| j0                        |*j/                  d
            }|| jD                  z  }n| jG                  |(      }$|| jH                  dkD  r|}+|+dd|j%                  d	       dddf   }+|| j                  |      }|j#                  | j                         })||)   },|+|)ddf   }+| jK                  |+      }#|rjt(        j*                  j-                  |#j/                  d
| j6                        |,j/                  d
            }|| jH                  z  }n| jK                  |+      }#|l|i| jL                  dkD  rY| j                  jO                  |dddddf         }/t(        j*                  jQ                  |/d
      }/| j                  jS                  |dddddf         }0t(        j*                  jQ                  |0d
      }0| j                  jT                  jV                  jY                  tZ        t\               | j_                  |0|/| j                  jT                        \  }&}'}1||&|   }&|'|   }'|1|   }1|rWt(        j*                  j-                  |&|1      }2t(        j*                  j-                  |'|1      }3|2|3z   dz  }|| jL                  z  }ta        ||| |||      }4|r0|4jc                         s te        d |4jg                         D              }|s.||jh                  |jh                  jk                         nd||jl                  |jl                  jk                         nd|j                  |jn                  |jn                  jk                         nd||jh                  |jh                  jk                         nd||jl                  |jl                  jk                         nd||jn                  |jn                  jk                         nd|!|"|%|&|&|$|#f}5|r|4jc                         s||4f|5z   }5tq        d |5D              S ts        d%i d|d|4d|d|jh                  d|d|jl                  d|j                  d|jn                  d|d|jh                  d|d|jl                  d|d|jn                  d|!d|"d |%d!|&d"|'d#|$d$|#S )&ai  
        Examples:
        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import FlavaForPreTraining, AutoProcessor

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> model = FlavaForPreTraining.from_pretrained("facebook/flava-full")
        >>> processor = AutoProcessor.from_pretrained("facebook/flava-full")

        >>> text = ["a photo of a cat"]

        >>> inputs = processor(
        ...     images=[image],
        ...     text=text,
        ...     return_masks=True,
        ...     return_codebook_pixels=True,
        ...     padding=True,
        ...     max_length=77,
        ...     return_tensors="pt",
        ... )


        >>> output = model(**inputs)
        ```

        Return:

        Nz`input_ids_masked` isn't passed which means MLM loss won't be calculated correctlySetting it to `input_ids` so that model can work. Please pass it if this is unintentional. This is usually OKAY if you are doing inference on unmasked text...T)
r   r   r   r   r   r  r  r   rG  rH  )	r   r   r   r   r  r   r   rG  rH  z`return_loss` is set to True but the image codebook is not initialized and no `mim_labels`  have been passed. Reinstantiate the model with `init_codebook` set to True or pass in your custom `mim_labels`z`codebook_pixel_value` are required to generate `mim_labels` if loss is expected. Call `AutoProcessor` with `return_codebook_pixels` set to Truer   r   r   r   r   )rF   rG   rH   rI   rJ   rK   c              3   (   K   | ]
  }||nd  y wr^  rC   )r1   rS   s     r4   r5   z.FlavaForPreTraining.forward.<locals>.<genexpr>  s     _T%5T1<_s   c              3   &   K   | ]	  }||  y wrd   rC   )r1   r   s     r4   r5   z.FlavaForPreTraining.forward.<locals>.<genexpr>   s     8qai8rK  rS   rT   r%   r&   r'   r(   r)   r*   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rC   ):rh   r  rw  ro  loggerwarningrc  r%   r'   r)   r\  RuntimeErrorr   r-  ri  rq  rk  ner   r`  r   r   cross_entropyr   rf  rh  ra  rg  rl  rb  r@   whereanynewrm  rc  rn  rd  rj  r  	normalizer  r  ri  clamp_LOGIT_SCALE_CLAMP_MINLOGIT_SCALE_CLAMP_MAXre  rE   rN   sumrM   r&   r0   r(   r*   r8   rR   )6r3   r   rr  r   rs  r   r   r   r   r  ro  rt  ru  rv  r   rG  rH  rw  flava_outputflava_masked_outputpos_maskr%   r'   rU   rW   rY   
total_lossmim_lossmlm_lossmmm_text_lossmmm_image_lossgc_lossitm_lossr[   r\   ra   r`   r]   rX  rY  sequence_for_imagemasked_tokensmim_labels_filteredsequence_for_textmlm_labels_filtered	pos_pairs	end_indextext_embeddingimage_embedding	gc_labelsgc_loss_imagegc_loss_textflava_lossesr  s6                                                         r4   r   zFlavaForPreTraining.forward  s
   p &1%<k$++B]B]%0%<k$++BYBY 0; -66 	) #	(=NN?
  )zz%))%!5 %E/!5 " 
  #jj&%))!5+/!5 ) 

 '88&66"5"F"F!4!D!D':'P'P$aee
eXee=e>eGV^GKK
KZK/4D:>>
>% #.2N2Z!k&&.&; 
 )0$Y  "00EEF[\
 ??Q#:#FKgKo!8%!//
;
"&"4"4_"E7;7K7K
?--d34%7JOOA<N;N;PRS8S%T" *d.B.B C&0&?#%7q8H%I"!]]+=>
!}}::"D,A,ABDWD\D\]_D` H /H!]]+=>
 ??Q#9#EJfJn 6%!//
;
$5a*//!:L9L9NPQ6Q$R! *d.B.B C&0&?#$5mQ6F$G!!]]+<=
!}}::"D,@,@ACVC[C[\^C_ H /H!]]+<=
 ??Q#?#K'CDJ%&MM!,	 ;;y}}	9==RVQWCXY!}}:::zRH/H/;3OPX3Y0)!+H!5J)!+H!5J&5h&?O (38M8MPQ8Q!=/44Q7!;I!3Aq1y=7H!4K!L%!//
;
"&"4"4_"E7;7K7K
?--d34 *d.B.B C&0&?#%7q8H%I"#'#6#67I#J %']]%@%@(--b$2G2GHJ]JbJbceJf&N #d&;&;;N#'#6#67I#J  (38L8Lq8P < 1!6L6Q6QRS6T5T5VXY2Y Z%!//
;
 *d.B.B C&0&?#$5mQ6F$G!"&"4"45F"G$&MM$?$?',,R1E1EFH[H`H`acHd%M "T%9%99M"&"4"45F"G 'O,GDLjLjmnLn!ZZ771a8PQN]]44^4LN"jj99:J1aQR7:STO mm55o25NOJJ""''../DF[\;?;W;W1G1G<8oy
 ##3H#= "1(";%h/	 " ; ;<Li X!}}::?IV(<71<4999"&$"
 |446_I\I\I^__J 8D8Q8Q8]))224cg7C7O7O7[((113ae22=I=[=[=g..779mq'?R?_?_?k#0099;qu&>Q>]>]>i#//88:os,&88D $55>>@   +F. <#8#8#:   8F888( 

"
 .
 &22	

 ,
 %00
 #/"D"D
 +<<
 %<
 !4 @ @
 $:
  3>>
 *F
 &9%J%J
 "
  "!
" "#
$ *:%
& )8'
( .)
* ,+
 	
r;   rd   )NNNNNNNNNNNNNNTNN)r<   r=   r>   _tied_weights_keysr   r   r   r  rp   r@   r   rq  r   "FLAVA_PRETRAINING_INPUTS_DOCSTRINGr  r   rR   r  rA   rP   r   r
   r   r   r   s   @r4   r[  r[    s   { HRYY<O <u|| 
 +*112LNmn +DS^_ 157;48=A151526377;15-1-1-1,0%)&*&*%A
E,,-A
 #5#3#34A
 u001	A

  ((9(9:A
 !.A
 !.A
 "%,,/A
 u//0A
 'u||4A
 +/A
 U\\*A
 U\\*A
 U\\*A
 $D>A
  #!A
" d^#A
$ d^%A
& 
uU\\"$==	>'A
 `A
r;   r[  )]r?   r   r   r   dataclassesr   typingr   r   r   r   r	   r
   r   r@   torch.utils.checkpointr   activationsr   modeling_outputsr   r   modeling_utilsr   r   r   utilsr   r   r   r   r   r   r   configuration_flavar   r   r   r   r    
get_loggerr<   r{  r  r)  r  r  r  r  r  r  r  r$   rE   rR   r  rg   rv   r   r   r  r  r&  r0  r4  r@  rY  FLAVA_START_DOCSTRINGFLAVA_INPUTS_DOCSTRING_COMMON!FLAVA_IMAGE_INPUTS_DOCSTRING_BASEr   FLAVA_TEXT_INPUTS_DOCSTRING_BASEr  r  !FLAVA_MODEL_INPUTS_DOCSTRING_BASEr  r  'FLAVA_PRETRAINING_START_DOCSTRING_EXTRArb  r  rs  r  r  r  r  r  r  r  r3  r9  rA  rF  r[  rC   r;   r4   <module>r     s^      # ! ? ? ?    ! K c c    
		H	%+   ? $6 !#4  )@ &,    _.>@UUV  "
{ "
 "
J !+ ! !H `t `t `tJ_299 _H!bii !H6")) 6r@ @Fbii $'RYY 'T		 ""))  + +\0
299 0
f")) 	 ! 0% !  AC`` $   ?A^^ 
 $$ "% ! &&'#$ ((  '	' (
(*V $W+$ #^+ '*? *6 k  (: ;\
* \
	\
~ j  (9 :b
) b
	b
J p  (? @[
/ [
	[
| e   6J
% J
	J
Z		 *Cbii C"299 ( 
   (B Cr)- r)r)j299 "		 ,
299 
%9 %9P     69``	q
. q
q
r;   