
    sg                   F   d Z ddlmZ ddlZddlmZ ddlmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZmZmZmZmZmZmZ dd	lmZmZ dd
lmZmZmZm Z m!Z!m"Z" ddl#m$Z$  e!jJ                  e&      Z'dZ(dZ)e G d de             Z*e G d de             Z+ G d dejX                  jZ                        Z. G d dejX                  jZ                        Z/ G d dejX                  jZ                        Z0 G d dejX                  jZ                        Z1 G d dejX                  jZ                        Z2 G d dejX                  jZ                        Z3 G d d ejX                  jZ                        Z4 G d! d"ejX                  jZ                        Z5 G d# d$ejX                  jZ                        Z6 G d% d&ejX                  jZ                        Z7 G d' d(ejX                  jZ                        Z8e G d) d*ejX                  jZ                               Z9 G d+ d,e      Z:d-Z;d.Z< ed/e;       G d0 d1e:             Z= G d2 d3ejX                  jZ                        Z> G d4 d5ejX                  jZ                        Z? G d6 d7ejX                  jZ                        Z@ G d8 d9ejX                  jZ                        ZA G d: d;ejX                  jZ                        ZB G d< d=ejX                  jZ                        ZC G d> d?ejX                  jZ                        ZD ed@e;       G dA dBe:             ZEy)CzTF 2.0 LXMERT model.    )annotationsN)	dataclass)DictOptionalTupleUnion   )get_tf_activation)TFModelInputTypeTFPreTrainedModelget_initializerkeraskeras_serializable
shape_listunpack_inputs)check_embeddings_within_boundsstable_softmax)ModelOutputadd_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )LxmertConfigzunc-nlp/lxmert-base-uncasedr   c                      e Zd ZU dZdZded<   dZded<   dZded<   dZded<   dZ	ded	<   dZ
ded
<   dZded<   dZded<   y)TFLxmertModelOutputa  
    Lxmert's outputs that contain the last hidden states, pooled outputs, and attention probabilities for the language,
    visual, and, cross-modality encoders. (note: the visual encoder in Lxmert is referred to as the "relation-ship"
    encoder")


    Args:
        language_output (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the language encoder.
        vision_output (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the visual encoder.
        pooled_output (`tf.Tensor` of shape `(batch_size, hidden_size)`):
            Last layer hidden-state of the first token of the sequence (classification, CLS, token) further processed
            by a Linear layer and a Tanh activation function. The Linear
        language_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for input features + one for the output of each cross-modality layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
        vision_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for input features + one for the output of each cross-modality layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
        language_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
        vision_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
        cross_encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    Ntf.Tensor | Nonelanguage_outputvision_outputpooled_outputTuple[tf.Tensor] | Nonelanguage_hidden_statesvision_hidden_stateslanguage_attentionsvision_attentionscross_encoder_attentions)__name__
__module____qualname____doc__r   __annotations__r    r!   r#   r$   r%   r&   r'        `/var/www/html/venv/lib/python3.12/site-packages/transformers/models/lxmert/modeling_tf_lxmert.pyr   r   8   sg     D )-O%,&*M#*&*M#*6:3:4818370715.58<5<r.   r   c                      e Zd ZU dZdZded<   dZded<   dZded<   dZded<   dZ	ded	<   dZ
ded
<   dZded<   dZded<   dZded<   y)TFLxmertForPreTrainingOutputa
  
    Output type of [`LxmertForPreTraining`].

    Args:
        loss (*optional*, returned when `labels` is provided, `tf.Tensor` of shape `(1,)`):
            Total loss as the sum of the masked language modeling loss and the next sequence prediction
            (classification) loss.
        prediction_logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        cross_relationship_score (`tf.Tensor` of shape `(batch_size, 2)`):
            Prediction scores of the textual matching objective (classification) head (scores of True/False
            continuation before SoftMax).
        question_answering_score (`tf.Tensor` of shape `(batch_size, n_qa_answers)`):
            Prediction scores of question answering objective (classification).
        language_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for input features + one for the output of each cross-modality layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
        vision_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for input features + one for the output of each cross-modality layer) of shape
            `(batch_size, sequence_length, hidden_size)`.
        language_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
        vision_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
        cross_encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.

    Nr   lossprediction_logitscross_relationship_scorequestion_answering_scorer"   r#   r$   r%   r&   r'   )r(   r)   r*   r+   r2   r,   r3   r4   r5   r#   r$   r%   r&   r'   r-   r.   r/   r1   r1   f   su    !F "D
!*.'.15.515.56:3:4818370715.58<5<r.   r1   c                  .     e Zd Z fdZddZddZ xZS )TFLxmertVisualFeatureEncoderc                   t        |   di | t        j                  j	                  |j
                  t        |j                        d      | _        t        j                  j                  |j                  d      | _        t        j                  j	                  |j
                  t        |j                        d      | _        t        j                  j                  |j                  d      | _        t        j                  j                  |j                        | _        |j"                  | _        |j&                  | _        || _        y )Nvisn_fckernel_initializernamevisn_layer_normepsilonr<   box_fcbox_layer_normr-   )super__init__r   layersDensehidden_sizer   initializer_ranger9   LayerNormalizationlayer_norm_epsr=   r@   rA   Dropouthidden_dropout_probdropoutvisual_feat_dimfeat_dimvisual_pos_dimpos_dimconfigselfrQ   kwargs	__class__s      r/   rC   z%TFLxmertVisualFeatureEncoder.__init__   s   "6" ||)).v/G/GH * 

  %||>>vG\G\ct>u ll((.v/G/GH ) 

 $ll==fF[F[br=s||++F,F,FG..,,r.   c                    |\  }}| j                  |      }| j                  |      }| j                  |      }| j                  |      }||z   dz  }| j	                  ||      }|S )N   training)r9   r=   r@   rA   rL   )rS   
visn_inputrY   featsboxesxyoutputs           r/   callz!TFLxmertVisualFeatureEncoder.call   sk    !uLL  #KK"a%1fx8r.   c                   | j                   ry d| _         t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       et        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        y y # 1 sw Y   UxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)NTr9   r=   r@   rA   )builtgetattrtf
name_scoper9   r<   buildrN   r=   rQ   rF   r@   rP   rA   rS   input_shapes     r/   rf   z"TFLxmertVisualFeatureEncoder.build   s   ::
4D)5t||001 @""D$#>?@4*D1=t33889 R$$**D$8O8O+PQR44(4t{{//0 >!!4t||"<=>4)40<t22778 Q##))4t{{7N7N*OPQ Q =@ @R R> >Q Qs0   )G23G#)G 
3G,GG G),G5FNr(   r)   r*   rC   r`   rf   __classcell__rU   s   @r/   r7   r7      s    0
Qr.   r7   c                  2     e Zd ZdZ fdZddZddZ xZS )TFLxmertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                V   t        |   di | || _        |j                  | _        |j                  | _        |j
                  | _        t        j                  j                  |j                  d      | _
        t        j                  j                  |j                        | _        y )N	LayerNormr>   )rater-   )rB   rC   rQ   rF   max_position_embeddingsrG   r   rD   rH   rI   rq   rJ   rK   rL   rR   s      r/   rC   zTFLxmertEmbeddings.__init__   s    "6"!--'-'E'E$!'!9!988AVAV]h8i||++1K1K+Lr.   c                   t        j                  d      5  | j                  d| j                  j                  | j
                  gt        | j                              | _        d d d        t        j                  d      5  | j                  d| j                  j                  | j
                  gt        | j                              | _
        d d d        t        j                  d      5  | j                  d| j                  | j
                  gt        | j                              | _        d d d        | j                  ry d| _        t        | d	d       et        j                  | j                  j                         5  | j                  j#                  d d | j                  j
                  g       d d d        y y # 1 sw Y   ]xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)
Nword_embeddingsweight)rG   )r<   shapeinitializertoken_type_embeddings
embeddingsposition_embeddingsTrq   )rd   re   
add_weightrQ   
vocab_sizerF   r   rG   rv   type_vocab_sizery   rs   r{   rb   rc   rq   r<   rf   rg   s     r/   rf   zTFLxmertEmbeddings.build   s   ]],- 	//{{--t/?/?@+d>T>TU * DK	 ]]23 	)-!{{22D4D4DE+d>T>TU *9 *D&	 ]]01 	'+!33T5E5EF+d>T>TU (7 (D$	 ::
4d+7t~~223 L$$dD$++2I2I%JKL L 81	 		 		 	L Ls2   AF?AG,AG3G$?G	GG!$G-c                "   ||J |At        || j                  j                         t        j                  | j
                  |      }t        |      dd }|t        j                  |d      }t        j                  t        j                  d|d         d      }t        j                  | j                  |      }t        j                  | j                  |      }||z   |z   }	| j                  |	      }	| j                  |	|	      }	|	S )
z
        Applies embedding based on inputs tensor.

        Returns:
            final_embeddings (`tf.Tensor`): output embedding tensor.
        N)paramsindicesr   )dimsvalue)startlimitaxisinputs)r   rY   )r   rQ   r}   rd   gatherrv   r   fillexpand_dimsranger{   ry   rq   rL   )
rS   	input_idstoken_type_idsinputs_embedsrY   rh   position_idsposition_embedstoken_type_embedsfinal_embeddingss
             r/   r`   zTFLxmertEmbeddings.call   s     %-*?@@ *9dkk6L6LMIIT[[)LM /4!WW+Q?N~~bhhQk"o&NUVW))4+C+C\ZIIT-G-GQ_`(?:=NN>>1A>B<</?(<Sr.   rj   )NNNF)r(   r)   r*   r+   rC   rf   r`   rl   rm   s   @r/   ro   ro      s    QML: r.   ro   c                  4     e Zd Z fdZd ZddZddZ xZS )TFLxmertAttentionc                   t        |   di | |j                  |j                  z  dk7  r%t	        d|j                   d|j                         |j                  | _        |j                  |j                  z  dk(  sJ t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  j                  | j                  t        |j                        d      | _        t        j                  j                  | j                  t        |j                        d      | _        t        j                  j                  | j                  t        |j                        d      | _        t        j                  j!                  |j"                        | _        |j                  | _        || _        y )	Nr   zThe hidden size (z6) is not a multiple of the number of attention heads (queryr:   keyr   r-   )rB   rC   rF   num_attention_heads
ValueErrorintattention_head_sizeall_head_sizer   rD   rE   r   rG   r   r   r   rJ   attention_probs_dropout_probrL   ctx_dimrQ   rR   s      r/   rC   zTFLxmertAttention.__init__  s   "6" : ::a?#F$6$6#7 8 4457 
 $*#=#= !!F$>$>>!CCC#&v'9'9F<V<V'V#W !558P8PP\\''.v/G/GH ( 


 <<%%.v/G/GH & 

 \\''.v/G/GH ( 

 ||++F,O,OP))r.   c                    t        j                  ||d| j                  | j                  f      }t        j                  |g d      S )Nr   r   rW   r   r	   perm)rd   reshaper   r   	transpose)rS   r]   
batch_sizes      r/   transpose_for_scoresz&TFLxmertAttention.transpose_for_scores5  s8    JJq:r4+C+CTE]E]^_||AL11r.   c                   t        |      d   }| j                  |      }| j                  |      }| j                  |      }	| j	                  ||      }
| j	                  ||      }| j	                  |	|      }t        j                  |
|d      }t        j                  t        |      d   |j                        }|t
        j                  j                  |      z  }|&t        j                  ||j                        }||z   }t        |d      }| j                  ||      }t        j                  ||      }t        j                  |g d	      }t        j                  ||d| j                  f      }|r||f}|S |f}|S )
Nr   T)transpose_br   dtyper   rX   r   r   )r   r   r   r   r   rd   matmulcastr   mathsqrtr   rL   r   r   r   )rS   hidden_statescontextattention_maskoutput_attentionsrY   r   mixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerattention_scoresdkattention_probscontext_layeroutputss                     r/   r`   zTFLxmertAttention.call:  sx   .q1
 JJ}5((7+ JJw///0A:N--ozJ	//0A:N 99
 WWZ	*2.6F6L6LM+bggll2.>>%WW^;K;Q;QRN/.@ ))9C ,,,J		/;?]F

JD,>,>?
 7H=/2 O\M]r.   c                   | j                   ry d| _         t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   xY w# 1 sw Y   ~xY w# 1 sw Y   y xY w)NTr   r   r   )rb   rc   rd   re   r   r<   rf   rQ   rF   r   r   r   rg   s     r/   rf   zTFLxmertAttention.build`  s$   ::
4$'3tzz/ H

  $dkk.E.E!FGH4%1txx}}- ;dDLL9:;4$'3tzz/ =

  $dll!;<= = 4H H; ;= =s$   3E<)E"#)E.E"E+.E7ri   rj   )r(   r)   r*   rC   r   r`   rf   rl   rm   s   @r/   r   r     s    B2
$L=r.   r   c                  ,     e Zd Z fdZd ZddZ xZS )TFLxmertIntermediatec                T   t        |   di | t        j                  j	                  |j
                  t        |j                        d      | _        t        |j                  t              r"t        |j                        | _        || _        y |j                  | _        || _        y )Ndenser:   r-   )rB   rC   r   rD   rE   intermediate_sizer   rG   r   
isinstance
hidden_actstrr
   intermediate_act_fnrQ   rR   s      r/   rC   zTFLxmertIntermediate.__init__p  s    "6"\\''$$.v/G/GH ( 


 f''-'89J9J'KD$  (.'8'8D$r.   c                J    | j                  |      }| j                  |      }|S rj   )r   r   rS   r   s     r/   r`   zTFLxmertIntermediate.call}  s&    

=100?r.   c                (   | j                   ry d| _         t        | dd       et        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        y y # 1 sw Y   y xY wNTr   	rb   rc   rd   re   r   r<   rf   rQ   rF   rg   s     r/   rf   zTFLxmertIntermediate.build  }    ::
4$'3tzz/ H

  $dkk.E.E!FGH H 4H H   3BBrj   rk   rm   s   @r/   r   r   o  s    
Hr.   r   c                  .     e Zd Z fdZddZddZ xZS )TFLxmertOutputc                v   t        |   di | t        j                  j	                  |j
                  t        |j                        d      | _        t        j                  j                  |j                  d      | _        t        j                  j                  |j                        | _        || _        y Nr   r:   rq   r>   r-   rB   rC   r   rD   rE   rF   r   rG   r   rH   rI   rq   rJ   rK   rL   rQ   rR   s      r/   rC   zTFLxmertOutput.__init__  s    "6"\\''.v/G/GH ( 

 88AVAV]h8i||++F,F,FGr.   c                t    | j                  |      }| j                  ||      }| j                  ||z         }|S rj   r   rL   rq   rS   r   input_tensorrY   s       r/   r`   zTFLxmertOutput.call  s9    

=1]H=}|'CDr.   c                "   | j                   ry d| _         t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       et        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        y y # 1 sw Y   |xY w# 1 sw Y   y xY wNTr   rq   )rb   rc   rd   re   r   r<   rf   rQ   r   rq   rF   rg   s     r/   rf   zTFLxmertOutput.build  s    ::
4$'3tzz/ N

  $dkk.K.K!LMN4d+7t~~223 L$$dD$++2I2I%JKL L 8N NL L   3C9<3D9DDri   rj   rk   rm   s   @r/   r   r     s    
	Lr.   r   c                  .     e Zd Z fdZddZddZ xZS )TFLxmertAttentionOutputc                v   t        |   di | t        j                  j	                  |j
                  t        |j                        d      | _        t        j                  j                  |j                  d      | _        t        j                  j                  |j                        | _        || _        y r   r   rR   s      r/   rC   z TFLxmertAttentionOutput.__init__  s    "6"\\''.v/G/GH ( 


 88AVAV]h8i||++F,F,FGr.   c                v    | j                  |      }| j                  ||      }| j                  ||z         }|S )NrX   r   r   s       r/   r`   zTFLxmertAttentionOutput.call  s;    

=1]XF}|'CDr.   c                "   | j                   ry d| _         t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       et        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        y y # 1 sw Y   |xY w# 1 sw Y   y xY wr   
rb   rc   rd   re   r   r<   rf   rQ   rF   rq   rg   s     r/   rf   zTFLxmertAttentionOutput.build      ::
4$'3tzz/ H

  $dkk.E.E!FGH4d+7t~~223 L$$dD$++2I2I%JKL L 8H HL Lr   ri   rj   rk   rm   s   @r/   r   r     s    		Lr.   r   c                  .     e Zd Z fdZddZddZ xZS )TFLxmertSelfAttentionLayerc                l    t        |   di | t        |d      | _        t	        |d      | _        y )NrS   r<   r_   r-   )rB   rC   r   rS   r   attention_outputrR   s      r/   rC   z#TFLxmertSelfAttentionLayer.__init__  s0    "6"%f6:	 7X Nr.   c                t    | j                  ||||      }|r|d   }| j                  |d   |      }|r|fS |fS )Nr   r   )rS   r   )rS   r   r   r   rY   self_outputr   r   s           r/   r`   zTFLxmertSelfAttentionLayer.call  sQ    iilNL]^)!nO00QN6G /2`N^M``r.   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   exY w# 1 sw Y   y xY w)NTrS   r   )rb   rc   rd   re   rS   r<   rf   r   rg   s     r/   rf   z TFLxmertSelfAttentionLayer.build  s    ::
4&2tyy~~. &		%&4+T2>t4499: 2%%++D12 2 ?& &2 2   C%CCC ri   rj   rk   rm   s   @r/   r   r     s    O
a	2r.   r   c                  2     e Zd Z fdZ	 	 ddZddZ xZS )TFLxmertCrossAttentionLayerc                l    t        |   di | t        |d      | _        t	        |d      | _        y )Nattr   r_   r-   )rB   rC   r   r   r   r   rR   s      r/   rC   z$TFLxmertCrossAttentionLayer.__init__  s0    "6"$V%8 7X Nr.   c                    | j                  |||||      }|r|d   }| j                  |d   ||      }|r|f}	|	S |f}	|	S NrX   r   r   )r   r   )
rS   r   
ctx_tensorctx_att_maskr   rY   r_   r   r   r   s
             r/   r`   z TFLxmertCrossAttentionLayer.call  sj     ,
LBS^fg$QiO00LS[0\9J#_5 RbPcr.   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   exY w# 1 sw Y   y xY w)NTr   r   )rb   rc   rd   re   r   r<   rf   r   rg   s     r/   rf   z!TFLxmertCrossAttentionLayer.build  s    ::
4%1txx}}- %t$%4+T2>t4499: 2%%++D12 2 ?% %2 2r   )FFrj   rk   rm   s   @r/   r   r     s    O  	2r.   r   c                  .     e Zd Z fdZddZddZ xZS )TFLxmertLayerc                    t        |   di | t        |d      | _        t	        |d      | _        t        |d      | _        y )N	attentionr   intermediater_   r-   )rB   rC   r   r   r   r   r   transformer_outputrR   s      r/   rC   zTFLxmertLayer.__init__  s?    "6"3FM0nM"0h"Gr.   c                    | j                  ||||      }|d   }| j                  |      }| j                  |||      }|f|dd  z   }	|	S )NrX   r   r   )r   r   r   )
rS   r   r   r   rY   attention_outputsr   intermediate_outputlayer_outputr   s
             r/   r`   zTFLxmertLayer.call
  sk     NN=.J[fnNo,Q/"//0@A../BDT_g.h/$5ab$99r.   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   xY w# 1 sw Y   qxY w# 1 sw Y   y xY w)NTr   r   r   )	rb   rc   rd   re   r   r<   rf   r   r   rg   s     r/   rf   zTFLxmertLayer.build  s   ::
4d+7t~~223 +$$T*+4.:t00556 .!!''-.4-t4@t66;;< 4''--d34 4 A+ +. .4 4$   D%%D1?D=%D.1D:=Eri   rj   rk   rm   s   @r/   r   r     s    H4r.   r   c                  L     e Zd Z fdZ	 ddZ	 ddZddZ	 ddZddZ xZ	S )	TFLxmertXLayerc                    t        |   d	i | t        |d      | _        t	        |d      | _        t	        |d      | _        t        |d      | _        t        |d      | _
        t        |d      | _        t        |d      | _        y )
Nvisual_attentionr   lang_self_attvisn_self_att
lang_interlang_output
visn_intervisn_outputr-   )rB   rC   r   r  r   r  r  r   r	  r   r
  r  r  rR   s      r/   rC   zTFLxmertXLayer.__init__"  s    "6" ;FI[ \ 8_U7_U /vLI)&}E.vLI)&}Er.   c                
   t        j                  |      }t        j                  |      }t        j                  |      }	t        j                  |      }
| j                  ||	|||      }| j                  |
||||      }||fS )N)r   rY   )rd   identityr  )rS   
lang_inputlang_attention_maskrZ   visn_attention_maskr   rY   lang_attention_lang_inputvisn_attention_lang_inputlang_attention_visn_inputvisn_attention_visn_inputlang_att_outputvisn_att_outputs                r/   	cross_attzTFLxmertXLayer.cross_att0  s     %'KK
$;!$&KK
$;!$&KK
$;!$&KK
$;!//%%/ 0 
 //%%/ 0 
 //r.   c                n    d}| j                  ||||      }| j                  ||||      }|d   |d   fS )NFrX   r   )r  r  )	rS   r  r  rZ   r  rY   r   r  r  s	            r/   self_attzTFLxmertXLayer.self_attQ  sW     ",,Z9LN_jr,s,,Z9LN_jr,sq!?1#555r.   c                    | j                  |      }| j                  |      }| j                  |||      }| j                  |||      }||fS rj   )r	  r  r
  r  )rS   r  rZ   rY   lang_inter_outputvisn_inter_outputr
  r  s           r/   	output_fczTFLxmertXLayer.output_fc_  sW     OOJ7 OOJ7 &&'8*hO&&'8*hOK''r.   c                    |}|}| j                  ||||||      \  }}|dd  }	| j                  |d   ||d   ||      \  }}| j                  |||      \  }
}|r|
||	d   fS |
|fS r   )r  r  r  )rS   
lang_featsr  
visn_featsr  r   rY   r  r  r   r
  r  s               r/   r`   zTFLxmertXLayer.calli  s     %$+/>> ,: ,
( *!"-+/==AA ,9 ,
( $(>>/?]e>#f [AR[/!*<=rYdfqXrrr.   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   (xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   AxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)	NTr  r  r  r	  r
  r  r  )rb   rc   rd   re   r  r<   rf   r  r  r	  r
  r  r  rg   s     r/   rf   zTFLxmertXLayer.build  sF   ::
4+T2>t4499: 2%%++D124$/;t11667 /""((./4$/;t11667 /""((./4t,8t334 ,%%d+,4-9t//445 -  &&t,-4t,8t334 ,%%d+,4-9t//445 -  &&t,- - :#2 2/ // /, ,- -, ,- -sT   J%J?J'J43KK'KJJ$'J14J>K
KK"ri   rj   )
r(   r)   r*   rC   r  r  r  r`   rf   rl   rm   s   @r/   r  r  !  s4    F* 0N 6(" s@-r.   r  c                  <     e Zd Z fdZ	 	 	 	 	 	 	 ddZddZ xZS )TFLxmertEncoderc                   t        |   di | t        |d      | _        |j                  | _        |j                  | _        |j                  | _	        t        | j
                        D cg c]  }t        |d|        c}| _        t        | j                        D cg c]  }t        |d|        c}| _        t        | j                        D cg c]  }t        |d|        c}| _        || _        y c c}w c c}w c c}w )Nr9   r   zlayer_._zx_layers_._zr_layers_._r-   )rB   rC   r7   r9   l_layersnum_l_layersx_layersnum_x_layersr_layersnum_r_layersr   r   layerr  rQ   )rS   rQ   rT   irU   s       r/   rC   zTFLxmertEncoder.__init__  s    "6"3FK #OO"OO"OO KPPTPaPaJbcQmF8A3@c
QVW[WhWhQijA{1#5FGjPUVZVgVgPhi1vk!4EFi djis   -D #DD
c           	     ^   d}d}	|s| j                   j                  rdnd }
|s| j                   j                  rdnd }|s| j                   j                  rdnd }| j                  ||g|      }| j                  D ]%  } |||||      }|d   }|	|fz   }	|||d   fz   }' | j                  D ]%  } |||||      }|d   }||fz   }|
|
|d   fz   }
' | j
                  D ]0  } |||||||      }|d d \  }}||fz   }|	|fz   }	|(||d   fz   }2 ||r|
nd f}|	|r|nd f}|||r|fS d fS )Nr-   rX   r   r   rW   )rQ   r   r9   r,  r*  r(  )rS   r   r  visual_feats
visual_posvisual_attention_maskr   rY   r$   r#   r&   r%   r'   layer_module	l_outputs	v_outputs	x_outputsvisual_encoder_outputslang_encoder_outputss                      r/   r`   zTFLxmertEncoder.call  s     "!#"3t{{7T7TBZ^$59V9Vb\`):dkk>[>[2ae ||\:$>|R !JJ 	LL$Z1DFWbjkI"1J%;zm%K"".&9Yq\O&K#	L !MM 
	HL$%!!	I %Q<L#7</#I  ,$51$G!
	H !MM 	VL$#%!!I (1!}$J#7</#I %;zm%K"'3+CyQR|o+U(	V  !!2"

 ##4$ 
 # (9$
 	
 @D
 	
r.   c                P   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       J| j                  D ];  }t        j                  |j
                        5  |j                  d        d d d        = t        | dd       J| j                  D ];  }t        j                  |j
                        5  |j                  d        d d d        = t        | dd       K| j                  D ];  }t        j                  |j
                        5  |j                  d        d d d        = y y # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   oxY w)NTr9   r,  r(  r*  )
rb   rc   rd   re   r9   r<   rf   r,  r(  r*  rS   rh   r,  s      r/   rf   zTFLxmertEncoder.build  sp   ::
4D)5t||001 )""4()4$'3 &]]5::. &KK%& && 4T*6 &]]5::. &KK%& && 4T*6 &]]5::. &KK%& && 7) )& && && &s0   E6*FFF6F F	F	F%	)NNNNNNFrj   rk   rm   s   @r/   r$  r$    s+    &  "D
L&r.   r$  c                  d     e Zd ZeZ fdZd Zd Zd Ze		 	 	 	 	 	 	 	 	 	 	 dd       Z
ddZ xZS )	TFLxmertMainLayerc                   t        |   di | || _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _	        |j                  | _
        |j                  | _        |j                  | _        t        |d      | _        t!        |d      | _        t%        |d      | _        || _        y )Nrz   r   encoderpoolerr-   )rB   rC   rQ   r&  r'  r(  r)  r*  r+  rG   r   output_hidden_statesuse_return_dictreturn_dictro   rz   r$  r=  TFLxmertPoolerr>  rR   s      r/   rC   zTFLxmertMainLayer.__init__  s    "6""OO"OO"OO!'!9!9!'!9!9$*$?$?!!11,V,G&vI>$V(;r.   c                    | j                   S rj   )rz   rS   s    r/   get_input_embeddingsz&TFLxmertMainLayer.get_input_embeddings%  s    r.   c                `    || j                   _        t        |      d   | j                   _        y Nr   )rz   rv   r   r}   rS   r   s     r/   set_input_embeddingsz&TFLxmertMainLayer.set_input_embeddings(  s$    !&%/%6q%9"r.   c                    t         rj   )NotImplementedError)rS   heads_to_prunes     r/   _prune_headszTFLxmertMainLayer._prune_heads,  s    !!r.   c           
        ||t        d      |t        |      }n|t        |      d d }nt        d      ||t        d      |t        j                  |d      }|t        j                  |d      }| j	                  ||||      }t        j
                  ||d   dd|d   f      }t        j                  ||j                        }t        j                  d|j                        }t        j                  d	|j                        }t        j                  t        j                  ||      |      }|t        j
                  ||d   dd|d   f      }t        j                  t        j                  |d
      d
      }t        j                  ||j                        }t        j                  t        j                  ||      |      }nd }| j                  |||||||      }|d d \  }}|d   }|d   }d}|r|d   }|d   }|d   }|||f}|	r||fnd}|d   }|d   }| j                  |      }|
s|||f|z   |z   S t        ||||	r|nd |	r|nd |rnd |rnd |r      S d       S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedszGvisual_feats and visual_pos cannot be `None` in LXMERT's `call` method.r   r   r         ?g     r   rW   r-   )r!   r   r    r#   r$   r%   r&   r'   )r   r   rd   r   rz   r   r   r   constantmultiplysubtractr   r=  r>  r   )rS   r   r/  r0  r   r1  r   r   r   r?  rA  rY   rh   embedding_outputextended_attention_maskone_cstten_thousand_cstextended_visual_attention_maskencoder_outputsr6  r7  r$   r#   all_attentionsr%   r&   r'   r   visual_outputr
  r!   s                                  r/   r`   zTFLxmertMainLayer.call/  s     ]%>cdd"$Y/K&$]3CR8KTUU!5fgg!WW[!4N!WW[!4N  ??9nmU]^ #%**^k!naQRT_`aTb=c"d #%''*AIYI_I_"`++c)9)?)?@;;x7G7M7MN"$++bkk'CZ.[]m"n ,-/ZZ8MP[\]P^`acdfqrsftOu-v*-/^^BNNK`gh<ipq-r*-/WW5S[k[q[q-r*-/[[G%CDFV.* .2* ,,#*
 8Gr7J4 45a8!5a!8"6q"9 6q 9'6q'9$#!(N K_/1EFdf,R0,R0K0>NQ___"'''=Q#9W[9M!5SW7H 3d3D/$AR%=	
 		
 Y]	
 		
r.   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   xY w# 1 sw Y   qxY w# 1 sw Y   y xY w)NTrz   r=  r>  )	rb   rc   rd   re   rz   r<   rf   r=  r>  rg   s     r/   rf   zTFLxmertMainLayer.build  s   ::
4t,8t334 ,%%d+,4D)5t||001 )""4()44(4t{{//0 (!!$'( ( 5, ,) )( (r  NNNNNNNNNNFrj   )r(   r)   r*   r   config_classrC   rE  rI  rM  r   r`   rf   rl   rm   s   @r/   r;  r;    s[    L :"  "!j
 j
X(r.   r;  c                  8    e Zd ZdZeZdZed        Zed        Z	y)TFLxmertPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    lxmertc                   d}d}t        j                  g dg dgt         j                        }t         j                  j	                  ||| j
                  j                  f      }t         j                  j	                  ||df      }|||dS )n
        Dummy inputs to build the network.

        Returns:
            tf.Tensor with dummy inputs
        rW   
   r	         rW   r	      r   rh  r   r/  r0  )rd   rP  int32randomuniformrQ   rM   )rS   r   num_visual_featuresr   r/  r0  s         r/   dummy_inputsz$TFLxmertPreTrainedModel.dummy_inputs  s     
 KKI 6bhhG	yy((*6I4;;KfKf)ghYY&&
4G'KL
 #($
 	
r.   c           
        t        j                  dt         j                  d      t        j                  dt         j                  d      t        j                  d d | j                  j                  ft         j
                  d      t        j                  dt         j
                  d      t        j                  dt         j                  d      t        j                  dt         j                  d	      d
S )N)NNr   r   r   r/  )NNrh  r0  r1  r   )r   r   r/  r0  r1  r   )rd   
TensorSpecrj  rQ   rM   float32rD  s    r/   input_signaturez'TFLxmertPreTrainedModel.input_signature  s     |RXXKP mmL"((IYZMM4t{{7R7R*SUWU_U_ftu--,W%']]<Pg%h mmL"((IYZ
 	
r.   N)
r(   r)   r*   r+   r   r]  base_model_prefixpropertyrn  rr  r-   r.   r/   r_  r_    s:    
  L 
 
& 
 
r.   r_  a
  

    The LXMERT model was proposed in [LXMERT: Learning Cross-Modality Encoder Representations from
    Transformers](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal. It's a vision and language transformer
    model, pre-trained on a variety of multi-modal datasets comprising of GQA, VQAv2.0, MCSCOCO captions, and Visual
    genome, using a combination of masked language modeling, region of interest feature regression, cross entropy loss
    for question answering attribute prediction, and object tag prediction.

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Parameters:
        config ([`LxmertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`np.ndarray` or `tf.Tensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
            [`PreTrainedTokenizer.encode`] for details.

            [What are input IDs?](../glossary#input-ids)
        visual_feats (`tf.Tensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
            This input represents visual features. They ROI pooled object features from bounding boxes using a
            faster-RCNN model)

            These are currently not provided by the transformers library.
        visual_pos (`tf.Tensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
            This input represents spacial features corresponding to their relative (via index) visual features. The
            pre-trained LXMERT model expects these spacial features to be normalized bounding boxes on a scale of 0 to
            1.

            These are currently not provided by the transformers library.
        attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        visual_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            MMask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
            config will be used instead.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False`):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
z`The bare Lxmert Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd Z fdZe ee       eee	e
      	 	 	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd                     ZddZ xZS )TFLxmertModelc                P    t        |   |g|i | t        |d      | _        y )Nr`  r   )rB   rC   r;  r`  )rS   rQ   r   rT   rU   s       r/   rC   zTFLxmertModel.__init__D  s(    3&3F3'X>r.   )
checkpointoutput_typer]  c                <    | j                  |||||||||	|
|      }|S rj   )r`  )rS   r   r/  r0  r   r1  r   r   r   r?  rA  rY   r   s                r/   r`   zTFLxmertModel.callH  s:    * ++! 
 r.   c                    | j                   ry d| _         t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY w)NTr`  )rb   rc   rd   re   r`  r<   rf   rg   s     r/   rf   zTFLxmertModel.buildm  si    ::
44(4t{{//0 (!!$'( ( 5( (   A11A:r\  )r   TFModelInputType | Noner/  r   r0  r   r   np.ndarray | tf.Tensor | Noner1  r~  r   r~  r   r~  r   Optional[bool]r?  r  rA  r  rY   boolreturnz!Union[Tuple, TFLxmertModelOutput]rj   )r(   r)   r*   rC   r   r   LXMERT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr`   rf   rl   rm   s   @r/   rv  rv  ?  s    
? *+BC&'$ .2)-'+8<?C8<7;,0/3&** ' %	
 6  = 6 5 * - $  
+ D <(r.   rv  c                  ,     e Zd Z fdZd ZddZ xZS )rB  c                    t        |   di | t        j                  j	                  |j
                  t        |j                        dd      | _        || _	        y )Ntanhr   )r;   
activationr<   r-   )
rB   rC   r   rD   rE   rF   r   rG   r   rQ   rR   s      r/   rC   zTFLxmertPooler.__init__w  sT    "6"\\''.v/G/GH	 ( 

 r.   c                :    |d d df   }| j                  |      }|S rG  )r   )rS   r   first_token_tensorr!   s       r/   r`   zTFLxmertPooler.call  s(     +1a40

#56r.   c                (   | j                   ry d| _         t        | dd       et        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        y y # 1 sw Y   y xY wr   r   rg   s     r/   rf   zTFLxmertPooler.build  r   r   rj   rk   rm   s   @r/   rB  rB  v  s    Hr.   rB  c                  0     e Zd Zd fdZddZddZ xZS )TFLxmertPredictionHeadTransformc                   t        |   di | t        j                  j	                  |j
                  t        |j                        d      | _        t        |j                  t              rt        |j                        | _        n|j                  | _        t        j                  j                  |j                  d      | _        || _        y )Nr   )unitsr;   r<   rq   r>   r-   )rB   rC   r   rD   rE   rF   r   rG   r   r   r   r   r
   transform_act_fnrH   rI   rq   rQ   rR   s      r/   rC   z(TFLxmertPredictionHeadTransform.__init__  s    "6"\\''$$.v/G/GH ( 

 f''-$5f6G6G$HD!$*$5$5D!88AVAV]h8ir.   c                p    | j                  |      }| j                  |      }| j                  |      }|S )Nr   )r   r  rq   r   s     r/   r`   z$TFLxmertPredictionHeadTransform.call  s8    

-
8--m<m<r.   c                "   | j                   ry d| _         t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       et        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        y y # 1 sw Y   |xY w# 1 sw Y   y xY wr   r   rg   s     r/   rf   z%TFLxmertPredictionHeadTransform.build  r   r   )rQ   r   r   	tf.Tensorr  r  rj   rk   rm   s   @r/   r  r    s    "	Lr.   r  c                  P     e Zd Zd fdZd	dZd
dZddZddZddZddZ	 xZ
S )TFLxmertLMPredictionHeadc                    t        |   di | || _        |j                  | _        t	        |d      | _        || _        y )N	transformr   r-   )rB   rC   rQ   rF   r  r  input_embeddingsrS   rQ   r  rT   rU   s       r/   rC   z!TFLxmertLMPredictionHead.__init__  s@    "6"!--8kR !1r.   c                X   | j                  | j                  j                  fddd      | _        | j                  ry d| _        t        | dd       Nt        j                  | j                  j                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY w)NzerosTbias)rw   rx   	trainabler<   r  )r|   rQ   r}   r  rb   rc   rd   re   r  r<   rf   rg   s     r/   rf   zTFLxmertLMPredictionHead.build  s    OO4;;+A+A*CQXdhouOv	::
4d+7t~~223 +$$T*+ + 8+ +s   :B  B)c                    | j                   S rj   )r  rD  s    r/   get_output_embeddingsz.TFLxmertLMPredictionHead.get_output_embeddings  s    $$$r.   c                `    || j                   _        t        |      d   | j                   _        y rG  )r  rv   r   r}   rH  s     r/   set_output_embeddingsz.TFLxmertLMPredictionHead.set_output_embeddings  s(    ',$+5e+<Q+?(r.   c                    d| j                   iS )Nr  )r  rD  s    r/   get_biasz!TFLxmertLMPredictionHead.get_bias  s    		""r.   c                X    |d   | _         t        |d         d   | j                  _        y )Nr  r   )r  r   rQ   r}   rH  s     r/   set_biasz!TFLxmertLMPredictionHead.set_bias  s'    &M	!+E&M!:1!=r.   c                   | j                  |      }t        |      d   }t        j                  |d| j                  g      }t        j
                  || j                  j                  d      }t        j                  |d|| j                  j                  g      }t        j                  j                  || j                        }|S )Nr   r   r   )tensorrw   T)abr   )r   r  )r  r   rd   r   rF   r   r  rv   rQ   r}   nnbias_addr  )rS   r   
seq_lengths      r/   r`   zTFLxmertLMPredictionHead.call  s    ]C.q1


-DDTDT?UV		MT5J5J5Q5Q_cd

-JPTP[P[PfPf?gh]Kr.   rQ   r   r  keras.layers.Layerrj   )r  r  )r   ztf.Variable)r  zDict[str, tf.Variable]r  )r(   r)   r*   rC   rf   r  r  r  r  r`   rl   rm   s   @r/   r  r    s'    
1+%@#>r.   r  c                  0     e Zd Zd fdZddZddZ xZS )TFLxmertMLMHeadc                J    t        |   di | t        ||d      | _        y )Npredictionsr   r-   )rB   rC   r  r  r  s       r/   rC   zTFLxmertMLMHead.__init__  s&    "6"3F<LS`ar.   c                *    | j                  |      }|S )Nr  )r  )rS   sequence_outputprediction_scoress      r/   r`   zTFLxmertMLMHead.call  s     ,,?,K  r.   c                    | j                   ry d| _         t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY w)NTr  )rb   rc   rd   re   r  r<   rf   rg   s     r/   rf   zTFLxmertMLMHead.build  sm    ::
4-9t//445 -  &&t,- - :- -r|  r  )r  r  r  r  rj   rk   rm   s   @r/   r  r    s    b
!
-r.   r  c                  ,     e Zd Z fdZd ZddZ xZS )TFLxmertPreTrainingHeadsc                    t        |   di | t        ||d      | _        t        j
                  j                  dt        |j                        d      | _	        || _
        y )Nr  r   rW   seq_relationshipr:   r-   )rB   rC   r  r  r   rD   rE   r   rG   r  rQ   r  s       r/   rC   z!TFLxmertPreTrainingHeads.__init__  s_    "6"3F<LS`a % 2 2.v/G/GH# !3 !

 r.   c                N    | j                  |      }| j                  |      }||fS rj   )r  r  )rS   r  r!   r  seq_relationship_scores        r/   r`   zTFLxmertPreTrainingHeads.call	  s0     ,,_=!%!6!6}!E "888r.   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       et        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        y y # 1 sw Y   |xY w# 1 sw Y   y xY w)NTr  r  )
rb   rc   rd   re   r  r<   rf   r  rQ   rF   rg   s     r/   rf   zTFLxmertPreTrainingHeads.build  s    ::
4-9t//445 -  &&t,-4+T2>t4499: S%%++T49P9P,QRS S ?- -S Ss   C"%3C."C+.C7rj   rk   rm   s   @r/   r  r    s    	9
	Sr.   r  c                  ,     e Zd Z fdZd ZddZ xZS )TFLxmertVisualAnswerHeadc                   t        |   di | |j                  }t        j                  j                  |dz  t        |j                        d      | _        t        d      | _
        t        j                  j                  |j                  d      | _        t        j                  j                  |t        |j                        d      | _        || _        y )	NrW   zlogit_fc_._0r:   geluzlogit_fc_._2r>   zlogit_fc_._3r-   )rB   rC   rF   r   rD   rE   r   rG   r   r
   r  rH   rI   
layer_normdense_1hid_dim)rS   rQ   
num_labelsrT   r  rU   s        r/   rC   z!TFLxmertVisualAnswerHead.__init__  s    "6"$$\\''aK.v/G/GH ( 


 ,F3,,99&BWBW^l9m||)).v/G/GH * 

 r.   c                    | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S rj   )r   r  r  r  r   s     r/   r`   zTFLxmertVisualAnswerHead.call,  s@    

=166]3r.   c                   | j                   ry d| _         t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       \t        j                  | j                  j
                        5  | j                  j                  d | j                  dz  g       d d d        t        | dd       ^t        j                  | j                  j
                        5  | j                  j                  d d | j                  dz  g       d d d        y y # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)NTr   r  rW   r  )
rb   rc   rd   re   r   r<   rf   r  r  r  rg   s     r/   rf   zTFLxmertVisualAnswerHead.build4  s1   ::
4$'3tzz/ =

  $dll!;<=4t,8t334 @%%tT\\A-=&>?@4D)5t||001 C""D$q0@#ABC C 6= =@ @C Cs$   )E2+E,E)EE&)E2rj   rk   rm   s   @r/   r  r    s    "Cr.   r  c                  ,     e Zd Z fdZd ZddZ xZS )TFLxmertVisualObjHeadc           
        t        |   di | t        |d      | _        i }|j                  rd|j
                  d|d<   |j                  rd|j                  d|d<   |j                  rd|j                  d|d<   || _
        | j                  D ci c]K  }|t        j                  j                  | j                  |   d	   t        |j                        d
|       M c}| _        || _        y c c}w )Nr  r   r   )rw   numobjattr)r   i   featr  zdecoder_dict.r:   r-   )rB   rC   r  r  visual_obj_lossnum_object_labelsvisual_attr_lossnum_attr_labelsvisual_feat_lossrM   visual_lossesr   rD   rE   r   rG   decoder_dictrQ   )rS   rQ   rT   r  r   rU   s        r/   rC   zTFLxmertVisualObjHead.__init__D  s   "6"8kR !!-26;S;S#TM% "".3F<R<R$SM&!"".8AWAW$XM&!* ))
  ##""3'.#263K3K#L$SE* $  
 
s   AC4c                ~    | j                  |      }i }| j                  D ]  } | j                  |   |      ||<    |S rj   )r  r  r  )rS   r   r_   r   s       r/   r`   zTFLxmertVisualObjHead.call^  sL    }5%% 	@C0$++C0?F3K	@r.   c                
   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       p| j                  j                         D ]R  }t        j                  |j
                        5  |j                  d d | j                  j                  g       d d d        T y y # 1 sw Y   xY w# 1 sw Y   lxY w)NTr  r  )rb   rc   rd   re   r  r<   rf   r  valuesrQ   rF   r9  s      r/   rf   zTFLxmertVisualObjHead.builde  s    ::
4d+7t~~223 +$$T*+4.:**113 G]]5::. GKKtT[[-D-D EFG GG ;+ +G Gs   C-8)C9-C69D	rj   rk   rm   s   @r/   r  r  C  s    4
Gr.   r  z4Lxmert Model with a `language modeling` head on top.c                       e Zd Z fdZed        Zd Zd Ze e	e
       eee      	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d	d                     Zd
dZ xZS )TFLxmertForPreTrainingc                   t        |   |g|i | || _        |j                  | _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _        |j                  | _        t        |d      | _
        t        || j                  j                  d      | _        | j                  rt        |d      | _        | j                  rt!        || j                  d      | _        t$        j&                  j)                  dd      t$        j&                  j+                  d	
      t$        j&                  j+                  d	
      d| _        i }|j.                  rd|j0                  dd|d<   |j2                  rd|j4                  dd|d<   |j6                  rd|j8                  f|j8                  dd|d<   || _        y )Nr`  r   clsobj_predict_headanswer_headrO  
huber_loss)deltar<   T)from_logits)l2visn_cecer  r  )rw   r  r2   r  r  r   r  r  )rB   rC   rQ   num_qa_labelsvisual_loss_normalizertask_mask_lmtask_obj_predicttask_matchedtask_qar;  r`  r  rz   r  r  r  r  r  r   lossesHuberSparseCategoricalCrossentropy	loss_fctsr  r  r  r  r  rM   r  )rS   rQ   r   rT   r  rU   s        r/   rC   zTFLxmertForPreTraining.__init__t  s   3&3F3#11&,&C&C# #// & 7 7"//~~ (X> ,FDKK4J4JQVW  $9&GY$ZD!<<7@R@RYfgD ,,$$3\$B||AAdAS,,<<<N
 !!//!$M% 
 ""--!%M&!
 ""f445--%M&!
 +r.   c                   d}d}t        j                  g dg dgt         j                        }t         j                  j	                  ||| j
                  j                  f      }t         j                  j	                  ||df      }| j
                  j                  ri }| j
                  j                  rG| j
                  j                  r1t        j                  ||g      t        j                  ||g      fd<   | j
                  j                  r\| j
                  j                  rFt        j                  ||| j
                  j                  g      t        j                  ||g      fd<   | j
                  j                  rG| j
                  j                  r1t        j                  ||g      t        j                  ||g      fd	<   i |||d
| j
                  j                  rdiS i S )rb  rW   rc  rd  rg  r   rh  r  r  r  ri  
obj_labels)rd   rP  rj  rk  rl  rQ   rM   r  r  onesr  r  )rS   r   rm  r   r/  r0  r  s          r/   rn  z#TFLxmertForPreTraining.dummy_inputs  s    
 KKI 6bhhG	yy((*6I4;;KfKf)ghYY&&
4G'KL
;;''J;;''DKK,H,H%89:%89:"Jv ;;''DKK,H,H%8$++:U:UVW%89:"Jv ;;&&4;;+G+G%89:%89:!Ju

& ,(
 .2[[-I-Ij)
 	
 PR
 	
r.   c                .    | j                   j                  S rj   )r  r  rD  s    r/   get_lm_headz"TFLxmertForPreTraining.get_lm_head  s    xx###r.   c                    t        j                  dt               | j                  dz   | j                  j                  z   dz   | j                  j
                  j                  z   S )NzMThe method get_prefix_bias_name is deprecated. Please use `get_bias` instead./)warningswarnFutureWarningr<   r  r  rD  s    r/   get_prefix_bias_namez+TFLxmertForPreTraining.get_prefix_bias_name  sG    egtuyy3.4txx7K7K7P7PPPr.   )ry  r]  c                   | j                  |||||||||||      }|d   |d   |d   }}}| j                  ||      \  }}| j                  r| j                  |      }n|d   d   }||
|	|dnt	        j
                  d      }d}|l| j                  r` | j                  d   t	        j                  |dg      t	        j                  |d| j                  j                  g            }||z  }||fz  }|
X| j                  rL | j                  d   t	        j                  |
dg      t	        j                  |ddg            }||z  }||fz  }|	.| j                  r!d}| j                  |      }| j                  j                         D ]  \  }}|	|   \  }} |d	   }!|d
   }"|d   }#| j                   }$| j                  |"   }%||   }& |%t	        j                  ||#      t	        j                  |&d|!g            }'|'j"                  dkD  rt	        j$                  |'      }'t	        j$                  |'t	        j&                  t	        j                  | dg      |'j(                        z        |$z  }'||'z  }||'fz  } ||z  }|b| j                  rV | j                  d   t	        j                  |dg      t	        j                  |d| j*                  g            }(||(z  }||(fz  }|s|||f|dd z   })||f|)z   S |)S t-        |||||j.                  |j0                  |j2                  |j4                  |j6                  	      S )ag  
        masked_lm_labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        obj_labels (`Dict[Str: Tuple[tf.Tensor, tf.Tensor]]`, *optional*, defaults to `None`):
            each key is named after each one of the visual losses and each element of the tuple is of the shape
            `(batch_size, num_features)` and `(batch_size, num_features, visual_feature_dim)` for each the label id and
            the label score respectively
        matched_label (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the whether or not the text input matches the image (classification) loss. Input
            should be a sequence pair (see `input_ids` docstring) Indices should be in `[0, 1]`:

            - 0 indicates that the sentence does not match the image,
            - 1 indicates that the sentence does match the image.
        ans (`tf.Tensor` of shape `(batch_size)`, *optional*, defaults to `None`):
            a one hot representation hof the correct answer *optional*

        Returns:
        r   r   rW   Ng        r-   r  r   r  r2   rw   r	   )	r2   r3   r4   r5   r#   r$   r%   r&   r'   )r`  r  r  r  rd   rP  r  r  r   rQ   r}   r  r  r  r  itemsr  ndimreduce_meanr   r   r  r1   r#   r$   r%   r&   r'   )*rS   r   r/  r0  r   r1  r   r   masked_lm_labelsr  matched_labelansr   r?  rA  rY   lxmert_outputr
  rZ  r!   lang_prediction_scoresr4   answer_score
total_lossr  masked_lm_lossmatched_losstotal_visn_lossvisn_prediction_scores_dictr   key_infolabel	mask_conf
output_dimloss_fct_namelabel_shaperv   visn_loss_fctvisn_prediction_scores	visn_lossanswer_lossr_   s*                                             r/   r`   zTFLxmertForPreTraining.call  s   T ! 
 !!! %2]
 <@88KQ^;_8 8<<++M:L(+A.L !(]-BzGY^a^i S! 	
 'D,=,=1T^^D1

+bT2

1B8N8N3OPN .(J~''F$):):/4>>$/

=2$/

3b!W=L ,&J|o%F!d&;&;!O*.*?*?*N'!%!3!3!9!9!; 'X#-c? y%e_
 ( 0&w/44 $} =)DS)I&)JJuk2JJ5J7GH	
 >>A% "y 9INN9rwwrzz)VXUY?Z\e\k\k7l+lmpvv	9,9,&#'$ /)J?t||.$...

3%rzz,TEWEW@X'YK +%J{n$F &( ab!	"F
 0:/EZMF*Q6Q+4%=%1#0#G#G!.!C!C - A A+==%2%K%K

 
	
r.   c                `   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   ~xY w# 1 sw Y   y xY w)NTr`  r  r  r  )
rb   rc   rd   re   r`  r<   rf   r  r  r  rg   s     r/   rf   zTFLxmertForPreTraining.builde  sR   ::
44(4t{{//0 (!!$'(4%1txx}}- %t$%4+T2>t4499: 2%%++D124-9t//445 -  &&t,- - :( (% %2 2- -s0   E?%F?FF$?F	FF!$F-)NNNNNNNNNNNNNNF) r   r}  r/  r   r0  r   r   r   r1  r   r   r   r   r   r  r   r  z-Dict[str, Tuple[tf.Tensor, tf.Tensor]] | Noner  r   r  r   r   bool | Noner?  r  rA  r  rY   r  r  z/Tuple[tf.Tensor] | TFLxmertForPreTrainingOutputrj   )r(   r)   r*   rC   rt  rn  r  r  r   r   r  r   r1   r  r`   rf   rl   rm   s   @r/   r  r  r  sD   1+f &
 &
P$Q *+BC+GVef .2)-'++/26+/*.-1DH*. $)-,0#'!I
*I
 'I
 %	I

 )I
  0I
 )I
 (I
 +I
 BI
 (I
 I
 'I
 *I
 !I
  !I
" 
9#I
 g D I
V-r.   r  )Fr+   
__future__r   r  dataclassesr   typingr   r   r   r   numpynp
tensorflowrd   activations_tfr
   modeling_tf_utilsr   r   r   r   r   r   r   tf_utilsr   r   utilsr   r   r   r   r   r   configuration_lxmertr   
get_loggerr(   loggerr  r  r   r1   rD   Layerr7   ro   r   r   r   r   r   r   r   r  r$  r;  r_  LXMERT_START_DOCSTRINGr  rv  rB  r  r  r  r  r  r  r  r-   r.   r/   <module>r'     s  "  "  ! / /   /   G  / 
		H	%3   *=+ *= *=Z ,=; ,= ,=^4Q5<<#5#5 4QnC ++ C LY=** Y=xH5<<-- H8LU\\'' L>Lell00 L<2!3!3 242%,,"4"4 2B4ELL&& 4<@-U\\'' @-Fj&ell(( j&Z V(** V( V(r&
/ &
R* X< ~ f0(+ 0(	0(fHU\\'' H8"Lell&8&8 "LL-u||11 -b-ell(( -(Su||11 S:&Cu||11 &CR,GELL.. ,G^ PRhiA-4 A- jA-r.   