
    sg۰                       d Z ddlmZ ddlZddlZddlmZmZmZm	Z	 ddl
ZddlZddlmZ ddlmZmZmZmZ ddlmZmZ dd	lmZmZmZmZmZmZmZm Z  dd
l!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(  e&jR                  e*      Z+dZ,dZ-dZ.d&dZ/	 	 	 	 	 	 	 	 d'dZ0	 	 	 	 	 	 	 	 d(dZ1d)d*dZ2d+d,dZ3 G d dejh                  jj                        Z6 G d dejh                  jj                        Z7e G d dejh                  jj                               Z8 G d de      Z9dZ:dZ; ed e:       G d! d"e9             Z< ed#e:       G d$ d%e9e             Z=y)-zTF 2.0 XGLM model.    )annotationsN)AnyOptionalTupleUnion   )get_tf_activation)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardreplace_return_docstrings)+TFBaseModelOutputWithPastAndCrossAttentions#TFCausalLMOutputWithCrossAttentions)TFCausalLanguageModelingLossTFModelInputTypeTFPreTrainedModelTFSharedEmbeddingsget_initializerkeraskeras_serializableunpack_inputs)check_embeddings_within_bounds
shape_liststable_softmax)logging   )
XGLMConfigzfacebook/xglm-564Mr   g    חc           
        |dz  }t        j                  d      |dz
  z  }t        j                  t        j                  |t        j
                        | z        }t        j                  t        j                  | t        j
                        d      t        j                  |d      z  }t        j                  t        j                  t        j                  |      t        j                  |      gd      | df      }|dz  dk(  r.t        j                  |t        j                  | df      gd      }|t        j                  t        j                  |t        |      d   f      t        j                  dt        |      d   f      t        j                  t        |      d   |z
  dz
  t        |      d   f      gd      }||z  }t        j                  |d	      S )
N   i'  r   dtypeaxisr   embed_positionsname)mathlogtfexprangefloat32expand_dimsreshapeconcatsincoszerosonesr   constant)num_positionsembedding_dimpadding_idxhalf_dimemb_padding_masks         \/var/www/html/venv/lib/python3.12/site-packages/transformers/models/xglm/modeling_tf_xglm.pycreate_sinusoidal_positionsr=   <   sz   !H
((5/X\
*C
&&("**5<
=C
..-rzzB
Kbnn]`ghNi
iC
**RYYsRVVC[9B]TVDW
XCqAiibhhq'9:;!D		joa&89:!Z_Q/01C+k9A=z#q?QRS
 
 	};;s!233    c                    t        j                  | |k7  dd      }t        j                  t        j                  |d      |j                        |z   |z  }t        j                  |t         j
                        |z   S )z
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.
    r   r   r"   r    )r*   wherecastcumsumr!   int64)	input_idspast_key_values_lengthr8   maskincremental_indicess        r<   #_create_position_ids_from_input_idsrH   S   sb     88I,a3D77299T#:$**MPffjnn77&bhh7+EEr>   c                    t        |       dd }|d   }t        j                  |dz   ||z   dz   t        j                        }t        j                  t        j
                  |d      |      |z   S )z
    Args:
    We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
        inputs_embeds: tf.Tensor
    Returns: tf.Tensor
    Nr$   r   r    r   r"   )r   r*   r,   rC   broadcast_tor.   )inputs_embedsrE   r8   input_shapesequence_lengthposition_idss         r<   '_create_position_ids_from_inputs_embedsrO   `   sg     ]+CR0K!!nO88K!O_{-JQ-NVXV^V^_L??2>>,Q?MPfffr>   c           	        | d   }| d   }t        j                  ||f      t        z  }t        j                  t	        |      d         }t        j
                  |t        j                  |dz   t	        |      d   df      k  d|      }|dkD  r.t        j                  t        j                  ||f      |gd      }t        j                  |ddddddf   |dddf      S )zB
    Make causal mask used for bi-directional self-attention.
    r   r   r$           r"   N)
r*   r4   LARGE_NEGATIVEr,   r   r@   r/   r0   r3   tile)input_ids_shaperE   bsztgt_lenrF   	mask_conds         r<   _make_causal_maskrX   r   s     !
Ca G77GW%&7DD)"-.I88I

9q=:d;KB;OQR:S TTVY[_`D!yy"((G-C#DEtLSUV774dAq()CAq>::r>   c                    t        |       d   }||n|}t        j                  d      }t        j                  | |j                        } t        j
                  | ddddddf   dd|df      }||z
  t        z  S )z_
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    r   N      ?r    )r   r*   r5   rA   r!   rS   rR   )rF   rV   src_lenone_cstexpanded_masks        r<   _expand_maskr^      sx     q!G ,g'Gkk#G774w}}-DGGDD$!12Q7A4FGMm#~55r>   c                  |     e Zd ZdZ	 	 	 d	 	 	 	 	 	 	 	 	 d fdZddZ	 	 	 	 	 d		 	 	 	 	 	 	 	 	 	 	 	 	 d
dZddZ xZS )TFXGLMAttentionz6Multi-headed attention from "Attention Is All You Needc                z   t        |   d
i | || _        || _        t        j
                  j                  |      | _        ||z  | _        | j                  |z  | j                  k7  rt        d| j                   d| d      | j                  dz  | _
        || _        t        j
                  j                  ||d      | _        t        j
                  j                  ||d      | _        t        j
                  j                  ||d      | _        t        j
                  j                  ||d	      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      k_proj)use_biasr'   q_projv_projout_proj )super__init__	embed_dim	num_headsr   layersDropoutdropouthead_dim
ValueErrorscaling
is_decoderDenserb   rd   re   rf   )selfrj   rk   rn   rr   biaskwargs	__class__s          r<   ri   zTFXGLMAttention.__init__   s    	"6"""||++G4!Y.MMI%$..8MdnnM]$YKr3  }}d*$ll((T(Qll((T(Qll((T(Q**9t**Ur>   c           	         t        j                  t        j                  |||| j                  | j                  f      d      S )Nr   r   r   r   )r*   	transposer/   rk   ro   )rt   tensorseq_lenrU   s       r<   _shapezTFXGLMAttention._shape   s0    ||BJJvWdnndmm/\]_kllr>   c           
     	   |du}t        |      \  }}	}
| j                  |      | j                  z  }|r||d   }|d   }n
|rE| j                  | j	                  |      d|      }| j                  | j                  |      d|      }n|}| j                  | j	                  |      d|      }| j                  | j                  |      d|      }t        j                  |d   |gd      }t        j                  |d   |gd      }nD| j                  | j	                  |      d|      }| j                  | j                  |      d|      }| j                  r||f}|| j                  z  d| j                  f}t        j                  | j                  ||	|      |      }t        j                  ||      }t        j                  ||      }t        |      d   }t        j                  ||d      }t        j                  j                  t        |      || j                  z  |	|gd	|| j                  z  |	|f d
t        |              |t        j                  j                  t        |      |d|	|gd|d|	|f d
t        |              t        j                  ||j                         }t        j                  ||| j                  |	|f      |z   }t        j                  ||| j                  z  |	|f      }t#        |d      }|t        j                  j                  t        |      | j                  gd| j                   d
t        |              t        j                  |d      t        j                  ||| j                  |	|f      z  }t        j                  ||| j                  z  |	|f      }| j%                  ||      }t        j                  ||      }t        j                  j                  t        |      || j                  z  |	| j                  gd|| j                  |	| j                  f d
t        |              t        j&                  t        j                  ||| j                  |	| j                  f      d      }t        j                  |||	|
f      }| j)                  |      }t        j                  ||| j                  |	|f      }|||fS )z#Input shape: Batch x Time x ChannelNr   r   r$   r   r"   T)transpose_bz$Attention weights should be of size z	, but is messagez!Attention mask should be of size r    z/Head mask for a single layer should be of size )r   r$   r   r   trainingz `attn_output` should be of size ry   )r   rd   rq   r}   rb   re   r*   r0   rr   rk   ro   r/   matmul	debuggingassert_equalrA   r!   r   rn   rz   rf   )rt   hidden_stateskey_value_statespast_key_valueattention_masklayer_head_maskr   is_cross_attentionrU   rV   rj   query_states
key_statesvalue_states
proj_shaper[   attn_weights
attn_probsattn_outputs                      r<   callzTFXGLMAttention.call   s    .T9",]";Wi {{=1DLL@."<'*J)!,LT[[1A%BBLJ;;t{{3C'Db#NL'T[[%?SIJ;;t{{='A2sKLN1$5z#BKJ99nQ&7%FQOL T[[%?SIJ;;t{{='A2sKL?? ),7NDNN*B>
zz$++lGS"I:VZZ
J7
zz,
;Z(+yyztL
!!|$4>>!7G46dnn8LgW^7_6` a|,-/	 	" 	
 %LL%%>*a'*7a'8R7S T">235	 &   WW^<;M;MNN::lS$..'SZ4[\_mmL::lS4>>5I7T[4\]L%l<&LL%%?+ Et~~EW X"?346	 &  ::o}E

sDNNGWEI L ::lS4>>5I7T[4\]L\\,\B
ii
L9
!!{#4>>!7DMM:2CRVR_R_3`2a b{+,.	 	" 	
 llJJ{S$..'4==$QRT`
 jjsGY.GHmmK0"$**\CQXZa;b"cL.88r>   c                   | j                   ry d| _         t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   AxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)NTrb   rd   re   rf   )builtgetattrr*   
name_scoperb   r'   buildrj   rd   re   rf   rt   rL   s     r<   r   zTFXGLMAttention.build*  s   ::
44(4t{{//0 @!!4t~~">?@44(4t{{//0 @!!4t~~">?@44(4t{{//0 @!!4t~~">?@4T*6t}}112 B##T4$@AB B 7@ @@ @@ @B Bs0   )F32)G )G )G3F= G	GG!)rQ   FT)
rj   intrk   r   rn   floatrr   boolru   r   )r{   	tf.Tensorr|   r   rU   r   )NNNNF)r   r   r   tf.Tensor | Noner   zTuple[Tuple[tf.Tensor]] | Noner   r   r   r   r   Optional[bool]returnz"Tuple[tf.Tensor, tf.Tensor | None]N)	__name__
__module____qualname____doc__ri   r}   r   r   __classcell__rw   s   @r<   r`   r`      s    @  VV V 	V
 V V8m .29=+/,0#(t9 t9 +t9 7	t9
 )t9 *t9 !t9 
,t9lBr>   r`   c                  b     e Zd Zd fdZ	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZddZ xZS )TFXGLMDecoderLayerc                   t        |   di | |j                  | _        t	        | j                  |j
                  |j                  dd      | _        t        j                  j                  |j                        | _        t        |j                        | _        t        j                  j                  |j                        | _        |j                   rYt	        | j                  |j
                  |j                  dd      | _        t        j                  j%                  dd      | _        t        j                  j%                  dd      | _        t        j                  j+                  |j,                  d	
      | _        t        j                  j+                  | j                  d
      | _        t        j                  j%                  dd      | _        || _        y )NT	self_attn)rj   rk   rn   rr   r'   encoder_attnh㈵>encoder_attn_layer_normepsilonr'   self_attn_layer_normfc1r&   fc2final_layer_normrg   )rh   ri   d_modelrj   r`   attention_headsattention_dropoutr   r   rl   rm   rn   r	   activation_functionactivation_fnactivation_dropoutadd_cross_attentionr   LayerNormalizationr   r   rs   ffn_dimr   r   r   config)rt   r   rv   rw   s      r<   ri   zTFXGLMDecoderLayer.__init__=  sp   "6"(nn,,,,
 ||++FNN;.v/I/IJ"',,"6"6v7P7P"Q%% /.. 0000#!D ,1<<+J+J#< ,K ,D( %*LL$C$CDWm$C$n!<<%%fnn5%A<<%%dnn5%A % ? ?Se ? fr>   c	                8   |}	| j                  |      }||dd nd}
| j                  ||
||      \  }}}| j                  ||      }|	|z   }d}d}|S|}	| j                  |      }||dd nd}| j	                  |||||      \  }}}| j                  ||      }|	|z   }||z   }|}	| j                  |      }| j                  | j                  |            }| j                  ||      }| j                  |      }| j                  ||      }|	|z   }||||fS )a  
        Args:
            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
            attention_mask (`tf.Tensor`): attention mask of size
                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
            encoder_hidden_states (`tf.Tensor`):
                cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
            encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
            layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
                *(decoder_attention_heads,)*
            cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module.
                *(decoder_attention_heads,)*
            past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states
        Nr   )r   r   r   r   r   )r   r   r   r   r   )
r   r   rn   r   r   r   r   r   r   r   )rt   r   r   encoder_hidden_statesencoder_attention_maskr   cross_attn_layer_head_maskr   r   residualself_attn_past_key_valueself_attn_weightspresent_key_valuecross_attn_present_key_valuecross_attn_weightscross_attn_past_key_values                   r<   r   zTFXGLMDecoderLayer.call^  s   4 !11-@ :H9S>"1#5Y] >Bnn'3)+	 ?M ?
;(*; ]XF =0 (,$! ,$H 88GM @N?Yrs(;_c%NRN_N_+!65 :8 O` OKM-/K !LLLJM$}4M !24P P !--m<**488M+BC///Q/]XF =0 	
 	
r>   c                b   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   sxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   [xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)	NTr   r   r   r   r   r   r   )r   r   r*   r   r   r'   r   r   rj   r   r   r   r   r   r   r   r   s     r<   r   zTFXGLMDecoderLayer.build  s   ::
4d+7t~~223 +$$T*+4/6Bt88==> N))//tT^^0LMN4%1txx}}- =dDNN;<=4%1txx}}- BdDKK,?,?@AB4+T2>t4499: J%%++T4,HIJ4.:t00556 .!!''-.42D9Et;;@@A Q,,22D$3OPQ Q F#+ +N N= =B BJ J. .Q QsT   K%)K%)K233K?$)LL%)L%K"%K/2K<?L	LL"%L.)r   r   rv   r   r   None)NNNNNNF)r   r   r   r   r   r   r   r   r   r   r   r   r   zTuple[tf.Tensor] | Noner   r   r   z4Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]r   )r   r   r   ri   r   r   r   r   s   @r<   r   r   <  s    H ,02637,07;26#(N
 N
 )N
  0	N

 !1N
 *N
 %5N
 0N
 !N
 
>N
`Qr>   r   c                       e Zd ZeZ	 d	 	 	 	 	 	 	 d	 fdZd
dZddZ	 	 	 	 	 	 	 	 ddZdddZ	e
	 	 	 	 	 	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd       ZddZ xZS )TFXGLMMainLayerc                   t        |   |i | || _        |j                  | _        |j
                  | _        |j                  rt        j                  |j                        nd| _        ||| _        n2t        |j                  |j                  | j                  d      | _        d| _        t!        |j
                  | j                  z   |j                  |j                        | _        t$        j&                  j)                  |j*                        | _        t-        |j.                        D cg c]  }t1        |d|        c}| _        |j2                  | _        t$        j&                  j5                  dd	      | _        y c c}w )
NrZ   embed_tokensr&   r   )r6   r7   r8   zlayers.r   
layer_normr   )rh   ri   r   pad_token_idr8   max_position_embeddingsmax_target_positionsscale_embeddingr(   sqrtr   embed_scaler   r   
vocab_sizeoffsetr=   _embed_positions_weightsr   rl   rm   rn   r,   
num_layersr   	layerdropr   r   )rt   r   r   inputsrv   irw   s         r<   ri   zTFXGLMMainLayer.__init__  s<    	&+F+!..$*$B$B!8>8N8N499V^^4TW# ,D 2!!6>>43C3C.!D (C 884;;F ..++)
% ||++FNN;OTU[UfUfOgh!)&}Eh)),,99$\9Z is   2F	c                    | j                   S r   r   rt   s    r<   get_input_embeddingsz$TFXGLMMainLayer.get_input_embeddings  s       r>   c                    || _         y r   r   )rt   values     r<   set_input_embeddingsz$TFXGLMMainLayer.set_input_embeddings  s
    !r>   c                    t        ||      t        j                  |d   dkD  fdfd      |S t        ||d         }|z   S )Nr$   r   c                      S r   rg   combined_attention_masks   r<   <lambda>zATFXGLMMainLayer._prepare_decoder_attention_mask.<locals>.<lambda>  s    )@ r>   c                 .    t        j                         S r   )r*   	ones_liker   s   r<   r   zATFXGLMMainLayer._prepare_decoder_attention_mask.<locals>.<lambda>  s    ",,WnJo r>   rV   )rX   r*   condr^   )rt   r   rL   rE   expand_attention_maskr   s        @r<   _prepare_decoder_attention_maskz/TFXGLMMainLayer._prepare_decoder_attention_mask  s`     #4KAW"X"$''Oa!@Bo#
 !** ,^[QS_ U$'>>>r>   c                h    || j                   z  }t        j                  | j                  |d      }|S )Nr   r"   )r   r*   gatherr   )rt   rN   	positionss      r<   r%   zTFXGLMMainLayer.embed_positions  s/    #IId;;\PQR	r>   c                   ||n| j                   j                  }||n| j                   j                  }|
|
n| j                   j                  }
||n| j                   j                  }||	t        d      |1t        j                  |      }t        j                  |d|d   f      }n&|	t        j                  |	      d d }nt        d      ||d   d   j                  d   nd}|1t        j                  t        j                  ||d   |z         d      }t        j                  |dt        |      d   g      }|	>t        || j                  j                         | j                  |      | j                  z  }	| j!                  |||      }||t#        ||d         }| j%                  |      }t        j&                  |	t        j(                        |z   }| j+                  ||	      }|rd
nd }|rd
nd }|r|d
nd }|
rd
nd }d|fd|ffD ]r  \  }}|	t        j,                  j/                  t        |      d   t1        | j2                        d| dt1        | j2                         dt        |      d    d       t t5        | j2                        D ]z  \  }}|r||fz  }t7        j8                  dd      }|r|| j:                  k  r6|||   nd } ||||||||   nd |||   nd |      \  }}}} |
r|| fz  }|sl||fz  }|u||fz  }| | j=                  |      }|r||fz  }|
r|nd }!|st?        d ||!|||fD              S tA        ||!|||      S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer$   z5You have to specify either input_ids or inputs_embedsr   r   r"   r   r    r   rg   	head_maskcross_attn_head_maskzThe z should be specified for z layers, but it is for .r   r   )r   r   r   r   r   r   c              3  $   K   | ]  }|| 
 y wr   rg   ).0vs     r<   	<genexpr>z'TFXGLMMainLayer.call.<locals>.<genexpr>}  s      = s   )last_hidden_statepast_key_valuesr   
attentionscross_attentions)!r   output_attentionsoutput_hidden_states	use_cacheuse_return_dictrp   r*   shaper/   r.   r,   r   r   r   r   r   r   r^   r%   rA   r-   rn   r   r   lenrl   	enumeraterandomuniformr   r   tupler   )"rt   rD   r   rN   r   r   r   r   r   rK   r  r  r  return_dictr   rv   rL   rE   r   r   all_hidden_statesall_self_attnsall_cross_attentionsnext_decoder_cacheattn_mask_name	attn_maskidxdecoder_layerdropout_probabilityr   layer_self_attnlayer_cross_attnr   
next_caches"                                     r<   r   zTFXGLMMainLayer.call  s   & 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]  ]%>cdd"((9-K

9r;r?.CDI&((=1#26KTUUCRC^!3A!6!<!<Q!?de>>/RCY1YZabL zz,Z5Mb5Q0RS *9d6G6G6R6RS --i84;K;KKM==nk[qr !,1G1S%12HR]^`Ra%b" ((6	RZZ@9L]XF #7BD0d&7<Q<]rdh#,R$ ,7	*BE[]qDr)s 		%NI$))y)!,$~..GDKKHXGY Z&y1!45Q8	 * 		 #,DKK"8 	@C#!m%55!"(..A"604>>A5D5P_S1VZNR_-&;'=3<3H3dI]Ii,@,Eos-SOM?,<>O "'8&::" ?"44(4(-=,??(9	@< 6  -!11+4'$
 '5FXlm  
 ;+&+%1
 	
r>   c                   | j                   ry d| _         t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       K| j                  D ];  }t        j                  |j
                        5  |j                  d        d d d        = y y # 1 sw Y   xY w# 1 sw Y   nxY w# 1 sw Y   axY w)NTr   r   rl   )r   r   r*   r   r   r'   r   r   r   r   rl   )rt   rL   layers      r<   r   zTFXGLMMainLayer.build  s!   ::
4t,8t334 I%%tT4;;3F3F&GHI4.:t00556 .!!''-.44(4 &]]5::. &KK%& && 5I I. .& &s$   3D9<EE9EEE	r   )r   r   r   Optional[TFSharedEmbeddings]rv   r   r   r   )r   r   )r   r   r   r   )r   r   rL   tf.TensorShaperE   r   r   r   )rN   np.ndarray | tf.Tensor | Noner   r   NNNNNNNNNNNNNF rD   TFModelInputType | Noner   r  rN   r  r   r  r   r  r   r  r   r  r   4Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]]rK   r  r  r   r  r   r  r   r  r   r   r   rv   r   r   zDUnion[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]])r   r   r   r   config_classri   r   r   r   r%   r   r   r   r   r   s   @r<   r   r     sk   L PT[ [0L[hk[	[:!"?(? $? !$	?
 
?"
  .28<6:?C@D37>BPT7;$(,0/3&*#(A
*A
 6A
 4	A

  =A
 !>A
 1A
 <A
 NA
 5A
 "A
 *A
 -A
 $A
 !A
  !A
" 
N#A
 A
F&r>   r   c                      e Zd ZeZdZy)TFXGLMPreTrainedModelmodelN)r   r   r   r   r!  base_model_prefixrg   r>   r<   r#  r#    s    Lr>   r#  au	  
    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Args:
        config ([`XGLMConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`tf.Tensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`tf.Tensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
            the decoder.
        encoder_attention_mask (`tf.Tensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
            Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
            selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        head_mask (`tf.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        cross_attn_head_mask (`tf.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.num_layers`)
            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*, defaults to `True`):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`). Set to `False` during training, `True` during generation
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
            config will be used instead.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False`):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
z^The bare XGLM Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd ZdZ	 d	 	 	 	 	 	 	 	 	 d fdZe ee       ee	e
e      	 	 	 	 	 	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d	d                     ZddZ xZS )
TFXGLMModelz
    Transformer decoder consisting of *config.num_layers* layers. Each layer is a [`TFXGLMDecoderLayer`]

    Args:
        config: XGLMConfig
        embed_tokens: [TFSharedEmbeddings]: output embedding
    c                R    t        |   |g|i | t        ||d      | _        y )Nr$  r   r'   )rh   ri   r   r$  rt   r   r   r   rv   rw   s        r<   ri   zTFXGLMModel.__init__  s,     	3&3F3$V,WU
r>   
checkpointoutput_typer!  c                B    | j                  ||||||||	|
||||      }|S )N)rD   r   r   r   r   r   r   rK   r  r  r  r  r   )r$  )rt   rD   r   rN   r   r   r   r   r   rK   r  r  r  r  r   rv   outputss                    r<   r   zTFXGLMModel.call%  sE    2 **)"7#9!5+'/!5#  
  r>   c                    | j                   ry d| _         t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY w)NTr$  )r   r   r*   r   r$  r'   r   r   s     r<   r   zTFXGLMModel.buildP  sg    ::
4$'3tzz/ '

  &' ' 4' 's   A11A:r   
r   r   r   r  r   r   rv   r   r   r   r  r  )r   r   r   r   ri   r   r   XGLM_INPUTS_DOCSTRINGr
   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   r   r   r   s   @r<   r'  r'    sM   
 PTV V0LV^aVmpV	V *+@A&?$ .28<6:?C@D37>BPT7;$(,0/3&*#("*" 6" 4	"
  =" !>" 1" <" N" 5" "" *" -" $" !"  !"" 
N#" B "H'r>   r'  z
    The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                  *    e Zd ZdZddgZdgZ	 d	 	 	 	 	 	 	 	 	 d fdZd Zd ZddZ	e
 ee       eee       eeee	      	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd
                            ZddZd Z xZS )TFXGLMForCausalLMr$  zmodel.embed_positions.weightslm_head.weightc                    t        |   |g|i | t        ||d      | _        t        j
                  j                  |j                  dt        |j                        d      | _
        || _        y )Nr$  r)  Flm_head)rc   kernel_initializerr'   )rh   ri   r   r$  r   rl   rs   r   r   init_stdr9  r   r*  s        r<   ri   zTFXGLMForCausalLM.__init__j  sk     	3&3F3$V,WU
||)).v?	 * 
 r>   c                    | j                   S r   r9  r   s    r<   get_output_embeddingsz'TFXGLMForCausalLM.get_output_embeddingsx  s    ||r>   c                    || _         y r   r=  )rt   new_embeddingss     r<   set_output_embeddingsz'TFXGLMForCausalLM.set_output_embeddings{  s	    %r>   c                "   |rt        j                  |d d df   d      }|j                  dd       }|j                  dd       }|C|At         j                  j	                  |dd      }|rt        j                  |d d df   d      }|||||dS )Nr$   rN   r   T)r#   	exclusive)rD   r   rN   r   r  )r*   r.   getr(   rB   )rt   r   r   r  rv   rN   r   s          r<   prepare_inputs_for_generationz/TFXGLMForCausalLM.prepare_inputs_for_generation~  s    ^^F1b5M26Fzz.$7$4d;%,*>77>>.rT>RL!~~l1b5.A2F  ,(."
 	
r>   )r-  r!  r+  c                "   | j                  |||||||||	|||||      }|d   }| j                  |      }d}|
t        j                  |
ddddf   t        j                  |
j
                  d   dft        j                  | j                  j                  |
j                              gd      }
| j                  |
|      }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                        S )a  
        labels (`np.ndarray` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        )rD   r   rN   r   r   r   r   r   rK   r  r  r  r  r   r   Nr   r$   r"   )losslogitsr   r   r   r   )r$  r9  r*   r0   fillr  rA   r   r   r!   hf_compute_lossr   r   r   r   r   )rt   rD   r   rN   r   r   r   r   r   rK   labelsr  r  r  r  r   rv   r/  r   	lm_logitsrG  outputs                         r<   r   zTFXGLMForCausalLM.call  s4   D **)%"7#9!5+'/!5#  
   
LL/	YY12a!(<bggdkkF^F^`f`l`l>m noF ''	:D\GABK/F)-)9TGf$EvE2#33!//))$55
 	
r>   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       et        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        y y # 1 sw Y   |xY w# 1 sw Y   y xY w)NTr$  r9  )
r   r   r*   r   r$  r'   r   r9  r   hidden_sizer   s     r<   r   zTFXGLMForCausalLM.build  s    ::
4$'3tzz/ '

  &'4D)5t||001 J""D$0G0G#HIJ J 6' 'J Js   C"%3C."C+.C7c                    |dk(  r|dfS |fS )Nr7  zmodel.embed_tokens.weightrg   )rt   	tf_weights     r<   tf_to_pt_weight_renamez(TFXGLMForCausalLM.tf_to_pt_weight_rename  s    ((999<r>   r   r1  )NN)NNNNNNNNNNNNNNF)"rD   r  r   r  rN   r  r   r  r   r  r   r  r   r  r   r   rK   r  rK  r  r  r   r  r   r  r   r  r   r   r   rv   r   r   z<Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]])r   r   r   r%  _keys_to_ignore_on_load_missing_keys_to_ignore_on_saveri   r>  rA  rE  r   r   r2  r   r   r4  r
   r3  r   r   rR  r   r   s   @r<   r6  r6  Y  s     ('#
 	)
 PT 0L^amp	&
* *+@A+N]lm&7$ .28<6:?C@D37>BPT7;04$(,0/3&*#(!A
*A
 6A
 4	A

  =A
 !>A
 1A
 <A
 NA
 5A
 .A
 "A
 *A
 -A
 $A
  !!A
" #A
$ 
F%A
 n B A
F	J r>   r6  )r6   r   r7   r   r8   Optional[int]r   r   )rD   r   rE   r   r8   rU  r   r   )rK   r   rE   r   r8   rU  r   r   )r   )rT   r  rE   r   r   )rF   r   rV   rU  )>r   
__future__r   r(   r  typingr   r   r   r   numpynp
tensorflowr*   activations_tfr	   
file_utilsr
   r   r   r   modeling_tf_outputsr   r   modeling_tf_utilsr   r   r   r   r   r   r   r   tf_utilsr   r   r   utilsr   configuration_xglmr   
get_loggerr   loggerr3  r4  rR   r=   rH   rO   rX   r^   rl   Layerr`   r   r   r#  XGLM_START_DOCSTRINGr2  r'  r6  rg   r>   r<   <module>rf     s    "   . .   /  t	 	 	 S R  * 
		H	%*  4.
F
F25
FDQ
F
Fgg69gHUgg$;$
6gBell(( gBTJQ++ JQZ M&ell(( M& M&` -  
' RF R dA'' A'	A'H  M -/K M M r>   