
    sg                   x   d Z ddlmZ ddlZddlmZ ddlmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZmZmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&  e#jN                  e(      Z)dZ*dZ+dZ,d8dZ-d9d:dZ.d;d<dZ/ G d dej`                  jb                        Z2 G d dej`                  jf                        Z4 G d dej`                  jf                        Z5 G d dej`                  jf                        Z6 G d dej`                  jf                        Z7 G d dej`                  jf                        Z8 G d  d!e      Z9e G d" d#e             Z:e G d$ d%e             Z;e G d& d'e             Z<d(Z=d)Z>e G d* d+ej`                  jf                               Z?e G d, d-ej`                  jf                               Z@e G d. d/ej`                  jf                               ZA e!d0e=       G d1 d2e9             ZB G d3 d4ej`                  jf                        ZC e!d5e=       G d6 d7e9             ZDy)=zTF 2.0 LED model.    )annotationsN)	dataclass)ListOptionalTupleUnion   )get_tf_activation)+TFBaseModelOutputWithPastAndCrossAttentions)TFModelInputTypeTFPreTrainedModelget_initializerkeraskeras_serializableunpack_inputs)check_embeddings_within_bounds
shape_liststable_softmax)ModelOutputadd_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )	LEDConfigzallenai/led-base-16384r   g    חc           
        t        j                  || j                        }t        j                  || j                        }t        j                  t	        |       d   dft        j
                  || j                              }t        j                  || d d d df   gd      }t        j                  |dk(  t        j                  t	        |      t        j
                  || j                              |      }t         j                  j                  |t        j                  d| j                              }t        j                  |g      5  t        j                  |      }d d d        |S # 1 sw Y   |S xY w)Nr   r   idtype)tfcastr    fillr   convert_to_tensorconcatwhere	debuggingassert_greater_equalconstantcontrol_dependenciesidentity)	input_idspad_token_iddecoder_start_token_idstart_tokensshifted_input_idsassert_gte0s         Z/var/www/html/venv/lib/python3.12/site-packages/transformers/models/led/modeling_tf_led.pyshift_tokens_rightr3   <   s8   77<9LWW%;Y__M77	I	q	!1%r';';<RT]TcTc'dL 		<1crc61B"CRHT!

,-r/C/CLR[RaRa/bc ,,334Er{{ST\e\k\kGlmK 
	 	 +	/ ;KK(9:; ; s   E..E8c           	        | d   }| d   }t        j                  ||f      t        z  }t        j                  t	        |      d         }t        j
                  |t        j                  |dz   t	        |      d   df      k  d|      }|dkD  r.t        j                  t        j                  ||f      |gd      }t        j                  |ddddddf   |dddf      S )zB
    Make causal mask used for bi-directional self-attention.
    r   r   r           axisN)
r!   onesLARGE_NEGATIVEranger   r&   reshaper%   zerostile)input_ids_shapepast_key_values_lengthbsztgt_lenmask	mask_conds         r2   _make_causal_maskrD   U   s     !
Ca G77GW%&7DD)"-.I88I

9q=:d;KB;OQR:S TTVY[_`D!yy"((G-C#DEtLSUV774dAq()CAq>::    c                    t        |       d   }||n|}t        j                  d      }t        j                  | |j                        } t        j
                  | ddddddf   dd|df      }||z
  t        z  S )z_
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    r   Ng      ?r   )r   r!   r)   r"   r    r=   r9   )rB   rA   src_lenone_cstexpanded_masks        r2   _expand_maskrJ   g   sx     q!G ,g'Gkk#G774w}}-DGGDD$!12Q7A4FGMm#~55rE   c                  2     e Zd ZdZd fdZdd fdZ xZS )TFLEDLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    c                (    t        |   ||fi | y N)super__init__)selfnum_embeddingsembedding_dimkwargs	__class__s       r2   rP   z(TFLEDLearnedPositionalEmbedding.__init__y   s    A&ArE   c                    |d   }t        j                  |dd      }||z  }t        |   t        j                  |t         j
                              S )z/Input is expected to be of size [bsz x seqlen].r   r:   )deltanamer   )r!   r:   rO   callr"   int32)rQ   input_shaper?   seq_lenposition_idsrU   s        r2   rY   z$TFLEDLearnedPositionalEmbedding.call|   sG    a.xxqw?..w|BGGLABBrE   )rR   intrS   r^   r   )r[   tf.TensorShaper?   r^   __name__
__module____qualname____doc__rP   rY   __classcell__rU   s   @r2   rL   rL   t   s    BC CrE   rL   c                       e Zd Z fdZddZ	 ddZd Zed        Zd Z	ed        Z
ed        Zed	        Zed
        Zd Zd Zd Zd Z xZS )TFLEDEncoderSelfAttentionc                   t        |   di | || _        |j                  |j                  z  dk7  r%t        d|j                   d|j                         |j                  | _        t        |j                  |j                  z        | _        |j                  | _	        t        j                  j                  | j                  t        |j                        d      | _        t        j                  j                  | j                  t        |j                        d      | _        t        j                  j                  | j                  t        |j                        d      | _        t        j                  j                  | j                  t        |j                        d      | _        t        j                  j                  | j                  t        |j                        d	      | _        t        j                  j                  | j                  t        |j                        d
      | _        t        j                  j+                  |j,                        | _        t        j                  j+                  |j,                        | _        || _        |j4                  | j2                     }|dz  dk(  sJ d| j2                   d|        |dkD  sJ d| j2                   d|        |dz  | _        y )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads (query)kernel_initializerrX   keyvaluequery_global
key_globalvalue_global   z`attention_window` for layer z  has to be an even value. Given z has to be positive. Given  )rO   rP   confighidden_sizenum_attention_heads
ValueError	num_headsr^   head_dim	embed_dimr   layersDenser   initializer_rangerk   rm   rn   ro   rp   rq   Dropoutattention_probs_dropout_probdropoutglobal_dropoutlayer_idattention_windowone_sided_attn_window_size)rQ   rt   r   rT   r   rU   s        r2   rP   z"TFLEDEncoderSelfAttention.__init__   s   "6" : ::a?#F$6$6#7 8 4457 
  33F..1K1KKL++\\''NN.v/G/GH ( 


 <<%%NN.v/G/GH & 

 \\''NN.v/G/GH ( 

 "LL..NN.v/G/GH / 

  ,,,,NN.v/G/GH - 

 "LL..NN.v/G/GH / 

 ||++F,O,OP#ll2263V3VW !224==A q A%	m*4==/9YZjYkl	m% q 	h*4==/9TUeTfg	h  +;a*?'rE   c                L   | j                   st        j                  d      5  | j                  j	                  | j
                  j                  f       d d d        t        j                  d      5  | j                  j	                  | j
                  j                  f       d d d        t        j                  d      5  | j                  j	                  | j
                  j                  f       d d d        | j                   ry d| _         t        | dd       dt        j                  | j                  j                        5  | j                  j	                  d d | j
                  j                  g       d d d        t        | dd       dt        j                  | j                  j                        5  | j                  j	                  d d | j
                  j                  g       d d d        t        | dd       dt        j                  | j                  j                        5  | j                  j	                  d d | j
                  j                  g       d d d        t        | dd       dt        j                  | j                  j                        5  | j                  j	                  d d | j
                  j                  g       d d d        t        | dd       dt        j                  | j                  j                        5  | j                  j	                  d d | j
                  j                  g       d d d        t        | dd       et        j                  | j                  j                        5  | j                  j	                  d d | j
                  j                  g       d d d        y y # 1 sw Y   bxY w# 1 sw Y   !xY w# 1 sw Y   xY w# 1 sw Y   hxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   <xY w# 1 sw Y   xY w# 1 sw Y   y xY w)Nro   rp   rq   Trk   rm   rn   )builtr!   
name_scopero   buildrt   ru   rp   rq   getattrrk   rX   rm   rn   rQ   r[   s     r2   r   zTFLEDEncoderSelfAttention.build   s=   zz~. D!!'')@)@(BCD|, B%%t{{'>'>&@AB~. D!!'')@)@(BCD ::
4$'3tzz/ H

  $dkk.E.E!FGH4%1txx}}- FdDKK,C,CDEF4$'3tzz/ H

  $dkk.E.E!FGH4.:t00556 O!!''tT[[5L5L(MNO4t,8t334 M%%tT4;;3J3J&KLM4.:t00556 O!!''tT[[5L5L(MNO O ;3D DB BD DH HF FH HO OM MO Osk   1N301O >1O3O23O'#3O43P3P63P3N= O
OO$'O14O>PPP#c                
   |\  }}}}}}| j                  |      }	| j                  |      }
| j                  |      }t        |      \  }}}t        j
                  j                  || j                  d| j                   d|        |	t        j                  j                  t	        j                  | j                  |	j                              z  }	t	        j                  |	||| j                  | j                  f      }	t	        j                  |
||| j                  | j                  f      }
| j                  |	|
| j                         }|dk7  }t	        j                  ||	j                        t"        z  }| j                  t	        j$                  t        |            || j                         }||z  }t        j
                  j                  t        |      ||| j                  | j                   dz  dz   gd| d	| d	| j                   d	| j                   dz  dz    d
t        |       
       | j'                  |      \  }}}}|r| j)                  ||	|
||||      }t+        |d      }|rCt	        j,                  |ddddddf   dd| j                  | j                   dz  |z   dz   f      }n?t	        j,                  |ddddddf   dd| j                  | j                   dz  dz   f      }t	        j.                  |t	        j0                  t        |      |j                        |      }|ht        j
                  j                  t        |      | j                  gd| j                   dt        |              t	        j                  |d      |z  }| j3                  ||      }t	        j                  |||| j                  | j                  f      }|r| j5                  |||||      }n| j7                  ||| j                         }t        j
                  j                  t        |      ||| j                  | j                  gd       t	        j                  ||||f      }|r| j9                  |||||||||	      \  }}n#t	        j0                  || j                  ||f      }|rCt	        j,                  |ddddddf   dd| j                  | j                   dz  |z   dz   f      }n?t	        j,                  |ddddddf   dd| j                  | j                   dz  dz   f      }t	        j.                  |t	        j0                  t        |      |j                        |      }|||f}|S )a  
        LongformerSelfAttention expects *len(hidden_states)* to be multiple of *attention_window*. Padding to
        *attention_window* happens in LongformerModel.forward to avoid redoing the padding on each layer.

        The *attention_mask* is changed in [`LongformerModel.forward`] from 0, 1, 2 to:

            - -10000: no attention
            - 0: local attention
            - +10000: global attention
        z&hidden_states should have embed_dim = z
, but has messager   r   rr   r   zattn_probs should be of size (z, z), but is of size )attn_scoresquery_vectorskey_vectorsmax_num_global_attn_indicesis_index_global_attn_nonzero"is_local_index_global_attn_nonzero%is_local_index_no_global_attn_nonzeror   r6   N/Head mask for a single layer should be of size 	, but is )r   r   r   r   training)value_vectors
attn_probsr   r   r   zUnexpected size)	attn_outputhidden_statesr   layer_head_maskr   r   r   is_index_maskedr   )rk   rm   rn   r   r!   r'   assert_equalrz   mathsqrtr"   ry   r    r;   rx    _sliding_chunks_query_key_matmulr   r9   r8   _get_global_attn_indices"_concat_with_global_key_attn_probsr   r=   r&   r<   r   (_compute_attn_output_with_global_indices'_sliding_chunks_matmul_attn_probs_value'_compute_global_attn_output_from_hidden)rQ   inputsr   r   attention_maskr   r   is_index_global_attnis_global_attnr   r   r   
batch_sizer\   rz   r   #remove_from_windowed_attention_mask
float_maskdiagonal_maskr   r   r   r   r   masked_indexr   global_attn_probsmasked_global_attn_indexoutputss                                r2   rY   zTFLEDEncoderSelfAttention.call   s   . 	
  

=1hh}-

=1)3M)B&
GY
!!NN<T^^<LJW`Vab 	" 	
 	bggdmm=CVCV&WXX

=:wX\XeXe2fgjjz7DNNTXTaTa.bc ;;;(G(G

 /=.A+WW@H[H[\_mm
 ==GGJ~./++
 	}$
!!{#$..$2Q2QTU2UXY2YZ0Bwir$..IY Z33a7!;<<NzZeOfNgi	 	" 	
 ))*>?	
'(.1 AA'+',G-I3U6[ B K $Kb9
 771dD 01At~~t'F'F'JMh'hkl'lmL
 771dD 01At~~t'F'F'JQ'NOL XXHHZ-Z5E5EF

 &LL%%?+ Et~~EW X"?346	 &  O]CjPJ \\*x\@


=:wX\XeXe2fg GG+%,G-I3U H K FFM4+J+JK 	!!{#j'4>>4==%Yct 	" 	
 jjz7I.NO -1-Y-Y'+,G /3U-I6[ /! .Z 
.*K* !#*dnnFacj)k l ')ww$Q4%56At~~t'F'F'JMh'hkl'lm($
 (*ww$Q4%56At~~t'F'F'JQ'NO($ XX$HHZ 89AQAQR

 
,=>rE   c           
        t        |      \  }}}}t        j                  j                  ||dz  z  dd|dz   d|        t        j                  j                  t        |      t        |      dt        |       dt        |              ||z  dz
  }t        j                  t        j
                  |d	      ||z  ||f      }t        j                  t        j
                  |d	      ||z  ||f      }| j                  ||      }	| j                  ||      }
t        j                  |	|
j                  
      }	t        j                  d|	|
      }t        j                  ddgddgddgddgg      }| j                  ||      }t        j                  |ddddd|d|dz   f   |dddd|dd|dz   f   gd      }t        j                  t        j                  ||z  d||f|j                  
      |dddd|dz    d|dz   df   gd      }t        j                  t        j                  |d|gddg      ddddd|d|f   t        j                  ||z  d||f|j                  
      gd      }t        j                  t        j                   |dz   t        j"                  
      dddddf   ||z  d||f      dk  }t        j$                  |||      }t        j                  ||gd      }t        j
                  t        j                  ||||d|z  dz   f      d	      }| j'                  ||      }|S )a  
        Matrix multiplication of query and key tensors using with a sliding window attention pattern. This
        implementation splits the input into overlapping chunks of size 2w (e.g. 512 for pretrained Longformer) with an
        overlap of size window_overlap
        rr   r   z&Sequence length should be multiple of z. Given r   z7Shape of query and key should be equal, but got query: z
 and key: r   r   rr   r   r	   r   zbcxd,bcyd->bcxyNr   r6   r	   )shiftr7   )r   r!   r'   r   r;   	transpose_chunkr"   r    einsumr$    _pad_and_transpose_last_two_dimsr%   r<   rollr=   r:   int64r&   _mask_invalid_locations)rQ   rk   rm   window_overlapr   r\   rx   ry   chunks_countchunked_querychunked_keychunked_attention_scorespaddings!diagonal_chunked_attention_scoresdiagonal_attn_scores_up_triangdiagonal_attn_scores_low_triang diagonal_attn_scores_first_chunkfirst_chunk_maskdiagonal_attention_scoress                      r2   r   z:TFLEDEncoderSelfAttention._sliding_chunks_query_key_matmul  s    4>e3D0
GY
!!~)*<^a=O<PPXY`Xab 	" 	

 	!!usOI*UZJ[I\ ]sO$&	 	" 	
 .014 

LL-)#Wh7
 jjc<8:	;QSZ\d:efE>:kk#~6 [5F5FG#%99->{#[  ''!Q!Q!Q!Q(HI,0,Q,QRjlt,u) *,1!QI]>\]K]I]2]^1!RS./K_^^_M_K_2_` *
& +-)))+QO;AA 2!Q.1:L8MPR8RTbefTfTh2hi 	+
' ,.995n-Q Q.8	:
 )+QO;AA
 ,
( GG):4D$;NOi'NNK 	 	 +-((,++
' %'II,.LMTV%
!
 %'LLJJ)Y^1Ca1GH %
! %)$@$@AZ\j$k!((rE   c           	        t        j                  t         j                  j                  t        j                  ||dz   f      dd      dg      }t        j
                  dt        |       d   |z
  gdt        |       d   |z
  dz
  gg      }t        j                  ||      }|t        j                  |ddg      z   }t        j                  |d d d d d d f   t        |       d   dddf      }t        d       t        j                  |       z  }t        j                  t         j                  j                  |d      ||       } | S )Nr   shaper   r   r6   r	   inf)r!   reverselinalg	band_partr8   r$   r   padr=   float	ones_liker&   r   greater)input_tensorr   mask_2d_upperpaddingmask_2dmask_4d
inf_tensors          r2   r   z1TFLEDEncoderSelfAttention._mask_invalid_locations  s4    

II~~PQ?Q.R SUWYZ[
 &&L)!,~=>J|D\]^D_bpDpstDt@uv

 &&0 BJJwaV<< '''$4"23j6Nq6QSTVWYZ5[\ El]R\\,%??
 xx ;ZVrE   c           	        t        |      \  }}}}t        j                  j                  ||dz  z  dd       t        j                  j                  t        |      dd t        |      dd d       t        j                  j                  t        |      d   d|z  dz   d	       ||z  dz
  }t        j                  t        j
                  |d
      ||z  ||z  |d|z  dz   f      }	t        j                  t        j
                  |d
      ||z  ||f      }t        j                  ddg||gddgg      }
t        j                  ||
d      }d|z  |z  }t        |      d   |z  |z
  |z  }t        j                  j                  t        j                  |||z  df      ||      }t        j                  |||z  |dz   d|z  |f      }t        j                  j                  t        |      ||z  |dz   d|z  |gd       | j                  |	      }	t        j                  d|	|      }t        j
                  t        j                  |||||f      d
      }|S )z
        Same as _sliding_chunks_query_key_matmul but for attn_probs and value tensors. Returned tensor will be of the
        same shape as `attn_probs`
        rr   r   z0Seq_len has to be multiple of 2 * window_overlapr   Nr	   z:value and attn_probs must have same dims (except head_dim)r   z4attn_probs last dim has to be 2 * window_overlap + 1r   r   constant_valuesz!Chunked value has the wrong shapezbcwd,bcdh->bcwh)r   r!   r'   r   r;   r   r$   r   signalframe_pad_and_diagonalizer   )rQ   r   rn   r   r   r\   rx   ry   r   chunked_attn_probsr   padded_value
frame_sizeframe_hop_sizechunked_valuecontexts                   r2   r   zATFLEDEncoderSelfAttention._sliding_chunks_matmul_attn_probs_value2  s    4>e3D0
GY
!!~)*A7i 	" 	
 	!!z"2A&ubq!P 	" 	

 	!!z"1%"J 	" 	
 .014  ZZLL\2Y&>)N"Q&	
 

LL-)#Wh7
 ''!Q..1QTUWXSY(Z[vveXrB '(2
$\215@:MR^^		JJ|j9&<b%AB

 

)#\A%5q>7I8T

 	!!}%)#\A%5q>7I8T7 	" 	
 "667IJ))-/A=Q,,JJwY JK

 rE   c                    t        j                  | |      } t        |       \  }}}}t        j                  | ||||f      } | S )z)pads rows and then flips rows and columns)r!   r   r   r;   )hidden_states_paddedr   r   
chunk_size
seq_length
hidden_dims         r2   r   z:TFLEDEncoderSelfAttention._pad_and_transpose_last_two_dims{  sR      "vv ( 
 :DDX9Y6
J
J!zz*>ZYceo@pq##rE   c                F   t        |       \  }}}}t        j                  ddgddgddgd|dz   gg      }t        j                  | |      } t        j                  | ||df      } | ddddd| f   } t        j                  | |||||z   f      } | ddddddddf   } | S )aY  
        shift every row 1 step right, converting columns into diagonals.

        Example:

        ```python
        chunked_hidden_states: [
            0.4983,
            2.6918,
            -0.0071,
            1.0492,
            -1.8348,
            0.7672,
            0.2986,
            0.0285,
            -0.7584,
            0.4206,
            -0.0405,
            0.1599,
            2.0514,
            -1.1600,
            0.5372,
            0.2629,
        ]
        window_overlap = num_rows = 4
        ```

                     (pad & diagonalize) => [ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000
                       0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000 0.0000, 0.0000, -0.7584, 0.4206,
                       -0.0405, 0.1599, 0.0000 0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ]
        r   r   r   N)r   r!   r$   r   r;   )chunked_hidden_statestotal_num_heads
num_chunksr   r   r   s         r2   r   z.TFLEDEncoderSelfAttention._pad_and_diagonalize  s    B CMMbBc?^Z''!Q!Q!Q!^VWEWAX(YZ "!8!
 !#

!OZ#D!
 !6q"N?""!
 !#

!j..::UV!
 !6aAssl C$$rE   c           
        t        |       \  }}}d|d|z  z  z  dz
  }||z  }d|z  }t        j                  | |||z  f      } t        j                  j	                  | ||      }t        j
                  j                  t        |      |||gd|||g dt        |       d       t        j                  |||d|z  |f      }|S )zBconvert into overlapping chunks. Chunk size = 2w, overlap size = wrr   r   z^Make sure chunking is correctly applied. `Chunked hidden states should have output  dimension z
, but got .r   )r   r!   r;   r   r   r'   r   )	r   r   r   r   r   num_output_chunksr   r   r   s	            r2   r   z TFLEDEncoderSelfAttention._chunk  s     .8-F*
J
N0B!CDqH (*4'


=:zJ?V2WX !#		z> Z
!!,-*J7->?@
:VkKlJmmnp	 	" 	
 !#

!*A,>
K!

 %$rE   c                   t         j                  j                  | d      }t        j                  |t        j                  d      j
                        }t        j                  |      }t        j                  |       }t        j                  |      t        j                  |d      k  }t        j                  |      }t        j                  t         j                  j                  |            }||||fS )z<compute global attn indices required throughout forward passr   r6   r   r   )r!   r   count_nonzeror"   r)   r    
reduce_maxr&   r:   expand_dimslogical_not)r   num_global_attn_indicesr   r   is_local_index_global_attnr   r   s          r2   r   z2TFLEDEncoderSelfAttention._get_global_attn_indices  s     #%''"7"78LST"7"U"$''*AUVI]I]"^ ')mm4K&L# (*xx0D'E$ &(XX.I%JR^^#"N
 &
"
 .0XX6P-Q* 139L9LMg9h0i- ((.1	
 	
rE   c                T   t        |      d   }t        j                  ||      }	t        j                  ||	||| j                  | j
                  f      }
t        j                  d||
      }t        j                  |d      }t        |      d   ft        t        |      dd        z   }t        j                  |      dz  }t        j                  ||j                        }t        j                  |||      }t        j                  |d      }t        j                  ||fd	
      }|S )Nr   r   zblhd,bshd->blhs)r   r	   r   rr        r   )r   rr   r	   r   r   r6   )r   r!   	gather_nd
scatter_ndrx   ry   r   r   tupler8   r"   r    tensor_scatter_nd_updater%   )rQ   r   r   r   r   r   r   r   r   global_key_vectorskey_vectors_only_globalattn_probs_from_global_key attn_probs_from_global_key_trans
mask_shaperB   s                  r2   r   z<TFLEDEncoderSelfAttention._concat_with_global_key_attn_probs  s/     ,Q/
  \\+7ST #%--.+		#
 &(YY/@-Qh%i" ,.<<8RT`+a( !FGJLu78=P
 

 wwz"X-wwt#C#I#IJ ,.+F+F,1,
( &(\\2RT`%a" ii!;[ IPRSrE   c                `   t        |      d   }|d d d d d d d |f   }t        j                  ||      }t        j                  ||||| j                  | j
                  f      }	t        j                  d||	      }
|d d d d d d |d f   }| j                  ||| j                        }|
|z   S )Nr   r   zblhs,bshd->blhd)	r   r!   r   r   rx   ry   r   r   r   )rQ   r   r   r   r   r   r   attn_probs_only_globalglobal_value_vectorsvalue_vectors_only_globalattn_output_only_globalattn_probs_without_globalattn_output_without_globals                r2   r   zBTFLEDEncoderSelfAttention._compute_attn_output_with_global_indices-  s      
+A.
 ",Aq!5Q6Q5Q,Q!R  "||M;WX %'MM. +		%
! #%)),=?UWp"q %/q!Q8S8T/T$U! &*%Q%Q%}d6U6U&
" ')CCCrE   c
           
     v	   t        |      d d \  }
}t        j                  ||      }t        j                  |||
|| j                  f      }| j                  |      }| j                  |      }| j                  |      }|t        j                  j                  t        j                  | j                  |j                              z  }| j                  ||
      }| j                  ||
      }| j                  ||
      }t        j                  ||d      }t        j                  j!                  t        |      |
| j"                  z  ||gd|
| j"                  z  ||f dt        |       d	       t        j$                  ||
| j"                  ||f      }t        j&                  |d
      }t        |      d   ft)        t        |      dd        z   }t        j*                  |      dz  }t        j                  ||j                        }t        j,                  |||      }t        j&                  |d
      }t        j.                  |d d d d d d f   dt        |      d   ddf      }t        j0                  |d|      }t        j$                  ||
| j"                  z  ||f      }t3        |d      }|t        j                  j!                  t        |      | j"                  gd| j"                   dt        |       	       t        j$                  |d      t        j$                  ||
| j"                  ||f      z  }t        j$                  ||
| j"                  z  ||f      }| j5                  ||	      }t        j                  ||      }t        j                  j!                  t        |      |
| j"                  z  || j                  gd|
| j"                  z  || j                  f dt        |       d	       t        j$                  ||
| j"                  || j                  f      }t        j                  t        j&                  |d
      |      }t        j$                  |t        |      d   df      }t        j,                  |||      }t        j$                  ||
| j"                  ||f      }||fS )Nrr   r   r   Ttranspose_bz7global_attn_scores have the wrong size. Size should be r   r   r   r   r   r   r   r   r   r6   r   r   r   r   r   r   z=global_attn_output tensor has the wrong size. Size should be )r   r!   r   r   rz   ro   rp   rq   r   r   r"   ry   r    reshape_and_transposematmulr'   r   rx   r;   r   r   r8   r   r=   r&   r   r   )rQ   r   r   r   r   r   r   r   r   r   r   r\   global_attn_hidden_states global_query_vectors_only_globalr  r  global_attn_scoresglobal_attn_scores_transr  global_attn_mask	attn_maskglobal_attn_probs_floatr   global_attn_outputnonzero_global_attn_outputs                            r2   r   zATFLEDEncoderSelfAttention._compute_global_attn_output_from_hiddenV  s    )7;
G %'LL@\$]!$&MM.%:DNNK%
! ,0+<+<=V+W(!__];#00? 	)BGGLLGGDMM)I)O)OP-
 	
( ,0+E+EFfhr+s(!778JJW#99:NPZ[  YY'GI[imn
!!)*$..(*EwO$..02MwWX Y12316	 	" 	
  ZZ)DgN
 $&<<0BL#Q  !FGJLu/05P
 

 77:.977#3;S;Y;YZ $&#>#>$1$
 
  \\*BLQ GGOAtT1,<=:N`CabcCdfgij?kl	XXi;MNZZ$..(*EwO
 #11C""M &LL%%?+ Et~~EW X"?346	 &  ')jj-&PSUS]S]'*dnnFacj)kT '# ')jj'*t~~*EGbdk)l'#
 !//0GRZ/[  YY'8:NO
!!)*$..(*Et}}U$..02Mt}}]^ _12316	 	" 	
  ZZ)DdmmT
 &(\\LL+\:.&
" &(ZZ&:;A>C&
" 1157Q
 JJ
DNN<WY`a
 ---rE   c                    t        j                  t        j                  t        j                  ||d| j                  | j                  f      d      || j                  z  d| j                  f      S )Nr   r   )r!   r;   r   rx   ry   )rQ   vectorr   s      r2   r  z/TFLEDEncoderSelfAttention.reshape_and_transpose  sZ    zzLL

6JDNNDMM#RS $..("dmm<
 	
rE   rN   F)rb   rc   rd   rP   r   rY   r   staticmethodr   r   r   r   r   r   r   r   r   r  rf   rg   s   @r2   ri   ri      s    9@vOD }~r)h  :GR $ $ 1% 1%f % %: 
 
<4l'DRG.R
rE   ri   c                  .     e Zd Z fdZddZddZ xZS )TFLEDEncoderAttentionc                    t        |   di | t        ||d      | _        t        j
                  j                  |j                  dd      | _        || _	        y )Nlongformer_self_attn)r   rX   Toutputuse_biasrX   rs   )
rO   rP   ri   r#  r   r{   r|   d_modeloutput_densert   rQ   rt   r   rT   rU   s       r2   rP   zTFLEDEncoderAttention.__init__  sO    "6"$=fx^t$u!!LL..v~~S[.\rE   c                    |\  }}}}}}| j                  ||||||g|      }	| j                  |	d   |      }
|
f|	dd  z   }|S )Nr   r   r   )r#  r(  )rQ   r   r   r   r   r   r   r   r   self_outputsattention_outputr   s               r2   rY   zTFLEDEncoderAttention.call  s     	
  00NO_Nbdrs 1 

  ,,\!_x,P#%QR(88rE   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       et        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        y y # 1 sw Y   |xY w# 1 sw Y   y xY w)NTr#  r(  )
r   r   r!   r   r#  rX   r   r(  rt   r'  r   s     r2   r   zTFLEDEncoderAttention.build  s    ::
4/6Bt88==> 6))//564.:t00556 K!!''tT[[5H5H(IJK K ;6 6K Ks   C"%3C."C+.C7r  rN   rb   rc   rd   rP   rY   r   rf   rg   s   @r2   r!  r!    s    (	KrE   r!  c                  x     e Zd ZdZ	 	 	 d	 	 	 	 	 	 	 	 	 d fdZddZ	 	 	 	 	 d		 	 	 	 	 	 	 	 	 	 	 d
dZddZ xZS )TFLEDDecoderAttentionz6Multi-headed attention from "Attention Is All You Needc                P   t        |   di | || _        || _        t        j
                  j                  |      | _        ||z  | _        | j                  |z  | j                  k(  sJ d       | j                  dz  | _	        || _
        t        j
                  j                  ||d      | _        t        j
                  j                  ||d      | _        t        j
                  j                  ||d      | _        t        j
                  j                  ||d      | _        y )	Nz(embed_dim must be divisible by num_headsg      k_projr%  q_projv_projout_projrs   )rO   rP   rz   rx   r   r{   r~   r   ry   scaling
is_decoderr|   r2  r3  r4  r5  )rQ   rz   rx   r   r7  biasrT   rU   s          r2   rP   zTFLEDDecoderAttention.__init__  s     	"6"""||++G4!Y.}}y(DNN:f<ff:}}d*$ll((T(Qll((T(Qll((T(Q**9t**UrE   c           	         t        j                  t        j                  |||| j                  | j                  f      d      S )Nr   )r!   r   r;   rx   ry   )rQ   tensorr\   r@   s       r2   _shapezTFLEDDecoderAttention._shape+  s0    ||BJJvWdnndmm/\]_kllrE   c           
     	   |du}t        |      \  }}	}
| j                  |      | j                  z  }|r||d   }|d   }n
|rE| j                  | j	                  |      d|      }| j                  | j                  |      d|      }n|}| j                  | j	                  |      d|      }| j                  | j                  |      d|      }t        j                  |d   |gd      }t        j                  |d   |gd      }nD| j                  | j	                  |      d|      }| j                  | j                  |      d|      }| j                  r||f}|| j                  z  d| j                  f}t        j                  | j                  ||	|      |      }t        j                  ||      }t        j                  ||      }t        |      d   }t        j                  ||d      }t        j                  j                  t        |      || j                  z  |	|gd	|| j                  z  |	|f d
t        |              |t        j                  j                  t        |      |d|	|gd|d|	|f d
t        |              t        j                  ||| j                  |	|f      t        j                  ||j                         z   }t        j                  ||| j                  z  |	|f      }t#        |d      }|t        j                  j                  t        |      | j                  gd| j                   d
t        |              t        j                  |d      t        j                  ||| j                  |	|f      z  }t        j                  ||| j                  z  |	|f      }| j%                  ||      }t        j                  ||      }t        j                  j                  t        |      || j                  z  |	| j                  gd|| j                  |	| j                  f d
t        |              t        j&                  t        j                  ||| j                  |	| j                  f      d      }t        j                  |||	|
f      }| j)                  |      }t        j                  ||| j                  |	|f      }|||fS )z#Input shape: Batch x Time x ChannelNr   r   r   rr   r6   Tr  z$Attention weights should be of size r   r   z!Attention mask should be of size r   r   r  r   z `attn_output` should be of size r   )r   r3  r6  r;  r2  r4  r!   r%   r7  rx   ry   r;   r  r'   r   r"   r    r   r   r   r5  )rQ   r   key_value_statespast_key_valuer   r   r   is_cross_attentionr@   rA   rz   query_states
key_statesvalue_states
proj_shaperG   attn_weightsr   r   s                      r2   rY   zTFLEDDecoderAttention.call.  s    .T9",]";Wi {{=1DLL@."<'*J)!,LT[[1A%BBLJ;;t{{3C'Db#NL'T[[%?SIJ;;t{{='A2sKLN1$5z#BKJ99nQ&7%FQOL T[[%?SIJ;;t{{='A2sKL?? ),7NDNN*B>
zz$++lGS"I:VZZ
J7
zz,
;Z(+yyztL
!!|$4>>!7G46dnn8LgW^7_6` a|,-/	 	" 	
 %LL%%>*a'*7a'8R7S T">235	 &  ::lS$..'SZ4[\_a_f_fl&8&8` L ::lS4>>5I7T[4\]L%l<&LL%%?+ Et~~EW X"?346	 &  ::o}E

sDNNGWEI L ::lS4>>5I7T[4\]L\\,\B
ii
L9
!!{#4>>!7DMM:2CRVR_R_3`2a b{+,.	 	" 	
 llJJ{S$..'4==$QRT`
 jjsGY.GHmmK0"$**\CQXZa;b"cL.88rE   c                   | j                   ry d| _         t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   AxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)NTr2  r3  r4  r5  )r   r   r!   r   r2  rX   r   rz   r3  r4  r5  r   s     r2   r   zTFLEDDecoderAttention.build  s   ::
44(4t{{//0 @!!4t~~">?@44(4t{{//0 @!!4t~~">?@44(4t{{//0 @!!4t~~">?@4T*6t}}112 B##T4$@AB B 7@ @@ @@ @B Bs0   )F32)G )G )G3F= G	GG!)r5   FT)
rz   r^   rx   r^   r   r   r7  boolr8  rF  )r:  	tf.Tensorr\   r^   r@   r^   )NNNNF)r   rG  r=  tf.Tensor | Noner>  Tuple[Tuple[tf.Tensor]] | Noner   rH  r   rH  returnz"Tuple[tf.Tensor, tf.Tensor | None]rN   )	rb   rc   rd   re   rP   r;  rY   r   rf   rg   s   @r2   r0  r0    s    @  VV V 	V
 V V0m .29=+/,0v9 v9 +v9 7	v9
 )v9 *v9 
,v9pBrE   r0  c                  J     e Zd Zd fdZ	 d	 	 	 	 	 	 	 	 	 	 	 ddZddZ xZS )TFLEDEncoderLayerc                   t        |   d	i | |j                  | _        t	        ||d      | _        t        j                  j                  dd      | _	        t        j                  j                  |j                        | _        t        |j                        | _        t        j                  j                  |j                        | _        t        j                  j!                  |j"                  d      | _        t        j                  j!                  | j                  d      | _        t        j                  j                  dd      | _        || _        y )
N	self_attnrX   h㈵>self_attn_layer_normepsilonrX   fc1fc2final_layer_normrs   )rO   rP   r'  rz   r!  rN  r   r{   LayerNormalizationrQ  r~   r   r
   activation_functionactivation_fnactivation_dropoutr|   encoder_ffn_dimrT  rU  rV  rt   r)  s       r2   rP   zTFLEDEncoderLayer.__init__  s    "6".vxkR$)LL$C$CDWm$C$n!||++FNN;.v/I/IJ"',,"6"6v7P7P"Q<<%%f&<&<5%I<<%%dnn5%A % ? ?Se ? frE   c           
        |}| j                  ||||||g|      }	|	d   }t        j                  j                  t	        |      t	        |      dt	        |       dt	        |              | j                  ||      }||z   }| j                  |      }|}| j                  | j                  |            }| j                  ||      }| j                  |      }| j                  ||      }||z   }| j                  |      }|f|	dd z   S )a  
        Args:
            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
            attention_mask (`tf.Tensor`): attention mask of size
                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
            layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
                *(config.encoder_attention_heads,)*.
        r   r   z&Self attn modified the shape of query  to r   r   N)rN  r!   r'   r   r   r   rQ  rY  rT  rZ  rU  rV  )
rQ   r   r   r   r   r   r   r   residuallayer_outputss
             r2   rY   zTFLEDEncoderLayer.call  s3   $ !NO_Nbdrs ' 

 &a(
!!}%x <Z=Q<RRVWaboWpVqr 	" 	
 ]XF =011-@ **488M+BC///Q/]XF =0--m<-"333rE   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   xY w# 1 sw Y   XxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)NTrN  rQ  rT  rU  rV  )r   r   r!   r   rN  rX   r   rQ  rz   rT  rU  rt   r[  rV  r   s     r2   r   zTFLEDEncoderLayer.build  s   ::
4d+7t~~223 +$$T*+4/6Bt88==> N))//tT^^0LMN4%1txx}}- =dDNN;<=4%1txx}}- JdDKK,G,GHIJ4+T2>t4499: J%%++T4,HIJ J ?+ +N N= =J JJ Js<   H%)H$)H133H=$)I	H!$H.1H:=I	I)rt   r   r   r^   r  )r   rG  r   rG  r   rG  r   rG  r   rG  r   rF  rN   r.  rg   s   @r2   rL  rL    sS    * +4 +4 "+4 #	+4
 #+4 (+4 +4ZJrE   rL  c                  Z     e Zd Zd fdZ	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 ddZddZ xZS )TFLEDDecoderLayerc                   t        |   di | |j                  | _        t	        | j                  |j
                  |j                  dd      | _        t        j                  j                  |j                        | _        t        |j                        | _        t        j                  j                  |j                        | _        t        j                  j!                  dd      | _        t	        | j                  |j
                  |j                  dd      | _        t        j                  j!                  dd	      | _        t        j                  j)                  |j*                  d
      | _        t        j                  j)                  | j                  d      | _        t        j                  j!                  dd      | _        || _        y )NrN  T)rz   rx   r   rX   r7  rP  rQ  rR  encoder_attn)r   rX   r7  encoder_attn_layer_normrT  rO  rU  rV  rs   )rO   rP   r'  rz   r0  decoder_attention_headsattention_dropoutrN  r   r{   r~   r   r
   rX  rY  rZ  rW  rQ  rd  re  r|   decoder_ffn_dimrT  rU  rV  rt   rQ   rt   rT   rU   s      r2   rP   zTFLEDDecoderLayer.__init__	  sa   "6".nn44,,
 ||++FNN;.v/I/IJ"',,"6"6v7P7P"Q$)LL$C$CDWm$C$n!1NN**,,
 (-||'F'FtZs'F't$<<%%f&<&<5%I<<%%dnn5%A % ? ?Se ? frE   c	                8   |}	||dd nd}
| j                  ||
||      \  }}}| j                  ||      }|	|z   }| j                  |      }d}d}|S|}	||dd nd}| j                  |||||      \  }}}| j                  ||      }|	|z   }| j	                  |      }||z   }|}	| j                  | j                  |            }| j                  ||      }| j                  |      }| j                  ||      }|	|z   }| j                  |      }||||fS )a  
        Args:
            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
            attention_mask (`tf.Tensor`): attention mask of size
                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
            encoder_hidden_states (`tf.Tensor`):
                cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
            encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
            layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
                *(config.encoder_attention_heads,)*.
            encoder_layer_head_mask (`tf.Tensor`): mask for encoder attention heads in a given layer of
                size *(config.encoder_attention_heads,)*.
            past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states
        Nrr   )r   r>  r   r   r   r   )r   r=  r   r   r>  )
rN  r   rQ  rd  re  rY  rT  rZ  rU  rV  )rQ   r   r   encoder_hidden_statesencoder_attention_maskr   encoder_layer_head_maskr>  r   r^  self_attn_past_key_valueself_attn_weightspresent_key_valuecross_attn_present_key_valuecross_attn_weightscross_attn_past_key_values                   r2   rY   zTFLEDDecoderLayer.call%  s   4 ! :H9S>"1#5Y] >Bnn'3)+	 ?M ?
;(*; ]XF =011-@ (,$! ,$H @N?Yrs(;_c%NRN_N_+!65 78 O` OKM-/K !LLLJM$}4M 88GM !24P P !**488M+BC///Q/]XF =0--m< 	
 	
rE   c                b   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   sxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   rxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)	NTrN  rQ  rd  re  rT  rU  rV  )r   r   r!   r   rN  rX   r   rQ  rz   rd  re  rT  rU  rt   rh  rV  r   s     r2   r   zTFLEDDecoderLayer.buildu  s   ::
4d+7t~~223 +$$T*+4/6Bt88==> N))//tT^^0LMN4.:t00556 .!!''-.42D9Et;;@@A Q,,22D$3OPQ4%1txx}}- =dDNN;<=4%1txx}}- JdDKK,G,GHIJ4+T2>t4499: J%%++T4,HIJ J ?#+ +N N. .Q Q= =J JJ JsT   K%)K%K2&)K?)L43L%)L%K"%K/2K<?L	LL"%L.rt   r   )NNNNNNF)r   rH  rk  rH  rl  rH  r   rH  rm  rH  r>  zTuple[tf.Tensor] | NonerJ  z?Tuple[tf.Tensor, tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]rN   r.  rg   s   @r2   rb  rb    sz    > ,02637,04826N
 )N
  0	N

 !1N
 *N
 "2N
 0N
 
IN
`JrE   rb  c                  0     e Zd ZeZdZe fd       Z xZS )TFLEDPreTrainedModelledc                n    t         |   }t        j                  dt        j                  d      |d<   |S )N)NNglobal_attention_maskrO  )rO   input_signaturer!   
TensorSpecrZ   )rQ   sigrU   s     r2   r{  z$TFLEDPreTrainedModel.input_signature  s/    g%')}}\288Ri'j#$
rE   )	rb   rc   rd   r   config_classbase_model_prefixpropertyr{  rf   rg   s   @r2   rw  rw    s     L rE   rw  c                  J    e Zd ZU dZdZded<   dZded<   dZded<   dZded<   y)	TFLEDEncoderBaseModelOutputaI  
    Base class for Longformer's outputs, with potential hidden states, local and global attentions.

    Args:
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x +
            attention_window + 1)`, where `x` is the number of tokens with global attention mask.

            Local attentions weights after the attention softmax, used to compute the weighted average in the
            self-attention heads. Those are the attention weights from every token in the sequence to every token with
            global attention (first `x` values) and to every token in the attention window (remaining `attention_window
            + 1` values). Note that the first `x` values refer to tokens with fixed positions in the text, but the
            remaining `attention_window + 1` values refer to tokens with relative positions: the attention weight of a
            token to itself is located at index `x + attention_window / 2` and the `attention_window / 2` preceding
            (succeeding) values are the attention weights to the `attention_window / 2` preceding (succeeding) tokens.
            If the attention window contains a token with global attention, the attention weight at the corresponding
            index is set to 0; the value should be accessed from the first `x` attention weights. If a token has global
            attention, the attention weights to all other tokens in `attentions` is set to 0, the values should be
            accessed from `global_attentions`.
        global_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, where `x`
            is the number of tokens with global attention mask.

            Global attentions weights after the attention softmax, used to compute the weighted average in the
            self-attention heads. Those are the attention weights from every token with global attention to every token
            in the sequence.
    NrG  last_hidden_stateTuple[tf.Tensor, ...] | Noner   
attentionsglobal_attentions)	rb   rc   rd   re   r  __annotations__r   r  r  rs   rE   r2   r  r    s7    !F $(y'26M/6/3J,36:3:rE   r  c                      e Zd ZU dZdZded<   dZded<   dZded<   dZded	<   dZ	ded
<   dZ
ded<   dZded<   dZded<   dZded<   y)TFLEDSeq2SeqModelOutputa  
    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
    decoding.

    Args:
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the decoder of the model.

            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
        past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
            sequence_length, embed_size_per_head)`).

            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
            used (see `past_key_values` input) to speed up sequential decoding.
        decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
        decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
        encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        encoder_global_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, where `x`
            is the number of tokens with global attention mask.

            Global attentions weights after the attention softmax, used to compute the weighted average in the
            self-attention heads. Those are the attention weights from every token with global attention to every token
            in the sequence.
    NrG  r  List[tf.Tensor] | Nonepast_key_valuesr  decoder_hidden_statesdecoder_attentionscross_attentionsrH  encoder_last_hidden_staterk  encoder_attentionsencoder_global_attentions)rb   rc   rd   re   r  r  r  r  r  r  r  rk  r  r  rs   rE   r2   r  r    st    5n $(y'.2O+2:>7>7;4;592926/6:>7>7;4;>B;BrE   r  c                      e Zd ZU dZdZded<   dZded<   dZded<   dZd	ed
<   dZ	d	ed<   dZ
d	ed<   dZded<   dZd	ed<   dZd	ed<   dZd	ed<   y)TFLEDSeq2SeqLMOutputap  
    Base class for sequence-to-sequence language models outputs.

    Args:
        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss.
        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
            sequence_length, embed_size_per_head)`).

            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
            used (see `past_key_values` input) to speed up sequential decoding.
        decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
        decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
            weighted average in the cross-attention heads.
        encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        encoder_global_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, x)`, where `x`
            is the number of tokens with global attention mask.

            Global attentions weights after the attention softmax, used to compute the weighted average in the
            self-attention heads. Those are the attention weights from every token with global attention to every token
            in the sequence.
    NrH  lossrG  logitsr  r  r  r  r  r  r  rk  r  r  )rb   rc   rd   re   r  r  r  r  r  r  r  r  rk  r  r  rs   rE   r2   r  r    s~    3j "D
!FI.2O+2:>7>7;4;592926/6:>7>7;4;>B;BrE   r  at	  
    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Args:
        config ([`LEDConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`tf.Tensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`tf.Tensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`LedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)

            LED uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
        decoder_attention_mask (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            will be made by default and ignore pad tokens. It is not recommended to set this for most use cases.
        head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        decoder_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        encoder_outputs (`tf.Tensor`, *optional*):
            hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
            of shape `(batch_size, sequence_length, hidden_size)` is a sequence of
        past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*, defaults to `True`):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`). Set to `False` during training, `True` during generation
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
            config will be used instead.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False`):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
c                       e Zd ZeZ	 dd	 fdZd Zd Ze	 	 	 	 	 	 	 	 	 d
d       Z	e
j                  d        Zd ZddZ xZS )TFLEDEncoderc           	        t        |   di | || _        t        j                  j                  |j                        | _        |j                  dkD  rt        j                  d       d| _
        |j                  | _        t        |j                  t              rO|j                  dz  dk(  sJ d       |j                  dkD  sJ d       |j                  g|j                   z  |_        nLt#        |j                        |j                   k(  s*J d|j                    dt#        |j                                |j                  | _        || _        t'        |j(                  |j*                  d	
      | _        t/        |j0                        D cg c]  }t3        ||d| 
       c}| _        t        j                  j5                  dd      | _        |j*                  | _        y c c}w )Nr   0Layerdrop is currently disabled in TFLED models.r5   rr   z1`config.attention_window` has to be an even valuez,`config.attention_window` has to be positivezQ`len(config.attention_window)` should equal `config.num_hidden_layers`. Expected z, given embed_positionsrO  layers.rP  layernorm_embeddingrR  rs   )rO   rP   rt   r   r{   r~   r   encoder_layerdroploggerwarning	layerdropr-   padding_idx
isinstancer   r^   num_hidden_layerslenembed_tokensrL   max_encoder_position_embeddingsr'  r  r:   encoder_layersrL  rW  r  rz   rQ   rt   r  rT   irU   s        r2   rP   zTFLEDEncoder.__init__  s   "6"||++FNN;##a'NNMN!..f--s3**Q.!3h5hh3**Q.^0^^.'-'>'>&?&BZBZ&ZF#v../63K3KK "445Xc&BYBY>Z=[]K
 !' 7 7(>22NN" 

 RWW]WlWlQmnA(71#Gn#(<<#B#B4Vk#B#l  os   G%c                    | j                   S rN   r  rQ   s    r2   get_embed_tokenszTFLEDEncoder.get_embed_tokens  s       rE   c                    || _         y rN   r  rQ   r  s     r2   set_embed_tokenszTFLEDEncoder.set_embed_tokens  
    (rE   c
                   ||t        d      |=t        |      }
t        || j                  j                         | j                  |      }n|t        |      dd }
nt        d      |t        j                  |
d      }|'|t        j                  |dz   |j                        z  }| j                  |||| j                        \  }}}}t        |      }
t
        j                  j                  t        j                  |t
        j                        d      }t
        j                  j                  t        j                  |t
        j                        d      }t
        j                  j                  |      }| j!                  |
      }||z   }| j#                  |      }| j%                  ||	      }|#t'        |      ddd	d	ddf   }|ddddddf   }|rd
nd}|rd
ndx}}|gt
        j(                  j+                  t        |      d	   t-        | j.                        dt-        | j.                         dt        |      d	    d       t1        | j.                        D ]  \  }}|r| j3                  ||      }||fz   }t5        j6                  d	d      }|	r|| j8                  k  rH ||||||   nd|||      }|d	   }|se|t        j:                  |d   d      fz   }|t        j:                  |d   d      fz   } | j3                  ||      }|r0|d	kD  r)t=        |D cg c]  }|ddddd| ddf    c}      n|}|r||fz   }|st=        d |||fD              S t?        ||||      S c c}w )aW  
        Args:
            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`tf.Tensor` of shape `(num_layers, num_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsr   r   )r,   r   inputs_embedsr-   r   r   rs   &The head_mask should be specified for  layers, but it is for r   r   )r   r   r   r   r   r   r   rr   )r   r   r	   rr   c              3  &   K   | ]	  }||  y wrN   rs   .0vs     r2   	<genexpr>z$TFLEDEncoder.call.<locals>.<genexpr>  s     eqWXWdes   )r  r   r  r  ) rw   r   r   r  	input_dimr!   r#   r"   r    _pad_to_window_sizer  r   lessint8r   
reduce_anyr  r  r   rJ   r'   r   r  r{   	enumeratecompute_hidden_statesrandomuniformr  r   r   r  )rQ   r,   r  r   rz  	head_maskoutput_attentionsoutput_hidden_statesreturn_dictr   r[   padding_lenr   r   r   	embed_posr   encoder_statesall_attentionsall_global_attentionsidxencoder_layerhidden_states_to_adddropout_probabilityr_  states                             r2   rY   zTFLEDEncoder.call  s   d  ]%>cdd"$Y/K*9d6G6G6Q6QR --i8M&$]3CR8KTUU!WW[!4N !,+bgg7Lq7PYgYmYm.nnN@D@X@X)'))	 AY A
=Y !0'',,rww~rww'GK!wwrww~rww/OQRS++,@A((5	%	100?]XF %).9!Q1*EN+Aq$,<=N37HdR.  LL%%9%a(DKK <S=M<N O"9-a014	 &  #,DKK"8 	pC#'+'A'A-Q\']$!/3G2I!I"(..A"604>>A)+-2;2G	#T /%9-M *!,M !/2<<a@PR^3_2a!a )>m\]N^`lAm@o(o%3	p: 22=+N  ? .QuQ=[L=!34QR#   +}.>>Ne]NN$Seee*+(%3	
 	
 Rs   ,M<c                (    |dkD  r|d d d | f   S |S )Nr   rs   )rQ   r   r  s      r2   r  z"TFLEDEncoder.compute_hidden_states  s#    2=/}Q+-.T}TrE   c                v   t        | j                  t              r| j                  nt        | j                        }|dz  dk(  s
J d|        |t	        |      n
t	        |      }|dd \  }}|||z  z
  |z  }	|	dkD  r!t
        j                  d| d||	z    d|        t        j                  ddgd|	gg      }
|t        j                  ||
|      }|G|	dkD  rBt        j                  ||	f|      }| j                  |      }t        j                  ||gd	
      }t        j                  ||
d      }|	|||fS )zaA helper function to pad tokens and mask to work with implementation of Longformer selfattention.rr   r   z2`attention_window` should be an even value. Given Nz(Input ids are automatically padded from r]  z0 to be a multiple of `config.attention_window`: r   r   r6   F)r  r   r^   maxr   r  warning_oncer!   r$   r   r#   r  r%   )rQ   r,   r   r  r-   r   r[   r   r\   r  r   input_ids_paddinginputs_embeds_paddings                r2   r  z TFLEDEncoder._pad_to_window_size  sz    &00E0Es%KD!!QTUYUjUjQk 	  !#q(q,^_o^p*qq(/8/Dj+*UbJc)"1o
G''4D*DDHXX?:7)4R]H]G^ _..>-?A
 ''!Q![1A(BC y(LQI$Q$&GGZ,E|$T!(,(9(9:K(L% "		=:O*PWY Z%P 	
 	
rE   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Zt        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        t        | dd       K| j                  D ];  }t        j                  |j
                        5  |j                  d        d d d        = y y # 1 sw Y   xY w# 1 sw Y   nxY w# 1 sw Y   axY wNTr  r  r{   )
r   r   r!   r   r  rX   r   r  rz   r{   rQ   r[   layers      r2   r   zTFLEDEncoder.build  s!   ::
4*D1=t33889 1$$**4014.5At77<<= M((..dDNN/KLM44(4 &]]5::. &KK%& && 51 1M M& &s$   D/%)D;E/D8;EE	rN   rt   r   r  z Optional[keras.layers.Embedding])	NNNNNNNNF)rb   rc   rd   r   r~  rP   r  r  r   rY   r!   functionr  r  r   rf   rg   s   @r2   r  r    su    L(<!)  "![
 [
z [[U U+
Z&rE   r  c                  b     e Zd ZeZ	 dd fdZd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 dd       ZddZ	 xZ
S )TFLEDDecoderc                $   t        |   d
i | || _        |j                  | _        || _        |j                  dkD  rt        j                  d       d| _	        t        |j                  |j                  d      | _        t        |j                        D cg c]  }t!        |d|        c}| _        t$        j"                  j'                  dd	      | _        t$        j"                  j+                  |j,                        | _        y c c}w )Nr   r  r5   r  rO  r  rP  r  rR  rs   )rO   rP   rt   r-   r  r  decoder_layerdropr  r  r  rL   max_decoder_position_embeddingsr'  r  r:   decoder_layersrb  r{   r   rW  r  r~   r   r  s        r2   rP   zTFLEDDecoder.__init__  s    "6"!..(##a'NNMN>22NN" 

 OTTZTiTiNjk(smDk#(<<#B#B4Vk#B#l ||++FNN; ls   Dc                    || _         y rN   r  r  s     r2   r  zTFLEDDecoder.set_embed_tokens  r  rE   c           
        ||t        d      |t        |      }n|t        |      dd }nt        d      |t        |d   d         d   nd}| j                  ||      }|1t        || j                  j
                         | j	                  |      }|}|d   dkD  rt        ||      }n.t        t        j                  |d   |d   |z   f      |d   	      }||d   dkD  r|t        ||d   	      z   }||t        ||d   	      }| j                  ||z         }| j                  ||
      }d}d}d}d}|gt        j                  j                  t        |      d   t        | j                        dt        | j                         dt        |      d    d       t!        | j                        D ]w  \  }}|r||fz  }t#        j$                  dd      }|r|| j&                  k  r6|||   nd} ||||||||   nd|||   nd|      \  }}}}|	r||fz  }|
sl||fz  }||fz  }y |r||fz  }nd}|
r|nd}|
r|nd}|	r|nd}|st)        d |||||fD              S t+        |||||      S )aM  
        Args:
            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`tf.Tensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            encoder_head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
                on hidden heads. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
                Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
                decoding. If `past_key_values` are used, the user can optionally input only the last
                `decoder_input_ids` (those that don't have their past key value states given to this model) of shape
                `(batch_size, 1)` instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
                inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsr   rr   r   )r?   )rA   r   rs   r  r  r   r   )r   rk  rl  r   rm  r>  c              3  $   K   | ]  }|| 
 y wrN   rs   r  s     r2   r  z$TFLEDDecoder.call.<locals>.<genexpr>  s      = s   )r  r  r   r  r  )rw   r   r  r   r  r  rD   rJ   r!   r8   r  r   r'   r   r  r{   r  r  r  r  r   r   )rQ   r,   r  r   rk  rl  r  encoder_head_maskr  	use_cacher  r  r  r   r[   r?   	positionsr   combined_attention_maskall_hidden_statesall_self_attnsall_cross_attentionspresent_key_valuesr  decoder_layerr  r>  layer_self_attnlayer_cross_attnrp  s                                 r2   rY   zTFLEDDecoder.call  sW   N  ]%>stt"$Y/K&$]3CR8KdeeIXIdOA,>q,A!B1!Ejk ((6LM	 *9d6G6G6Q6QR --i8M% r?Q&7\r&s#&2QQ:P)PQR\ghj\k'# %+b/A*=&=^epqset@u&u# ,1G1S%12HR]^`Ra%b"001JK]XF !  LL%%9%a(DKK <S=M<N O"9-a014	 &  #,DKK"8 	<C#!m%55!"(..A"604>>A5D5P_S1VZNR_6&;'=2;2G	#TBSB_(9#(>ei-SOM?,<>O "'8&::" ?"44$)9(;;$5	<8  -!11 $+<$7H3d3</$ ');=NP^`tu   ?"/ 2/)!5 rE   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       K| j                  D ];  }t        j                  |j
                        5  |j                  d        d d d        = y y # 1 sw Y   xY w# 1 sw Y   nxY w# 1 sw Y   axY wr  )r   r   r!   r   r  rX   r   r  rt   r'  r{   r  s      r2   r   zTFLEDDecoder.build  s'   ::
4*D1=t33889 1$$**4014.5At77<<= R((..dDKK<O<O/PQR44(4 &]]5::. &KK%& && 51 1R R& &s$   D9%3EE9EEE	rN   r  )NNNNNNNNNNNNF)rb   rc   rd   r   r~  rP   r  r   rY   r   rf   rg   s   @r2   r  r    s\    L<$)  "#!p pd&rE   r  c                  n     e Zd ZeZd fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d	 dd       Z	d	dZ
 xZS )
TFLEDMainLayerc                   t        |   di | || _        t        j                  j                  |j                  |j                  t        j                  j                  | j                  j                        d      | _        d| j                  _        t        || j                  d      | _        t        || j                  d      | _        y )N)stddevz
led.shared)r  
output_dimembeddings_initializerrX   encoderrO  decoderrs   )rO   rP   rt   r   r{   	Embedding
vocab_sizer'  initializersTruncatedNormalinit_stdsharedload_weight_prefixr  r  r  r  ri  s      r2   rP   zTFLEDMainLayer.__init__  s    "6"ll,,''~~#(#5#5#E#ET[[MaMa#E#b	 - 
 *6&#FDKKiH#FDKKiHrE   c                    | j                   S rN   )r   r  s    r2   get_input_embeddingsz#TFLEDMainLayer.get_input_embeddings  s    {{rE   c                ~    || _         | j                   | j                  _        | j                   | j                  _        y rN   )r   r  r  r  )rQ   new_embeddingss     r2   set_input_embeddingsz#TFLEDMainLayer.set_input_embeddings  s)    $$(KK!$(KK!rE   c                L   ||d}|| j                  |||||
||||	      }nl|rHt        |t              s8t        |d   t        |      dkD  r|d   nd t        |      dkD  r|d   nd       }n"|s t        |t              s|j                         }| j                  |||d   ||||	||||||      }|s||z   S t        |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                  	      S )	NF)	r,   r   rz  r  r  r  r  r  r   r   r   rr   )r  r   r  )r   rk  rl  r  r  r  r  r  r  r  r  r   	r  r  r  r  r  r  rk  r  r  )r  r  r  r  r   to_tupler  r  r  r  r   r  r  r  )rQ   r,   r   decoder_input_idsdecoder_attention_maskr  decoder_head_maskencoder_outputsrz  r  r  decoder_inputs_embedsr  r  r  r  r   rT   decoder_outputss                      r2   rY   zTFLEDMainLayer.call  s`   * $)>)FI""ll#-&;#+"3%9'! + 
O O=X!Y9"1!"4474H14Loa0RV14_1E1I?1-tO Z%G-668O,,1"1!"4#1''+//!5# ' 
  "_44&-??+;;"1"?"?.99,==&5&G&G"1"?"?.99&5&G&G

 
	
rE   c                   | j                   ry d| _         t        j                  | j                  j                  dz   | j                  j
                  z   dz         5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   xY w# 1 sw Y   qxY w# 1 sw Y   y xY w)NT/r  r  )
r   r!   r   r   r  rX   r   r   r  r  r   s     r2   r   zTFLEDMainLayer.build	  s   ::
 ]]4;;99C?$++BRBRRUXXY 	$KKd#	$4D)5t||001 )""4()4D)5t||001 )""4() ) 6	$ 	$) )) )s$   D55EE5D>E
Eru  NNNNNNNNNNNNNNNF)r  z3Optional[Union[Tuple, TFLEDEncoderBaseModelOutput]]rN   )rb   rc   rd   r   r~  rP   r  r  r   rY   r   rf   rg   s   @r2   r  r    sr    LI0
  #OS""!#K
 MK
 K
Z)rE   r  zQThe bare LED Model outputting raw hidden-states without any specific head on top.c                       e Zd Z fdZd Zd Ze eej                  d             e
eee      	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d
d                     Zd ZddZ xZS )
TFLEDModelc                P    t        |   |g|i | t        |d      | _        y )Nrx  rO  )rO   rP   r  rx  rQ   rt   r   rT   rU   s       r2   rP   zTFLEDModel.__init__+	  s(    3&3F3!&u5rE   c                .    | j                   j                  S rN   rx  r  r  s    r2   get_encoderzTFLEDModel.get_encoder0	      xxrE   c                .    | j                   j                  S rN   rx  r  r  s    r2   get_decoderzTFLEDModel.get_decoder3	  r  rE   zbatch_size, sequence_length)
checkpointoutput_typer~  c                H    | j                  |||||||||	|
||||||      }|S )N)r,   r   r
  r  r  rz  r  r  r  r  r  r  r  r  r  r   )rx  )rQ   r,   r   r
  r  r  r  r  rz  r  r  r  r  r  r  r  r   rT   r   s                      r2   rY   zTFLEDModel.call6	  sN    6 (()/#9+"7/+'"7/!5#!  
& rE   c                Z   | j                   j                  r"t        j                  |j                        d   nd }| j                   j
                  rt        j                  |j                        nd }| j                   j                  rt        j                  |j                        nd }| j                   j                  rt        j                  |j                        nd }| j                   j
                  rt        j                  |j                        nd }| j                   j                  rt        j                  |j                        nd }| j                   j                  rt        j                  |j                        nd }t        |j                  |||||j                   |||	      S )Nr   r  )rt   r  r!   r   r  r  r$   r  r  r  r  rk  r  r  r  r  r  	rQ   r$  pkvdec_hs	dec_attnscross_attnsenc_hs	enc_attnsenc_g_attnss	            r2   serving_outputzTFLEDModel.serving_outputf	  s>   59[[5J5Jbhhv--.q1PTGK{{GgGg%%f&B&BCmqGK{{GdGdB(()B)BCjn	GK{{GdGdb**6+B+BCjnGK{{GgGg%%f&B&BCmqGK{{GdGdB(()B)BCjn	PTP[P[PmPmb**6+K+KLsw&$66"(((&,&F&F"((&1

 
	
rE   c                    | j                   ry d| _         t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY w)NTrx  )r   r   r!   r   rx  rX   r   r   s     r2   r   zTFLEDModel.build{	  se    ::
4%1txx}}- %t$% % 2% %s   A11A:r  )"r,   TFModelInputType | Noner   rH  r
  rH  r  rH  r  rH  r  rH  r  rH  rz  rH  r  rI  r  rH  r  rH  r  bool | Noner  r-  r  r-  r  r-  r   rF  rJ  z*Tuple[tf.Tensor] | TFLEDSeq2SeqModelOutputrN   )rb   rc   rd   rP   r  r  r   r   LED_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr  _CONFIG_FOR_DOCrY   r*  r   rf   rg   s   @r2   r  r  &	  s<   
6
   *+?+F+FGd+ef&+$ .2+/.237&*.2,026:>*.26!%)-,0#'#'*' )' ,	'
 !1' $' ,' *'  0' 8' ('  0' ' '' *'  !!'" #'& 
4'' g 'R
*%rE   r  c                  (     e Zd ZdZ fdZd Z xZS )	BiasLayerz
    Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
    so all weights have to be registered in a layer.
    c                \    t        |   dd|i| | j                  ||||      | _        y )NrX   rX   r   initializer	trainablers   )rO   rP   
add_weightr8  )rQ   r   r6  r7  rX   rT   rU   s         r2   rP   zBiasLayer.__init__	  s3    -d-f- OOU_hOi	rE   c                     || j                   z   S rN   )r8  )rQ   xs     r2   rY   zBiasLayer.call	  s    499}rE   ra   rg   s   @r2   r3  r3  	  s    
jrE   r3  zKThe LED Model with a language modeling head. Can be used for summarization.c                  &    e Zd ZddgZ fdZd Zd Zd Zd Zd Z	d	 Z
e ee       eee
      	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd                     Zd Z	 	 	 	 	 	 ddZddZd ZddZ xZS )TFLEDForConditionalGenerationzled.encoder.embed_tokens.weightzled.decoder.embed_tokens.weightc                    t        |   |g|i | t        |d      | _        |j                  | _        t        dd|j                  gdd      | _        d| _        y )Nrx  rO  final_logits_biasr   r<   Fr5  )	rO   rP   r  rx  r  r3  r  
bias_layersupports_xla_generationr  s       r2   rP   z&TFLEDForConditionalGeneration.__init__	  sa    3&3F3!&u5))#$Q0A0A,BPWch

 (-$rE   c                .    | j                   j                  S rN   r  r  s    r2   r  z)TFLEDForConditionalGeneration.get_decoder	  r  rE   c                .    | j                   j                  S rN   r  r  s    r2   r  z)TFLEDForConditionalGeneration.get_encoder	  r  rE   c                2    d| j                   j                  iS )Nr>  )r?  r8  r  s    r2   get_biasz&TFLEDForConditionalGeneration.get_bias	  s    #T__%9%9::rE   c                    |d   j                   d   }t        dd|gdd      | _        | j                  j                  j	                  |d          y )Nr>  r   r   r<   Fr5  )r   r3  r?  r8  assign)rQ   rn   r  s      r2   set_biasz&TFLEDForConditionalGeneration.set_bias	  sR    ./55b9
#$Q
O\a
 	##E*=$>?rE   c                "    | j                         S rN   )r  r  s    r2   get_output_embeddingsz3TFLEDForConditionalGeneration.get_output_embeddings	  s    ((**rE   c                &    | j                  |       y rN   )r  )rQ   rn   s     r2   set_output_embeddingsz3TFLEDForConditionalGeneration.set_output_embeddings	  s    !!%(rE   )r  r~  c                r   |;d}|7|5t        || j                  j                  | j                  j                        }| j	                  |||||||||	|
||||||      }t        j                  |d   | j                  j                  j                  d      }| j                  |      }|dn| j                  ||      }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                  |j                   |j"                  |j$                  |j&                  
      S )	a  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, TFLEDForConditionalGeneration
        >>> import tensorflow as tf

        >>> mname = "allenai/led-base-16384"
        >>> tokenizer = AutoTokenizer.from_pretrained(mname)
        >>> TXT = "My friends are <mask> but they eat too many carbs."
        >>> model = TFLEDForConditionalGeneration.from_pretrained(mname)
        >>> batch = tokenizer([TXT], return_tensors="tf")
        >>> logits = model(inputs=batch.input_ids).logits
        >>> probs = tf.nn.softmax(logits[0])
        >>> # probs[5] is associated with the mask token
        ```NF)r   r
  r  r  rz  r  r  r  r  r  r  r  r  r  r   r   Tr  r   )
r  r  r  r  r  r  r  rk  r  r  )r3   rt   r-   r.   rx  r!   r  r   weightsr?  hf_compute_lossr  r  r  r  r  r  rk  r  r  )rQ   r,   r   r
  r  r  r  r  rz  r  r  r  r  r  r  r  labelsr   r   	lm_logitsmasked_lm_lossr$  s                         r2   rY   z"TFLEDForConditionalGeneration.call	  s]   T I (-B-J$6DKK44dkk6X6X%! (()/#9+"7/+'"7/!5#!  
$ IIgaj$((//*A*AtT	OOI.	!'T5I5I&R[5\\GABK/F3A3M^%.YSYY##33")"?"?&99$55&-&G&G")"?"?&99&-&G&G
 	
rE   c                Z   | j                   j                  r"t        j                  |j                        d   nd }| j                   j
                  rt        j                  |j                        nd }| j                   j                  rt        j                  |j                        nd }| j                   j                  rt        j                  |j                        nd }| j                   j
                  rt        j                  |j                        nd }| j                   j                  rt        j                  |j                        nd }| j                   j                  rt        j                  |j                        nd }t        |j                  |||||j                   |||	      S )Nr   )	r  r  r  r  r  r  rk  r  r  )rt   r  r!   r   r  r  r$   r  r  r  r  rk  r  r  r  r  r  r"  s	            r2   r*  z,TFLEDForConditionalGeneration.serving_output
  s<   59[[5J5Jbhhv--.q1PTGK{{GgGg%%f&B&BCmqGK{{GdGdB(()B)BCjn	GK{{GdGdb**6+B+BCjnGK{{GgGg%%f&B&BCmqGK{{GdGdB(()B)BCjn	PTP[P[PmPmb**6+K+KLsw#=="(((&,&F&F"((&1

 
	
rE   c           	     2    ||d d dd f   }d |||||||dS )Nr   )r,   r  r  r
  r   r  r  r  rs   )	rQ   r
  r  r   r  r  r  r  rT   s	            r2   prepare_inputs_for_generationz;TFLEDForConditionalGeneration.prepare_inputs_for_generation/
  s?     & 1!RS& 9 ..!2,"!2"	
 		
rE   c                l    t        || j                  j                  | j                  j                        S rN   )r3   rt   r-   r.   )rQ   rO  s     r2   %prepare_decoder_input_ids_from_labelszCTFLEDForConditionalGeneration.prepare_decoder_input_ids_from_labelsI
  s%    !&$++*B*BDKKDfDfggrE   c           	     *   t         j                  j                  dt         j                  j                  j                        }| j
                  j                  rt        j                  |d      }t        j                  || j
                  j                        }t        j                  t        j                  |dt        |      d   f      |      }t        j                  ||      } |||      S  |t        j                  j                  |      |      }t        j                  || j
                  j                  k7  |j                         }||z  }	t        j"                  |	      t        j"                  |      z  }
t        j                  |
d      S )z(CrossEntropyLoss that ignores pad tokensT)from_logits	reduction)r   r   rr   r   )r   )r   lossesSparseCategoricalCrossentropy	ReductionNONErt   tf_legacy_lossr!   r;   	not_equalr-   boolean_maskr   nnrelur"   r    
reduce_sum)rQ   rO  r  loss_fnmelted_labelsactive_lossreduced_logitsunmasked_loss	loss_maskmasked_lossreduced_masked_losss              r2   rN  z-TFLEDForConditionalGeneration.hf_compute_lossL
  s(   ,,<<Y^YeYeYoYoYtYt<u;;%%JJvu5M,,}dkk6N6NOK__RZZZPVEWXYEZ@[-\^ijN__]K@F6>22  

6 2F;GGFdkk&>&>>mFYFYZ	#i/ mmK82==;SSzz-t44rE   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   exY w# 1 sw Y   y xY w)NTrx  r?  )r   r   r!   r   rx  rX   r   r?  r   s     r2   r   z#TFLEDForConditionalGeneration.build^
  s    ::
4%1txx}}- %t$%4t,8t334 ,%%d+, , 9% %, ,s   C%CCC )NNNNNNNNNNNNNNNNF)$r,   r,  r   np.ndarray | tf.Tensor | Noner
  rm  r  rm  r  rm  r  rm  r  z"TFLEDEncoderBaseModelOutput | Nonerz  rm  r  z1Tuple[Tuple[Union[np.ndarray, tf.Tensor]]] | Noner  rm  r  rm  r  r-  r  r-  r  r-  r  r-  rO  rH  r   rF  rJ  z'Tuple[tf.Tensor] | TFLEDSeq2SeqLMOutput)NNNNNN)rO  rG  rN   )rb   rc   rd   "_keys_to_ignore_on_load_unexpectedrP   r  r  rD  rG  rI  rK  r   r   r.  r   r  r1  rY   r*  rT  rV  rN  r   rf   rg   s   @r2   r<  r<  	  s    	+**&

-  ;@+) *+?@+?o^ .28<;?@D37;?>B?CMQ7;?C!%)-,0#'#'%R
*R
 6R
 9	R

 !>R
 1R
 9R
 <R
  =R
 KR
 5R
  =R
 R
 'R
 *R
  !!R
" !#R
$ %R
& 
1'R
 _ A R
h
0 
4h5$	,rE   r<  )r,   rG  r-   r^   r.   r^   r_   )r>   r`   r?   r^   rN   )rB   rG  rA   zOptional[int])Ere   
__future__r   r  dataclassesr   typingr   r   r   r   numpynp
tensorflowr!   activations_tfr
   modeling_tf_outputsr   modeling_tf_utilsr   r   r   r   r   r   tf_utilsr   r   r   utilsr   r   r   r   r   r   configuration_ledr   
get_loggerrb   r  r0  r1  r9   r3   rD   rJ   r{   r  rL   Layerri   r!  r0  rL  rb  rw  r  r  r  LED_START_DOCSTRINGr.  r  r  r  r  r3  r<  rs   rE   r2   <module>r~     sz    "  ! / /   / N  S R  ) 
		H	%.  2;$
6Cell&<&< C$`
 2 2 `
F$KELL.. $KNeBELL.. eBPMJ** MJ`EJ** EJP,  ';+ '; ';T @Ck @C @CF ?C; ?C ?CD' RA H J&5<<%% J& J&Z _&5<<%% _& _&D v)U\\'' v) v)r WW%% W%	W%v"" " QM,$8 M,	M,rE   