
    sgƜ                       d Z ddlZddlmZmZmZmZmZ ddlm	Z
 ddlZddlmZ ddlZddlmZmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ d
dlm Z m!Z!m"Z"m#Z#m$Z$ d
dl%m&Z&m'Z'm(Z(m)Z)m*Z* d
dl+m,Z,m-Z-m.Z.m/Z/ ddl0m1Z1  e.jd                  e3      Z4dZ5dZ6ejn                  Z7dejp                  de9de9dejp                  fdZ:d]dejp                  de9de9de9dejp                  f
dZ;dejp                  de9de9dejp                  fdZ<d]dejp                  de9de9de9dejp                  f
dZ=de9dejp                  fd Z>d!ejp                  de9dejp                  fd"Z?d#ejp                  de9dejp                  fd$Z@d#ejp                  d%e9deejp                  ejp                  f   fd&ZAd#ejp                  d%e9dejp                  fd'ZBd(ejp                  d)ejp                  d*e9dejp                  fd+ZC G d, d-e
j                        ZE G d. d/e
j                        ZF G d0 d1e
j                        ZG G d2 d3e
j                        ZH G d4 d5e
j                        ZI G d6 d7e
j                        ZJ G d8 d9e
j                        ZK G d: d;e
j                        ZL G d< d=e
j                        ZM G d> d?e
j                        ZN G d@ dAe
j                        ZO G dB dCe
j                        ZP G dD dEe
j                        ZQ G dF dGe
j                        ZR G dH dIe
j                        ZSdJZTdKZUdLZV G dM dNe'      ZWdOZX e,dPeX       G dQ dRe
j                               ZY G dS dTeW      ZZ e(eZe5e$e6       dUZ[ e*eZeVe[z           e)eZe#e6V        e,dWeX       G dX dYe
j                               Z\ G dZ d[eW      Z]d\Z^ e*e]eVe^z           e)e]e#e6V       y)^zFlax LongT5 model.    N)AnyCallableListOptionalTuple)
FrozenDictfreezeunfreeze)combine_masksmake_causal_mask)partitioning)dot_product_attention_weights)flatten_dictunflatten_dict)PRNGKey   )FlaxBaseModelOutput-FlaxBaseModelOutputWithPastAndCrossAttentions%FlaxCausalLMOutputWithCrossAttentionsFlaxSeq2SeqLMOutputFlaxSeq2SeqModelOutput)ACT2FNFlaxPreTrainedModelappend_call_sample_docstring append_replace_return_docstringsoverwrite_call_docstring)add_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )LongT5Configzgoogle/long-t5-local-baser"   	input_idspad_token_iddecoder_start_token_idreturnc                    t        j                  |       }|j                  ddddf   j                  | ddddf         }|j                  dddf   j                  |      }t        j                  |dk(  ||      }|S )z1
    Shift input ids one token to the right.
    Nr!   r   i)jnp
zeros_likeatsetwhere)r#   r$   r%   shifted_input_idss       b/var/www/html/venv/lib/python3.12/site-packages/transformers/models/longt5/modeling_flax_longt5.pyshift_tokens_rightr0   :   s     y1),,QU377	!SbS&8IJ),,QT2667MN		"3t";\K\]    x	block_lenaxis	pad_valuec                     | j                   |    |z  }dg| j                  z  }d|f||<   t        j                  | |d|      } | S )zHPad an array so that a sequence length will be a multiple of `block_len`r   r   r   constant	pad_widthmodeconstant_values)shapendimr)   pad)r2   r3   r4   r5   pad_lenr?   s         r/   _pad_to_multiplerA   F   sL    wwt}ny(G(QVV
CGCISz9MAHr1   c                     | j                   |   |z  dk7  rt        | ||d      } | j                   |   |z  }| j                   d| ||fz   | j                   |dz   d z   }| j                  |      S )zSplit an input array into blocks of a given `block_len` along the given `axis`. If the dimension length
    is not a multiple of `block_len`, it will be padded first with selected `pad_value`.
    r   )r5   Nr!   )r=   rA   reshape)r2   r3   r4   
num_blocksoutput_shapes        r/   _split_into_blocksrF   O   sw    
 	wwt}y A%Q	41=)+J775D>Z$;;aggtaxl>SSL99\""r1   
block_axissequence_axisc                 d   | j                   |   }dg| j                  z  }d||<   t        j                  | |d|      } g }t	        d      D ]M  }t        dd      g| j                  z  }t        |||z         ||<   t        |      }|j                  | |          O t        j                  ||      S )	zConcatenate three consecutive blocks for each input block for local attentiont.
    For more information, see: https://arxiv.org/pdf/2112.07916.pdf.
    r7   r!   r!   r8   r9   r   r   Nr4   )	r=   r>   r)   r?   rangeslicetupleappendconcatenate)	r2   rG   rH   r5   rD   r?   blocks_listiindicess	            r/   _concatenate_3_blocksrT   [   s     $J(QVV
CC
OSz9MA"$K1X ' D>"QVV+#Aq:~6
.1W:&' ??;];;r1   c                     t        j                  d| z  t         j                        }|| |   }|dddf   |dddf   z
  }|S )z:Makes 3-blocked relative position ids for local attention.r   dtypeN)r)   arangeint32)r3   position_idscenter_position_idsrelative_position_idss       r/   "_make_3block_relative_position_idsr]   q   sM    ::a)m399=L&y)<(q14G44PP  r1   local_attention_maskc                     t        |      }t        j                  |      |k  }|ddddddf   }t        j                  | |      S )znMask local attention mask to enforce that tokens are not allowed to attend tokens farther than ``local_radius.N)r]   r)   abslogical_and)r^   r3   r\   locality_masks       r/   _mask_local_attention_maskrc   y   sF    >yIGG12Y>M!$a"23M??/??r1   attention_maskc                     t        | |d      }t        |dd      }|d   }|ddddf   }t        j                  ||      }t	        ||      }|ddddf   S )z;Prepare attention mask to be applied for a local attention.r!   rK      rG   rH   .N.N)rF   rT   r)   ra   rc   )rd   r3   _blocked_attention_mask_3blocked_attention_maskr^   s        r/   _get_local_attention_maskrk      st     1QRS45LYZjkl5i@7T1E??+BD\]56JIV4--r1   global_block_sizec                 &  
 | j                   dd \  }
dt        j                  dt        j                  f
fd}t        j                  |       z  }t        j
                  |d      |z
  }t        j                  | dk7  d	d
      }t        j                  t        j                  ||z   d	z
        t        j                  d| j                              }|| z  | dz
  z   } ||      }
z  }|dkD  r0t        j                  |j                  d      dddf   |d      }n#t        j                  |df|j                        }t        j
                  t        j                  ||f      d      dz
  }	t        j                  |	|k  dd      }	||	fS )a  Obtain the "fixed block" global id corresponding to each input token.

    This implementation is a simlified version of the original Flaxformr implementation adopted from:
    https://github.com/google/flaxformer/blob/main/flaxformer/architectures/longt5/long_attention.py.

    In our scenario, as we use this strategy only for a decoder, orphan tokens, i.e. those tokens which do not make for
    the whole fixed block, are assigned to the preceding block.

    Padding tokens from the original sequence are represented by -1.
    Nrf   	block_idsr&   c                     t        j                        z  dz
  k(  }t        j                  || dk\        }|j                  d      d   }t        j                  | |dz
        } | S )Nr!   r   r(   rh   )r)   rX   ra   summinimum)rn   
block_endstrue_block_endsfull_blocksrl   seq_lens       r/   handle_orphan_tokensz:_make_global_fixed_block_ids.<locals>.handle_orphan_tokens   sf    jj),==BSVWBWW
//*i1nE%))"-i8KK	;?;	r1   r!   rK                 ?g     @g      rV   r   r(   )repeatsr4   )r=   npndarrayr)   	ones_likecumsumr-   maximumfloorarrayrW   repeatmaxzerosones)rd   rl   
batch_sizerv   fixed_block_maskmaskglobal_block_idsnum_globals_sequence_block_ids_maxglobal_segment_idsru   s    `        @r/   _make_global_fixed_block_idsr      s    )..r2J

 s{{  }}^47HHzz"2;>NN99^s*C9D{{		$))C/0#))DH\H\2] )>9nq>PQ+,<=..K Q"%**-=-A-Ar-A-J1d7-S]hop"q"%))ZOCSCYCY"ZCHHj+-F$GbQTUU#59P#PRSUVW///r1   c                 |    t        | |      \  }}|j                  d   }t        j                  |      }||d   z
  }|S )zBCreate the relative position tensor for local -> global attention.r(   rh   )r   r=   r)   rX   )rd   rl   rn   r   global_seq_lenglobal_positionsside_relative_positions          r/    _make_side_relative_position_idsr      sJ    $@Qb$c!I!'--b1Nzz.1-	)0DD!!r1   hidden_statesrn   r   c                 p    t         j                  j                  ||      }t        j                  d| |      S )zFCompute individual block aggregates by summing over individual blocks.z...nd,...ng->...gd)jaxnnone_hotr)   einsum)r   rn   r   one_hot_block_idss       r/   _create_global_aggregatesr      s.     y.A::*M;LMMr1   c                       e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   e
j                  j                  j                  Zedej"                  f   ed<   d Zd Zy	)
FlaxLongT5LayerNormhidden_sizerW   gư>eps.weight_initc                 ^    | j                  d| j                  | j                  f      | _        y )Nweight)paramr   r   r   selfs    r/   setupzFlaxLongT5LayerNorm.setup   s%    jj4+;+;d>N>N=PQr1   c                     t        j                  |j                  d      d      j                  dd      }|t        j                  || j
                  z         z  }| j                  |z  S )zg
        Construct a layernorm module in the LongT5 style; No bias and no subtraction of mean.
        f4rf   r(   T)r4   keepdims)r)   powerastypemeansqrtr   r   )r   r   variances      r/   __call__zFlaxLongT5LayerNorm.__call__   s[    
 99]11$7;@@bSW@X%DHH1D(EE{{]**r1   N)__name__
__module____qualname__int__annotations__r)   float32rW   r   floatr   r   initializersr   r   r   rz   r{   r   r    r1   r/   r   r      sV    {{E399"C-0VV-@-@-E-EK#rzz/*ER+r1   r   c                   \    e Zd ZU eed<   ej                  Zej                  ed<   d ZddZ	y)FlaxLongT5DenseActDenseconfigrW   c                    | j                   j                  | j                   j                  dz  z  }| j                   j                  | j                   j                  dz  z  }t	        j
                  | j                   j                  dt        j                  j                  j                  |      | j                        | _
        t	        j
                  | j                   j                  dt        j                  j                  j                  |      | j                        | _        t	        j                  | j                   j                        | _        t        | j                   j                      | _        y N      Fuse_biaskernel_initrW   )r   initializer_factord_modeld_ffr   Denser   r   normalrW   wiwoDropoutdropout_ratedropoutr   dense_act_fnactr   wi_init_stdwo_init_stds      r/   r   zFlaxLongT5DenseActDense.setup   s    kk448K8KT8QRkk448H8H$8NO((KK++22;?**	
 ((KK++22;?**	
 zz$++":":;$++223r1   c                     | j                  |      }| j                  |      }| j                  ||      }| j                  |      }|S Ndeterministic)r   r   r   r   )r   r   r   s      r/   r   z FlaxLongT5DenseActDense.__call__   sD    ./]-P.r1   NT
r   r   r   r"   r   r)   r   rW   r   r   r   r1   r/   r   r      s$    {{E399"4&r1   r   c                   Z    e Zd ZU eed<   ej                  Zej                  ed<   d Zd Z	y)FlaxLongT5DenseGatedActDenser   rW   c                    | j                   j                  | j                   j                  dz  z  }| j                   j                  | j                   j                  dz  z  }t	        j
                  | j                   j                  dt        j                  j                  j                  |      | j                        | _
        t	        j
                  | j                   j                  dt        j                  j                  j                  |      | j                        | _        t	        j
                  | j                   j                  dt        j                  j                  j                  |      | j                        | _        t	        j                  | j                   j                        | _        t         | j                   j"                     | _        y r   )r   r   r   r   r   r   r   r   r   rW   wi_0wi_1r   r   r   r   r   r   r   r   s      r/   r   z"FlaxLongT5DenseGatedActDense.setup  s;   kk448K8KT8QRkk448H8H$8NOHHKK++22;?**	
	 HHKK++22;?**	
	 ((KK++22;?**	
 zz$++":":;$++223r1   c                     | j                  | j                  |            }| j                  |      }||z  }| j                  ||      }| j	                  |      }|S r   )r   r   r   r   r   )r   r   r   hidden_geluhidden_linears        r/   r   z%FlaxLongT5DenseGatedActDense.__call__  sW    hhtyy78		-0#m3]-P.r1   Nr   r   r1   r/   r   r     s$    {{E399"42r1   r   c                   \    e Zd ZU eed<   ej                  Zej                  ed<   d ZddZ	y)FlaxLongT5LayerFFr   rW   c                    | j                   j                  r't        | j                   | j                        | _        n&t        | j                   | j                        | _        t        | j                   j                  | j                   j                  | j                        | _	        t        j                  | j                   j                        | _        y )NrV   r   rW   )r   is_gated_actr   rW   DenseReluDenser   r   r   layer_norm_epsilon
layer_normr   r   r   r   r   s    r/   r   zFlaxLongT5LayerFF.setup,  s    ;;##">t{{RVR\R\"]D"9$++TZZ"XD-KKT[[%C%C4::
 zz$++":":;r1   c                 z    | j                  |      }| j                  ||      }|| j                  ||      z   }|S r   )r   r   r   )r   r   r   forwarded_statess       r/   r   zFlaxLongT5LayerFF.__call__7  sG    ??=9../?}.]%5EUb(ccr1   Nr   r   r   r1   r/   r   r   (  s$    {{E399"	<r1   r   c                       e Zd ZU eed<   dZeed<   dZeed<   ej                  Z
ej                  ed<   d Zedd       Zd Zd	 Zd
 Zej$                  d        Zd Z	 	 	 	 	 	 	 ddZy)FlaxLongT5Attentionr   Fhas_relative_attention_biascausalrW   c                 *   | j                   j                  | _        | j                   j                  | _        | j                   j                  | _        | j                   j                  | _        | j                   j                  | _        | j                   j                  | _	        | j                  | j
                  z  | _
        | j                   j                  | j                  | j
                  z  dz  z  }| j                   j                  | j                  dz  z  }| j                   j                  | j                  dz  z  }t        j                  | j                  dt        j                  j                  j!                  |      | j"                        | _        t        j                  | j                  dt        j                  j                  j!                  |      | j"                        | _        t        j                  | j                  dt        j                  j                  j!                  |      | j"                        | _        t        j                  | j                  dt        j                  j                  j!                  |      | j"                        | _        | j,                  rdt        j.                  | j                  | j                  t        j                  j                  j!                  |      | j"                        | _        y y )Nr   Fr   embedding_initrW   )r   relative_attention_num_bucketsrelative_attention_max_distancer   d_kvkey_value_proj_dim	num_headsn_headsr   r   	inner_dimr   r   r   r   r   r   rW   qkvor   Embedrelative_attention_biasr   
q_init_stdkv_init_std
o_init_stds       r/   r   zFlaxLongT5Attention.setupE  s   .2kk.X.X+/3{{/Z/Z,{{**"&++"2"2{{,,{{//(?(??[[33I`I`8`ei7ij
kk448LM[[33t~~t7KL
NN++22:>**	
 NN++22;?**	
 NN++22;?**	
 LL++22:>**	
 +++-8833"vv2299+Fjj	,D( ,r1   c                    d}|r&|dz  }|| dkD  |z  z  }t        j                  |       } nt        j                  | d       } |dz  }| |k  }|t        j                  | |z        t        j                  ||z        z  ||z
  z  z   }t        j                  ||dz
        }|t        j                  || |      z  }|j                  d      S av  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on
        r   rf   )a_maxr!   i4r)   r`   cliplogr-   r   relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r/   _relative_position_bucketz-FlaxLongT5Attention._relative_position_buckets  s     AK!2Q!6+ EE #(9 :!$*;1!E E  1$	$y0 &/GG%	12SWW\I=U5VVZehqZqr&
" &)XX.HP[^_P_%`"CIIh0AC]^^&&t,,r1   c                 N   t        j                  |d      dddf   }t        j                  |d      dddf   }||z
  }| j                  || j                   | j                  | j
                        }| j                  |      }|j                  d      dddddddf   }|S )%Compute binned relative position biasr  rV   Nr  r	  r
  rf   r   r!   )r)   rX   r  r   r   r   r   	transpose)r   query_length
key_lengthcontext_positionmemory_positionr  relative_position_bucketvaluess           r/   compute_biasz FlaxLongT5Attention.compute_bias  s    ::l$?4H**Zt<T1WE+.>>#'#A#A#{{?;;==	 $B $
  --.FG!!),T1a];r1   c                 p    |j                  |j                  d d | j                  | j                  fz         S Nrf   rC   r=   r   r   r   r   s     r/   _split_headsz FlaxLongT5Attention._split_heads  4    $$]%8%8!%<dNeNe?f%fggr1   c                 Z    |j                  |j                  d d | j                  fz         S r  rC   r=   r   r  s     r/   _merge_headsz FlaxLongT5Attention._merge_heads  s,    $$]%8%8!%<?P%PQQr1   c                 R   | j                  dd      }| j                  ddt        j                  |j                  |j
                        }| j                  ddt        j                  |j                  |j
                        }| j                  ddd       }|r|j                  j                  ^ }	}
}}|j                  }dt        |	      z  |ddfz   }t        j                  j                  |j                  ||      }t        j                  j                  |j                  ||      }||_        ||_        |j                  d   }|j                  |z   |_        t        j                  t        j                  |
      ||z   k  t        |	      d||
fz         }t        ||      }|||fS )	a[  
        This function takes projected key, value states from a single input token and concatenates the states to cached
        states from previous steps. This function is slighly adapted from the official Flax repository:
        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
        cache
cached_keycached_valuecache_indexc                  L    t        j                  dt         j                        S )Nr   rV   )r)   r   rY   r   r1   r/   <lambda>z;FlaxLongT5Attention._concatenate_to_cache.<locals>.<lambda>  s    CIIaWZW`W`Da r1   r   r   r!   )has_variablevariabler)   r   r=   rW   valuelenr   laxdynamic_update_slicebroadcast_torX   rN   r   )r   keyr/  queryrd   is_initializedr'  r(  r)  
batch_dims
max_lengthr   depth_per_head	cur_indexrS   num_updated_cache_vectorspad_masks                    r/   _concatenate_to_cachez)FlaxLongT5Attention._concatenate_to_cache  s|    **7LA]]7L#))SYYPSPYPYZ
}}WnciiV[VaVabmmG]<abAKAQAQAWAW>ZY#))IS_,	1a/@@G''..z/?/?gNCGG001C1CUGTE"J!&L(-A% + 1 14M MK ''

:&5N)NNj!Q(A:$NNH +8^DNE>))r1   c                    | j                   xr | j                  dd      xr | }|j                  d   }|r|n|j                  d   }	| j                  r| j	                  |	|      }
nG|t        j                  |      }
n/t        j                  d| j                  |	|f| j                        }
|rR| j                  d   d   j                  d   }t        j                  j                  |
dd|dfd| j                  ||f      }
|
S )Nr&  r'  r!   rV   r   )r   r-  r=   r   r  r)   r*   r   r   rW   	variablesr   r1  dynamic_slice)r   
key_statesquery_statesrd   
init_cache
seq_lengthcausal_attention_mask_shiftcache_is_filledr  r  position_biasmax_decoder_lengths               r/   _create_position_biasz)FlaxLongT5Attention._create_position_bias  s     ++g$*;*;G\*Rg\fXf%%a(
%4z,:L:LQ:O++ --lJGM'NN>:MIIq$,,j&QY]YcYcdM !%!8!F!L!LQ!OGG11A2A6DLL*.@AM
 r1   Nc	           
      "   |j                   dd \  }	}
| j                  |      }|| j                  |      n| j                  |      }|| j                  |      n| j                  |      }| j	                  |      }| j	                  |      }| j	                  |      }|t        j                  |j                   d         z  }| j                  dd      r| j                  r| j                  d   d   nd}| j                  rt        |d	      }| j                  dd      rH| j                  d   d   j                   d
   }t        j                  j                  |dd|dfd
d
|
|f      }t        j                  ||	f|j                   d
d z         }t        j                  t        j                  |d      |j                         }t!        ||      }n|t        j                  |d      }| j                  r,| j                  dd      s|r| j#                  ||||      \  }}}|t        j$                  | j&                        j(                  }t        j                  j+                  |dkD  t        j,                  |j                   d      j/                  | j&                        t        j,                  |j                   |      j/                  | j&                              }|| j1                  |||||
|      }|||z   }d}|s | j2                  dkD  r| j5                  d      }t7        ||||| j2                  d|| j&                        }t        j8                  d||      }| j;                  |      }| j=                  |      }||f}|r||fz   }|S )
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        Nrf   r(   r&  r'  r)  r   boolrV   r!   )rK   rw   r   Tbiasdropout_rngr   broadcast_dropoutr   rW   ...hqk,...khd->...qhd)r=   r   r   r   r   r)   r   r-  r   r?  r   r   r1  r@  r3  expand_dimsr   r=  finforW   minselectfullr   rI  r   make_rngr   r   r$  r   )r   r   rd   key_value_statesrG  	use_cacheoutput_attentionsr   rC  r   rD  rB  rA  value_statesrE  causal_attention_maskrH  
mask_valuerQ  attn_weightsattn_outputoutputss                         r/   r   zFlaxLongT5Attention.__call__  s|    "/!4!4Ra!8
J vvm,.>.FTVVM*DFFScLd
0@0Htvvm,dffUeNf ((6&&z2
((6 	!3!3B!788 8<7H7HR^7_dhdodoDNN7#M2vw 	$ ;;$4^6$R!   ,7%)^^G%<\%J%P%PQR%S"(+(=(=)6::'9:)% %($4$4%
}7L7R7RSTSU7V'V%! !--X>@U@[@[N +>;PQN' __^(KN ;;D--g|D
7;7Q7QL,84Jn
 %4::.22J WW^^"--s3::4::F--z:AA$**MN   66L.*jRmM ) - > !3--	2K 5#"'**	
 jj!8,U ''4 ff[)./Gr1   T       )NNNFFTF)r   r   r   r"   r   r   rL  r   r)   r   rW   r   staticmethodr  r  r   r$  r   compactr=  rI  r   r   r1   r/   r   r   ?  s    (--FD{{E399",\ !- !-F"hR ZZ* *@6 qr1   r   c                       e Zd ZU eed<   dZeed<   ej                  Z	ej                  ed<   d Z
edd       ZdefdZd	 Zd
 Zdedeej&                     dej&                  fdZ	 	 	 	 	 ddZy)FlaxLongT5LocalAttentionr   Fr   rW   c                 r   | j                   j                  | _        | j                   j                  | _        | j                   j                  | _        | j                   j                  | _        | j                   j                  | _        | j                   j                  | _        | j                  dz   | _	        | j                   j                  | _        | j                  | j
                  z  | _        | j                   j                  | j                  | j
                  z  dz  z  }| j                   j                  | j                  dz  z  }| j                   j                  | j                  dz  z  }t        j                  | j                  dt         j                  j"                  j%                  |      | j&                        | _        t        j                  | j                  dt         j                  j"                  j%                  |      | j&                        | _        t        j                  | j                  dt         j                  j"                  j%                  |      | j&                        | _        t        j                  | j                  dt         j                  j"                  j%                  |      | j&                        | _        | j0                  rYt        j2                  | j                  | j                  t         j                  j"                  j%                  |            | _        y y )Nr!   r   Fr   r   )r   r   r   r   r   r   r   r   local_radiusr3   r   r   r   r   r   r   r   r   r   rW   r   r   r   r   r   r   r   r   s       r/   r   zFlaxLongT5LocalAttention.setup`  s;   .2kk.X.X+/3{{/Z/Z,{{**"&++"2"2{{,, KK44**Q.{{//(?(??[[33I`I`8`ei7ij
kk448LM[[33t~~t7KL
NN++22:>**	
 NN++22;?**	
 NN++22;?**	
 LL++22:>**	
 +++-8833"vv2299+F,D( ,r1   c                    d}|r&|dz  }|| dkD  |z  z  }t        j                  |       } nt        j                  | d       } |dz  }| |k  }|t        j                  | |z        t        j                  ||z        z  ||z
  z  z   }t        j                  ||dz
        }|t        j                  || |      z  }|j                  d      S r   r  r  s           r/   r  z2FlaxLongT5LocalAttention._relative_position_bucket       AK!2Q!6+ EE #(9 :!$*;1!E E  1$	$y0 &/GG%	12SWW\I=U5VVZehqZqr&
" &)XX.HP[^_P_%`"CIIh0AC]^^&&t,,r1   block_lengthc                    t        j                  d|z  d      }|||  }|dddf   |dddf   z
  }| j                  |d| j                  | j                        }| j                  |      }|j                  d      ddddddddf   }|S r  r   r  rV   NTr  r  r)   rX   r  r   r   r   r  r   ro  r  r  r  r  r  s          r/   r  z%FlaxLongT5LocalAttention.compute_bias      **Q%5TB*<F+D!G47G47PP#'#A#A;;==	 $B $
  --.FG!!),T4Aq-@Ar1   c                 p    |j                  |j                  d d | j                  | j                  fz         S r  r  r  s     r/   r   z%FlaxLongT5LocalAttention._split_heads  r!  r1   c                 V    |j                  |j                  d   d| j                        S Nr   r(   r#  r  s     r/   r$  z%FlaxLongT5LocalAttention._merge_heads  &    $$]%8%8%;RPPr1   r3   rd   r&   c                     | j                   r| j                  |      }|S |t        j                  |      }|S t        j                  dd| j
                  |d|z  f| j                        }|S Nr!   r   rV   r   r  r)   r*   r   r   rW   r   r3   rd   rG  s       r/   rI  z.FlaxLongT5LocalAttention._create_position_bias  s    ++ --i8M  'NN>:M   IIq!T\\9a)m&T\`\f\fgMr1   Nc           
      N   |j                   dd \  }}| j                  |      }	|| j                  |      n| j                  |      }
|| j                  |      n| j                  |      }| j	                  |	      }	| j	                  |
      }
| j	                  |      }t        |	| j                  d      }	t        |
| j                  d      }
t        || j                  d      }t        |
dd      }
t        |dd      }|	t        j                  |	j                   d         z  }	|t        || j                        }t        j                  j                  |dkD  t        j                  |j                   d      j                  | j                         t        j                  |j                   d	      j                  | j                               }|3| j#                  | j                  |      }|||j%                  dd      z   }d}|s | j&                  dkD  r| j)                  d
      }t+        |	|
||| j&                  d|| j                         }t        j,                  d||      }| j/                  |      }|ddd|ddf   }| j1                  |      }||f}|r||fz   }|S )rK  Nrf   r!   rK   rg   r(   r   rw       _r   TrO  rS  )r=   r   r   r   r   rF   r3   rT   r)   r   rk   r   r1  rW  rX  r   rW   rI  swapaxesr   rY  r   r   r$  r   )r   r   rd   rZ  rG  r\  r   r   rD  rB  rA  r]  rQ  r`  ra  rb  s                   r/   r   z!FlaxLongT5LocalAttention.__call__  s    "/!4!4Ra!8
J vvm,.>.FTVVM*DFFScLd
0@0Htvvm,dffUeNf ((6&&z2
((6 *,QO'
DNNK
),QO +:!STU
,\aWXY 	!3!3B!788%6~t~~VN !WW^^"--s3::4::F--u5<<TZZHN   66t~~~VM) -0G0G10M M !3--	2K 5#"'**	
 jj!8,U ''4!![j[!"34 ff[)./Gr1   rc  NNNFT)r   r   r   r"   r   r   rL  r)   r   rW   r   rf  r  r   r  r   r$  r   rz   r{   rI  r   r   r1   r/   ri  ri  [  s    (--{{E399"-^ !- !-F "hQ	s 	HRZZDX 	]_]g]g 	 Ur1   ri  c                   0   e Zd ZU eed<   dZeed<   ej                  Z	ej                  ed<   d Z
edd       ZdefdZd	ej                   d
ej                   dej                   fdZd Zd Zded	eej                      dej                   fdZ	 	 	 	 	 ddZy)"FlaxLongT5TransientGlobalAttentionr   Fr   rW   c                    | j                   j                  | _        | j                   j                  | _        | j                   j                  | _        | j                   j                  | _        | j                   j                  | _        | j                   j                  | _        | j                  dz   | _	        | j                   j                  | _
        | j                   j                  | _        | j                  | j
                  z  | _        | j                   j                  | j                  | j
                  z  dz  z  }| j                   j                  | j                  dz  z  }| j                   j                  | j                  dz  z  }t        j                   | j                  dt"        j                  j$                  j'                  |      | j(                        | _        t        j                   | j                  dt"        j                  j$                  j'                  |      | j(                        | _        t        j                   | j                  dt"        j                  j$                  j'                  |      | j(                        | _        t        j                   | j                  dt"        j                  j$                  j'                  |      | j(                        | _        | j2                  rXt        j4                  | j                  | j                  t"        j                  j$                  j'                  |            | _        | j2                  rXt        j4                  | j                  | j                  t"        j                  j$                  j'                  |            | _        t;        | j                   j                  | j                   j<                  | j(                        | _        y )Nr!   r   Fr   rk  r   ) r   r   r   r   r   r   r   r   rl  r3   rl   r   r   r   r   r   r   r   r   r   rW   r   r   r   r   r   r   r   global_relative_attention_biasr   r   global_input_layer_normr   s       r/   r   z(FlaxLongT5TransientGlobalAttention.setup3  s   .2kk.X.X+/3{{/Z/Z,{{**"&++"2"2{{,, KK44**Q.!%!>!>{{//(?(??[[33I`I`8`ei7ij
kk448LM[[33t~~t7KL
NN++22:>**	
 NN++22;?**	
 NN++22;?**	
 LL++22:>**	
 +++-8833"vv2299+F,D( ++24((33"vv2299+F3D/
 (;KKT[[%C%C4::(
$r1   c                    d}|r&|dz  }|| dkD  |z  z  }t        j                  |       } nt        j                  | d       } |dz  }| |k  }|t        j                  | |z        t        j                  ||z        z  ||z
  z  z   }t        j                  ||dz
        }|t        j                  || |      z  }|j                  d      S r   r  r  s           r/   r  z<FlaxLongT5TransientGlobalAttention._relative_position_bucketn  rn  r1   ro  c                    t        j                  d|z  d      }|||  }|dddf   |dddf   z
  }| j                  |d| j                  | j                        }| j                  |      }|j                  d      ddddddddf   }|S rq  rr  rs  s          r/   r  z/FlaxLongT5TransientGlobalAttention.compute_bias  rt  r1   rd   r   r&   c                 V   t        j                  |d   |d d d d d f         d d d df   }t        j                  j	                  |dkD  t        j
                  |j                  d      j                  | j                        t        j
                  |j                  d      j                  | j                              }t        || j                        }| j                  |d| j                  | j                        }| j                  |      }t        j                  |d      }||z   }|S )	Nrh   .r   rw   r  Tr  )r   r   r!   rf   )r)   equalr   r1  rW  rX  r=   r   rW   r   rl   r  r   r   r  r  )r   rd   r   side_attention_maskattention_side_biasr   side_relative_position_bucket	side_biass           r/   compute_side_biasz4FlaxLongT5TransientGlobalAttention.compute_side_bias  s   !iiy(ACUVWY]_`V`Cabcdfjlocop!ggnn!#HH(..4;;DJJGHH(..6==djjI
 "B.RVRhRh!i(,(F(F";;==	 )G )
% 778UV	 MM)\:	1I=""r1   c                 p    |j                  |j                  d d | j                  | j                  fz         S r  r  r  s     r/   r   z/FlaxLongT5TransientGlobalAttention._split_heads  r!  r1   c                 V    |j                  |j                  d   d| j                        S rw  r#  r  s     r/   r$  z/FlaxLongT5TransientGlobalAttention._merge_heads  rx  r1   r3   c                     | j                   r| j                  |      }|S |t        j                  |      }|S t        j                  dd| j
                  |d|z  f| j                        }|S rz  r{  r|  s       r/   rI  z8FlaxLongT5TransientGlobalAttention._create_position_bias  r}  r1   Nc           
         |j                   dd \  }}t        ||nt        j                  ||f      | j                        \  }	}
|
j                   d   }t        ||	|      }| j                  |      }| j                  |      }|| j                  |      n| j                  |      }|| j                  |      n| j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }t        || j                  d      }t        || j                  d      }t        || j                  d      }t        |dd      }t        |dd      }dg|j                  dz   z  }|j                   d   |d<   t        j                  |ddddf   |      }t        j                  |ddddf   |      }t        j                   ||fd      }t        j                   ||fd      }|t        j"                  |j                   d         z  }|t%        || j                        }t&        j(                  j+                  |dkD  t        j,                  |j                   d	      j/                  | j0                        t        j,                  |j                   d
      j/                  | j0                              }nd}|| j3                  | j                  |      }|||j5                  dd      z   }|t        j                  ||f      }| j7                  ||
      }t        || j                  d      }t        j4                  |dd      }t        j                   ||fd      }d}|s | j8                  d	kD  r| j;                  d      }t=        ||||| j8                  d|| j0                        }t        j>                  d||      }| jA                  |      }|ddd|ddf   }| jC                  |      }||f}|r||fz   }|S )rK  Nrf   r(   r!   rK   rg   .r   rw   r  rN  r   TrO  rS  )"r=   r   r)   r   rl   r   r  r   r   r   r   rF   r3   rT   r>   tilerP   r   rk   r   r1  rW  rX  r   rW   rI  r  r  r   rY  r   r   r$  r   )r   r   rd   rZ  rG  r\  r   r   rD  rn   r   _global_seq_lenglobal_inputsrB  rA  r]  side_key_statesside_value_statesrepsr^   side_position_biasrQ  r`  ra  rb  s                            r/   r   z+FlaxLongT5TransientGlobalAttention.__call__  s2    "/!4!4Ra!8
J )E,8Nchh
T^G_>`"")
%	%
 -22261-O\44]C vvm,.>.FTVVM*DFFScLd
0@0Htvvm,dffUeNf ((6&&z2
((6 &&/ FF=1 ++O< --.?@ *,QO'
DNNK
),QO +:!STU
,\aWXY so**Q./""1%Q((?1dC<#@$GHH%6q$|%DdK __j/%BK
6G'HqQ 	!3!3B!788%#<^T^^#\ #&77>>$q(-33S9@@L-33U;BB4::N$  $(   66t~~~VM#/ -0D0M0MaQR0S S %!$:z*B!C!%!7!7HZ![!34F]_!`!$.@!Q!GOO]<N,OVXYM !3--	2K 5#"'**	
 jj!8,U ''4!![j[!"34 ff[)./Gr1   rc  r  )r   r   r   r"   r   r   rL  r)   r   rW   r   rf  r  r   r  rz   r{   r  r   r$  r   rI  r   r   r1   r/   r  r  .  s    (--{{E399"9
v !- !-F "#

 #PRPZPZ #_a_i_i #2hQ	s 	HRZZDX 	]_]g]g 	 }r1   r  c                   |    e Zd ZU dZeed<   dZeed<   ej                  Z
ej                  ed<   d Z	 	 	 	 d
defd	Zy)!FlaxLongT5LayerLocalSelfAttentionz$Local self attention used in encoderr   Fr   rW   c                 L   t        | j                  | j                  | j                        | _        t        | j                  j                  | j                  j                  | j                        | _        t        j                  | j                  j                        | _        y Nr   rW   r   )ri  r   r   rW   LocalSelfAttentionr   r   r   r   r   r   r   r   r   s    r/   r   z'FlaxLongT5LayerLocalSelfAttention.setupU  sp    ":KKT5U5U]a]g]g#
 .KKT[[%C%C4::
 zz$++":":;r1   Nkwargsc                     | j                  |      }| j                  |||||      }|| j                  |d   |      z   }|f|dd  z   }	|	S N)rd   rG  r\  r   r   r   r!   )r   r  r   
r   r   rd   rG  r\  r   r  normed_hidden_statesattention_outputrb  s
             r/   r   z*FlaxLongT5LayerLocalSelfAttention.__call__^  sr      $}=22 )'/' 3 
 &5Ea5HXe(ff "%5ab%99r1   NNFTr   r   r   __doc__r"   r   r   rL  r)   r   rW   r   r   r   r   r1   r/   r  r  N  sJ    .(--{{E399"<  r1   r  c                   |    e Zd ZU dZeed<   dZeed<   ej                  Z
ej                  ed<   d Z	 	 	 	 d
defd	Zy)+FlaxLongT5LayerTransientGlobalSelfAttentionz/Transient-Global self attention used in encoderr   Fr   rW   c                 L   t        | j                  | j                  | j                        | _        t        | j                  j                  | j                  j                  | j                        | _        t        j                  | j                  j                        | _        y r  )r  r   r   rW   TransientGlobalSelfAttentionr   r   r   r   r   r   r   r   r   s    r/   r   z1FlaxLongT5LayerTransientGlobalSelfAttention.setup{  sp    ,NKKT5U5U]a]g]g-
) .KKT[[%C%C4::
 zz$++":":;r1   Nr  c                     | j                  |      }| j                  |||||      }|| j                  |d   |      z   }|f|dd  z   }	|	S r  )r   r  r   r  s
             r/   r   z4FlaxLongT5LayerTransientGlobalSelfAttention.__call__  sr      $}=<< )'/' = 
 &5Ea5HXe(ff "%5ab%99r1   r  r  r   r1   r/   r  r  t  sJ    9(--{{E399"<  r1   r  c                   t    e Zd ZU eed<   dZeed<   ej                  Z	ej                  ed<   d Z
	 	 	 	 	 ddZy)	FlaxLongT5LayerSelfAttentionr   Fr   rW   c                 v   t        | j                  | j                  | j                  j                  | j                        | _        t        | j                  j                  | j                  j                  | j                        | _	        t        j                  | j                  j                        | _        y )Nr   r   rW   r   )r   r   r   r   rW   SelfAttentionr   r   r   r   r   r   r   r   r   s    r/   r   z"FlaxLongT5LayerSelfAttention.setup  s|    0KK(,(H(H;;%%**	
 .KKT[[%C%C4::
 zz$++":":;r1   Nc                     | j                  |      }| j                  ||||||      }|| j                  |d   |      z   }|f|dd  z   }	|	S )Nrd   rG  r\  r   rC  r   r   r!   )r   r  r   )
r   r   rd   rG  r\  r   rC  r  r  rb  s
             r/   r   z%FlaxLongT5LayerSelfAttention.__call__  su      $}=-- )'/'! . 
 &5Ea5HXe(ff "%5ab%99r1   )NNFTFr   r   r   r"   r   r   rL  r)   r   rW   r   r   r   r1   r/   r  r    s@    (--{{E399"
< r1   r  c                   d    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 ddZ	y)FlaxLongT5LayerCrossAttentionr   rW   c                 :   t        | j                  dd| j                        | _        t	        | j                  j
                  | j                  j                  | j                        | _        t        j                  | j                  j                        | _        y )NFr  r   )r   r   rW   EncDecAttentionr   r   r   r   r   r   r   r   r   s    r/   r   z#FlaxLongT5LayerCrossAttention.setup  sl    2KKU5PTPZPZ 
 .KKT[[%C%C4::
 zz$++":":;r1   Nc                     | j                  |      }| j                  |||||      }|| j                  |d   |      z   }|f|dd  z   }	|	S )N)rd   rZ  rG  r\  r   r   r!   )r   r  r   )
r   r   rZ  rd   rG  r\  r   r  r  rb  s
             r/   r   z&FlaxLongT5LayerCrossAttention.__call__  sr      $}=// )-'/ 0 
 &5Ea5HXe(ff "%5ab%99r1   r  r   r   r1   r/   r  r    s2    {{E399"< r1   r  c                   |    e Zd ZU eed<   dZeed<   ej                  Z	ej                  ed<   d Z
	 	 	 	 	 	 	 	 	 ddZy)	FlaxLongT5Blockr   Fr   rW   c                    | j                   j                  | _        | j                  rt        }nc| j                   j                  dk(  rt        }nC| j                   j                  dk(  rt
        }n#t        d| j                   j                   d       || j                   | j                  t        d      | j                        f| _
        d}| j                  rD| xj                  t        | j                   t        d      | j                        fz  c_
        |dz  }| xj                  t        | j                   t        |      | j                        fz  c_
        y )	Nlocalztransient-globalzjFor encoder attention mechanism, either `local` or `transient-global` attention type is expected, but got .r   )r   namerW   r!   )r  rW   )r   r   r  encoder_attention_typer  r  
ValueErrorr   strrW   layerr  r   )r   attention_layerfeed_forward_indexs      r/   r   zFlaxLongT5Block.setup  s   kk((;;:O[[//7:?O[[//3EEIO;;==>aA 
 ,0,L,LVjj	

 ;;JJ83q6Y]YcYcdffJ!#

(3?Q;RZ^ZdZdegg
r1   Nc                     | j                   d   |||||	|
      }|d   }|dd  }| j                  xr |d u}|r( | j                   d   ||||||	      }|d   }||dd  z   } | j                   d   ||	      }|f}||z   }|S )Nr   r  r!   )rZ  rd   rG  r\  r   r(   r   )r  r   )r   r   rd   rG  encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr\  return_dictr   rC  self_attention_outputsattention_outputsdo_cross_attentioncross_attention_outputsrb  s                   r/   r   zFlaxLongT5Block.__call__	  s     "/A)'/'!"
 /q12126![[N-B$-N&3djjm!65;"3+'# 4A6M !24KAB4O O '

2}MR "-- r1   )	NNNNNFTTFr  r   r1   r/   r  r    sN    (--{{E399"h@ "#&*0r1   r  c                   v    e Zd ZU eed<   eed<   ej                  Zej                  ed<   d Z		 	 	 	 	 	 	 	 ddZ
y)FlaxLongT5LayerCollectionr   r   rW   c                 f    t        | j                  | j                  | j                        | _        y )Nr  )r  r   r   rW   r  r   s    r/   r   zFlaxLongT5LayerCollection.setupB  s&    $KKT5U5U]a]g]g

r1   Nc
                 6    | j                  |||||||||		      S )N)rd   rG  r  r  r  r\  r   rC  )r  )
r   r   rd   rG  r  r  r  r\  r   rC  s
             r/   r   z"FlaxLongT5LayerCollection.__call__G  s5     zz)'"7#9*G/'!  

 
	
r1   )NNNNNFTF)r   r   r   r"   r   rL  r)   r   rW   r   r   r   r1   r/   r  r  =  sD    !%%{{E399"
 "#&*
r1   r  c            	           e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 	 dde	de	d	e	d
e	fdZy)FlaxLongT5BlockCollectionr   rW   Fgradient_checkpointingc                     | j                   j                  | _        | j                  rnt        t        d      }t        | j                   j                        D cg c].  } || j                   |dk(  | j                  t        |            0 c}| _	        y t        | j                   j                        D cg c]1  }t	        | j                   |dk(  | j                  t        |            3 c}| _	        y c c}w c c}w )N)         )static_argnumsr   )r   rW   r  )
r   r   r  rematr  rL   
num_layersrW   r  blocks)r   FlaxLongT5CheckpointLayerrR   s      r/   r   zFlaxLongT5BlockCollection.setupf  s    kk((&&(-.GXa(b% t{{556  *KK12a**Q	DK" t{{556  *KK12a**Q	DKs   3C676C;Nr\  output_hidden_statesr   rC  c	                 T   |rdnd }	|rdnd }
|r| j                   rdnd }d }d }t        | j                        D ]`  \  }}|r|	|fz   }	 ||||||||||	      }|d   }|d   }| j                   r|	||rdnd   }|sB|
|d   fz   }
| j                   sX||d   fz   }b t        ||	|
|      S )Nr   r   r!   r   rf      last_hidden_stater   
attentionscross_attentions)r   	enumerater  r   )r   r   rd   r  r  r\  r  r   rC  all_hidden_statesall_attentionsall_cross_attentionsrG  r  rR   layer_modulelayer_outputss                    r/   r   z"FlaxLongT5BlockCollection.__call__~  s    #7BD0d&7DKKrd(,%(5 	VOA|#$58H$H!(%&-!
M *!,M
 *!,M{{4@0=CTaZ[0\- !/=3C2E!E;;+?=QRCSBU+U(;	V> =++%1	
 	
r1   )NNNNFFTF)r   r   r   r"   r   r)   r   rW   r  rL  r   r   r   r1   r/   r  r  a  sq    {{E399"#(D(4 "#"'%*" 6
  6
 #6
 6
 6
r1   r  c                       e Zd ZU eed<   ej                  ed<   ej                  Z	ej                  ed<   dZ
eed<   d Z	 	 	 	 	 	 	 	 	 dded	ed
ededef
dZy)FlaxLongT5Stackr   embed_tokensrW   Fr  c                    | j                   j                  | _        t        | j                   | j                  | j                        | _        t        | j                   j                  | j                   j                  | j                        | _	        t        j                  | j                   j                        | _        y )NrW   r  r   )r   r   r  rW   r  blockr   r   r   final_layer_normr   r   r   r   r   s    r/   r   zFlaxLongT5Stack.setup  s~    kk((.KKtzz$B]B]

 !4KKT[[%C%C4::!
 zz$++":":;r1   Nr\  r  r  r   rC  c
           
      n   | j                  |      }
| j                  |
|      }
| j                  |
|||||||	      }|d   }
| j                  |
      }
| j                  |
|      }
d }|r|j                  }||
fz   }|s|r
|
|f|dd  z   S |
f|dd  z   S t        |
||j                  |j                        S )Nr   )rd   r  r  r\  r  r   rC  r   rf   r!   r  )r  r   r  r  r   r   r  r  )r   r#   rd   r  r  r\  r  r  r   rC  r   rb  r  s                r/   r   zFlaxLongT5Stack.__call__  s    )))4]-P**)"7#9/!5'!  	
  
--m<]-P ! ' 5 5 1]4D D#!% ABK    "#gabk11<++))$55	
 	
r1   )	NNNNFFTTF)r   r   r   r"   r   r   r   r)   r   rW   r  rL  r   r   r   r1   r/   r  r    s    (({{E399"#(D(	< "#"'%* " 3
  3
 #3
 3
 3
 3
r1   r  a  
    Args:
        input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).
        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a1
  
    Args:
        decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            For training, `decoder_input_ids` should be provided.
        encoder_outputs (`tuple(tuple(jnp.ndarray)`):
            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
        encoder_attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the
            paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
        past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a  
    Args:
        input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).
        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
            Training](./longt5#training).
        decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        encoder_outputs (`tuple(tuple(jnp.ndarray)`, *optional*):
            Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, `optional`: *attentions*)
            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden states at
            the output of the last layer of the encoder. Used in the cross-attention of the decoder.
        past_key_values (`tuple(tuple(jnp.ndarray))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.


        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                       e Zd ZU dZeZdZdZej                  e
d<   ddej                  dfded	ee   d
edej                  def
 fdZd Zd#dej(                  j*                  d	ededefdZ ee      	 	 	 	 	 	 	 	 	 d$dej4                  deej4                     dej4                  deej4                     dee   dee   dee   dededefd       Zd Z ee        e!e"e      	 	 	 	 	 	 	 d%dej4                  deej4                     dee   dee   dee   dededefd              Z# ee$       e!e%e      	 	 	 	 	 	 	 	 	 d$d eej4                     deej4                     d!edee   dee   dee   dededefd"              Z& xZ'S )&FlaxLongT5PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    transformerNmodule_classrJ   r   Tr   input_shapeseedrW   _do_initc                 Z     | j                   d||d|}t        | 	  ||||||       y )N)r   rW   )r  r  rW   r  r   )r  super__init__)	r   r   r  r  rW   r  r  module	__class__s	           r/   r  z"FlaxLongT5PreTrainedModel.__init__  s=     #""H&HH[tSXcklr1   c                 ^    | j                  | j                  | j                  d      | _        y )NT)r   rW   r  )r  r   rW   _moduler   s    r/   enable_gradient_checkpointingz7FlaxLongT5PreTrainedModel.enable_gradient_checkpointing  s*    ((;;**#' ) 
r1   rngparamsr&   c                    t        j                  |d      }t        j                  |      }t        j                  |      }t        j                  |      }t        j                  j                  |      \  }}	||	d}
| j                  j                  |
||||      d   }|dt        t        |            }t        t        |            }| j                  D ]
  }||   ||<    t               | _
        t        t        |            S |S )Nr  rV   )r  r   r  )r)   r   r|   r   randomsplitr   initr   r
   _missing_keysr,   r	   r   )r   r  r  r  r#   rd   decoder_input_idsdecoder_attention_mask
params_rngrQ  rngsrandom_paramsmissing_keys                r/   init_weightsz&FlaxLongT5PreTrainedModel.init_weights  s    IIk6	y1MM)4!$y!9"%**"2"23"7
K$=(("
  (-)@AM!(6"23F#11 A&3K&@{#A!$D.011  r1   r#   rd   r  r  r\  r  r  trainrQ  c                 8   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      |t        j                  |      }|t        j                  |      }|
d|
ini }| j                  j                  d|	xs | j                  it        j                  |d      t        j                  |d      t        j                  |d      t        j                  |d      |||| |
      S )NzfMake sure to provide both `input_ids` and `decoder_input_ids`. `decoder_input_ids` is not passed here.r   r  r  rV   )	r#   rd   r  r  r\  r  r  r   r  )r   r\  r  r  r  r)   r|   r   applyr  r   )r   r#   rd   r  r  r\  r  r  r  r  rQ  r  s               r/   r   z"FlaxLongT5PreTrainedModel.__call__  s#    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY$  ! ]]95N ")%(]]3D%E" ,7+B	;'{{  v,-ii	699^4@!ii(9F#&99-C4#P/!5##) ! 
 	
r1   c                    t        j                  ||fd      }t        j                  |      }d }| j                  j	                  t
        j                  j                  d      |||d   d|      }t        |d         S )a+  
        Args:
            batch_size (`int`):
                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
            max_length (`int`):
                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
                cache.
            encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
                `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
                `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*)
                is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
                cross-attention of the decoder.
        r  rV   c                 6    | j                         } |||fi |S N_get_decoder_moduler   r  r  r  decoder_modules        r/   _decoder_forwardz>FlaxLongT5PreTrainedModel.init_cache.<locals>._decoder_forward  -    #779N!!&  r1   r   T)r  r  r  rC  methodr&  )	r)   r   r|   r   r
  r   r  r   r
   )r   r   r8  encoder_outputsr  r  r  init_variabless           r/   rC  z$FlaxLongT5PreTrainedModel.init_cache  s      HHj*%=TJ!$/@!A	 ))JJq!/#9"1!"4# * 
 w/00r1   output_typeconfig_classc	                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        j
                  |      }i }	|||	d<   d }
| j                  j                  d|xs | j                  it	        j                  |d      t	        j                  |d      |||| |	|
	      S )a  
        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, FlaxLongT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
        >>> model = FlaxLongT5ForConditionalGeneration.from_pretrained("google/long-t5-local-base")

        >>> text = "My friends are cool but they eat too many carbs."
        >>> inputs = tokenizer(text, return_tensors="np")
        >>> encoder_outputs = model.encode(**inputs)
        ```r   c                 6    | j                         } |||fi |S r  )_get_encoder_module)r   r#   rd   r  encode_modules        r/   _encoder_forwardz:FlaxLongT5PreTrainedModel.encode.<locals>._encoder_forward;  s"    "668M NEfEEr1   r  r  rV   )r#   rd   r\  r  r  r   r  r  )
r   r\  r  r  r)   r|   r   r  r  r   )r   r#   rd   r\  r  r  r  r  rQ  r  r)  s              r/   encodez FlaxLongT5PreTrainedModel.encode  s    8 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY! ]]95N ")DO	F {{  v,-ii	699^4@/!5##)# ! 

 
	
r1   r  past_key_valuesc                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d   }|)|j                  dd \  }}t        j                  ||f      }|j                  \  }}|t        j                  ||f      }i }|||d<   d|
xs | j                  i}|r	||d<   dg}nd}d }| j                  j                  |t        j                  |d	
      t        j                  |d	
      |t        j                  |d	
      ||||	 |||      }||r|\  }}t        |d         |d<   |S |"|s |\  }}|dd t        |d         fz   |dd z   }|S )aG  
        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, FlaxLongT5ForConditionalGeneration
        >>> import jax.numpy as jnp

        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
        >>> model = FlaxLongT5ForConditionalGeneration.from_pretrained("google/long-t5-local-base")

        >>> text = "My friends are cool but they eat too many carbs."
        >>> inputs = tokenizer(text, return_tensors="np")
        >>> encoder_outputs = model.encode(**inputs)

        >>> decoder_start_token_id = model.config.decoder_start_token_id
        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id

        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
        >>> logits = outputs.logits
        ```Nr   rf   r   r  r&  Fc                 6    | j                         } |||fi |S r  r  r  s        r/   r  z:FlaxLongT5PreTrainedModel.decode.<locals>._decoder_forward  r  r1   r  rV   r  r  r  r  r\  r  r  r   r  mutabler  r+  r!   )r   r\  r  r  r=   r)   r   r  r   r  r   r
   )r   r  r   r  r  r+  r\  r  r  r  r  rQ  r  r   sequence_lengthr  inputsr/  r  rb  pasts                        r/   decodez FlaxLongT5PreTrainedModel.decodeK  s   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY / 2!)*?*E*Ebq*I'J%(XXz?.K%L"&7&=&=#
O!)%(XXz?.K%L" ")DOF1dkk2
 -F7OiGG	 ++##!ii(9F#&99-C4#P"7#&99-C4#P/!5##)# $ 
  &;#MGT)1$w-)@G%&N(#MGTbqkXd7m%<$>>LGr1   r  	NNNNNNFNN)NNNNFNN)(r   r   r   r  r"   r$  base_model_prefixr  r   Moduler   r)   r   r   r   rW   rL  r  r  r   r  r   r   r  r   LONGT5_INPUTS_DOCSTRINGr{   r   dictr   rC  r   LONGT5_ENCODE_INPUTS_DOCSTRINGr    r   r*  LONGT5_DECODE_INPUTS_DOCSTRINGr   r3  __classcell__)r  s   @r/   r  r    s   
  L%"L"))"
 #);;
m
m 3Z
m 	
m
 yy
m 
m
!

 2 2 ! !PZ !fp !: ++BC 15)-8<,0/3&*#/
;;/
 !-/
 ;;	/

 !) 5/
 $D>/
 'tn/
 d^/
 /
 /
 /
 D/
b"1H 89+>\Z 15,0/3&*#6
;;6
 !-6
 $D>	6

 'tn6
 d^6
 6
 6
 6
 [ :6
p 89+Xgst
 9=8< $,0/3&*#c !) 5	c
 !) 5c c $D>c 'tnc d^c c c c u :cr1   r  a
  
    The LongT5 model was proposed in [LongT5: Efficient Text-To-Text Transformer for Long
    Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo
    Ni, Yun-Hsuan Sung and Yinfei Yang. It's an encoder-decoder transformer pre-trained in a text-to-text denoising
    generative setting. LongT5 model is an extension of T5 model, and it enables using one of the two different
    efficient attention mechanisms - (1) Local attention, or (2) Transient-Global attention.

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a Flax Linen
    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`LongT5Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
            `jax.numpy.bfloat16` (on TPUs).

            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
            specified all the computation will be performed with the given `dtype`.

            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
            parameters.**

            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
            [`~FlaxPreTrainedModel.to_bf16`].
z_The bare LONGT5 Model transformer outputting raw hidden-stateswithout any specific head on top.c                       e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
d Zd Z	 	 	 	 	 	 	 	 	 dd	e	fd
Zy)FlaxLongT5Moduler   rW   Fr  c                     | j                   S r  encoderr   s    r/   r'  z$FlaxLongT5Module._get_encoder_module      ||r1   c                     | j                   S r  decoderr   s    r/   r  z$FlaxLongT5Module._get_decoder_module  rA  r1   c                    t        j                  | j                  j                  | j                  j                  t
        j                   j                  j                  | j                  j                  dz        | j                        | _
        t        j                  | j                        }d|_        t        || j                  | j                  | j                        | _        t        j                  | j                        }d|_        | j                  j"                  |_        t        || j                  | j                  | j                        | _        y )Nrx   r   F)r  rW   r  T)r   r   r   
vocab_sizer   r   r   r   r   rW   sharedcopydeepcopyr   r  r  r@  num_decoder_layersr  rD  r   encoder_configdecoder_configs      r/   r   zFlaxLongT5Module.setup  s    hhKK""KK66..55dkk6T6TWZ6Z[**	
 t{{3 %&**#'#>#>	
 t{{3 $$(KK$B$B!&**#'#>#>	
r1   Nr   c
           
      p   ||n| j                   j                  }| j                  ||||||	      }| j                  |||d   |||||	      }
|s|
|z   S t	        |
j
                  |
j                  |
j                  |
j                  |
j                  |j
                  |j                  |j                        S )Nr#   rd   r\  r  r  r   r   r#   rd   r  r  r\  r  r  r   )r  r+  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater  encoder_attentions)
r   use_return_dictr@  rD  r   r  r+  r   r  r  )r   r#   rd   r  r  r   r\  r  r  r   decoder_outputss              r/   r   zFlaxLongT5Module.__call__  s     &1%<k$++B]B] ,,)/!5#' ' 
 ,,'1"1!"4#1/!5#' ' 	
 "_44%-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r1   	NNNNNNNNTr   r   r   r"   r   r)   r   rW   r  rL  r'  r  r   r   r   r1   r/   r=  r=    sb     {{E399"#(D(
: #!"0
 0
r1   r=  c                       e Zd ZeZy)FlaxLongT5ModelN)r   r   r   r=  r  r   r1   r/   rZ  rZ  ;  s    #Lr1   rZ  a  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, FlaxLongT5Model

    >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
    >>> model = FlaxLongT5Model.from_pretrained("google/long-t5-local-base")

    >>> input_ids = tokenizer(
    ...     "Studies have been shown that owning a dog is good for you", return_tensors="np"
    ... ).input_ids
    >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="np").input_ids

    >>> # forward pass
    >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
    >>> last_hidden_states = outputs.last_hidden_state
    ```
r"  z4LONGT5 Model with a `language modeling` head on top.c                       e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
d Zd Z	 	 	 	 	 	 	 	 	 dd	e	fd
Zy)(FlaxLongT5ForConditionalGenerationModuler   rW   Fr  c                     | j                   S r  r?  r   s    r/   r'  z<FlaxLongT5ForConditionalGenerationModule._get_encoder_modulec  rA  r1   c                     | j                   S r  rC  r   s    r/   r  z<FlaxLongT5ForConditionalGenerationModule._get_decoder_modulef  rA  r1   c                    | j                   j                  | _        t        j                  | j                   j
                  | j                   j                  t        j                  j                  j                  | j                   j                        | j                        | _        t        j                  | j                         }d|_        d|_        d|_        t#        || j                  | j                  | j$                        | _        t        j                  | j                         }d|_        d|_        | j                   j(                  |_        t#        || j                  | j                  | j$                        | _        t        j.                  | j                   j
                  dt        j                  j                  j                  | j                   j                        | j                        | _        y )Nr   Fr  Tr   )r   r   	model_dimr   r   rF  r   r   r   r   rW   rG  rH  rI  r   r[  is_encoder_decoderr  r  r@  rJ  r  rD  r   lm_headrK  s      r/   r   z.FlaxLongT5ForConditionalGenerationModule.setupi  s[   ,,hhKK""KK66..55dkk6T6TU**	
 t{{3 %#( ,1)&DKKtzzRVRmRm
 t{{3 $,1)$(KK$B$B!&DKKtzzRVRmRm
 xxKK""++224;;3Q3QR**	
r1   Nr   c
           
         ||n| j                   j                  }| j                  ||||||	      }|d   }
| j                  |||
|||||	      }|d   }| j                   j                  r|| j
                  dz  z  }| j                   j                  rG| j                  j                  d   d   }| j                  j                  dd|j                  ii|      }n| j                  |      }|s|f|dd  z   |z   S t        ||j                  |j                  |j                  |j                  |j                   |j                  |j                  	      S )
NrO  r   rP  r   r  	embeddingkernelr!   )logitsr+  rQ  rR  r  rS  r  rT  )r   rU  r@  rD  tie_word_embeddingsr`  rG  r?  rb  r  Tr   r+  r   r  r  r  )r   r#   rd   r  r  r   r\  r  r  r   r   rV  sequence_outputshared_embedding	lm_logitss                  r/   r   z1FlaxLongT5ForConditionalGenerationModule.__call__  sp    &1%<k$++B]B] ,,)/!5#' ' 
 (* ,,'1"/#1/!5#' ' 	
 *!,;;** .1EFO;;**#{{44X>{K**HxAQASAS6T+UWfgI_5I</!""55GG"+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r1   rW  rX  r   r1   r/   r\  r\  \  sc     {{E399"#(D(
F #!"?
 ?
r1   r\  c                      e Zd ZeZ ee       eee	      	 	 	 	 	 	 	 	 	 dde
ej                     de
ej                     dede
e   de
e   de
e   d	ed
edefd              Z	 	 	 dde
ej$                     de
ej$                     fdZd Zy)"FlaxLongT5ForConditionalGenerationr"  Nr  r  r+  r\  r  r  r  r  rQ  c                 z    ||n j                   j                  }||n j                   j                  }||n j                   j                  }|d   }|)|j                  dd \  }}t        j                  ||f      }|j                  \  }}|t        j                  ||f      }i }|||d<   d|
xs  j                  i}|r	||d<   dg}nd} fd} j                  j                  |t        j                  |d	
      t        j                  |d	
      |t        j                  |d	
      ||||	 |||      }||\  }}n|\  \  }}}|r.t        ||j                  |j                  |j                        }n	|f|dd z   }||rt        d         |d<   |S ||s|dd t        d         fz   |dd z   }|S )aR  
        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, FlaxLongT5ForConditionalGeneration
        >>> import jax.numpy as jnp

        >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
        >>> model = FlaxLongT5ForConditionalGeneration.from_pretrained("google/long-t5-local-base")

        >>> text = "summarize: My friends are cool but they eat too many carbs."
        >>> inputs = tokenizer(text, return_tensors="np")
        >>> encoder_outputs = model.encode(**inputs)

        >>> decoder_start_token_id = model.config.decoder_start_token_id
        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id

        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
        >>> logits = outputs.logits
        ```Nr   rf   r   r  r&  Fc                    | j                         } |||fi |}|d   }	j                  j                  r|	j                  j                  dz  z  }	j                  j                  rJ| j                  j
                  d   d   }| j                  j                  dd|j                  ii|      }||fS | j                  |      }||fS )Nr   r   r  rd  re  )	r  r   rg  r   rG  r?  rb  r  rh  )
r   r  r  r  r  rV  ri  rj  rk  r   s
            r/   r  zCFlaxLongT5ForConditionalGeneration.decode.<locals>._decoder_forward	  s    #779N,!& O .a0O{{.. #2T[[5H5H$5N"O{{..#)==#:#:8#D[#Q "NN00(XGWGYGY<Z1[]lm	 o-- #NN?;	o--r1   r  rV   r.  )rf  r   r  r  r!   r+  )r   r\  r  r  r=   r)   r   r  r   r  r   r   r   r  r  r
   )r   r  r   r  r  r+  r\  r  r  r  r  rQ  r  r   r0  r  r1  r/  r  rb  rk  rV  r2  s   `                      r/   r3  z)FlaxLongT5ForConditionalGeneration.decode  s   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY / 2!)*?*E*Ebq*I'J%(XXz?.K%L"&7&=&=#
O!)%(XXz?.K%L" ")DOF1dkk2
 -F7OiGG	.. ++##!ii(9F#&99-C4#P"7#&99-C4#P/!5##)# $ 
 ")0&I18.(Y$; -;;*55!0!A!A	G !l_QR%88G &;)1$w-)@G%&N(bqkXd7m%<$>>LGr1   rd   c                     |j                   \  }}| j                  |||      }	t        j                  ||fd      }
|!t        j
                  j                  |
|d      }
|	|||
dS )Nr  rV   r7   )r+  r   r  r  )r=   rC  r)   r   r   r1  r2  )r   r  r8  rd   r  r   r  r   rD  r+  extended_attention_masks              r/   prepare_inputs_for_generationz@FlaxLongT5ForConditionalGeneration.prepare_inputs_for_generationR	  sy     "3!8!8
J//*j/R #&((J
+C4"P!-&)gg&B&B')?'#
  /.&4&=	
 	
r1   c                 $    |j                   |d<   |S )Nr+  )r+  )r   model_outputsmodel_kwargss      r/   update_inputs_for_generationz?FlaxLongT5ForConditionalGeneration.update_inputs_for_generationo	  s    *7*G*G&'r1   r4  )NNN)r   r   r   r\  r  r   r:  r    r   r"   r   r)   r{   r8  rL  r   r3  r   Arrayrr  rv  r   r1   r/   rm  rm    s   ;L89+P_kl
 9=8< $,0/3&*# !) 5	
 !) 5  $D> 'tn d^    m :J /36:
 !+	

 !) 3
:r1   rm  a  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, FlaxLongT5ForConditionalGeneration

    >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
    >>> model = FlaxLongT5ForConditionalGeneration.from_pretrained("google/long-t5-local-base")

    >>> ARTICLE_TO_SUMMARIZE = "summarize: My friends are cool but they eat too many carbs."
    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], return_tensors="np")

    >>> # Generate Summary
    >>> summary_ids = model.generate(inputs["input_ids"]).sequences
    >>> print(tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False))
    ```
r,  )_r  rH  typingr   r   r   r   r   
flax.linenlinenr   r   	jax.numpynumpyr)   rz   flax.core.frozen_dictr   r	   r
   r   r   r   nn_partitioningflax.linen.attentionr   flax.traverse_utilr   r   
jax.randomr   modeling_flax_outputsr   r   r   r   r   modeling_flax_utilsr   r   r   r   r   utilsr   r   r   r    configuration_longt5r"   
get_loggerr   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr  r{   r   r0   rA   rF   rT   r]   rc   rk   r   r   r   r6  r   r   r   r   r   ri  r  r  r  r  r  r  r  r  r  r9  r:  r7  r  LONGT5_START_DOCSTRINGr=  rZ  FLAX_LONGT5_MODEL_DOCSTRINGr\  rm  ,FLAX_LONGT5_CONDITIONAL_GENERATION_DOCSTRINGr   r1   r/   <module>r     sS     7 7  
   > > 6 6 > ;    u t . 
		H	%1  	#++ 	S 	Z] 	bebmbm 	  3 3 WZWbWb 	##++ 	## 	#S 	#S[[ 	#<S[[ <c <# <Z] <fifqfq <,!# !#++ !@RZZ @C @TWT_T_ @.bjj .S .S[[ . '0 '0PS '0X]^a^i^ikmkuku^uXv '0T"RZZ "TW "\^\f\f "NRZZ NBJJ N`c Nhjhrhr N+")) +*bii @#299 #N		 .Y")) YxPryy Pf] ]@	#		 #L#")) #N%299 %R BII  FRbii Rl 
		  
HS
		 S
nD
bii D
N" 8'" T9 xo 3 od	& R e
V
ryy V

V
t$/ $ _.ACY[j k . *AD_*_ `  >Q`o p PRhik
ryy k
 jk
\e)B eP0 ,* &(?Bn(n !&4GVer1   