
    sg                        d dl mZmZmZ d dlmZ d dlZd dlm	Z
 d dl	Zd dlmZmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d	d
lmZmZmZmZmZm Z m!Z!m"Z"m#Z# d	dl$m%Z%m&Z&m'Z'm(Z( d	dl)m*Z*m+Z+m,Z, ddl-m.Z.  e,j^                  e0      Z1dZ2dZ3ejh                  Z4d Z5dZ6dZ7 G d dejp                        Z9 G d dejp                        Z: G d dejp                        Z; G d dejp                        Z< G d dejp                        Z= G d dejp                        Z> G d  d!ejp                        Z? G d" d#ejp                        Z@ G d$ d%ejp                        ZA G d& d'ejp                        ZB G d( d)ejp                        ZC G d* d+ejp                        ZD G d, d-e&      ZE G d. d/ejp                        ZF e*d0e6       G d1 d2eE             ZG e'eGe2ee3        G d3 d4ejp                        ZH e*d5e6       G d6 d7eE             ZI e'eIe2ee3d89        G d: d;ejp                        ZJ e*d<e6       G d= d>eE             ZK e'eKe2e"e3        G d? d@ejp                        ZL e*dAe6       G dB dCeE             ZM e(eMe7j                  dD              e'eMe2e e3        G dE dFejp                        ZO e*dGe6       G dH dIeE             ZP e'ePe2e#e3        G dJ dKejp                        ZQ e*dLe6       G dM dNeE             ZR e'eRe2e!e3        G dO dPejp                        ZS e*dQe6       G dR dSeE             ZT e'eTe2ee3       y)T    )CallableOptionalTupleN)
FrozenDictfreezeunfreeze)combine_masksmake_causal_mask)partitioning)dot_product_attention_weights)flatten_dictunflatten_dict)lax   )	-FlaxBaseModelOutputWithPastAndCrossAttentionsFlaxBaseModelOutputWithPooling0FlaxBaseModelOutputWithPoolingAndCrossAttentions%FlaxCausalLMOutputWithCrossAttentionsFlaxMaskedLMOutputFlaxMultipleChoiceModelOutput FlaxQuestionAnsweringModelOutputFlaxSequenceClassifierOutputFlaxTokenClassifierOutput)ACT2FNFlaxPreTrainedModelappend_call_sample_docstringoverwrite_call_docstring)add_start_docstrings%add_start_docstrings_to_model_forwardlogging   )RobertaConfigzFacebookAI/roberta-baser"   c                    | |k7  j                  d      }|j                  dkD  re|j                  d|j                  d   f      }t	        j
                  |d      j                  d      |z  }|j                  | j                        }n)t	        j
                  |d      j                  d      |z  }|j                  d      |z   S )a!  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        input_ids: jnp.ndarray
        padding_idx: int

    Returns: jnp.ndarray
    i4   r!   axis)astypendimreshapeshapejnpcumsum)	input_idspadding_idxmaskincremental_indicess       d/var/www/html/venv/lib/python3.12/site-packages/transformers/models/roberta/modeling_flax_roberta.py"create_position_ids_from_input_idsr4   4   s     $,,T2Dyy1}||RB01!jjA6==dCdJ199)//J!jjA6==dCdJ%%d+k99    a   

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`RobertaConfig`]): Model configuration class with all the parameters of the
            model. Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        head_mask (`numpy.ndarray` of shape `({0})`, `optional):
            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                   f    e Zd ZU dZeed<   ej                  Zej                  ed<   d Z	dde
fdZy)	FlaxRobertaEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.configdtypec                    t        j                  | j                  j                  | j                  j                  t
        j                   j                  j                  | j                  j                        | j                        | _
        t        j                  | j                  j                  | j                  j                  t
        j                   j                  j                  | j                  j                        | j                        | _        t        j                  | j                  j                  | j                  j                  t
        j                   j                  j                  | j                  j                        | j                        | _        t        j                  | j                  j                   | j                        | _        t        j"                  | j                  j$                        | _        y )N)stddev)embedding_initr9   epsilonr9   rate)nnEmbedr8   
vocab_sizehidden_sizejaxinitializersnormalinitializer_ranger9   word_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutselfs    r3   setupzFlaxRobertaEmbeddings.setup   sJ   !xxKK""KK##66..55T[[=Z=Z5[**	 
 $&88KK//KK##66..55T[[=Z=Z5[**	$
  &(XXKK''KK##66..55T[[=Z=Z5[**	&
" dkk.H.HPTPZPZ[zzt{{'F'FGr5   deterministicc                    | j                  |j                  d            }| j                  |j                  d            }| j                  |j                  d            }||z   |z   }	| j	                  |	      }	| j                  |	|      }	|	S )Nr$   rV   )rI   r)   rK   rM   rN   rR   )
rT   r/   token_type_idsposition_idsattention_maskrV   inputs_embedsposition_embedsrM   hidden_statess
             r3   __call__zFlaxRobertaEmbeddings.__call__   s    ,,Y-=-=d-CD22<3F3Ft3LM $ : :>;P;PQU;V W &(==O }5]-Pr5   NT)__name__
__module____qualname____doc__r"   __annotations__r-   float32r9   rU   boolr_    r5   r3   r7   r7      s0    Q{{E399"H,_c r5   r7   c                       e Zd ZU eed<   dZeed<   ej                  Z	ej                  ed<   d Z
d Zd Zej                  d        Z	 	 	 	 dd
eej"                     dedefdZy	)FlaxRobertaSelfAttentionr8   Fcausalr9   c                 6   | j                   j                  | j                   j                  z  | _        | j                   j                  | j                   j                  z  dk7  rt	        d      t        j                  | j                   j                  | j                  t        j
                  j                  j                  | j                   j                              | _        t        j                  | j                   j                  | j                  t        j
                  j                  j                  | j                   j                              | _        t        j                  | j                   j                  | j                  t        j
                  j                  j                  | j                   j                              | _        | j                  r>t!        t#        j$                  d| j                   j&                  fd      d      | _        y y )Nr   z`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads`                    : {self.config.num_attention_heads}r9   kernel_initr!   rg   r9   )r8   rD   num_attention_headshead_dim
ValueErrorrA   Denser9   rE   rF   rG   rH   querykeyvaluerk   r
   r-   onesrJ   causal_maskrS   s    r3   rU   zFlaxRobertaSelfAttention.setup   si   //4;;3R3RR;;""T[[%D%DDII 
 XXKK##**++224;;3P3PQ


 88KK##**++224;;3P3PQ

 XXKK##**++224;;3P3PQ

 ;;/!T[[@@APX^ D r5   c                     |j                  |j                  d d | j                  j                  | j                  fz         S Nr%   )r+   r,   r8   rp   rq   rT   r^   s     r3   _split_headsz%FlaxRobertaSelfAttention._split_heads   s;    $$]%8%8!%<@_@_aeanan?o%oppr5   c                 n    |j                  |j                  d d | j                  j                  fz         S rz   )r+   r,   r8   rD   r{   s     r3   _merge_headsz%FlaxRobertaSelfAttention._merge_heads   s2    $$]%8%8!%<@W@W?Y%YZZr5   c                 (   | j                  dd      }| j                  ddt        j                  |j                  |j
                        }| j                  ddt        j                  |j                  |j
                        }| j                  ddd       }|r|j                  j                  ^ }	}
}}|j                  }dt        |	      z  |ddfz   }t        j                  |j                  ||      }t        j                  |j                  ||      }||_        ||_        |j                  d   }|j                  |z   |_        t        j                  t        j                  |
      ||z   k  t        |	      d||
fz         }t        ||      }|||fS )	a[  
        This function takes projected key, value states from a single input token and concatenates the states to cached
        states from previous steps. This function is slighly adapted from the official Flax repository:
        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
        cache
cached_keycached_valuecache_indexc                  L    t        j                  dt         j                        S )Nr   ro   )r-   arrayint32rh   r5   r3   <lambda>z@FlaxRobertaSelfAttention._concatenate_to_cache.<locals>.<lambda>   s    CIIaWZW`W`Da r5   )r   r   r!   )has_variablevariabler-   zerosr,   r9   rv   lenr   dynamic_update_slicebroadcast_toarangetupler	   )rT   ru   rv   rt   r[   is_initializedr   r   r   
batch_dims
max_length	num_headsdepth_per_head	cur_indexindicesnum_updated_cache_vectorspad_masks                    r3   _concatenate_to_cachez.FlaxRobertaSelfAttention._concatenate_to_cache   sr    **7LA]]7L#))SYYPSPYPYZ
}}WnciiV[VaVabmmG]<abAKAQAQAWAW>ZY#))IS_,	1a/@@G**:+;+;S'JC,,\-?-?PE"J!&L(-A% + 1 14M MK''

:&5N)NNj!Q(A:$NNH +8^DNE>))r5   Nkey_value_states
init_cacheoutput_attentionsc                 4   |d u}|j                   d   }	| j                  |      }
|r#| j                  |      }| j                  |      }n"| j                  |      }| j                  |      }| j	                  |
      }
| j	                  |      }| j	                  |      }| j
                  r|
j                   d   |j                   d   }}| j                  dd      r[| j                  d   d   }| j                  d   d   j                   d   }t        j                  | j                  dd|dfdd||f      }n| j                  d d d d d |d |f   }t        j                  ||	f|j                   dd  z         }|N| j
                  rBt        j                  t        j                  |d      j                         }t        ||      }n(| j
                  r}n|t        j                  |d      }| j
                  r,| j                  dd      s|r| j                  |||
|      \  }}}|t        j                   |dkD  t        j"                  |j                   d      j%                  | j&                        t        j"                  |j                   t        j(                  | j&                        j*                        j%                  | j&                              }nd }d }|s*| j,                  j.                  dkD  r| j1                  d	      }t3        |
|||| j,                  j.                  d
|| j&                  d 	      }|t        j4                  d||      }t        j4                  d||      }|j7                  |j                   d d dz         }|r||f}|S |f}|S )Nr   r!   r   r   r   )r'   g        rR   T)biasdropout_rngdropout_ratebroadcast_dropoutrV   r9   	precisionz...hqk,h->...hqkz...hqk,...khd->...qhdr%   )r&   )r,   rt   ru   rv   r|   rk   r   	variablesr   dynamic_slicerx   r-   r   expand_dimsr	   r   selectfullr)   r9   finfominr8   attention_probs_dropout_probmake_rngr   einsumr+   )rT   r^   r[   layer_head_maskr   r   rV   r   is_cross_attention
batch_sizequery_states
key_statesvalue_statesquery_length
key_length
mask_shiftmax_decoder_lengthrx   attention_biasr   attn_weightsattn_outputoutputss                          r3   r_   z!FlaxRobertaSelfAttention.__call__   sk    .T9"((+
 zz-0"23J::&67L -0J::m4L((6&&z2
((6 ;;'3'9'9!'<j>N>Nq>Q*L  ,7!^^G4]C
%)^^G%<\%J%P%PQR%S"!//$$q!Z&;aLRd=e #..q!]l]KZK/OP**;HYHYZ[Z\H]8]^K %$++ --coonS[.\^i^o^opN*>;GN[[(N' __^(KN ;;D--g|D
7;7Q7QL,84Jn
 % ZZ"--s3::4::F--syy/D/H/HIPPQUQ[Q[\N "N!I!IC!O--	2K4#AA"'**

 &::&8,XLjj!8,U!))+*;*;BQ*?%*GH1B;- JUr5   NFTF)ra   rb   rc   r"   re   rk   rg   r-   rf   r9   rU   r|   r~   rA   compactr   r   ndarrayr_   rh   r5   r3   rj   rj      s    FD{{E399":q[ ZZ* *H 37 "'_
 #3;;/_ _  _r5   rj   c                   b    e Zd ZU eed<   ej                  Zej                  ed<   d Zdde	fdZ
y)FlaxRobertaSelfOutputr8   r9   c                    t        j                  | j                  j                  t        j                   j
                  j                  | j                  j                        | j                        | _	        t        j                  | j                  j                  | j                        | _
        t        j                  | j                  j                        | _        y )Nrn   r9   r=   r?   )rA   rs   r8   rD   rE   rF   rG   rH   r9   denserN   rO   rP   rQ   rR   rS   s    r3   rU   zFlaxRobertaSelfOutput.setupf  s    XXKK##++224;;3P3PQ**


 dkk.H.HPTPZPZ[zzt{{'F'FGr5   rV   c                 v    | j                  |      }| j                  ||      }| j                  ||z         }|S NrX   r   rR   rN   )rT   r^   input_tensorrV   s       r3   r_   zFlaxRobertaSelfOutput.__call__o  s;    

=1]-P}|'CDr5   Nr`   ra   rb   rc   r"   re   r-   rf   r9   rU   rg   r_   rh   r5   r3   r   r   b  s,    {{E399"H4 r5   r   c                   x    e Zd ZU eed<   dZeed<   ej                  Z	ej                  ed<   d Z
	 	 	 	 d	defdZy)
FlaxRobertaAttentionr8   Frk   r9   c                     t        | j                  | j                  | j                        | _        t        | j                  | j                        | _        y )Nrk   r9   ro   )rj   r8   rk   r9   rT   r   outputrS   s    r3   rU   zFlaxRobertaAttention.setup|  s7    ,T[[TXT^T^_	+DKKtzzJr5   Nr   c           	          | j                  |||||||      }|d   }	| j                  |	||      }|f}
|r	|
|d   fz  }
|
S )N)r   r   r   rV   r   r   rX   r!   )rT   r   )rT   r^   r[   r   r   r   rV   r   attn_outputsr   r   s              r3   r_   zFlaxRobertaAttention.__call__  sl     yy+-!'/ ! 
 #1oKm\ "Q))Gr5   r   )ra   rb   rc   r"   re   rk   rg   r-   rf   r9   rU   r_   rh   r5   r3   r   r   w  sG    FD{{E399"K "'  r5   r   c                   Z    e Zd ZU eed<   ej                  Zej                  ed<   d Zd Z	y)FlaxRobertaIntermediater8   r9   c                 4   t        j                  | j                  j                  t        j                   j
                  j                  | j                  j                        | j                        | _	        t        | j                  j                     | _        y Nr   )rA   rs   r8   intermediate_sizerE   rF   rG   rH   r9   r   r   
hidden_act
activationrS   s    r3   rU   zFlaxRobertaIntermediate.setup  s`    XXKK))++224;;3P3PQ**


 !!7!78r5   c                 J    | j                  |      }| j                  |      }|S N)r   r   r{   s     r3   r_   z FlaxRobertaIntermediate.__call__  s$    

=16r5   N
ra   rb   rc   r"   re   r-   rf   r9   rU   r_   rh   r5   r3   r   r     s$    {{E399"9r5   r   c                   b    e Zd ZU eed<   ej                  Zej                  ed<   d Zdde	fdZ
y)FlaxRobertaOutputr8   r9   c                    t        j                  | j                  j                  t        j                   j
                  j                  | j                  j                        | j                        | _	        t        j                  | j                  j                        | _        t        j                  | j                  j                  | j                        | _        y )Nr   r?   r=   )rA   rs   r8   rD   rE   rF   rG   rH   r9   r   rP   rQ   rR   rN   rO   rS   s    r3   rU   zFlaxRobertaOutput.setup  s    XXKK##++224;;3P3PQ**


 zzt{{'F'FGdkk.H.HPTPZPZ[r5   rV   c                 v    | j                  |      }| j                  ||      }| j                  ||z         }|S r   r   )rT   r^   attention_outputrV   s       r3   r_   zFlaxRobertaOutput.__call__  s<    

=1]-P}7G'GHr5   Nr`   r   rh   r5   r3   r   r     s,    {{E399"\t r5   r   c                       e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 	 dde	ej                     de	ej                     deded	ef
d
Zy)FlaxRobertaLayerr8   r9   c                    t        | j                  | j                  j                  | j                        | _        t        | j                  | j                        | _        t        | j                  | j                        | _        | j                  j                  r(t        | j                  d| j                        | _
        y y )Nr   ro   F)r   r8   
is_decoderr9   	attentionr   intermediater   r   add_cross_attentioncrossattentionrS   s    r3   rU   zFlaxRobertaLayer.setup  s    -dkk$++BXBX`d`j`jk3DKKtzzR'4::F;;**"6t{{5X\XbXb"cD +r5   Nencoder_hidden_statesencoder_attention_maskr   rV   r   c	                     | j                  ||||||      }	|	d   }
|| j                  |
|||||      }|d   }
| j                  |
      }| j                  ||
|      }|f}|r||	d   fz  }|	|d   fz  }|S )N)r   r   rV   r   r   )r[   r   r   rV   r   rX   r!   )r   r   r   r   )rT   r^   r[   r   r   r   r   rV   r   attention_outputsr   cross_attention_outputsr   s                r3   r_   zFlaxRobertaLayer.__call__  s     !NN+!'/ + 
 -Q/ !,&*&9&9 5 /!6+"3 ': '#  7q9))*:;M3CS`a ")!,..G$03A688r5   )NNFTF)ra   rb   rc   r"   re   r-   rf   r9   rU   r   r   rg   r_   rh   r5   r3   r   r     sz    {{E399"d 8<8< ""'+
  (4+ !) 5+ + +  +r5   r   c                       e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 ddeej                     deej                     d	e	d
e	de	de	de	fdZy)FlaxRobertaLayerCollectionr8   r9   Fgradient_checkpointingc           	         | j                   rjt        t        d      }t        | j                  j
                        D cg c]*  } || j                  t        |      | j                        , c}| _        y t        | j                  j
                        D cg c]-  }t        | j                  t        |      | j                        / c}| _        y c c}w c c}w )N)         )static_argnums)namer9   )	r   rematr   ranger8   num_hidden_layersstrr9   layers)rT   FlaxRobertaCheckpointLayeris      r3   rU   z FlaxRobertaLayerCollection.setup	  s    &&)./?PY)Z& t{{<<= +4;;SV4::VDK t{{<<= !3q6LDK
s   /C2CNr   r   r   rV   r   output_hidden_statesreturn_dictc                    |rdnd }|	rdnd }|r|dnd }|W|j                   d   t        | j                        k7  r2t        dt        | j                         d|j                   d    d      t	        | j                        D ]@  \  }}|	r||fz  } ||||||   nd |||||      }|d   }|s,||d   fz  }|8||d   fz  }B |	r||fz  }||||f}|
st        d |D              S t        ||||	      S )
Nrh   r   z&The head_mask should be specified for z/ layers, but it is for                         .r!   r%   c              3   &   K   | ]	  }||  y wr   rh   ).0vs     r3   	<genexpr>z6FlaxRobertaLayerCollection.__call__.<locals>.<genexpr>L  s     =qq}=s   )last_hidden_stater^   
attentionscross_attentions)r,   r   r   rr   	enumerater   r   )rT   r^   r[   	head_maskr   r   r   rV   r   r   r   all_attentionsall_hidden_statesall_cross_attentionsr   layerlayer_outputsr   s                     r3   r_   z#FlaxRobertaLayerCollection.__call__  sl     1d"6BD&7<Q<]rdh  q!c$++&67 <S=M<N O'ooa014 
 "$++. 	@HAu#!m%55!! ) 5	!4%&!	M *!,M =#3"55(4(]1-=,??(+	@.  -!11 "3^EYZ=G===<++%1	
 	
r5   NNFTFFTra   rb   rc   r"   re   r-   rf   r9   r   rg   rU   r   r   r_   rh   r5   r3   r   r     s    {{E399"#(D($ 8<8< ""'%* =

  (4=
 !) 5=
 =
 =
  =
 #=
 =
r5   r   c                       e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 ddeej                     deej                     d	e	d
e	de	de	de	fdZy)FlaxRobertaEncoderr8   r9   Fr   c                 f    t        | j                  | j                  | j                        | _        y )Nr9   r   )r   r8   r9   r   r  rS   s    r3   rU   zFlaxRobertaEncoder.setup\  s%    /KK**#'#>#>

r5   Nr   r   r   rV   r   r   r   c                 8    | j                  |||||||||	|

      S )N)r	  r   r   r   rV   r   r   r   )r  )rT   r^   r[   r	  r   r   r   rV   r   r   r   s              r3   r_   zFlaxRobertaEncoder.__call__c  s8     zz"7#9!'/!5#  
 	
r5   r  r  rh   r5   r3   r  r  W  s    {{E399"#(D(
 8<8< ""'%* 

  (4
 !) 5
 
 
  
 #
 
r5   r  c                   Z    e Zd ZU eed<   ej                  Zej                  ed<   d Zd Z	y)FlaxRobertaPoolerr8   r9   c                     t        j                  | j                  j                  t        j                   j
                  j                  | j                  j                        | j                        | _	        y r   )
rA   rs   r8   rD   rE   rF   rG   rH   r9   r   rS   s    r3   rU   zFlaxRobertaPooler.setup  sH    XXKK##++224;;3P3PQ**

r5   c                 `    |d d df   }| j                  |      }t        j                  |      S )Nr   )r   rA   tanh)rT   r^   cls_hidden_states      r3   r_   zFlaxRobertaPooler.__call__  s1    (A.::&67ww'((r5   Nr   rh   r5   r3   r  r    s$    {{E399"
)r5   r  c                       e Zd ZU eed<   ej                  Zej                  ed<   ej                  j                  j                  Zedej                  f   ed<   d ZddZy)	FlaxRobertaLMHeadr8   r9   .	bias_initc                    t        j                  | j                  j                  | j                  t
        j                   j                  j                  | j                  j                              | _	        t        j                  | j                  j                  | j                        | _        t        j                  | j                  j                  | j                  dt
        j                   j                  j                  | j                  j                              | _        | j                  d| j                   | j                  j                  f      | _        y )Nrm   r=   F)r9   use_biasrn   r   )rA   rs   r8   rD   r9   rE   rF   rG   rH   r   rN   rO   
layer_normrC   decoderparamr  r   rS   s    r3   rU   zFlaxRobertaLMHead.setup  s    XXKK##**++224;;3P3PQ


 ,,t{{/I/IQUQ[Q[\xxKK""**++224;;3P3PQ	
 JJvt~~8N8N7PQ	r5   Nc                 @   | j                  |      }t        d   |      }| j                  |      }|+| j                  j	                  dd|j
                  ii|      }n| j                  |      }t        j                  | j                  | j                        }||z  }|S )Ngeluparamskernel)
r   r   r!  r"  applyTr-   asarrayr   r9   )rT   r^   shared_embeddingr   s       r3   r_   zFlaxRobertaLMHead.__call__  s    

=1v}56' LL..8EUEWEW:X/Y[hiM LL7M{{499djj1r5   r   )ra   rb   rc   r"   re   r-   rf   r9   rE   rA   rF   r   r  r   npr   rU   r_   rh   r5   r3   r  r    sL    {{E399"+.66+>+>+D+DIxRZZ(DRr5   r  c                   \    e Zd ZU eed<   ej                  Zej                  ed<   d ZddZ	y)FlaxRobertaClassificationHeadr8   r9   c                    t        j                  | j                  j                  | j                  t
        j                   j                  j                  | j                  j                              | _	        | j                  j                  | j                  j                  n| j                  j                  }t        j                  |      | _        t        j                  | j                  j                  | j                  t
        j                   j                  j                  | j                  j                              | _        y )Nrm   r?   )rA   rs   r8   rD   r9   rE   rF   rG   rH   r   classifier_dropoutrQ   rP   rR   
num_labelsout_projrT   r0  s     r3   rU   z#FlaxRobertaClassificationHead.setup  s    XXKK##**++224;;3P3PQ

 {{--9 KK**00 	
 zz'9:KK""**++224;;3P3PQ
r5   c                     |d d dd d f   }| j                  ||      }| j                  |      }t        j                  |      }| j                  ||      }| j	                  |      }|S )Nr   rX   )rR   r   rA   r  r2  )rT   r^   rV   s      r3   r_   z&FlaxRobertaClassificationHead.__call__  sf    %aAg.]-P

=1.]-Pm4r5   Nr`   r   rh   r5   r3   r.  r.    s$    {{E399"
$r5   r.  c                       e Zd ZU dZeZdZdZej                  e
d<   ddej                  ddfd	ed
ededej                  dedef fdZd Zddej(                  j*                  d
ededefdZd Z eej7                  d            	 	 	 	 	 	 	 	 	 	 	 	 	 ddedej(                  j*                  dedee   dee   dee   defd       Z xZS ) FlaxRobertaPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    robertaNmodule_class)r!   r!   r   TFr8   input_shapeseedr9   _do_initr   c                 \     | j                   d|||d|}t        	| 	  ||||||       y )Nr8   r9   r   )r9  r:  r9   r;  rh   )r8  super__init__)
rT   r8   r9  r:  r9   r;  r   kwargsmodule	__class__s
            r3   r?  z#FlaxRobertaPreTrainedModel.__init__  sA     #""w&Vlwpvw[tSXcklr5   c                 ^    | j                  | j                  | j                  d      | _        y )NTr=  )r8  r8   r9   _modulerS   s    r3   enable_gradient_checkpointingz8FlaxRobertaPreTrainedModel.enable_gradient_checkpointing  s*    ((;;**#' ) 
r5   rngr&  returnc                    t        j                  |d      }t        j                  |      }t        || j                  j
                        }t        j                  |      }t        j                  | j                  j                  | j                  j                  f      }t        j                  j                  |      \  }	}
|	|
d}| j                  j                  rTt        j                  || j                  j                  fz         }|}| j                  j                  ||||||||d	      }n"| j                  j                  ||||||d      }|d   }|dt!        t#        |            }t!        t#        |            }| j$                  D ]
  }||   ||<    t'               | _        t)        t+        |            S |S )Nr$   ro   )r&  rR   F)r   r&  )r-   r   	ones_liker4   r8   pad_token_idrw   r   rp   rE   randomsplitr   rD   rA  initr   r   _missing_keyssetr   r   )rT   rF  r9  r&  r/   rY   rZ   r[   r	  
params_rngr   rngsr   r   module_init_outputsrandom_paramsmissing_keys                    r3   init_weightsz'FlaxRobertaPreTrainedModel.init_weights  s   IIk6	y19)T[[E]E]^y1HHdkk;;T[[=\=\]^	"%**"2"23"7
K$=;;**$'IIkT[[=T=T<V.V$W!%3""&++"2"2%&! #3 
# #'++"2"2iyfk #3 # ,H5(-)@AM!(6"23F#11 A&3K&@{#A!$D.011  r5   c                    t        j                  ||fd      }t        j                  |d      }t        j                  t        j                  t        j
                  |      j                  d         |j                        }| j                  j                  t        j                  j                  d      |||dd      }t        |d         S )	aW  
        Args:
            batch_size (`int`):
                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
            max_length (`int`):
                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
                cache.
        r$   ro   r&   r   FT)r   r   r   )r-   rw   rI  r   r   
atleast_2dr,   rA  rM  rE   rK  PRNGKeyr   )rT   r   r   r/   r[   rZ   init_variabless          r3   r   z%FlaxRobertaPreTrainedModel.init_cache  s     HHj*5TB	y=''

3>>)3L3R3RSU3V(WYbYhYhi))JJq!9nlX]jn * 
 w/00r5   zbatch_size, sequence_lengthr   trainr   r   r   past_key_valuesc                 
   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        j
                  |      }| t        || j                   j                        }|t	        j                  |      }|?t	        j                  | j                   j                  | j                   j                  f      }i }|	|	|d<   d|xs | j                  i}| j                   j                  r|r	||d<   dg}nd}| j                  j                  |t	        j                   |d      t	        j                   |d      t	        j                   |d      t	        j                   |d      t	        j                   |d      |||
 |||||      }||r|\  }}t#        |d         |d<   |S |"|s |\  }}|d d	 t#        |d         fz   |d	d  z   }|S | j                  j                  |t	        j                   |d      t	        j                   |d      t	        j                   |d      t	        j                   |d      t	        j                   |d      |
 ||||
      }|S )NrR   r&  r   Fr$   ro   )rY   rZ   r	  r   r   rV   r   r   r   rQ  mutabler[  r!   )rY   rZ   r	  rV   r   r   r   rQ  )r8   r   r   r   r-   
zeros_liker4   rJ  rI  rw   r   rp   r&  r   rA  r(  r   r   )rT   r/   r[   rY   rZ   r	  r   r   r&  r   rZ  r   r   r   r[  rQ  inputsr]  r   s                      r3   r_   z#FlaxRobertaPreTrainedModel.__call__1  s   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY ! ^^I6N=iIaIabL! ]]95N$++"?"?A`A`!abI ")DOF1dkk2;;** "1w")kk''		)40		.5"yytD YY|4@))IT:&;'="'i"3%9' ( G$ *{+2(-5og6N-O)* ,[+2(!"1+/'2J)K(MMPWXYXZP[["  kk''		)40		.5"yytD YY|4@))IT:"'i"3%9' ( G r5   r   )NNNNNNNNFNNNN) ra   rb   rc   rd   r"   config_classbase_model_prefixr8  rA   Modulere   r-   rf   r   intr9   rg   r?  rE  rE   rK  rX  r   rU  r   r   ROBERTA_INPUTS_DOCSTRINGformatdictr   r_   __classcell__)rB  s   @r3   r6  r6    so   
 !L!"L"))"
 $;;',mm m 	m
 yym m !%m
(!

 2 2 (! (!PZ (!fp (!V1& ++C+J+JKh+ij "#*.,0/3&* $^ ^ ZZ''^ ^ $D>^ 'tn^ d^^ ^ k^r5   r6  c                   8   e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   dZ
e	ed<   d Z	 	 	 	 	 	 	 	 	 	 dd	eej                     d
eej                     deej                     deej                     deej                     de	de	de	de	de	fdZy)FlaxRobertaModuler8   r9   Tadd_pooling_layerFr   c                     t        | j                  | j                        | _        t	        | j                  | j                  | j
                        | _        t        | j                  | j                        | _        y )Nro   r  )	r7   r8   r9   
embeddingsr  r   encoderr  poolerrS   s    r3   rU   zFlaxRobertaModule.setup  sS    /4::N)KK**#'#>#>

 (4::Fr5   NrY   rZ   r	  r   r   r   rV   r   r   r   c                    |t        j                  |      }|St        j                  t        j                  t        j                  |      j
                  d         |j
                        }| j                  |||||	      }| j                  ||||	||||
||
      }|d   }| j                  r| j                  |      nd }|s|	|f|dd  z   S ||f|dd  z   S t        |||j                  |j                  |j                        S )Nr&   rX   )r	  rV   r   r   r   r   r   r   r   r!   )r  pooler_outputr^   r  r  )r-   r^  r   r   rW  r,   rl  rm  rj  rn  r   r^   r  r  )rT   r/   r[   rY   rZ   r	  r   r   r   rV   r   r   r   r^   r   pooleds                   r3   r_   zFlaxRobertaModule.__call__  s,     ! ^^I6N ++CJJs~~i7P7V7VWY7Z,[]f]l]lmL~|^S` ( 
 ,,'"7#9!/!5#  
  
/3/E/E]+4~%''!"+55!6*WQR[88?+ !//))$55
 	
r5   )
NNNNNFTFFT)ra   rb   rc   r"   re   r-   rf   r9   rj  rg   r   rU   r   r   r_   rh   r5   r3   ri  ri    s    {{E399""t"#(D(G 15.2+/7;8< ""'%* 5
 !-	5

 s{{+5
 CKK(5
  (45
 !) 55
 5
 5
  5
 #5
 5
r5   ri  zaThe bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd ZeZy)FlaxRobertaModelN)ra   rb   rc   ri  r8  rh   r5   r3   rs  rs    s	    
 %Lr5   rs  c            	           e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 dde	de	de	d	e	fd
Zy)FlaxRobertaForMaskedLMModuler8   r9   Fr   c                     t        | j                  d| j                  | j                        | _        t        | j                  | j                        | _        y NF)r8   rj  r9   r   r8   r9   ri  r8   r9   r   r7  r  lm_headrS   s    r3   rU   z"FlaxRobertaForMaskedLMModule.setup  @    (;;#**#'#>#>	
 )4::Nr5   rV   r   r   r   c
                 6   | j                  |||||||||		      }
|
d   }| j                  j                  r#| j                   j                  d   d   d   d   }nd }| j	                  ||      }|	s	|f|
dd  z   S t        ||
j                  |
j                  	      S )
NrV   r   r   r   r   r&  rl  rI   	embeddingr+  r!   logitsr^   r  )r7  r8   tie_word_embeddingsr   rz  r   r^   r  )rT   r/   r[   rY   rZ   r	  rV   r   r   r   r   r^   r+  r  s                 r3   r_   z%FlaxRobertaForMaskedLMModule.__call__  s     ,,'/!5#  

  
;;**#||55h?MN_`alm# m>NO9wqr{**!!//))
 	
r5   NTFFTra   rb   rc   r"   re   r-   rf   r9   r   rg   rU   r_   rh   r5   r3   ru  ru    sf    {{E399"#(D(O  #"'%* )
 )
  )
 #)
 )
r5   ru  z5RoBERTa Model with a `language modeling` head on top.c                       e Zd ZeZy)FlaxRobertaForMaskedLMN)ra   rb   rc   ru  r8  rh   r5   r3   r  r     s    /Lr5   r  z<mask>)r1   c            	           e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 dde	de	de	d	e	fd
Zy)*FlaxRobertaForSequenceClassificationModuler8   r9   Fr   c                     t        | j                  | j                  d| j                        | _        t        | j                  | j                        | _        y )NFr8   r9   rj  r   rx  )ri  r8   r9   r   r7  r.  
classifierrS   s    r3   rU   z0FlaxRobertaForSequenceClassificationModule.setup3  sC    (;;**##'#>#>	
 8t{{RVR\R\]r5   rV   r   r   r   c
                     | j                  |||||||||		      }
|
d   }| j                  ||      }|	s	|f|
dd  z   S t        ||
j                  |
j                        S Nr}  r   rX   r!   r  )r7  r  r   r^   r  )rT   r/   r[   rY   rZ   r	  rV   r   r   r   r   sequence_outputr  s                r3   r_   z3FlaxRobertaForSequenceClassificationModule.__call__<  s     ,,'/!5#  

 "!*N9wqr{**+!//))
 	
r5   Nr  r  rh   r5   r3   r  r  .  sf    {{E399"#(D(^  #"'%* #
 #
  #
 ##
 #
r5   r  z
    Roberta Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                       e Zd ZeZy)$FlaxRobertaForSequenceClassificationN)ra   rb   rc   r  r8  rh   r5   r3   r  r  b  s	     >Lr5   r  c            	           e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 dde	de	de	d	e	fd
Zy)"FlaxRobertaForMultipleChoiceModuler8   r9   Fr   c                    t        | j                  | j                  | j                        | _        t        j                  | j                  j                        | _        t        j                  d| j                        | _
        y )Nr=  r?   r!   ro   )ri  r8   r9   r   r7  rA   rP   rQ   rR   rs   r  rS   s    r3   rU   z(FlaxRobertaForMultipleChoiceModule.setup{  sW    (;;**#'#>#>

 zzt{{'F'FG((1DJJ7r5   rV   r   r   r   c
                 <   |j                   d   }
||j                  d|j                   d         nd }||j                  d|j                   d         nd }||j                  d|j                   d         nd }||j                  d|j                   d         nd }| j                  |||||||||		      }|d   }| j                  ||      }| j	                  |      }|j                  d|
      }|	s	|f|dd  z   S t        ||j                  |j                        S )Nr!   r&   r}  rX   r%   r  )r,   r+   r7  rR   r  r   r^   r  )rT   r/   r[   rY   rZ   r	  rV   r   r   r   num_choicesr   pooled_outputr  reshaped_logitss                  r3   r_   z+FlaxRobertaForMultipleChoiceModule.__call__  sH     ooa(BKBWI%%b)//"*=>]a	Q_Qk//N4H4H4LMquQ_Qk//N4H4H4LMquKWKc|++B0B0B20FGim ,,'/!5#  

  
]-P/ ..[9#%33,"!//))
 	
r5   Nr  r  rh   r5   r3   r  r  v  se    {{E399"#(D(8  #"'%* ,
 ,
  ,
 #,
 ,
r5   r  z
    Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                       e Zd ZeZy)FlaxRobertaForMultipleChoiceN)ra   rb   rc   r  r8  rh   r5   r3   r  r    s	     6Lr5   r  z(batch_size, num_choices, sequence_lengthc            	           e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 dde	de	de	d	e	fd
Zy)'FlaxRobertaForTokenClassificationModuler8   r9   Fr   c                    t        | j                  | j                  d| j                        | _        | j                  j
                  | j                  j
                  n| j                  j                  }t        j                  |      | _	        t        j                  | j                  j                  | j                        | _        y )NFr  r?   ro   )ri  r8   r9   r   r7  r0  rQ   rA   rP   rR   rs   r1  r  r3  s     r3   rU   z-FlaxRobertaForTokenClassificationModule.setup  s    (;;**##'#>#>	
 {{--9 KK**00 	
 zz'9:((4;;#9#9Lr5   rV   r   r   r   c
                     | j                  |||||||||		      }
|
d   }| j                  ||      }| j                  |      }|	s	|f|
dd  z   S t        ||
j                  |
j
                        S r  )r7  rR   r  r   r^   r  )rT   r/   r[   rY   rZ   r	  rV   r   r   r   r   r^   r  s                r3   r_   z0FlaxRobertaForTokenClassificationModule.__call__  s     ,,'/!5#  

  
]-P/9wqr{**(!//))
 	
r5   Nr  r  rh   r5   r3   r  r    sf    {{E399"#(D(M, #"'%* $
 $
  $
 #$
 $
r5   r  z
    Roberta Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       e Zd ZeZy)!FlaxRobertaForTokenClassificationN)ra   rb   rc   r  r8  rh   r5   r3   r  r    s	     ;Lr5   r  c            	           e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 dde	de	de	d	e	fd
Zy)%FlaxRobertaForQuestionAnsweringModuler8   r9   Fr   c                     t        | j                  | j                  d| j                        | _        t        j                  | j                  j                  | j                        | _        y )NFr  ro   )	ri  r8   r9   r   r7  rA   rs   r1  
qa_outputsrS   s    r3   rU   z+FlaxRobertaForQuestionAnsweringModule.setup  sJ    (;;**##'#>#>	
 ((4;;#9#9Lr5   rV   r   r   r   c
                 b   | j                  |||||||||		      }
|
d   }| j                  |      }t        j                  || j                  j
                  d      \  }}|j                  d      }|j                  d      }|	s
||f|
dd  z   S t        |||
j                  |
j                        S )Nr}  r   r&   r'   r!   )start_logits
end_logitsr^   r  )
r7  r  r-   rL  r8   r1  squeezer   r^   r  )rT   r/   r[   rY   rZ   r	  rV   r   r   r   r   r^   r  r  r  s                  r3   r_   z.FlaxRobertaForQuestionAnsweringModule.__call__'  s     ,,'/!5#  

  
/#&99VT[[5K5KRT#U j#++B/''+
 *-;;/%!!//))	
 	
r5   Nr  r  rh   r5   r3   r  r    sf    {{E399"#(D(M  #"'%* (
 (
  (
 #(
 (
r5   r  z
    Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       e Zd ZeZy)FlaxRobertaForQuestionAnsweringN)ra   rb   rc   r  r8  rh   r5   r3   r  r  R  s	     9Lr5   r  c                   
   e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 	 	 ddeej                     deej                     d	eej                     d
eej                     de	de	de	de	de	fdZy)FlaxRobertaForCausalLMModuler8   r9   Fr   c                     t        | j                  d| j                  | j                        | _        t        | j                  | j                        | _        y rw  ry  rS   s    r3   rU   z"FlaxRobertaForCausalLMModule.setupj  r{  r5   NrY   r	  r   r   r   rV   r   r   r   c                 R   | j                  |||||||||	|
||      }|d   }| j                  j                  r#| j                   j                  d   d   d   d   }nd }| j	                  ||      }|s	|f|dd  z   S t        ||j                  |j                  |j                  	      S )
N)r   r   r   rV   r   r   r   r   r&  rl  rI   r~  r  r!   )r  r^   r  r  )	r7  r8   r  r   rz  r   r^   r  r  )rT   r/   r[   rZ   rY   r	  r   r   r   rV   r   r   r   r   r^   r+  r  s                    r3   r_   z%FlaxRobertaForCausalLMModule.__call__s  s      ,,"7#9!'/!5#  
  
;;**#||55h?MN_`alm# m>NO9wqr{**4!//))$55	
 	
r5   )	NNNNFTFFTr  rh   r5   r3   r  r  e  s    {{E399"#(D(O 15+/7;8< ""'%* 0

 !-0
 CKK(0
  (40
 !) 50
 0
 0
  0
 #0
 0
r5   r  z
    Roberta Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
    autoregressive tasks.
    c                   >    e Zd ZeZddeej                     fdZd Z	y)FlaxRobertaForCausalLMNr[   c                 H   |j                   \  }}| j                  ||      }t        j                  ||fd      }|-|j	                  d      dz
  }t        j                  ||d      }n4t        j                  t        j                  |d      d d d f   ||f      }|||dS )Nr$   ro   r&   r'   r!   )r   r   )r[  r[   rZ   )	r,   r   r-   rw   r.   r   r   r   r   )	rT   r/   r   r[   r   
seq_lengthr[  extended_attention_maskrZ   s	            r3   prepare_inputs_for_generationz4FlaxRobertaForCausalLM.prepare_inputs_for_generation  s    !*
J//*jA #&((J
+C4"P%)00b09A=L&)&>&>?VXfhn&o#++CJJz,NtUVw,WZdfpYqrL  /5(
 	
r5   c                 L    |j                   |d<   |d   d d dd f   dz   |d<   |S )Nr[  rZ   r&   r!   )r[  )rT   model_outputsmodel_kwargss      r3   update_inputs_for_generationz3FlaxRobertaForCausalLM.update_inputs_for_generation  s8    *7*G*G&''3N'CArsF'Ka'O^$r5   r   )
ra   rb   rc   r  r8  r   rE   Arrayr  r  rh   r5   r3   r  r    s'     0L
S[\_\e\eSf 
*r5   r  )Utypingr   r   r   
flax.linenlinenrA   rE   	jax.numpynumpyr-   r,  flax.core.frozen_dictr   r   r   r	   r
   r   nn_partitioningflax.linen.attentionr   flax.traverse_utilr   r   r   modeling_flax_outputsr   r   r   r   r   r   r   r   r   modeling_flax_utilsr   r   r   r   utilsr   r   r    configuration_robertar"   
get_loggerra   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr   r4   ROBERTA_START_DOCSTRINGrd  rb  r7   rj   r   r   r   r   r   r   r  r  r  r.  r6  ri  rs  ru  r  r  r  r  r  re  r  r  r  r  r  r  rh   r5   r3   <module>r     s   - ,  
   > > 6 6 > ; 
 
 
 w v Y Y 0 
		H	%/ !:0 .# N(BII (Xhryy hXBII *'299 'Vbii &		 *6ryy 6tO
 O
f$
 $
P)		 )" 		  FBII @}!4 }BD
		 D
N g%1 %	% -/BDbds t7
299 7
t QSjk07 0 l0 "	1
 1
h  >+E >> ( 	:
 :
z  6#= 66  ":"A"ABl"m  !	8
bii 8
v  ;(B ;; %	6
BII 6
r  9&@ 99 #$	>
299 >
B  7 < )	r5   