
    sg<                        d dl mZmZmZ d dlZd dlmZ d dlZd dl	m
Z d dl
Zd dlmZmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d	d
lmZmZmZmZm Z m!Z!m"Z"m#Z# d	dl$m%Z%m&Z&m'Z'm(Z(m)Z) d	dl*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0  e.jb                  e2      Z3dZ4dZ5ejl                  Z6ejn                  jp                   G d de+             Z9dZ:dZ; G d dejx                        Z= G d dejx                        Z> G d dejx                        Z? G d dejx                        Z@ G d dejx                        ZA G d d ejx                        ZB G d! d"ejx                        ZC G d# d$ejx                        ZD G d% d&ejx                        ZE G d' d(ejx                        ZF G d) d*ejx                        ZG G d+ d,e&      ZH G d- d.ejx                        ZI e,d/e:       G d0 d1eH             ZJ e'eJe4ee5        G d2 d3ejx                        ZK G d4 d5ejx                        ZL e,d6e:       G d7 d8eH             ZM e'eMe4ee5        G d9 d:ejx                        ZN e,d;e:       G d< d=eH             ZOd>ZP e)eOe;j                  d?      ePz           e(eOe9e5@        G dA dBejx                        ZR e,dCe:       G dD dEeH             ZS e'eSe4e#e5       dF ZT G dG dHejx                        ZU G dI dJejx                        ZV e,dKe:       G dL dMeH             ZW e)eWe;j                  dN              e'eWe4e e5        G dO dPejx                        ZX e,dQe:       G dR dSeH             ZY e'eYe4e!e5        G dT dUejx                        ZZ G dV dWejx                        Z[ e,dXe:       G dY dZeH             Z\ e'e\e4e"e5        G d[ d\ejx                        Z] e,d]e:       G d^ d_eH             Z^ e'e^e4ee5       y)`    )CallableOptionalTupleN)
FrozenDictfreezeunfreeze)combine_masksmake_causal_mask)partitioning)dot_product_attention_weights)flatten_dictunflatten_dict)lax   )FlaxBaseModelOutput-FlaxBaseModelOutputWithPastAndCrossAttentions%FlaxCausalLMOutputWithCrossAttentionsFlaxMaskedLMOutputFlaxMultipleChoiceModelOutput FlaxQuestionAnsweringModelOutputFlaxSequenceClassifierOutputFlaxTokenClassifierOutput)ACT2FNFlaxPreTrainedModelappend_call_sample_docstring append_replace_return_docstringsoverwrite_call_docstring)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )ElectraConfigz"google/electra-small-discriminatorr#   c                       e Zd ZU dZdZej                  ed<   dZe	e
ej                        ed<   dZe	e
ej                        ed<   y)FlaxElectraForPreTrainingOutputaa  
    Output type of [`ElectraForPreTraining`].

    Args:
        logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlogitshidden_states
attentions)__name__
__module____qualname____doc__r&   jnpndarray__annotations__r'   r   r   r(        d/var/www/html/venv/lib/python3.12/site-packages/transformers/models/electra/modeling_flax_electra.pyr%   r%   ;   sG    & FCKK26M8E#++./6/3Js{{+,3r1   r%   a  

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a Flax Linen
    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`ElectraConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        head_mask (`numpy.ndarray` of shape `({0})`, `optional):
            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

c                   f    e Zd ZU dZeed<   ej                  Zej                  ed<   d Z	dde
fdZy)	FlaxElectraEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.configdtypec                    t        j                  | j                  j                  | j                  j                  t
        j                   j                  j                  | j                  j                              | _	        t        j                  | j                  j                  | j                  j                  t
        j                   j                  j                  | j                  j                              | _        t        j                  | j                  j                  | j                  j                  t
        j                   j                  j                  | j                  j                              | _        t        j                  | j                  j                  | j                         | _        t        j"                  | j                  j$                        | _        y )N)stddev)embedding_initepsilonr6   rate)nnEmbedr5   
vocab_sizeembedding_sizejaxinitializersnormalinitializer_rangeword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsr6   Dropouthidden_dropout_probdropoutselfs    r2   setupzFlaxElectraEmbeddings.setup   s5   !xxKK""KK&&66..55T[[=Z=Z5[ 

 $&88KK//KK&&66..55T[[=Z=Z5[$
 
 &(XXKK''KK&&66..55T[[=Z=Z5[&
"
 dkk.H.HPTPZPZ[zzt{{'F'FGr1   deterministicc                    | j                  |j                  d            }| j                  |j                  d            }| j                  |j                  d            }||z   |z   }	| j	                  |	      }	| j                  |	|      }	|	S )Ni4rS   )rF   astyperH   rJ   rK   rO   )
rQ   	input_idstoken_type_idsposition_idsattention_maskrS   inputs_embedsposition_embedsrJ   r'   s
             r2   __call__zFlaxElectraEmbeddings.__call__   s    ,,Y-=-=d-CD22<3F3Ft3LM $ : :>;P;PQU;V W &(==O }5]-Pr1   NTr)   r*   r+   r,   r#   r/   r-   float32r6   rR   boolr^   r0   r1   r2   r4   r4      s0    Q{{E399"H(_c r1   r4   c                       e Zd ZU eed<   dZeed<   ej                  Z	ej                  ed<   d Z
d Zd Zej                  d        Z	 	 	 	 dd
eej"                     dedefdZy	)FlaxElectraSelfAttentionr5   Fcausalr6   c                 6   | j                   j                  | j                   j                  z  | _        | j                   j                  | j                   j                  z  dk7  rt	        d      t        j                  | j                   j                  | j                  t        j
                  j                  j                  | j                   j                              | _        t        j                  | j                   j                  | j                  t        j
                  j                  j                  | j                   j                              | _        t        j                  | j                   j                  | j                  t        j
                  j                  j                  | j                   j                              | _        | j                  r>t!        t#        j$                  d| j                   j&                  fd      d      | _        y y )Nr   z`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads`                    : {self.config.num_attention_heads})r6   kernel_initr"   rb   r6   )r5   hidden_sizenum_attention_headshead_dim
ValueErrorr>   Denser6   rB   rC   rD   rE   querykeyvaluere   r
   r-   onesrG   causal_maskrP   s    r2   rR   zFlaxElectraSelfAttention.setup   si   //4;;3R3RR;;""T[[%D%DDII 
 XXKK##**++224;;3P3PQ


 88KK##**++224;;3P3PQ

 XXKK##**++224;;3P3PQ

 ;;/!T[[@@APX^ D r1   c                     |j                  |j                  d d | j                  j                  | j                  fz         S N   )reshapeshaper5   rj   rk   rQ   r'   s     r2   _split_headsz%FlaxElectraSelfAttention._split_heads   s;    $$]%8%8!%<@_@_aeanan?o%oppr1   c                 n    |j                  |j                  d d | j                  j                  fz         S rt   )rv   rw   r5   ri   rx   s     r2   _merge_headsz%FlaxElectraSelfAttention._merge_heads   s2    $$]%8%8!%<@W@W?Y%YZZr1   c                 (   | j                  dd      }| j                  ddt        j                  |j                  |j
                        }| j                  ddt        j                  |j                  |j
                        }| j                  ddd       }|r|j                  j                  ^ }	}
}}|j                  }dt        |	      z  |ddfz   }t        j                  |j                  ||      }t        j                  |j                  ||      }||_        ||_        |j                  d   }|j                  |z   |_        t        j                  t        j                  |
      ||z   k  t        |	      d||
fz         }t        ||      }|||fS )	a[  
        This function takes projected key, value states from a single input token and concatenates the states to cached
        states from previous steps. This function is slighly adapted from the official Flax repository:
        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
        cache
cached_keycached_valuecache_indexc                  L    t        j                  dt         j                        S )Nr   rh   )r-   arrayint32r0   r1   r2   <lambda>z@FlaxElectraSelfAttention._concatenate_to_cache.<locals>.<lambda>   s    CIIaWZW`W`Da r1   r   r   r"   )has_variablevariabler-   zerosrw   r6   rp   lenr   dynamic_update_slicebroadcast_toarangetupler	   )rQ   ro   rp   rn   r[   is_initializedr~   r   r   
batch_dims
max_length	num_headsdepth_per_head	cur_indexindicesnum_updated_cache_vectorspad_masks                    r2   _concatenate_to_cachez.FlaxElectraSelfAttention._concatenate_to_cache   sr    **7LA]]7L#))SYYPSPYPYZ
}}WnciiV[VaVabmmG]<abAKAQAQAWAW>ZY#))IS_,	1a/@@G**:+;+;S'JC,,\-?-?PE"J!&L(-A% + 1 14M MK''

:&5N)NNj!Q(A:$NNH +8^DNE>))r1   Nkey_value_states
init_cacheoutput_attentionsc                 4   |d u}|j                   d   }	| j                  |      }
|r#| j                  |      }| j                  |      }n"| j                  |      }| j                  |      }| j	                  |
      }
| j	                  |      }| j	                  |      }| j
                  r|
j                   d   |j                   d   }}| j                  dd      r[| j                  d   d   }| j                  d   d   j                   d   }t        j                  | j                  dd|dfdd||f      }n| j                  d d d d d |d |f   }t        j                  ||	f|j                   dd  z         }|N| j
                  rBt        j                  t        j                  |d      j                         }t        ||      }n(| j
                  r}n|t        j                  |d      }| j
                  r,| j                  dd      s|r| j                  |||
|      \  }}}|t        j                   |dkD  t        j"                  |j                   d      j%                  | j&                        t        j"                  |j                   t        j(                  | j&                        j*                        j%                  | j&                              }nd }d }|s*| j,                  j.                  dkD  r| j1                  d	      }t3        |
|||| j,                  j.                  d
|| j&                  d 	      }|t        j4                  d||      }t        j4                  d||      }|j7                  |j                   d d dz         }|r||f}|S |f}|S )Nr   r"   r}   r~   r   )axisg        rO   T)biasdropout_rngdropout_ratebroadcast_dropoutrS   r6   	precisionz...hqk,h->...hqkz...hqk,...khd->...qhdru   ))rw   rn   ro   rp   ry   re   r   	variablesr   dynamic_slicerr   r-   r   expand_dimsr	   r   selectfullrW   r6   finfominr5   attention_probs_dropout_probmake_rngr   einsumrv   )rQ   r'   r[   layer_head_maskr   r   rS   r   is_cross_attention
batch_sizequery_states
key_statesvalue_statesquery_length
key_length
mask_shiftmax_decoder_lengthrr   attention_biasr   attn_weightsattn_outputoutputss                          r2   r^   z!FlaxElectraSelfAttention.__call__  sk    .T9"((+
 zz-0"23J::&67L -0J::m4L((6&&z2
((6 ;;'3'9'9!'<j>N>Nq>Q*L  ,7!^^G4]C
%)^^G%<\%J%P%PQR%S"!//$$q!Z&;aLRd=e #..q!]l]KZK/OP**;HYHYZ[Z\H]8]^K %$++ --coonS[.\^i^o^opN*>;GN[[(N' __^(KN ;;D--g|D
7;7Q7QL,84Jn
 % ZZ"--s3::4::F--syy/D/H/HIPPQUQ[Q[\N "N!I!IC!O--	2K4#AA"'**

 &::&8,XLjj!8,U!))+*;*;BQ*?%*GH1B;- JUr1   NFTF)r)   r*   r+   r#   r/   re   rb   r-   ra   r6   rR   ry   r{   r>   compactr   r   r.   r^   r0   r1   r2   rd   rd      s    FD{{E399":q[ ZZ* *H 37 "'_
 #3;;/_ _  _r1   rd   c                   b    e Zd ZU eed<   ej                  Zej                  ed<   d Zdde	fdZ
y)FlaxElectraSelfOutputr5   r6   c                    t        j                  | j                  j                  t        j                   j
                  j                  | j                  j                        | j                        | _	        t        j                  | j                  j                  | j                        | _
        t        j                  | j                  j                        | _        y )Nrg   r6   r:   r<   )r>   rm   r5   ri   rB   rC   rD   rE   r6   denserK   rL   rM   rN   rO   rP   s    r2   rR   zFlaxElectraSelfOutput.setupl  s    XXKK##++224;;3P3PQ**


 dkk.H.HPTPZPZ[zzt{{'F'FGr1   rS   c                 v    | j                  |      }| j                  ||      }| j                  ||z         }|S NrV   r   rO   rK   )rQ   r'   input_tensorrS   s       r2   r^   zFlaxElectraSelfOutput.__call__u  s;    

=1]-P}|'CDr1   Nr_   r)   r*   r+   r#   r/   r-   ra   r6   rR   rb   r^   r0   r1   r2   r   r   h  s,    {{E399"H4 r1   r   c                   x    e Zd ZU eed<   dZeed<   ej                  Z	ej                  ed<   d Z
	 	 	 	 d	defdZy)
FlaxElectraAttentionr5   Fre   r6   c                     t        | j                  | j                  | j                        | _        t        | j                  | j                        | _        y )Nre   r6   rh   )rd   r5   re   r6   rQ   r   outputrP   s    r2   rR   zFlaxElectraAttention.setup  s7    ,T[[TXT^T^_	+DKKtzzJr1   Nr   c           	          | j                  |||||||      }|d   }	| j                  |	||      }|f}
|r	|
|d   fz  }
|
S )N)r   r   r   rS   r   r   rV   r"   )rQ   r   )rQ   r'   r[   r   r   r   rS   r   attn_outputsr   r   s              r2   r^   zFlaxElectraAttention.__call__  sl     yy+-!'/ ! 
 #1oKm\ "Q))Gr1   r   )r)   r*   r+   r#   r/   re   rb   r-   ra   r6   rR   r^   r0   r1   r2   r   r   }  sG    FD{{E399"K "'  r1   r   c                   Z    e Zd ZU eed<   ej                  Zej                  ed<   d Zd Z	y)FlaxElectraIntermediater5   r6   c                 4   t        j                  | j                  j                  t        j                   j
                  j                  | j                  j                        | j                        | _	        t        | j                  j                     | _        y )Nr   )r>   rm   r5   intermediate_sizerB   rC   rD   rE   r6   r   r   
hidden_act
activationrP   s    r2   rR   zFlaxElectraIntermediate.setup  s`    XXKK))++224;;3P3PQ**


 !!7!78r1   c                 J    | j                  |      }| j                  |      }|S N)r   r   rx   s     r2   r^   z FlaxElectraIntermediate.__call__  s$    

=16r1   N
r)   r*   r+   r#   r/   r-   ra   r6   rR   r^   r0   r1   r2   r   r     s$    {{E399"9r1   r   c                   b    e Zd ZU eed<   ej                  Zej                  ed<   d Zdde	fdZ
y)FlaxElectraOutputr5   r6   c                    t        j                  | j                  j                  t        j                   j
                  j                  | j                  j                        | j                        | _	        t        j                  | j                  j                        | _        t        j                  | j                  j                  | j                        | _        y )Nr   r<   r:   )r>   rm   r5   ri   rB   rC   rD   rE   r6   r   rM   rN   rO   rK   rL   rP   s    r2   rR   zFlaxElectraOutput.setup  s    XXKK##++224;;3P3PQ**


 zzt{{'F'FGdkk.H.HPTPZPZ[r1   rS   c                 v    | j                  |      }| j                  ||      }| j                  ||z         }|S r   r   )rQ   r'   attention_outputrS   s       r2   r^   zFlaxElectraOutput.__call__  s<    

=1]-P}7G'GHr1   Nr_   r   r0   r1   r2   r   r     s,    {{E399"\t r1   r   c                       e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 	 dde	ej                     de	ej                     deded	ef
d
Zy)FlaxElectraLayerr5   r6   c                    t        | j                  | j                  j                  | j                        | _        t        | j                  | j                        | _        t        | j                  | j                        | _        | j                  j                  r(t        | j                  d| j                        | _
        y y )Nr   rh   F)r   r5   
is_decoderr6   	attentionr   intermediater   r   add_cross_attentioncrossattentionrP   s    r2   rR   zFlaxElectraLayer.setup  s    -dkk$++BXBX`d`j`jk3DKKtzzR'4::F;;**"6t{{5X\XbXb"cD +r1   Nencoder_hidden_statesencoder_attention_maskr   rS   r   c	                     | j                  ||||||      }	|	d   }
|| j                  |
|||||      }|d   }
| j                  |
      }| j                  ||
|      }|f}|r||	d   fz  }|	|d   fz  }|S )N)r   r   rS   r   r   )r[   r   r   rS   r   rV   r"   )r   r   r   r   )rQ   r'   r[   r   r   r   r   rS   r   attention_outputsr   cross_attention_outputsr   s                r2   r^   zFlaxElectraLayer.__call__  s     !NN+!'/ + 
 -Q/ !,&*&9&9 5 /!6+"3 ': '#  7q9))*:;M3CS`a ")!,..G$03A688r1   )NNFTF)r)   r*   r+   r#   r/   r-   ra   r6   rR   r   r.   rb   r^   r0   r1   r2   r   r     sz    {{E399"d 8<8< ""'+
  (4+ !) 5+ + +  +r1   r   c                       e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 ddeej                     deej                     d	e	d
e	de	de	de	fdZy)FlaxElectraLayerCollectionr5   r6   Fgradient_checkpointingc           	         | j                   rjt        t        d      }t        | j                  j
                        D cg c]*  } || j                  t        |      | j                        , c}| _        y t        | j                  j
                        D cg c]-  }t        | j                  t        |      | j                        / c}| _        y c c}w c c}w )N)         )static_argnums)namer6   )	r   rematr   ranger5   num_hidden_layersstrr6   layers)rQ   FlaxElectraCheckpointLayeris      r2   rR   z FlaxElectraLayerCollection.setup  s    &&)./?PY)Z& t{{<<= +4;;SV4::VDK t{{<<= !3q6LDK
s   /C2CNr   r   r   rS   r   output_hidden_statesreturn_dictc                    |rdnd }|	rdnd }|r|dnd }|W|j                   d   t        | j                        k7  r2t        dt        | j                         d|j                   d    d      t	        | j                        D ]@  \  }}|	r||fz  } ||||||   nd |||||      }|d   }|s,||d   fz  }|8||d   fz  }B |	r||fz  }||||f}|
st        d |D              S t        ||||	      S )
Nr0   r   z&The head_mask should be specified for z/ layers, but it is for                         .r"   ru   c              3   &   K   | ]	  }||  y wr   r0   ).0vs     r2   	<genexpr>z6FlaxElectraLayerCollection.__call__.<locals>.<genexpr>R  s     =qq}=s   )last_hidden_stater'   r(   cross_attentions)rw   r   r   rl   	enumerater   r   )rQ   r'   r[   	head_maskr   r   r   rS   r   r   r   all_attentionsall_hidden_statesall_cross_attentionsr   layerlayer_outputsr   s                     r2   r^   z#FlaxElectraLayerCollection.__call__  sl     1d"6BD&7<Q<]rdh  q!c$++&67 <S=M<N O'ooa014 
 "$++. 	@HAu#!m%55!! ) 5	!4%&!	M *!,M =#3"55(4(]1-=,??(+	@.  -!11 "3^EYZ=G===<++%1	
 	
r1   NNFTFFTr)   r*   r+   r#   r/   r-   ra   r6   r   rb   rR   r   r.   r^   r0   r1   r2   r   r   
  s    {{E399"#(D($ 8<8< ""'%* =

  (4=
 !) 5=
 =
 =
  =
 #=
 =
r1   r   c                       e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 ddeej                     deej                     d	e	d
e	de	de	de	fdZy)FlaxElectraEncoderr5   r6   Fr   c                 f    t        | j                  | j                  | j                        | _        y )Nr6   r   )r   r5   r6   r   r  rP   s    r2   rR   zFlaxElectraEncoder.setupb  s%    /KK**#'#>#>

r1   Nr   r   r   rS   r   r   r   c                 8    | j                  |||||||||	|

      S )N)r  r   r   r   rS   r   r   r   )r  )rQ   r'   r[   r  r   r   r   rS   r   r   r   s              r2   r^   zFlaxElectraEncoder.__call__i  s8     zz"7#9!'/!5#  
 	
r1   r  r  r0   r1   r2   r  r  ]  s    {{E399"#(D(
 8<8< ""'%* 

  (4
 !) 5
 
 
  
 #
 
r1   r  c                   Z    e Zd ZU eed<   ej                  Zej                  ed<   d Zd Z	y)FlaxElectraGeneratorPredictionsr5   r6   c                     t        j                  | j                  j                  | j                        | _        t        j
                  | j                  j                  | j                        | _        y )Nr:   rh   )r>   rK   r5   rL   r6   rm   rA   r   rP   s    r2   rR   z%FlaxElectraGeneratorPredictions.setup  sE    dkk.H.HPTPZPZ[XXdkk88

K
r1   c                     | j                  |      }t        | j                  j                     |      }| j	                  |      }|S r   )r   r   r5   r   rK   rx   s     r2   r^   z(FlaxElectraGeneratorPredictions.__call__  s=    

=1t{{556}E}5r1   Nr   r0   r1   r2   r  r    s%    {{E399"Lr1   r  c                   ^    e Zd ZU dZeed<   ej                  Zej                  ed<   d Z	d Z
y)#FlaxElectraDiscriminatorPredictionszEPrediction module for the discriminator, made up of two dense layers.r5   r6   c                     t        j                  | j                  j                  | j                        | _        t        j                  d| j                        | _        y )Nrh   r"   )r>   rm   r5   ri   r6   r   dense_predictionrP   s    r2   rR   z)FlaxElectraDiscriminatorPredictions.setup  s9    XXdkk55TZZH
 "$** =r1   c                     | j                  |      }t        | j                  j                     |      }| j	                  |      j                  d      }|S )Nr   )r   r   r5   r   r  squeezerx   s     r2   r^   z,FlaxElectraDiscriminatorPredictions.__call__  sJ    

=1t{{556}E--m<DDRHr1   N)r)   r*   r+   r,   r#   r/   r-   ra   r6   rR   r^   r0   r1   r2   r  r    s'    O{{E399">r1   r  c                       e Zd ZU dZeZdZdZej                  e
d<   ddej                  ddfd	ed
ededej                  dedef fdZd Zddej(                  j*                  d
ededefdZd Z eej7                  d            	 	 	 	 	 	 	 	 	 	 	 	 	 ddedej(                  j*                  dedee   dee   dee   defd       Z xZS ) FlaxElectraPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    electraNmodule_class)r"   r"   r   TFr5   input_shapeseedr6   _do_initr   c                 \     | j                   d|||d|}t        	| 	  ||||||       y )Nr5   r6   r   )r"  r#  r6   r$  r0   )r!  super__init__)
rQ   r5   r"  r#  r6   r$  r   kwargsmodule	__class__s
            r2   r(  z#FlaxElectraPreTrainedModel.__init__  sA     #""w&Vlwpvw[tSXcklr1   c                 ^    | j                  | j                  | j                  d      | _        y )NTr&  )r!  r5   r6   _modulerP   s    r2   enable_gradient_checkpointingz8FlaxElectraPreTrainedModel.enable_gradient_checkpointing  s*    ((;;**#' ) 
r1   rngparamsreturnc                    t        j                  |d      }t        j                  |      }t        j                  t        j                  t        j
                  |      j                  d         |      }t        j                  |      }t        j                  | j                  j                  | j                  j                  f      }t        j                  j                  |      \  }	}
|	|
d}| j                  j                  rTt        j                  || j                  j                   fz         }|}| j"                  j%                  ||||||||d	      }n"| j"                  j%                  ||||||d      }|d   }|dt'        t)        |            }t'        t)        |            }| j*                  D ]
  }||   ||<    t-               | _        t/        t1        |            S |S )NrU   rh   r   )r0  rO   F)r   r0  )r-   r   
zeros_liker   r   
atleast_2drw   	ones_likerq   r5   r   rj   rB   randomsplitr   ri   r*  initr   r   _missing_keyssetr   r   )rQ   r/  r"  r0  rX   rY   rZ   r[   r  
params_rngr   rngsr   r   module_init_outputsrandom_paramsmissing_keys                    r2   init_weightsz'FlaxElectraPreTrainedModel.init_weights  s   IIk6		2''

3>>)3L3R3RSU3V(WYdey1HHdkk;;T[[=\=\]^	"%**"2"23"7
K$=;;**$'IIkT[[=T=T<V.V$W!%3""&++"2"2%&! #3 
# #'++"2"2iyfk #3 # ,H5(-)@AM!(6"23F#11 A&3K&@{#A!$D.011  r1   c                    t        j                  ||fd      }t        j                  |d      }t        j                  t        j                  t        j
                  |      j                  d         |j                        }| j                  j                  t        j                  j                  d      |||dd      }t        |d         S )	aW  
        Args:
            batch_size (`int`):
                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
            max_length (`int`):
                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
                cache.
        rU   rh   r   r   FT)r   r   r}   )r-   rq   r5  r   r   r4  rw   r*  r8  rB   r6  PRNGKeyr   )rQ   r   r   rX   r[   rZ   init_variabless          r2   r   z%FlaxElectraPreTrainedModel.init_cache  s     HHj*5TB	y=''

3>>)3L3R3RSU3V(WYbYhYhi))JJq!9nlX]jn * 
 w/00r1   batch_size, sequence_lengthr   trainr   r   r   past_key_valuesc                 p   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        j
                  |      }|St	        j                  t	        j                  t	        j                  |      j                  d         |j                        }|t	        j
                  |      }|?t	        j                  | j                   j                  | j                   j                  f      }i }|	|	|d<   d|xs | j                  i}| j                   j                  r|r	||d<   dg}nd}| j                  j!                  |t	        j"                  |d      t	        j"                  |d      t	        j"                  |d      t	        j"                  |d      t	        j"                  |d      |||
 |||||      }||r|\  }}t%        |d         |d	<   |S |"|s |\  }}|d d
 t%        |d         fz   |d
d  z   }|S | j                  j!                  |t	        j"                  |d      t	        j"                  |d      t	        j"                  |d      t	        j"                  |d      t	        j"                  |d      |
 ||||      }|S )Nr   rO   r0  r}   FrU   rh   )rY   rZ   r  r   r   rS   r   r   r   r<  mutablerF  r"   )rY   rZ   r  rS   r   r   r   r<  )r5   r   r   r   r-   r5  r   r   r4  rw   rq   r   rj   r0  r   r*  applyr   r   )rQ   rX   r[   rY   rZ   r  r   r   r0  r   rE  r   r   r   rF  r<  inputsrH  r   s                      r2   r^   z#FlaxElectraPreTrainedModel.__call__  s   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY ! ]]95N++CJJs~~i7P7V7VWY7Z,[]f]l]lmL! ]]95N$++"?"?A`A`!abI ")DOF1dkk2;;** "1w")kk''		)40		.5"yytD YY|4@))IT:&;'="'i"3%9' ( G$ *{+2(-5og6N-O)* ,[+2(!"1+/'2J)K(MMPWXYXZP[["  kk''		)40		.5"yytD YY|4@))IT:"'i"3%9' ( G r1   r   )NNNNNNNNFNNNN) r)   r*   r+   r,   r#   config_classbase_model_prefixr!  r>   Moduler/   r-   ra   r   intr6   rb   r(  r.  rB   r6  rB  r   r@  r   r    ELECTRA_INPUTS_DOCSTRINGformatdictr   r^   __classcell__)r+  s   @r2   r  r    so   
 !L!"L"))"
 $;;',mm m 	m
 yym m !%m
(!

 2 2 (! (!PZ (!fp (!V1& ++C+J+JKh+ij "#*.,0/3&* $^ ^ ZZ''^ ^ $D>^ 'tn^ d^^ ^ k^r1   r  c                       e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 	 ddeej                     deej                     d	eej                     d
e	de	de	de	de	fdZy)FlaxElectraModuler5   r6   Fr   c                    t        | j                  | j                        | _        | j                  j                  | j                  j
                  k7  r:t        j                  | j                  j
                  | j                        | _        t        | j                  | j                  | j                        | _        y )Nrh   r  )r4   r5   r6   
embeddingsrA   ri   r>   rm   embeddings_projectr  r   encoderrP   s    r2   rR   zFlaxElectraModule.setupi  sv    /4::N;;%%)@)@@&(hht{{/F/Fdjj&YD#)KKtzz$B]B]
r1   Nr  r   r   r   rS   r   r   r   c                     | j                  |||||	      }t        | d      r| j                  |      }| j                  ||||	||||
||
      S )NrV   rW  )r  rS   r   r   r   r   r   r   )rV  hasattrrW  rX  )rQ   rX   r[   rY   rZ   r  r   r   r   rS   r   r   r   rV  s                 r2   r^   zFlaxElectraModule.__call__q  st     __~|^S` % 

 4-.00<J||'"7#9!/!5#  
 	
r1   )NNNFTFFT)r)   r*   r+   r#   r/   r-   ra   r6   r   rb   rR   r   npr.   r^   r0   r1   r2   rT  rT  d  s    {{E399"#(D(
 +/7;8< ""'%*  
 BJJ' 
  (4 
 !) 5 
  
  
   
 # 
  
r1   rT  zaThe bare Electra Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd ZeZy)FlaxElectraModelN)r)   r*   r+   rT  r!  r0   r1   r2   r]  r]    s	    
 %Lr1   r]  c                       e Zd ZU eed<   ej                  Zej                  ed<   dZe	j                  j                  j                  Zedej                   f   ed<   d Zd Zy)FlaxElectraTiedDenserA   r6   N.	bias_initc                 ^    | j                  d| j                  | j                  f      | _        y )Nr   )paramr`  rA   r   rP   s    r2   rR   zFlaxElectraTiedDense.setup  s#    JJvt~~8K8K7MN	r1   c                 J   t        j                  || j                        }t        j                  || j                        }t        j                  |||j
                  dz
  fdfdf| j                        }t        j                  | j                  | j                        }||z   S )Nr"   r   )r0   r0   )r   )r-   asarrayr6   r   dot_generalndimr   r   )rQ   xkernelyr   s        r2   r^   zFlaxElectraTiedDense.__call__  s    KK4::&VTZZ0OOvvzmT"H-nn	
 {{499djj14xr1   )r)   r*   r+   rN  r/   r-   ra   r6   r   rB   r>   rC   r   r`  r   r[  r.   rR   r^   r0   r1   r2   r_  r_    sQ    {{E399"I+.66+>+>+D+DIxRZZ(DO
r1   r_  c            	           e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 	 dde	de	d	e	d
e	fdZy)FlaxElectraForMaskedLMModuler5   r6   Fr   c                    t        | j                  | j                  | j                        | _        t        | j                  | j                        | _        | j                  j                  r1t        | j                  j                  | j                        | _
        y t        j                  | j                  j                  | j                        | _
        y Nr&  r5   r6   rh   rT  r5   r6   r   r   r  generator_predictionstie_word_embeddingsr_  r@   generator_lm_headr>   rm   rP   s    r2   rR   z"FlaxElectraForMaskedLMModule.setup      (;;djjIdId
 &EDKK_c_i_i%j";;**%9$++:P:PX\XbXb%cD"%'XXdkk.D.DDJJ%WD"r1   NrS   r   r   r   c
                    | j                  |||||||||		      }
|
d   }| j                  |      }| j                  j                  r?| j                   j                  d   d   d   d   }| j                  ||j                        }n| j                  |      }|	s	|f|
dd  z   S t        ||
j                  |
j                        S )	NrS   r   r   r   r   r0  rV  rF   	embeddingr"   r&   r'   r(   )
r   rp  r5   rq  r   rr  Tr   r'   r(   )rQ   rX   r[   rY   rZ   r  rS   r   r   r   r   r'   prediction_scoresshared_embeddings                 r2   r^   z%FlaxElectraForMaskedLMModule.__call__  s     ,,'/!5#  

  
 66}E;;**#||55h?MN_`alm $ 6 67HJZJ\J\ ] $ 6 67H I%''!"+55!$!//))
 	
r1   NNNNTFFTr)   r*   r+   r#   r/   r-   ra   r6   r   rb   rR   r^   r0   r1   r2   rk  rk    sr    {{E399"#(D(X ""'%* '
 '
  '
 #'
 '
r1   rk  z5Electra Model with a `language modeling` head on top.c                       e Zd ZeZy)FlaxElectraForMaskedLMN)r)   r*   r+   rk  r!  r0   r1   r2   r~  r~    s    /Lr1   r~  c            	           e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 	 dde	de	d	e	d
e	fdZy)FlaxElectraForPreTrainingModuler5   r6   Fr   c                     t        | j                  | j                  | j                        | _        t        | j                  | j                        | _        y Nr&  rn  )rT  r5   r6   r   r   r  discriminator_predictionsrP   s    r2   rR   z%FlaxElectraForPreTrainingModule.setup  sC    (;;djjIdId
 *MTXT_T_gkgqgq)r&r1   NrS   r   r   r   c
                     | j                  |||||||||		      }
|
d   }| j                  |      }|	s	|f|
dd  z   S t        ||
j                  |
j                        S )Nru  r   r"   rw  )r   r  r%   r'   r(   rQ   rX   r[   rY   rZ   r  rS   r   r   r   r   r'   r&   s                r2   r^   z(FlaxElectraForPreTrainingModule.__call__  s     ,,'/!5#  

  
//>9wqr{**.!//))
 	
r1   r{  r|  r0   r1   r2   r  r    sr    {{E399"#(D(s ""'%* #
 #
  #
 ##
 #
r1   r  z
    Electra model with a binary classification head on top as used during pretraining for identifying generated tokens.

    It is recommended to load the discriminator checkpoint into that model.
    c                       e Zd ZeZy)FlaxElectraForPreTrainingN)r)   r*   r+   r  r!  r0   r1   r2   r  r  '  s	     3Lr1   r  a  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, FlaxElectraForPreTraining

    >>> tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator")
    >>> model = FlaxElectraForPreTraining.from_pretrained("google/electra-small-discriminator")

    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
    >>> outputs = model(**inputs)

    >>> prediction_logits = outputs.logits
    ```
rD  )output_typerK  c            	           e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 	 dde	de	d	e	d
e	fdZy)'FlaxElectraForTokenClassificationModuler5   r6   Fr   c                    t        | j                  | j                  | j                        | _        | j                  j
                  | j                  j
                  n| j                  j                  }t        j                  |      | _	        t        j                  | j                  j                  | j                        | _        y Nr&  rh   )rT  r5   r6   r   r   classifier_dropoutrN   r>   rM   rO   rm   
num_labels
classifierrQ   r  s     r2   rR   z-FlaxElectraForTokenClassificationModule.setupS  s    (;;djjIdId

 {{--9 KK**00 	
 zz"45((4;;#9#9Lr1   NrS   r   r   r   c
                     | j                  |||||||||		      }
|
d   }| j                  ||      }| j                  |      }|	s	|f|
dd  z   S t        ||
j                  |
j
                        S Nru  r   rV   r"   rw  )r   rO   r  r   r'   r(   r  s                r2   r^   z0FlaxElectraForTokenClassificationModule.__call___  s     ,,'/!5#  

  
]-P/9wqr{**(!//))
 	
r1   r{  r|  r0   r1   r2   r  r  N  sr    {{E399"#(D(
M ""'%* $
 $
  $
 #$
 $
r1   r  z
    Electra model with a token classification head on top.

    Both the discriminator and generator may be loaded into this model.
    c                       e Zd ZeZy)!FlaxElectraForTokenClassificationN)r)   r*   r+   r  r!  r0   r1   r2   r  r    s	     ;Lr1   r  c                     | S r   r0   )rg  r)  s     r2   identityr    s    Hr1   c                   f    e Zd ZU dZeed<   ej                  Zej                  ed<   d Z	dde
fdZy)	FlaxElectraSequenceSummarya  
    Compute a single vector summary of a sequence hidden states.

    Args:
        config ([`PretrainedConfig`]):
            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
            config class of your model for the default values it uses):

            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
              (otherwise to `config.hidden_size`).
            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
              another string or `None` will add no activation.
            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
    r5   r6   c                    t         | _        t        | j                  d      r| j                  j                  rt        | j                  d      rF| j                  j
                  r0| j                  j                  dkD  r| j                  j                  }n| j                  j                  }t        j                  || j                        | _        t        | j                  dd       }|r	t        |   nd | _        t         | _        t        | j                  d      rG| j                  j                  dkD  r.t        j                   | j                  j                        | _        t         | _        t        | j                  d      rI| j                  j$                  dkD  r/t        j                   | j                  j$                        | _        y y y )	Nsummary_use_projsummary_proj_to_labelsr   rh   summary_activationc                     | S r   r0   )rg  s    r2   r   z2FlaxElectraSequenceSummary.setup.<locals>.<lambda>  s    XY r1   summary_first_dropoutsummary_last_dropout)r  summaryrZ  r5   r  r  r  ri   r>   rm   r6   getattrr   r   first_dropoutr  rM   last_dropoutr  )rQ   num_classesactivation_strings      r2   rR   z FlaxElectraSequenceSummary.setup  s<   4;; 238T8T%=>KK66KK**Q."kk44"kk5588KtzzBDL#DKK1EtL7H&!23k%4;; 78T[[=^=^ab=b!#DKK,M,M!ND$4;; 67DKK<\<\_`<` "

4;;+K+K LD =a7r1   NrS   c                     |dddf   }| j                  ||      }| j                  |      }| j                  |      }| j                  ||      }|S )aZ  
        Compute a single vector summary of a sequence hidden states.

        Args:
            hidden_states (`jnp.ndarray` of shape `[batch_size, seq_len, hidden_size]`):
                The hidden states of the last layer.
            cls_index (`jnp.ndarray` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.

        Returns:
            `jnp.ndarray`: The summary of the sequence hidden states.
        Nr   rV   )r  r  r   r  )rQ   r'   	cls_indexrS   r   s        r2   r^   z#FlaxElectraSequenceSummary.__call__  s]     q!t$##F-#Hf%(""6"Gr1   )NTr`   r0   r1   r2   r  r    s3    " {{E399"M0T r1   r  c            	           e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 	 dde	de	d	e	d
e	fdZy)"FlaxElectraForMultipleChoiceModuler5   r6   Fr   c                     t        | j                  | j                  | j                        | _        t        | j                  | j                        | _        t        j                  d| j                        | _	        y )Nr&  rn  r"   rh   )
rT  r5   r6   r   r   r  sequence_summaryr>   rm   r  rP   s    r2   rR   z(FlaxElectraForMultipleChoiceModule.setup  sU    (;;djjIdId
 !;$++UYU_U_ `((1DJJ7r1   NrS   r   r   r   c
                 <   |j                   d   }
||j                  d|j                   d         nd }||j                  d|j                   d         nd }||j                  d|j                   d         nd }||j                  d|j                   d         nd }| j                  |||||||||		      }|d   }| j                  ||      }| j	                  |      }|j                  d|
      }|	s	|f|dd  z   S t        ||j                  |j                        S )Nr"   r   ru  r   rV   rw  )rw   rv   r   r  r  r   r'   r(   )rQ   rX   r[   rY   rZ   r  rS   r   r   r   num_choicesr   r'   pooled_outputr&   reshaped_logitss                   r2   r^   z+FlaxElectraForMultipleChoiceModule.__call__  sK     ooa(BKBWI%%b)//"*=>]a	Q_Qk//N4H4H4LMquQ_Qk//N4H4H4LMquKWKc|++B0B0B20FGim ,,'/!5#  

  
--m=-Y/ ..[9#%33,"!//))
 	
r1   r{  r|  r0   r1   r2   r  r    sq    {{E399"#(D(8 ""'%* +
 +
  +
 #+
 +
r1   r  z
    ELECTRA Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                       e Zd ZeZy)FlaxElectraForMultipleChoiceN)r)   r*   r+   r  r!  r0   r1   r2   r  r    s	     6Lr1   r  z(batch_size, num_choices, sequence_lengthc            	           e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 	 dde	de	d	e	d
e	fdZy)%FlaxElectraForQuestionAnsweringModuler5   r6   Fr   c                     t        | j                  | j                  | j                        | _        t        j                  | j                  j                  | j                        | _        y r  )	rT  r5   r6   r   r   r>   rm   r  
qa_outputsrP   s    r2   rR   z+FlaxElectraForQuestionAnsweringModule.setup7  sE    (;;djjIdId
 ((4;;#9#9Lr1   NrS   r   r   r   c
                 X   | j                  |||||||||		      }
|
d   }| j                  |      }|j                  | j                  j                  d      \  }}|j                  d      }|j                  d      }|	s
||f|
dd  z   S t        |||
j                  |
j                        S )Nru  r   r   r   r"   )start_logits
end_logitsr'   r(   )	r   r  r7  r5   r  r  r   r'   r(   )rQ   rX   r[   rY   rZ   r  rS   r   r   r   r   r'   r&   r  r  s                  r2   r^   z.FlaxElectraForQuestionAnsweringModule.__call__=  s     ,,'/!5#  

  
/#)<<0F0FR<#P j#++B/''+
 *-;;/%!!//))	
 	
r1   r{  r|  r0   r1   r2   r  r  2  sr    {{E399"#(D(M ""'%* &
 &
  &
 #&
 &
r1   r  z
    ELECTRA Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       e Zd ZeZy)FlaxElectraForQuestionAnsweringN)r)   r*   r+   r  r!  r0   r1   r2   r  r  f  s	     9Lr1   r  c                   f    e Zd ZU dZeed<   ej                  Zej                  ed<   d Z	dde
fdZy)	FlaxElectraClassificationHeadz-Head for sentence-level classification tasks.r5   r6   c                    t        j                  | j                  j                  | j                        | _        | j                  j                  | j                  j                  n| j                  j                  }t        j                  |      | _	        t        j                  | j                  j                  | j                        | _        y )Nrh   )r>   rm   r5   ri   r6   r   r  rN   rM   rO   r  out_projr  s     r2   rR   z#FlaxElectraClassificationHead.setup  s    XXdkk55TZZH
 {{--9 KK**00 	
 zz"45!7!7tzzJr1   rS   c                     |d d dd d f   }| j                  ||      }| j                  |      }t        d   |      }| j                  ||      }| j                  |      }|S )Nr   rV   gelu)rO   r   r   r  )rQ   r'   rS   rg  s       r2   r^   z&FlaxElectraClassificationHead.__call__  sd    !Q'"LL-L8JJqM6N1LL-L8MM!r1   Nr_   r`   r0   r1   r2   r  r  y  s/    7{{E399"KT r1   r  c            	           e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 	 dde	de	d	e	d
e	fdZy)*FlaxElectraForSequenceClassificationModuler5   r6   Fr   c                     t        | j                  | j                  | j                        | _        t        | j                  | j                        | _        y r  )rT  r5   r6   r   r   r  r  rP   s    r2   rR   z0FlaxElectraForSequenceClassificationModule.setup  s>    (;;djjIdId
 8t{{RVR\R\]r1   NrS   r   r   r   c
                     | j                  |||||||||		      }
|
d   }| j                  ||      }|	s	|f|
dd  z   S t        ||
j                  |
j                        S r  )r   r  r   r'   r(   r  s                r2   r^   z3FlaxElectraForSequenceClassificationModule.__call__  s     ,,'/!5#  

  
mL9wqr{**+!//))
 	
r1   r{  r|  r0   r1   r2   r  r    sr    {{E399"#(D(^ ""'%* "
 "
  "
 #"
 "
r1   r  z
    Electra Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                       e Zd ZeZy)$FlaxElectraForSequenceClassificationN)r)   r*   r+   r  r!  r0   r1   r2   r  r    s	     >Lr1   r  c                   J   e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     d	eej                     d
eej                     deej                     deej                     de	de	de	de	de	fdZy)FlaxElectraForCausalLMModuler5   r6   Fr   c                    t        | j                  | j                  | j                        | _        t        | j                  | j                        | _        | j                  j                  r1t        | j                  j                  | j                        | _
        y t        j                  | j                  j                  | j                        | _
        y rm  ro  rP   s    r2   rR   z"FlaxElectraForCausalLMModule.setup  rs  r1   Nr[   rY   rZ   r  r   r   r   rS   r   r   r   c                    | j                  |||||||||	|
||      }|d   }| j                  |      }| j                  j                  r?| j                   j                  d   d   d   d   }| j                  ||j                        }n| j                  |      }|s	|f|dd  z   S t        ||j                  |j                  |j                        S )	N)r   r   r   rS   r   r   r   r   r0  rV  rF   rv  r"   )r&   r'   r(   r  )r   rp  r5   rq  r   rr  rx  r   r'   r(   r  )rQ   rX   r[   rY   rZ   r  r   r   r   rS   r   r   r   r   r'   ry  rz  s                    r2   r^   z%FlaxElectraForCausalLMModule.__call__  s     ,,"7#9!'/!5#  
  
 66}E;;**#||55h?MN_`alm $ 6 67HJZJ\J\ ] $ 6 67H I%''!"+554$!//))$55	
 	
r1   )NNNNNNFTFFTr  r0   r1   r2   r  r    s    {{E399"#(D(X 1504.2+/7;8< ""'%* .
 !-.
 !-	.

 s{{+.
 CKK(.
  (4.
 !) 5.
 .
 .
  .
 #.
 .
r1   r  z
    Electra Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
    autoregressive tasks.
    c                   >    e Zd ZeZddeej                     fdZd Z	y)FlaxElectraForCausalLMNr[   c                 H   |j                   \  }}| j                  ||      }t        j                  ||fd      }|-|j	                  d      dz
  }t        j                  ||d      }n4t        j                  t        j                  |d      d d d f   ||f      }|||dS )NrU   rh   r   r   r"   )r   r   )rF  r[   rZ   )	rw   r   r-   rq   cumsumr   r   r   r   )	rQ   rX   r   r[   r   
seq_lengthrF  extended_attention_maskrZ   s	            r2   prepare_inputs_for_generationz4FlaxElectraForCausalLM.prepare_inputs_for_generation!  s    !*
J//*jA #&((J
+C4"P%)00b09A=L&)&>&>?VXfhn&o#++CJJz,NtUVw,WZdfpYqrL  /5(
 	
r1   c                 L    |j                   |d<   |d   d d dd f   dz   |d<   |S )NrF  rZ   r   r"   )rF  )rQ   model_outputsmodel_kwargss      r2   update_inputs_for_generationz3FlaxElectraForCausalLM.update_inputs_for_generation6  s8    *7*G*G&''3N'CArsF'Ka'O^$r1   r   )
r)   r*   r+   r  r!  r   rB   Arrayr  r  r0   r1   r2   r  r    s'     0L
S[\_\e\eSf 
*r1   r  )_typingr   r   r   flax
flax.linenlinenr>   rB   	jax.numpynumpyr-   r[  flax.core.frozen_dictr   r   r   r	   r
   r   nn_partitioningflax.linen.attentionr   flax.traverse_utilr   r   r   modeling_flax_outputsr   r   r   r   r   r   r   r   modeling_flax_utilsr   r   r   r   r   utilsr   r   r    r!   configuration_electrar#   
get_loggerr)   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr   struct	dataclassr%   ELECTRA_START_DOCSTRINGrO  rM  r4   rd   r   r   r   r   r   r   r  r  r  r  rT  r]  r_  rk  r~  r  r  &FLAX_ELECTRA_FOR_PRETRAINING_DOCSTRINGrP  r  r  r  r  r  r  r  r  r  r  r  r  r  r0   r1   r2   <module>r     s~    - ,   
   > > 6 6 > ; 	 	 	  g f 0 
		H	%: ! 4k 4 42 ,$ N&BII &Thryy hXBII *'299 'Vbii &		 *6ryy 6tO
 O
f$
 $
Nbii ")) "}!4 }@-
		 -
` g%1 %	% -/BDWYh i299 ,6
299 6
r QSjk07 0 l0 35HJ\^m n.
bii .
b 
 3 : 33* &$ ##$ABEkk !+JYh
5
bii 5
p 
 ;(B ;; %	@ @F7
 7
t  6#= 66
  ":"A"ABl"m  !	1
BII 1
h  9&@ 99 #$	BII 4-
 -
`  >+E >> ( 	=
299 =
@  7 < )	r1   