
    sg                        d dl mZmZmZ d dlZd dlmZ d dlZd dl	m
Z d dl
Zd dlmZmZmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d	d
lmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% d	dl&m'Z'm(Z(m)Z)m*Z*m+Z+ d	dl,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2  e0jf                  e4      Z5dZ6dZ7ejp                  Z8ejr                  jt                   G d de-             Z;dZ<dZ= G d dej|                        Z? G d dej|                        Z@ G d dej|                        ZA G d dej|                        ZB G d dej|                        ZC G d d ej|                        ZD G d! d"ej|                        ZE G d# d$ej|                        ZF G d% d&ej|                        ZG G d' d(ej|                        ZH G d) d*ej|                        ZI G d+ d,ej|                        ZJ G d- d.ej|                        ZK G d/ d0ej|                        ZL G d1 d2ej|                        ZM G d3 d4e(      ZN G d5 d6ej|                        ZO e.d7e<       G d8 d9eN             ZP e)ePe6ee7        G d: d;ej|                        ZQ e.d<e<       G d= d>eN             ZRd?ZS e+eRe=j                  d@      eSz           e*eRe;e7A        G dB dCej|                        ZU e.dDe<       G dE dFeN             ZV e)eVe6e e7        G dG dHej|                        ZW e.dIe<       G dJ dKeN             ZXdLZY e+eXe=j                  d@      eYz           e*eXe"e7A        G dM dNej|                        ZZ e.dOe<       G dP dQeN             Z[ e)e[e6e$e7        G dR dSej|                        Z\ e.dTe<       G dU dVeN             Z] e+e]e=j                  dW              e)e]e6e!e7        G dX dYej|                        Z^ e.dZe<       G d[ d\eN             Z_ e)e_e6e%e7        G d] d^ej|                        Z` e.d_e<       G d` daeN             Za e)eae6e#e7        G db dcej|                        Zb e.dde<       G de dfeN             Zc e)ece6ee7       y)g    )CallableOptionalTupleN)
FrozenDictfreezeunfreeze)combine_masksmake_causal_mask)partitioning)dot_product_attention_weights)flatten_dictunflatten_dict)lax   )
-FlaxBaseModelOutputWithPastAndCrossAttentionsFlaxBaseModelOutputWithPooling0FlaxBaseModelOutputWithPoolingAndCrossAttentions%FlaxCausalLMOutputWithCrossAttentionsFlaxMaskedLMOutputFlaxMultipleChoiceModelOutputFlaxNextSentencePredictorOutput FlaxQuestionAnsweringModelOutputFlaxSequenceClassifierOutputFlaxTokenClassifierOutput)ACT2FNFlaxPreTrainedModelappend_call_sample_docstring append_replace_return_docstringsoverwrite_call_docstring)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )
BertConfigzgoogle-bert/bert-base-uncasedr%   c                       e Zd ZU dZdZej                  ed<   dZej                  ed<   dZ	e
eej                        ed<   dZe
eej                        ed<   y)FlaxBertForPreTrainingOutputaI  
    Output type of [`BertForPreTraining`].

    Args:
        prediction_logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        seq_relationship_logits (`jnp.ndarray` of shape `(batch_size, 2)`):
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
            before SoftMax).
        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nprediction_logitsseq_relationship_logitshidden_states
attentions)__name__
__module____qualname____doc__r(   jnpndarray__annotations__r)   r*   r   r   r+        ^/var/www/html/venv/lib/python3.12/site-packages/transformers/models/bert/modeling_flax_bert.pyr'   r'   =   sW    , &*s{{)+/S[[/26M8E#++./6/3Js{{+,3r4   r'   a
  

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`BertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
            `jax.numpy.bfloat16` (on TPUs).

            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
            specified all the computation will be performed with the given `dtype`.

            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
            parameters.**

            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
            [`~FlaxPreTrainedModel.to_bf16`].
        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
            `jax.numpy.bfloat16` (on TPUs).

            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
            specified all the computation will be performed with the given `dtype`.

            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
            parameters.**

            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
            [`~FlaxPreTrainedModel.to_bf16`].

a  
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        head_mask (`numpy.ndarray` of shape `({0})`, `optional):
            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

c                   f    e Zd ZU dZeed<   ej                  Zej                  ed<   d Z	dde
fdZy)	FlaxBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.configdtypec                    t        j                  | j                  j                  | j                  j                  t
        j                   j                  j                  | j                  j                        | j                        | _
        t        j                  | j                  j                  | j                  j                  t
        j                   j                  j                  | j                  j                        | j                        | _        t        j                  | j                  j                  | j                  j                  t
        j                   j                  j                  | j                  j                        | j                        | _        t        j                  | j                  j                   | j                        | _        t        j"                  | j                  j$                        | _        y )N)stddev)embedding_initr9   epsilonr9   rate)nnEmbedr8   
vocab_sizehidden_sizejaxinitializersnormalinitializer_ranger9   word_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutselfs    r5   setupzFlaxBertEmbeddings.setup   sJ   !xxKK""KK##66..55T[[=Z=Z5[**	 
 $&88KK//KK##66..55T[[=Z=Z5[**	$
  &(XXKK''KK##66..55T[[=Z=Z5[**	&
" dkk.H.HPTPZPZ[zzt{{'F'FGr4   deterministicc                    | j                  |j                  d            }| j                  |j                  d            }| j                  |j                  d            }||z   |z   }	| j	                  |	      }	| j                  |	|      }	|	S )Ni4rV   )rI   astyperK   rM   rN   rR   )
rT   	input_idstoken_type_idsposition_idsattention_maskrV   inputs_embedsposition_embedsrM   r*   s
             r5   __call__zFlaxBertEmbeddings.__call__   s    ,,Y-=-=d-CD22<3F3Ft3LM $ : :>;P;PQU;V W &(==O }5]-Pr4   NT)r,   r-   r.   r/   r%   r2   r0   float32r9   rU   boolra   r3   r4   r5   r7   r7      s0    Q{{E399"H,_c r4   r7   c                       e Zd ZU eed<   dZeed<   ej                  Z	ej                  ed<   d Z
d Zd Zej                  d        Z	 	 	 	 dd
eej"                     dedefdZy	)FlaxBertSelfAttentionr8   Fcausalr9   c                 6   | j                   j                  | j                   j                  z  | _        | j                   j                  | j                   j                  z  dk7  rt	        d      t        j                  | j                   j                  | j                  t        j
                  j                  j                  | j                   j                              | _        t        j                  | j                   j                  | j                  t        j
                  j                  j                  | j                   j                              | _        t        j                  | j                   j                  | j                  t        j
                  j                  j                  | j                   j                              | _        | j                  r>t!        t#        j$                  d| j                   j&                  fd      d      | _        y y )Nr   z`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads`                    : {self.config.num_attention_heads})r9   kernel_initr$   rd   r9   )r8   rD   num_attention_headshead_dim
ValueErrorrA   Denser9   rE   rF   rG   rH   querykeyvaluerg   r
   r0   onesrJ   causal_maskrS   s    r5   rU   zFlaxBertSelfAttention.setup   si   //4;;3R3RR;;""T[[%D%DDII 
 XXKK##**++224;;3P3PQ


 88KK##**++224;;3P3PQ

 XXKK##**++224;;3P3PQ

 ;;/!T[[@@APX^ D r4   c                     |j                  |j                  d d | j                  j                  | j                  fz         S N   )reshapeshaper8   rk   rl   rT   r*   s     r5   _split_headsz"FlaxBertSelfAttention._split_heads   s;    $$]%8%8!%<@_@_aeanan?o%oppr4   c                 n    |j                  |j                  d d | j                  j                  fz         S ru   )rw   rx   r8   rD   ry   s     r5   _merge_headsz"FlaxBertSelfAttention._merge_heads  s2    $$]%8%8!%<@W@W?Y%YZZr4   c                 (   | j                  dd      }| j                  ddt        j                  |j                  |j
                        }| j                  ddt        j                  |j                  |j
                        }| j                  ddd       }|r|j                  j                  ^ }	}
}}|j                  }dt        |	      z  |ddfz   }t        j                  |j                  ||      }t        j                  |j                  ||      }||_        ||_        |j                  d   }|j                  |z   |_        t        j                  t        j                  |
      ||z   k  t        |	      d||
fz         }t        ||      }|||fS )	a[  
        This function takes projected key, value states from a single input token and concatenates the states to cached
        states from previous steps. This function is slighly adapted from the official Flax repository:
        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
        cache
cached_keycached_valuecache_indexc                  L    t        j                  dt         j                        S )Nr   rj   )r0   arrayint32r3   r4   r5   <lambda>z=FlaxBertSelfAttention._concatenate_to_cache.<locals>.<lambda>  s    CIIaWZW`W`Da r4   )r   r   r$   )has_variablevariabler0   zerosrx   r9   rq   lenr   dynamic_update_slicebroadcast_toarangetupler	   )rT   rp   rq   ro   r^   is_initializedr   r   r   
batch_dims
max_length	num_headsdepth_per_head	cur_indexindicesnum_updated_cache_vectorspad_masks                    r5   _concatenate_to_cachez+FlaxBertSelfAttention._concatenate_to_cache  sr    **7LA]]7L#))SYYPSPYPYZ
}}WnciiV[VaVabmmG]<abAKAQAQAWAW>ZY#))IS_,	1a/@@G**:+;+;S'JC,,\-?-?PE"J!&L(-A% + 1 14M MK''

:&5N)NNj!Q(A:$NNH +8^DNE>))r4   Nkey_value_states
init_cacheoutput_attentionsc                 4   |d u}|j                   d   }	| j                  |      }
|r#| j                  |      }| j                  |      }n"| j                  |      }| j                  |      }| j	                  |
      }
| j	                  |      }| j	                  |      }| j
                  r|
j                   d   |j                   d   }}| j                  dd      r[| j                  d   d   }| j                  d   d   j                   d   }t        j                  | j                  dd|dfdd||f      }n| j                  d d d d d |d |f   }t        j                  ||	f|j                   dd  z         }|N| j
                  rBt        j                  t        j                  |d      j                         }t        ||      }n(| j
                  r}n|t        j                  |d      }| j
                  r,| j                  dd      s|r| j                  |||
|      \  }}}|t        j                   |dkD  t        j"                  |j                   d      j%                  | j&                        t        j"                  |j                   t        j(                  | j&                        j*                        j%                  | j&                              }nd }d }|s*| j,                  j.                  dkD  r| j1                  d	      }t3        |
|||| j,                  j.                  d
|| j&                  d 	      }|t        j4                  d||      }t        j4                  d||      }|j7                  |j                   d d dz         }|r||f}|S |f}|S )Nr   r$   r~   r   r   )axisg        rR   T)biasdropout_rngdropout_ratebroadcast_dropoutrV   r9   	precisionz...hqk,h->...hqkz...hqk,...khd->...qhdrv   ))rx   ro   rp   rq   rz   rg   r   	variablesr   dynamic_slicers   r0   r   expand_dimsr	   r   selectfullrZ   r9   finfominr8   attention_probs_dropout_probmake_rngr   einsumrw   )rT   r*   r^   layer_head_maskr   r   rV   r   is_cross_attention
batch_sizequery_states
key_statesvalue_statesquery_length
key_length
mask_shiftmax_decoder_lengthrs   attention_biasr   attn_weightsattn_outputoutputss                          r5   ra   zFlaxBertSelfAttention.__call__&  sk    .T9"((+
 zz-0"23J::&67L -0J::m4L((6&&z2
((6 ;;'3'9'9!'<j>N>Nq>Q*L  ,7!^^G4]C
%)^^G%<\%J%P%PQR%S"!//$$q!Z&;aLRd=e #..q!]l]KZK/OP**;HYHYZ[Z\H]8]^K %$++ --coonS[.\^i^o^opN*>;GN[[(N' __^(KN ;;D--g|D
7;7Q7QL,84Jn
 % ZZ"--s3::4::F--syy/D/H/HIPPQUQ[Q[\N "N!I!IC!O--	2K4#AA"'**

 &::&8,XLjj!8,U!))+*;*;BQ*?%*GH1B;- JUr4   NFTF)r,   r-   r.   r%   r2   rg   rd   r0   rc   r9   rU   rz   r|   rA   compactr   r   r1   ra   r3   r4   r5   rf   rf      s    FD{{E399":q[ ZZ* *H 37 "'_
 #3;;/_ _  _r4   rf   c                   b    e Zd ZU eed<   ej                  Zej                  ed<   d Zdde	fdZ
y)FlaxBertSelfOutputr8   r9   c                    t        j                  | j                  j                  t        j                   j
                  j                  | j                  j                        | j                        | _	        t        j                  | j                  j                  | j                        | _
        t        j                  | j                  j                        | _        y )Nri   r9   r=   r?   )rA   rn   r8   rD   rE   rF   rG   rH   r9   denserN   rO   rP   rQ   rR   rS   s    r5   rU   zFlaxBertSelfOutput.setup  s    XXKK##++224;;3P3PQ**


 dkk.H.HPTPZPZ[zzt{{'F'FGr4   rV   c                 v    | j                  |      }| j                  ||      }| j                  ||z         }|S NrY   r   rR   rN   )rT   r*   input_tensorrV   s       r5   ra   zFlaxBertSelfOutput.__call__  s;    

=1]-P}|'CDr4   Nrb   r,   r-   r.   r%   r2   r0   rc   r9   rU   rd   ra   r3   r4   r5   r   r     s,    {{E399"H4 r4   r   c                   x    e Zd ZU eed<   dZeed<   ej                  Z	ej                  ed<   d Z
	 	 	 	 d	defdZy)
FlaxBertAttentionr8   Frg   r9   c                     t        | j                  | j                  | j                        | _        t        | j                  | j                        | _        y )Nrg   r9   rj   )rf   r8   rg   r9   rT   r   outputrS   s    r5   rU   zFlaxBertAttention.setup  s7    )$++dkkQUQ[Q[\	(DJJGr4   Nr   c           	          | j                  |||||||      }|d   }	| j                  |	||      }|f}
|r	|
|d   fz  }
|
S )N)r   r   r   rV   r   r   rY   r$   )rT   r   )rT   r*   r^   r   r   r   rV   r   attn_outputsr   r   s              r5   ra   zFlaxBertAttention.__call__  sl     yy+-!'/ ! 
 #1oKm\ "Q))Gr4   r   )r,   r-   r.   r%   r2   rg   rd   r0   rc   r9   rU   ra   r3   r4   r5   r   r     sG    FD{{E399"H "'  r4   r   c                   Z    e Zd ZU eed<   ej                  Zej                  ed<   d Zd Z	y)FlaxBertIntermediater8   r9   c                 4   t        j                  | j                  j                  t        j                   j
                  j                  | j                  j                        | j                        | _	        t        | j                  j                     | _        y Nr   )rA   rn   r8   intermediate_sizerE   rF   rG   rH   r9   r   r   
hidden_act
activationrS   s    r5   rU   zFlaxBertIntermediate.setup  s`    XXKK))++224;;3P3PQ**


 !!7!78r4   c                 J    | j                  |      }| j                  |      }|S N)r   r   ry   s     r5   ra   zFlaxBertIntermediate.__call__  s$    

=16r4   N
r,   r-   r.   r%   r2   r0   rc   r9   rU   ra   r3   r4   r5   r   r     s$    {{E399"9r4   r   c                   b    e Zd ZU eed<   ej                  Zej                  ed<   d Zdde	fdZ
y)FlaxBertOutputr8   r9   c                    t        j                  | j                  j                  t        j                   j
                  j                  | j                  j                        | j                        | _	        t        j                  | j                  j                        | _        t        j                  | j                  j                  | j                        | _        y )Nr   r?   r=   )rA   rn   r8   rD   rE   rF   rG   rH   r9   r   rP   rQ   rR   rN   rO   rS   s    r5   rU   zFlaxBertOutput.setup  s    XXKK##++224;;3P3PQ**


 zzt{{'F'FGdkk.H.HPTPZPZ[r4   rV   c                 v    | j                  |      }| j                  ||      }| j                  ||z         }|S r   r   )rT   r*   attention_outputrV   s       r5   ra   zFlaxBertOutput.__call__  s<    

=1]-P}7G'GHr4   Nrb   r   r3   r4   r5   r   r     s,    {{E399"\t r4   r   c                       e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 	 dde	ej                     de	ej                     deded	ef
d
Zy)FlaxBertLayerr8   r9   c                    t        | j                  | j                  j                  | j                        | _        t        | j                  | j                        | _        t        | j                  | j                        | _        | j                  j                  r(t        | j                  d| j                        | _
        y y )Nr   rj   F)r   r8   
is_decoderr9   	attentionr   intermediater   r   add_cross_attentioncrossattentionrS   s    r5   rU   zFlaxBertLayer.setup  s    *4;;t{{?U?U]a]g]gh0DJJO$T[[

C;;**"3DKKUYU_U_"`D +r4   Nencoder_hidden_statesencoder_attention_maskr   rV   r   c	                     | j                  ||||||      }	|	d   }
|| j                  |
|||||      }|d   }
| j                  |
      }| j                  ||
|      }|f}|r||	d   fz  }|	|d   fz  }|S )N)r   r   rV   r   r   )r^   r   r   rV   r   rY   r$   )r   r   r   r   )rT   r*   r^   r   r   r   r   rV   r   attention_outputsr   cross_attention_outputsr   s                r5   ra   zFlaxBertLayer.__call__  s     !NN+!'/ + 
 -Q/ !,&*&9&9 5 /!6+"3 ': '#  7q9))*:;M3CS`a ")!,..G$03A688r4   )NNFTF)r,   r-   r.   r%   r2   r0   rc   r9   rU   r   r1   rd   ra   r3   r4   r5   r   r     sz    {{E399"a 8<8< ""'+
  (4+ !) 5+ + +  +r4   r   c                       e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 ddeej                     deej                     d	e	d
e	de	de	de	fdZy)FlaxBertLayerCollectionr8   r9   Fgradient_checkpointingc           	         | j                   rjt        t        d      }t        | j                  j
                        D cg c]*  } || j                  t        |      | j                        , c}| _        y t        | j                  j
                        D cg c]-  }t        | j                  t        |      | j                        / c}| _        y c c}w c c}w )N)         )static_argnums)namer9   )	r   rematr   ranger8   num_hidden_layersstrr9   layers)rT   FlaxBertCheckpointLayeris      r5   rU   zFlaxBertLayerCollection.setup*  s    &&&+M)&T# t{{<<= (#a&

SDK TYY]YdYdYvYvSwNOdkkAdjjIDK
s   /C2CNr   r   r   rV   r   output_hidden_statesreturn_dictc                    |rdnd }|	rdnd }|r|dnd }|W|j                   d   t        | j                        k7  r2t        dt        | j                         d|j                   d    d      t	        | j                        D ]@  \  }}|	r||fz  } ||||||   nd |||||      }|d   }|s,||d   fz  }|8||d   fz  }B |	r||fz  }||||f}|
st        d |D              S t        ||||	      S )
Nr3   r   z&The head_mask should be specified for z/ layers, but it is for                         .r$   rv   c              3   &   K   | ]	  }||  y wr   r3   ).0vs     r5   	<genexpr>z3FlaxBertLayerCollection.__call__.<locals>.<genexpr>l  s     =qq}=s   )last_hidden_stater*   r+   cross_attentions)rx   r   r   rm   	enumerater   r   )rT   r*   r^   	head_maskr   r   r   rV   r   r   r   all_attentionsall_hidden_statesall_cross_attentionsr   layerlayer_outputsr   s                     r5   ra   z FlaxBertLayerCollection.__call__6  sl     1d"6BD&7<Q<]rdh  q!c$++&67 <S=M<N O'ooa014 
 "$++. 	@HAu#!m%55!! ) 5	!4%&!	M *!,M =#3"55(4(]1-=,??(+	@.  -!11 "3^EYZ=G===<++%1	
 	
r4   NNFTFFTr,   r-   r.   r%   r2   r0   rc   r9   r   rd   rU   r   r1   ra   r3   r4   r5   r   r   %  s    {{E399"#(D(
" 8<8< ""'%* =

  (4=
 !) 5=
 =
 =
  =
 #=
 =
r4   r   c                       e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 ddeej                     deej                     d	e	d
e	de	de	de	fdZy)FlaxBertEncoderr8   r9   Fr   c                 f    t        | j                  | j                  | j                        | _        y )Nr9   r   )r   r8   r9   r   r  rS   s    r5   rU   zFlaxBertEncoder.setup{  s%    ,KK**#'#>#>

r4   Nr   r   r   rV   r   r   r   c                 8    | j                  |||||||||	|

      S )N)r  r   r   r   rV   r   r   r   )r  )rT   r*   r^   r  r   r   r   rV   r   r   r   s              r5   ra   zFlaxBertEncoder.__call__  s8     zz"7#9!'/!5#  
 	
r4   r  r  r3   r4   r5   r  r  v  s    {{E399"#(D(
 8<8< ""'%* 

  (4
 !) 5
 
 
  
 #
 
r4   r  c                   Z    e Zd ZU eed<   ej                  Zej                  ed<   d Zd Z	y)FlaxBertPoolerr8   r9   c                     t        j                  | j                  j                  t        j                   j
                  j                  | j                  j                        | j                        | _	        y r   )
rA   rn   r8   rD   rE   rF   rG   rH   r9   r   rS   s    r5   rU   zFlaxBertPooler.setup  sH    XXKK##++224;;3P3PQ**

r4   c                 `    |d d df   }| j                  |      }t        j                  |      S )Nr   )r   rA   tanh)rT   r*   cls_hidden_states      r5   ra   zFlaxBertPooler.__call__  s1    (A.::&67ww'((r4   Nr   r3   r4   r5   r  r    s$    {{E399"
)r4   r  c                   Z    e Zd ZU eed<   ej                  Zej                  ed<   d Zd Z	y)FlaxBertPredictionHeadTransformr8   r9   c                 0   t        j                  | j                  j                  | j                        | _        t        | j                  j                     | _        t        j                  | j                  j                  | j                        | _	        y )Nrj   r=   )rA   rn   r8   rD   r9   r   r   r   r   rN   rO   rS   s    r5   rU   z%FlaxBertPredictionHeadTransform.setup  s[    XXdkk55TZZH
 !7!78dkk.H.HPTPZPZ[r4   c                 h    | j                  |      }| j                  |      }| j                  |      S r   )r   r   rN   ry   s     r5   ra   z(FlaxBertPredictionHeadTransform.__call__  s-    

=16~~m,,r4   Nr   r3   r4   r5   r  r    s%    {{E399"\
-r4   r  c                       e Zd ZU eed<   ej                  Zej                  ed<   ej                  j                  j                  Zedej                  f   ed<   d ZddZy)	FlaxBertLMPredictionHeadr8   r9   .	bias_initc                 4   t        | j                  | j                        | _        t	        j
                  | j                  j                  | j                  d      | _        | j                  d| j                  | j                  j                  f      | _
        y )Nrj   F)r9   use_biasr   )r  r8   r9   	transformrA   rn   rC   decoderparamr!  r   rS   s    r5   rU   zFlaxBertLMPredictionHead.setup  s`    8DJJWxx 6 6djjSXYJJvt~~8N8N7PQ	r4   Nc                    | j                  |      }|+| j                  j                  dd|j                  ii|      }n| j                  |      }t	        j
                  | j                  | j                        }||z  }|S )Nparamskernel)r$  r%  applyTr0   asarrayr   r9   )rT   r*   shared_embeddingr   s       r5   ra   z!FlaxBertLMPredictionHead.__call__  st    }5' LL..8EUEWEW:X/Y[hiM LL7M{{499djj1r4   r   )r,   r-   r.   r%   r2   r0   rc   r9   rE   rA   rF   r   r!  r   npr1   rU   ra   r3   r4   r5   r   r     sL    {{E399"+.66+>+>+D+DIxRZZ(DR

r4   r   c                   \    e Zd ZU eed<   ej                  Zej                  ed<   d ZddZ	y)FlaxBertOnlyMLMHeadr8   r9   c                 P    t        | j                  | j                        | _        y )Nrj   )r   r8   r9   predictionsrS   s    r5   rU   zFlaxBertOnlyMLMHead.setup  s    3DKKtzzRr4   Nc                 ,    | j                  ||      }|S Nr-  )r2  )rT   r*   r-  s      r5   ra   zFlaxBertOnlyMLMHead.__call__  s    ((IY(Zr4   r   r   r3   r4   r5   r0  r0    s%    {{E399"Sr4   r0  c                   P    e Zd ZU ej                  Zej
                  ed<   d Zd Zy)FlaxBertOnlyNSPHeadr9   c                 P    t        j                  d| j                        | _        y )Nrv   rj   )rA   rn   r9   seq_relationshiprS   s    r5   rU   zFlaxBertOnlyNSPHead.setup  s     "$** =r4   c                 $    | j                  |      S r   )r9  )rT   pooled_outputs     r5   ra   zFlaxBertOnlyNSPHead.__call__  s    $$]33r4   N)	r,   r-   r.   r0   rc   r9   r2   rU   ra   r3   r4   r5   r7  r7    s    {{E399">4r4   r7  c                   \    e Zd ZU eed<   ej                  Zej                  ed<   d ZddZ	y)FlaxBertPreTrainingHeadsr8   r9   c                     t        | j                  | j                        | _        t	        j
                  d| j                        | _        y )Nrj   rv   )r   r8   r9   r2  rA   rn   r9  rS   s    r5   rU   zFlaxBertPreTrainingHeads.setup  s0    3DKKtzzR "$** =r4   Nc                 R    | j                  ||      }| j                  |      }||fS r4  )r2  r9  )rT   r*   r;  r-  prediction_scoresseq_relationship_scores         r5   ra   z!FlaxBertPreTrainingHeads.__call__  s6     ,,]M],^!%!6!6}!E "888r4   r   r   r3   r4   r5   r=  r=    s$    {{E399">9r4   r=  c                       e Zd ZU dZeZdZdZej                  e
d<   ddej                  ddfd	ed
ededej                  dedef fdZd Zddej(                  j*                  d
ededefdZd Z eej7                  d            	 	 	 	 	 	 	 	 	 	 	 	 	 ddedej(                  j*                  dedee   dee   dee   defd       Z xZS ) FlaxBertPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    bertNmodule_class)r$   r$   r   TFr8   input_shapeseedr9   _do_initr   c                 \     | j                   d|||d|}t        	| 	  ||||||       y )Nr8   r9   r   )rF  rG  r9   rH  r3   )rE  super__init__)
rT   r8   rF  rG  r9   rH  r   kwargsmodule	__class__s
            r5   rL  z FlaxBertPreTrainedModel.__init__  sM     #"" 
#9
 	
 	[tSXcklr4   c                 ^    | j                  | j                  | j                  d      | _        y )NTrJ  )rE  r8   r9   _modulerS   s    r5   enable_gradient_checkpointingz5FlaxBertPreTrainedModel.enable_gradient_checkpointing  s*    ((;;**#' ) 
r4   rngr(  returnc                    t        j                  |d      }t        j                  |      }t        j                  t        j                  t        j
                  |      j                  d         |      }t        j                  |      }t        j                  | j                  j                  | j                  j                  f      }t        j                  j                  |      \  }	}
|	|
d}| j                  j                  rTt        j                  || j                  j                   fz         }|}| j"                  j%                  ||||||||d	      }n"| j"                  j%                  ||||||d      }|d   }|dt'        t)        |            }t'        t)        |            }| j*                  D ]
  }||   ||<    t-               | _        t/        t1        |            S |S )NrX   rj   r   )r(  rR   F)r   r(  )r0   r   
zeros_liker   r   
atleast_2drx   	ones_likerr   r8   r   rk   rE   randomsplitr   rD   rN  initr   r   _missing_keyssetr   r   )rT   rS  rF  r(  r[   r\   r]   r^   r  
params_rngr   rngsr   r   module_init_outputsrandom_paramsmissing_keys                    r5   init_weightsz$FlaxBertPreTrainedModel.init_weights  s   IIk6		2''

3>>)3L3R3RSU3V(WYdey1HHdkk;;T[[=\=\]^	"%**"2"23"7
K$=;;**$'IIkT[[=T=T<V.V$W!%3""&++"2"2%&! #3 
# #'++"2"2iyfk #3 # ,H5(-)@AM!(6"23F#11 A&3K&@{#A!$D.011  r4   c                    t        j                  ||fd      }t        j                  |d      }t        j                  t        j                  t        j
                  |      j                  d         |j                        }| j                  j                  t        j                  j                  d      |||dd      }t        |d         S )	aW  
        Args:
            batch_size (`int`):
                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
            max_length (`int`):
                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
                cache.
        rX   rj   r   r   FT)r   r   r~   )r0   rr   rX  r   r   rW  rx   rN  r[  rE   rY  PRNGKeyr   )rT   r   r   r[   r^   r]   init_variabless          r5   r   z"FlaxBertPreTrainedModel.init_cacheF  s     HHj*5TB	y=''

3>>)3L3R3RSU3V(WYbYhYhi))JJq!9nlX]jn * 
 w/00r4   batch_size, sequence_lengthr   trainr   r   r   past_key_valuesc                 p   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        j
                  |      }|St	        j                  t	        j                  t	        j                  |      j                  d         |j                        }|t	        j                  |      }|?t	        j                  | j                   j                  | j                   j                  f      }i }|	|	|d<   d|xs | j                  i}| j                   j                  r|r	||d<   dg}nd}| j                   j#                  |t	        j$                  |d      t	        j$                  |d      t	        j$                  |d      t	        j$                  |d      t	        j$                  |d      |||
 |||||      }||r|\  }}t'        |d         |d	<   |S |"|s |\  }}|d d
 t'        |d         fz   |d
d  z   }|S | j                   j#                  |t	        j$                  |d      t	        j$                  |d      t	        j$                  |d      t	        j$                  |d      t	        j$                  |d      |
 ||||      }|S )Nr   rR   r(  r~   FrX   rj   )r\   r]   r  r   r   rV   r   r   r   r_  mutableri  r$   )r\   r]   r  rV   r   r   r   r_  )r8   r   r   r   r0   rV  r   r   rW  rx   rX  rr   r   rk   r(  r   rN  r*  r   r   )rT   r[   r^   r\   r]   r  r   r   r(  r   rh  r   r   r   ri  r_  inputsrk  r   s                      r5   ra   z FlaxBertPreTrainedModel.__call__Y  s   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY ! ^^I6N++CJJs~~i7P7V7VWY7Z,[]f]l]lmL! ]]95N$++"?"?A`A`!abI ")DOF1dkk2;;** "1w")kk''		)40		.5"yytD YY|4@))IT:&;'="'i"3%9' ( G$ *{+2(-5og6N-O)* ,[+2(!"1+/'2J)K(MMPWXYXZP[["  kk''		)40		.5"yytD YY|4@))IT:"'i"3%9' ( G r4   r   )NNNNNNNNFNNNN) r,   r-   r.   r/   r%   config_classbase_model_prefixrE  rA   Moduler2   r0   rc   r   intr9   rd   rL  rR  rE   rY  re  r   rc  r   r"   BERT_INPUTS_DOCSTRINGformatdictr   ra   __classcell__)rO  s   @r5   rC  rC    so   
 L"L"))"
 $;;',mm m 	m
 yym m !%m$
(!

 2 2 (! (!PZ (!fp (!V1& ++@+G+GHe+fg "#*.,0/3&* $^ ^ ZZ''^ ^ $D>^ 'tn^ d^^ ^ h^r4   rC  c                   8   e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   dZ
e	ed<   d Z	 	 	 	 	 	 	 	 	 	 dd	eej                     d
eej                     deej                     deej                     deej                     de	de	de	de	de	fdZy)FlaxBertModuler8   r9   Tadd_pooling_layerFr   c                     t        | j                  | j                        | _        t	        | j                  | j                  | j
                        | _        t        | j                  | j                        | _        y )Nrj   r  )	r7   r8   r9   
embeddingsr  r   encoderr  poolerrS   s    r5   rU   zFlaxBertModule.setup  sS    ,T[[

K&KK**#'#>#>

 %T[[

Cr4   Nr\   r]   r  r   r   r   rV   r   r   r   c                    |t        j                  |      }|St        j                  t        j                  t        j                  |      j
                  d         |j
                        }| j                  |||||	      }| j                  ||||	||||
||
      }|d   }| j                  r| j                  |      nd }|s|	|f|dd  z   S ||f|dd  z   S t        |||j                  |j                  |j                        S )Nr   rY   )r  rV   r   r   r   r   r   r   r   r$   )r  pooler_outputr*   r+   r  )r0   rV  r   r   rW  rx   ry  rz  rw  r{  r   r*   r+   r  )rT   r[   r^   r\   r]   r  r   r   r   rV   r   r   r   r*   r   pooleds                   r5   ra   zFlaxBertModule.__call__  s,     ! ^^I6N ++CJJs~~i7P7V7VWY7Z,[]f]l]lmL~|^S` ( 
 ,,'"7#9!/!5#  
  
/3/E/E]+4~%''!"+55!6*WQR[88?+ !//))$55
 	
r4   )
NNNNNFTFFT)r,   r-   r.   r%   r2   r0   rc   r9   rw  rd   r   rU   r   r1   ra   r3   r4   r5   rv  rv    s    {{E399""t"#(D(D 15.2+/7;8< ""'%* 5
 !-	5

 s{{+5
 CKK(5
  (45
 !) 55
 5
 5
  5
 #5
 5
r4   rv  z^The bare Bert Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd ZeZy)FlaxBertModelN)r,   r-   r.   rv  rE  r3   r4   r5   r  r    s	    
 "Lr4   r  c            	           e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 dde	de	de	d	e	fd
Zy)FlaxBertForPreTrainingModuler8   r9   Fr   c                     t        | j                  | j                  | j                        | _        t        | j                  | j                        | _        y )NrJ  r8   r9   )rv  r8   r9   r   rD  r=  clsrS   s    r5   rU   z"FlaxBertForPreTrainingModule.setup  s=    ";;**#'#>#>
	
 ,4;;djjQr4   rV   r   r   r   c
                 L   | j                  |||||||||		      }
| j                  j                  r#| j                   j                  d   d   d   d   }nd }|
d   }|
d   }| j	                  |||      \  }}|	s
||f|
d	d  z   S t        |||
j                  |
j                  
      S )NrV   r   r   r   r(  ry  rI   	embeddingr   r$   r5  rv   )r(   r)   r*   r+   )rD  r8   tie_word_embeddingsr   r  r'   r*   r+   )rT   r[   r^   r\   r]   r  rV   r   r   r   r   r-  r*   r;  r@  rA  s                   r5   ra   z%FlaxBertForPreTrainingModule.__call__  s     ))'/!5#  

 ;;**#yy228<\JK\]^ij#

48HH=;K 5= 5
11 %'=>LL+/$:!//))	
 	
r4   NTFFTr,   r-   r.   r%   r2   r0   rc   r9   r   rd   rU   ra   r3   r4   r5   r  r    sf    {{E399"#(D(R #"'%* -
 -
  -
 #-
 -
r4   r  z
    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
    sentence prediction (classification)` head.
    c                       e Zd ZeZy)FlaxBertForPreTrainingN)r,   r-   r.   r  rE  r3   r4   r5   r  r  J  s	     0Lr4   r  a  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, FlaxBertForPreTraining

    >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
    >>> model = FlaxBertForPreTraining.from_pretrained("google-bert/bert-base-uncased")

    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
    >>> outputs = model(**inputs)

    >>> prediction_logits = outputs.prediction_logits
    >>> seq_relationship_logits = outputs.seq_relationship_logits
    ```
rg  )output_typerm  c            	           e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 dde	de	de	d	e	fd
Zy)FlaxBertForMaskedLMModuler8   r9   Fr   c                     t        | j                  d| j                  | j                        | _        t        | j                  | j                        | _        y NF)r8   rw  r9   r   r  rv  r8   r9   r   rD  r0  r  rS   s    r5   rU   zFlaxBertForMaskedLMModule.setupv  @    ";;#**#'#>#>	
	 'dkkLr4   rV   r   r   r   c
                 6   | j                  |||||||||		      }
|
d   }| j                  j                  r#| j                   j                  d   d   d   d   }nd }| j	                  ||      }|	s	|f|
dd  z   S t        ||
j                  |
j                  	      S )
Nr  r   r(  ry  rI   r  r5  r$   logitsr*   r+   )rD  r8   r  r   r  r   r*   r+   )rT   r[   r^   r\   r]   r  rV   r   r   r   r   r*   r-  r  s                 r5   ra   z"FlaxBertForMaskedLMModule.__call__  s     ))'/!5#  

  
;;**#yy228<\JK\]^ij# -:JK9wqr{**!!//))
 	
r4   Nr  r  r3   r4   r5   r  r  q  sf    {{E399"#(D(M  #"'%* )
 )
  )
 #)
 )
r4   r  z2Bert Model with a `language modeling` head on top.c                       e Zd ZeZy)FlaxBertForMaskedLMN)r,   r-   r.   r  rE  r3   r4   r5   r  r    s    ,Lr4   r  c            	           e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 dde	de	de	d	e	fd
Zy)'FlaxBertForNextSentencePredictionModuler8   r9   Fr   c                     t        | j                  | j                  | j                        | _        t        | j                        | _        y )NrJ  rj   )rv  r8   r9   r   rD  r7  r  rS   s    r5   rU   z-FlaxBertForNextSentencePredictionModule.setup  s7    ";;**#'#>#>
	
 'TZZ8r4   rV   r   r   r   c
                     |	|	n| j                   j                  }	| j                  |||||||||		      }
|
d   }| j                  |      }|	s	|f|
dd  z   S t	        ||
j
                  |
j                        S )Nr  r$   rv   r  )r8   r   rD  r  r   r*   r+   )rT   r[   r^   r\   r]   r  rV   r   r   r   r   r;  seq_relationship_scoress                r5   ra   z0FlaxBertForNextSentencePredictionModule.__call__  s     &1%<k$++BYBY ))'/!5#  

  
"&((="9+-;;.*!//))
 	
r4   Nr  r  r3   r4   r5   r  r    se    {{E399"#(D(9 #"'%* %
 %
  %
 #%
 %
r4   r  zJBert Model with a `next sentence prediction (classification)` head on top.c                       e Zd ZeZy)!FlaxBertForNextSentencePredictionN)r,   r-   r.   r  rE  r3   r4   r5   r  r    s	    
 ;Lr4   r  a  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, FlaxBertForNextSentencePrediction

    >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
    >>> model = FlaxBertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased")

    >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
    >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
    >>> encoding = tokenizer(prompt, next_sentence, return_tensors="jax")

    >>> outputs = model(**encoding)
    >>> logits = outputs.logits
    >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
    ```
c            	           e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 dde	de	de	d	e	fd
Zy)'FlaxBertForSequenceClassificationModuler8   r9   Fr   c                    t        | j                  | j                  | j                        | _        | j                  j
                  | j                  j
                  n| j                  j                  }t        j                  |      | _	        t        j                  | j                  j                  | j                        | _        y )NrJ  r?   rj   rv  r8   r9   r   rD  classifier_dropoutrQ   rA   rP   rR   rn   
num_labels
classifierrT   r  s     r5   rU   z-FlaxBertForSequenceClassificationModule.setup  s    ";;**#'#>#>
	 {{--9 KK**00 	
 zz'9:((KK""**
r4   rV   r   r   r   c
                     | j                  |||||||||		      }
|
d   }| j                  ||      }| j                  |      }|	s	|f|
dd  z   S t        ||
j                  |
j
                        S )Nr  r$   rY   rv   r  )rD  rR   r  r   r*   r+   )rT   r[   r^   r\   r]   r  rV   r   r   r   r   r;  r  s                r5   ra   z0FlaxBertForSequenceClassificationModule.__call__%  s     ))'/!5#  

  
]-P/9wqr{**+!//))
 	
r4   Nr  r  r3   r4   r5   r  r    se    {{E399"#(D(
0 #"'%* $
 $
  $
 #$
 $
r4   r  z
    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                       e Zd ZeZy)!FlaxBertForSequenceClassificationN)r,   r-   r.   r  rE  r3   r4   r5   r  r  L  s	     ;Lr4   r  c            	           e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 dde	de	de	d	e	fd
Zy)FlaxBertForMultipleChoiceModuler8   r9   Fr   c                    t        | j                  | j                  | j                        | _        t        j                  | j                  j                        | _        t        j                  d| j                        | _
        y )NrJ  r?   r$   rj   )rv  r8   r9   r   rD  rA   rP   rQ   rR   rn   r  rS   s    r5   rU   z%FlaxBertForMultipleChoiceModule.setupd  sW    ";;**#'#>#>
	
 zzt{{'F'FG((1DJJ7r4   rV   r   r   r   c
                 <   |j                   d   }
||j                  d|j                   d         nd }||j                  d|j                   d         nd }||j                  d|j                   d         nd }||j                  d|j                   d         nd }| j                  |||||||||		      }|d   }| j                  ||      }| j	                  |      }|j                  d|
      }|	s	|f|dd  z   S t        ||j                  |j                        S )Nr$   r   r  rY   rv   r  )rx   rw   rD  rR   r  r   r*   r+   )rT   r[   r^   r\   r]   r  rV   r   r   r   num_choicesr   r;  r  reshaped_logitss                  r5   ra   z(FlaxBertForMultipleChoiceModule.__call__m  sH     ooa(BKBWI%%b)//"*=>]a	Q_Qk//N4H4H4LMquQ_Qk//N4H4H4LMquKWKc|++B0B0B20FGim ))'/!5#  

  
]-P/ ..[9#%33,"!//))
 	
r4   Nr  r  r3   r4   r5   r  r  _  se    {{E399"#(D(8  #"'%* ,
 ,
  ,
 #,
 ,
r4   r  z
    Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                       e Zd ZeZy)FlaxBertForMultipleChoiceN)r,   r-   r.   r  rE  r3   r4   r5   r  r    s	     3Lr4   r  z(batch_size, num_choices, sequence_lengthc            	           e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 dde	de	de	d	e	fd
Zy)$FlaxBertForTokenClassificationModuler8   r9   Fr   c                    t        | j                  | j                  d| j                        | _        | j                  j
                  | j                  j
                  n| j                  j                  }t        j                  |      | _	        t        j                  | j                  j                  | j                        | _        y )NFr8   r9   rw  r   r?   rj   r  r  s     r5   rU   z*FlaxBertForTokenClassificationModule.setup  s    ";;**##'#>#>	
	 {{--9 KK**00 	
 zz'9:((4;;#9#9Lr4   rV   r   r   r   c
                     | j                  |||||||||		      }
|
d   }| j                  ||      }| j                  |      }|	s	|f|
dd  z   S t        ||
j                  |
j
                        S )Nr  r   rY   r$   r  )rD  rR   r  r   r*   r+   )rT   r[   r^   r\   r]   r  rV   r   r   r   r   r*   r  s                r5   ra   z-FlaxBertForTokenClassificationModule.__call__  s     ))'/!5#  

  
]-P/9wqr{**(!//))
 	
r4   Nr  r  r3   r4   r5   r  r    sf    {{E399"#(D(M, #"'%* $
 $
  $
 #$
 $
r4   r  z
    Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       e Zd ZeZy)FlaxBertForTokenClassificationN)r,   r-   r.   r  rE  r3   r4   r5   r  r    s	     8Lr4   r  c            	           e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 dde	de	de	d	e	fd
Zy)"FlaxBertForQuestionAnsweringModuler8   r9   Fr   c                     t        | j                  | j                  d| j                        | _        t        j                  | j                  j                  | j                        | _        y )NFr  rj   )	rv  r8   r9   r   rD  rA   rn   r  
qa_outputsrS   s    r5   rU   z(FlaxBertForQuestionAnsweringModule.setup  sJ    ";;**##'#>#>	
	 ((4;;#9#9Lr4   rV   r   r   r   c
                 b   | j                  |||||||||		      }
|
d   }| j                  |      }t        j                  || j                  j
                  d      \  }}|j                  d      }|j                  d      }|	s
||f|
dd  z   S t        |||
j                  |
j                        S )Nr  r   r   r   r$   )start_logits
end_logitsr*   r+   )
rD  r  r0   rZ  r8   r  squeezer   r*   r+   )rT   r[   r^   r\   r]   r  rV   r   r   r   r   r*   r  r  r  s                  r5   ra   z+FlaxBertForQuestionAnsweringModule.__call__  s     ))'/!5#  

  
/#&99VT[[5K5KRT#U j#++B/''+
 *-;;/%!!//))	
 	
r4   Nr  r  r3   r4   r5   r  r    sf    {{E399"#(D(M  #"'%* (
 (
  (
 #(
 (
r4   r  z
    Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       e Zd ZeZy)FlaxBertForQuestionAnsweringN)r,   r-   r.   r  rE  r3   r4   r5   r  r  3  s	     6Lr4   r  c                   
   e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 	 	 	 	 	 ddeej                     deej                     d	eej                     d
eej                     de	de	de	de	de	fdZy)FlaxBertForCausalLMModuler8   r9   Fr   c                     t        | j                  d| j                  | j                        | _        t        | j                  | j                        | _        y r  r  rS   s    r5   rU   zFlaxBertForCausalLMModule.setupK  r  r4   Nr\   r  r   r   r   rV   r   r   r   c                 R   | j                  |||||||||	|
||      }|d   }| j                  j                  r#| j                   j                  d   d   d   d   }nd }| j	                  ||      }|s	|f|dd  z   S t        ||j                  |j                  |j                  	      S )
N)r   r   r   rV   r   r   r   r   r(  ry  rI   r  r5  r$   )r  r*   r+   r  )	rD  r8   r  r   r  r   r*   r+   r  )rT   r[   r^   r]   r\   r  r   r   r   rV   r   r   r   r   r*   r-  r  s                    r5   ra   z"FlaxBertForCausalLMModule.__call__T  s      ))"7#9!'/!5#  
  
;;**#yy228<\JK\]^ij# -:JK9wqr{**4!//))$55	
 	
r4   )	NNNNFTFFTr  r3   r4   r5   r  r  F  s    {{E399"#(D(M 15+/7;8< ""'%* 0

 !-0
 CKK(0
  (40
 !) 50
 0
 0
  0
 #0
 0
r4   r  z
    Bert Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
    autoregressive tasks.
    c                   >    e Zd ZeZddeej                     fdZd Z	y)FlaxBertForCausalLMNr^   c                 H   |j                   \  }}| j                  ||      }t        j                  ||fd      }|-|j	                  d      dz
  }t        j                  ||d      }n4t        j                  t        j                  |d      d d d f   ||f      }|||dS )NrX   rj   r   r   r$   )r   r   )ri  r^   r]   )	rx   r   r0   rr   cumsumr   r   r   r   )	rT   r[   r   r^   r   
seq_lengthri  extended_attention_maskr]   s	            r5   prepare_inputs_for_generationz1FlaxBertForCausalLM.prepare_inputs_for_generation  s    !*
J//*jA #&((J
+C4"P%)00b09A=L&)&>&>?VXfhn&o#++CJJz,NtUVw,WZdfpYqrL  /5(
 	
r4   c                 L    |j                   |d<   |d   d d dd f   dz   |d<   |S )Nri  r]   r   r$   )ri  )rT   model_outputsmodel_kwargss      r5   update_inputs_for_generationz0FlaxBertForCausalLM.update_inputs_for_generation  s8    *7*G*G&''3N'CArsF'Ka'O^$r4   r   )
r,   r-   r.   r  rE  r   rE   Arrayr  r  r3   r4   r5   r  r    s'     -L
S[\_\e\eSf 
*r4   r  )dtypingr   r   r   flax
flax.linenlinenrA   rE   	jax.numpynumpyr0   r.  flax.core.frozen_dictr   r   r   r	   r
   r   nn_partitioningflax.linen.attentionr   flax.traverse_utilr   r   r   modeling_flax_outputsr   r   r   r   r   r   r   r   r   r   modeling_flax_utilsr   r   r   r   r   utilsr    r!   r"   r#   configuration_bertr%   
get_loggerr,   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCr   struct	dataclassr'   BERT_START_DOCSTRINGrq  ro  r7   rf   r   r   r   r   r   r   r  r  r  r   r0  r7  r=  rC  rv  r  r  r  #FLAX_BERT_FOR_PRETRAINING_DOCSTRINGrr  r  r  r  r  &FLAX_BERT_FOR_NEXT_SENT_PRED_DOCSTRINGr  r  r  r  r  r  r  r  r  r  r3   r4   r5   <module>r     s    - ,   
   > > 6 6 > ;     g f * 
		H	%5  4; 4 4:. `$ N( (VhBII hV ('		 'T299 $RYY (6BII 6rN
bii N
b$
bii $
N)RYY )"-bii -ryy .	")) 	4")) 49ryy 9@1 @FD
RYY D
N d"+ "	" ],?A_ap q:
299 :
z  04 00' #&   !>?Bee !(DSb
7
		 7
t NPde-1 - f- 02EGY[j k2
bii 2
j T;(? ;	;* &, %  !>?Bhh !%3Rap
:
bii :
z  ;(? ;; % 	:
bii :
z  3 7 33 4;;<fg 24QSb
8
299 8
v  8%< 88 "$79RTc
6
 6
r  6#: 66  $	>
		 >
B  1 < )	r4   