
    sgC                     l   d dl mZmZmZ d dlZd dlmZ d dlZd dl	m
Z d dl
Zd dlmZmZmZ d dlmZ d dlmZmZ d dlmZ ddlmZmZmZmZmZmZmZ dd	lm Z m!Z!m"Z"m#Z#m$Z$ dd
l%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+  e)jX                  e-      Z.dZ/dZ0ejb                  jd                   G d de&             Z3dZ4dZ5 G d dejl                        Z7 G d dejl                        Z8 G d dejl                        Z9 G d dejl                        Z: G d dejl                        Z; G d dejl                        Z< G d d ejl                        Z= G d! d"ejl                        Z> G d# d$ejl                        Z? G d% d&e!      Z@ G d' d(ejl                        ZA e'd)e4       G d* d+e@             ZB e"eBe/ee0        G d, d-ejl                        ZC e'd.e4       G d/ d0e@             ZDd1ZE e$eDe5j                  d2      eEz           e#eDe3e03        G d4 d5ejl                        ZG e'd6e4       G d7 d8e@             ZH e"eHe/ee0d9:        G d; d<ejl                        ZI e'd=e4       G d> d?e@             ZJ e"eJe/ee0        G d@ dAejl                        ZK e'dBe4       G dC dDe@             ZL e$eLe5j                  dE              e"eLe/ee0        G dF dGejl                        ZM e'dHe4       G dI dJe@             ZN e"eNe/ee0        G dK dLejl                        ZO e'dMe4       G dN dOe@             ZP e"ePe/ee0       g dPZQy)Q    )CallableOptionalTupleN)
FrozenDictfreezeunfreeze)dot_product_attention_weights)flatten_dictunflatten_dict)lax   )FlaxBaseModelOutputFlaxBaseModelOutputWithPoolingFlaxMaskedLMOutputFlaxMultipleChoiceModelOutput FlaxQuestionAnsweringModelOutputFlaxSequenceClassifierOutputFlaxTokenClassifierOutput)ACT2FNFlaxPreTrainedModelappend_call_sample_docstring append_replace_return_docstringsoverwrite_call_docstring)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )AlbertConfigzalbert/albert-base-v2r   c                       e Zd ZU dZdZej                  ed<   dZej                  ed<   dZ	e
eej                        ed<   dZe
eej                        ed<   y)FlaxAlbertForPreTrainingOutputaB  
    Output type of [`FlaxAlbertForPreTraining`].

    Args:
        prediction_logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        sop_logits (`jnp.ndarray` of shape `(batch_size, 2)`):
            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
            before SoftMax).
        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nprediction_logits
sop_logitshidden_states
attentions)__name__
__module____qualname____doc__r"   jnpndarray__annotations__r#   r$   r   r   r%        b/var/www/html/venv/lib/python3.12/site-packages/transformers/models/albert/modeling_flax_albert.pyr!   r!   6   sV    , &*s{{)"J"26M8E#++./6/3Js{{+,3r.   r!   a  

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`AlbertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
            `jax.numpy.bfloat16` (on TPUs).

            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
            specified all the computation will be performed with the given `dtype`.

            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
            parameters.**

            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
            [`~FlaxPreTrainedModel.to_bf16`].
a  
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

c                   f    e Zd ZU dZeed<   ej                  Zej                  ed<   d Z	dde
fdZy)	FlaxAlbertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.configdtypec                    t        j                  | j                  j                  | j                  j                  t
        j                   j                  j                  | j                  j                              | _	        t        j                  | j                  j                  | j                  j                  t
        j                   j                  j                  | j                  j                              | _        t        j                  | j                  j                  | j                  j                  t
        j                   j                  j                  | j                  j                              | _        t        j                  | j                  j                  | j                         | _        t        j"                  | j                  j$                        | _        y )N)stddev)embedding_initepsilonr3   rate)nnEmbedr2   
vocab_sizeembedding_sizejaxinitializersnormalinitializer_rangeword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsr3   Dropouthidden_dropout_probdropoutselfs    r/   setupzFlaxAlbertEmbeddings.setup   s5   !xxKK""KK&&66..55T[[=Z=Z5[ 

 $&88KK//KK&&66..55T[[=Z=Z5[$
 
 &(XXKK''KK&&66..55T[[=Z=Z5[&
"
 dkk.H.HPTPZPZ[zzt{{'F'FGr.   deterministicc                    | j                  |j                  d            }| j                  |j                  d            }| j                  |j                  d            }||z   |z   }| j	                  |      }| j                  ||      }|S )Ni4rP   )rC   astyperE   rG   rH   rL   )	rN   	input_idstoken_type_idsposition_idsrP   inputs_embedsposition_embedsrG   r$   s	            r/   __call__zFlaxAlbertEmbeddings.__call__   s    ,,Y-=-=d-CD22<3F3Ft3LM $ : :>;P;PQU;V W &(==O }5]-Pr.   NT)r&   r'   r(   r)   r   r,   r*   float32r3   rO   boolrZ   r-   r.   r/   r1   r1      s/    Q{{E399"H&t r.   r1   c                   b    e Zd ZU eed<   ej                  Zej                  ed<   d Zdde	fdZ
y)FlaxAlbertSelfAttentionr2   r3   c                    | j                   j                  | j                   j                  z  dk7  rt        d      t	        j
                  | j                   j                  | j                  t        j                  j                  j                  | j                   j                              | _        t	        j
                  | j                   j                  | j                  t        j                  j                  j                  | j                   j                              | _        t	        j
                  | j                   j                  | j                  t        j                  j                  j                  | j                   j                              | _        t	        j
                  | j                   j                  t        j                  j                  j                  | j                   j                        | j                        | _        t	        j                  | j                   j                   | j                        | _        t	        j"                  | j                   j$                        | _        y )Nr   z`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads`                    : {self.config.num_attention_heads})r3   kernel_initra   r3   r7   r9   )r2   hidden_sizenum_attention_heads
ValueErrorr;   Denser3   r?   r@   rA   rB   querykeyvaluedenserH   rI   rJ   rK   rL   rM   s    r/   rO   zFlaxAlbertSelfAttention.setup   s   ;;""T[[%D%DDII 
 XXKK##**++224;;3P3PQ


 88KK##**++224;;3P3PQ

 XXKK##**++224;;3P3PQ


 XXKK##++224;;3P3PQ**


 dkk.H.HPTPZPZ[zzt{{'F'FGr.   output_attentionsc                 P   | j                   j                  | j                   j                  z  }| j                  |      j	                  |j
                  d d | j                   j                  |fz         }| j                  |      j	                  |j
                  d d | j                   j                  |fz         }| j                  |      j	                  |j
                  d d | j                   j                  |fz         }|t        j                  |d      }t        j                  |dkD  t        j                  |j
                  d      j                  | j                        t        j                  |j
                  t        j                  | j                        j                         j                  | j                              }	nd }	d }
|s*| j                   j"                  dkD  r| j%                  d      }
t'        |||	|
| j                   j"                  d|| j                  d 	      }t        j(                  d	||      }|j	                  |j
                  d d d
z         }| j+                  |      }| j-                  ||      }| j/                  ||z         }|r||f}|S |f}|S )N   )axisr   g        rL   T)biasdropout_rngdropout_ratebroadcast_dropoutrP   r3   	precisionz...hqk,...khd->...qhd)rS   )r2   rc   rd   rg   reshapeshaperi   rh   r*   expand_dimsr   selectfullrT   r3   finfominattention_probs_dropout_probmake_rngr	   einsumrj   rL   rH   )rN   r$   attention_maskrP   rk   head_dimquery_statesvalue_states
key_statesattention_biasrs   attn_weightsattn_outputprojected_attn_outputlayernormed_attn_outputoutputss                   r/   rZ   z FlaxAlbertSelfAttention.__call__   sY   ;;**dkk.M.MMzz-088#t{{'F'F&QQ
 zz-088#t{{'F'F&QQ
 XXm,44#t{{'F'F&QQ


 % __^(KN ZZ"--s3::4::F--syy/D/H/HIPPQUQ[Q[\N "N!I!IC!O--	2K4#AA"'**

 jj!8,U!))+*;*;BQ*?%*GH $

; 7 $-BR_ `"&..1F1V"W=N*L9 VmTnr.   NTFr&   r'   r(   r   r,   r*   r\   r3   rO   r]   rZ   r-   r.   r/   r_   r_      s-    {{E399"H<0]a 0r.   r_   c                   j    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 dde	de	fdZ
y)	FlaxAlbertLayerr2   r3   c                 >   t        | j                  | j                        | _        t	        j
                  | j                  j                  t        j                  j                  j                  | j                  j                        | j                        | _        t        | j                  j                     | _        t	        j
                  | j                  j                  t        j                  j                  j                  | j                  j                        | j                        | _        t	        j"                  | j                  j$                  | j                        | _        t	        j(                  | j                  j*                        | _        y )Nr3   rb   r7   r9   )r_   r2   r3   	attentionr;   rf   intermediate_sizer?   r@   rA   rB   ffnr   
hidden_act
activationrc   
ffn_outputrH   rI   full_layer_layer_normrJ   rK   rL   rM   s    r/   rO   zFlaxAlbertLayer.setup  s    0DJJO88KK))++224;;3P3PQ**

 !!7!78((KK##++224;;3P3PQ**

 &(\\$++:T:T\`\f\f%g"zzt{{'F'FGr.   rP   rk   c                 
   | j                  ||||      }|d   }| j                  |      }| j                  |      }| j                  |      }| j	                  ||      }| j                  ||z         }|f}|r	||d   fz  }|S )NrP   rk   r   rS   r   )r   r   r   r   rL   r   )	rN   r$   r   rP   rk   attention_outputsattention_outputr   r   s	            r/   rZ   zFlaxAlbertLayer.__call__)  s     !NN>Zk + 
 -Q/XX./
__Z0
__Z0
\\*M\J
22:@P3PQ ")!,..Gr.   Nr   r   r-   r.   r/   r   r     sA    {{E399"H( #"' 	
  r.   r   c                   p    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 d	de	de	de	fdZ
y)
FlaxAlbertLayerCollectionr2   r3   c           	          t        | j                  j                        D cg c]-  }t        | j                  t	        |      | j
                        / c}| _        y c c}w )N)namer3   )ranger2   inner_group_numr   strr3   layersrN   is     r/   rO   zFlaxAlbertLayerCollection.setupE  sE    QVW[WbWbWrWrQs
LMODKKc!fDJJG
 
s   2ArP   rk   output_hidden_statesc                     d}d}t        | j                        D ]*  \  }}	 |	||||      }
|
d   }|r	||
d   fz   }|s%||fz   }, |f}|r||fz   }|r||fz   }|S )Nr-   r   r   r   )	enumerater   )rN   r$   r   rP   rk   r   layer_hidden_stateslayer_attentionslayer_indexalbert_layerlayer_outputr   s               r/   rZ   z"FlaxAlbertLayerCollection.__call__J  s     !)24;;)? 	M%K'+"3	L )OM #3|A6H#H #&9]<L&L#	M !"!4 66G!1 33Gr.   NTFFr   r-   r.   r/   r   r   A  sM    {{E399"
 #"'%* 	
   #r.   r   c                       e Zd ZU eed<   ej                  Zej                  ed<   dZe	e
   ed<   d Z	 	 	 d
dededefd	Zy)FlaxAlbertLayerCollectionsr2   r3   Nr   c                 P    t        | j                  | j                        | _        y )Nr   )r   r2   r3   albert_layersrM   s    r/   rO   z FlaxAlbertLayerCollections.setupq  s    6t{{$**Ur.   rP   rk   r   c                 2    | j                  |||||      }|S NrP   rk   r   )r   )rN   r$   r   rP   rk   r   r   s          r/   rZ   z#FlaxAlbertLayerCollections.__call__t  s/     $$'/!5 % 
 r.   r   )r&   r'   r(   r   r,   r*   r\   r3   r   r   r   rO   r]   rZ   r-   r.   r/   r   r   l  s\    {{E399"!%K#%V #"'%* 	
   #r.   r   c            	       v    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 d
de	de	de	de	fdZ
y	)FlaxAlbertLayerGroupsr2   r3   c           
          t        | j                  j                        D cg c]7  }t        | j                  t	        |      t	        |      | j
                        9 c}| _        y c c}w )N)r   r   r3   )r   r2   num_hidden_groupsr   r   r3   r   r   s     r/   rO   zFlaxAlbertLayerGroups.setup  sQ     4;;889
 't{{QSQRV[_[e[ef
 
s   <A'rP   rk   r   return_dictc                    |rdnd }|r|fnd }t        | j                  j                        D ]m  }	t        |	| j                  j                  | j                  j                  z  z        }
 | j
                  |
   |||||      }|d   }|r||d   z   }|sh||fz   }o |st        d |||fD              S t        |||      S )Nr-   r   r   rw   c              3   &   K   | ]	  }||  y wNr-   ).0vs     r/   	<genexpr>z1FlaxAlbertLayerGroups.__call__.<locals>.<genexpr>  s     hqZ[Zghs   )last_hidden_stater$   r%   )r   r2   num_hidden_layersintr   r   tupler   )rN   r$   r   rP   rk   r   r   all_attentionsall_hidden_statesr   	group_idxlayer_group_outputs               r/   rZ   zFlaxAlbertLayerGroups.__call__  s      1d0D],$t{{445 	IAA!>!>A^A^!^_`I!7Y!7+"3%9" /q1M !/2DR2H!H#$58H$H!!	I$ h]4E~$Vhhh"+;LYg
 	
r.   NTFFTr   r-   r.   r/   r   r     sZ    {{E399"
 #"'%* "
 	"

  "
 #"
 "
r.   r   c            	       v    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 d
de	de	de	de	fdZ
y	)FlaxAlbertEncoderr2   r3   c                 <   t        j                  | j                  j                  t        j                   j
                  j                  | j                  j                        | j                        | _	        t        | j                  | j                        | _        y )Nrb   r   )r;   rf   r2   rc   r?   r@   rA   rB   r3   embedding_hidden_mapping_inr   albert_layer_groupsrM   s    r/   rO   zFlaxAlbertEncoder.setup  sb    +-88KK##++224;;3P3PQ**,
(
 $9DJJ#W r.   rP   rk   r   r   c                 P    | j                  |      }| j                  |||||      S r   )r   r   )rN   r$   r   rP   rk   r   r   s          r/   rZ   zFlaxAlbertEncoder.__call__  s;     88G'''/!5 ( 
 	
r.   Nr   r   r-   r.   r/   r   r     s[    {{E399"X #"'%* 
 	

  
 #
 
r.   r   c                       e Zd ZU eed<   ej                  Zej                  ed<   ej                  j                  j                  Zedej                  f   ed<   d ZddZy)	FlaxAlbertOnlyMLMHeadr2   r3   .	bias_initc                    t        j                  | j                  j                  | j                        | _        t        | j                  j                     | _        t        j                  | j                  j                  | j                        | _	        t        j                  | j                  j                  | j                  d      | _        | j                  d| j                  | j                  j                  f      | _        y )Nr   r7   F)r3   use_biasrr   )r;   rf   r2   r>   r3   rj   r   r   r   rH   rI   r=   decoderparamr   rr   rM   s    r/   rO   zFlaxAlbertOnlyMLMHead.setup  s    XXdkk88

K
 !7!78dkk.H.HPTPZPZ[xx 6 6djjSXYJJvt~~8N8N7PQ	r.   Nc                    | j                  |      }| j                  |      }| j                  |      }|+| j                  j	                  dd|j
                  ii|      }n| j                  |      }|| j                  z  }|S )Nparamskernel)rj   r   rH   r   applyTrr   )rN   r$   shared_embeddings      r/   rZ   zFlaxAlbertOnlyMLMHead.__call__  s|    

=16}5' LL..8EUEWEW:X/Y[hiM LL7M"r.   r   )r&   r'   r(   r   r,   r*   r\   r3   r?   r;   r@   zerosr   r   npr+   rO   rZ   r-   r.   r/   r   r     sL    {{E399"+.66+>+>+D+DIxRZZ(DRr.   r   c                   \    e Zd ZU eed<   ej                  Zej                  ed<   d ZddZ	y)FlaxAlbertSOPHeadr2   r3   c                     t        j                  | j                  j                        | _        t        j
                  d| j                        | _        y )Nrm   r   )r;   rJ   r2   classifier_dropout_probrL   rf   r3   
classifierrM   s    r/   rO   zFlaxAlbertSOPHead.setup  s2    zz$++"E"EF((1DJJ7r.   c                 N    | j                  ||      }| j                  |      }|S )NrS   )rL   r   )rN   pooled_outputrP   logitss       r/   rZ   zFlaxAlbertSOPHead.__call__  s'    ]-P/r.   Nr[   )
r&   r'   r(   r   r,   r*   r\   r3   rO   rZ   r-   r.   r/   r   r     s$    {{E399"8r.   r   c                   z    e Zd ZU dZeZdZdZej                  e
d<   ddej                  dfded	ed
edej                  def
 fdZddej&                  j(                  d	ededefdZ eej3                  d            	 	 	 	 	 	 	 	 	 ddedej&                  j(                  dedee   dee   dee   fd       Z xZS )FlaxAlbertPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    albertNmodule_class)r   r   r   Tr2   input_shapeseedr3   _do_initc                 Z     | j                   d||d|}t        | 	  ||||||       y )Nr2   r3   )r   r   r3   r   r-   )r   super__init__)	rN   r2   r   r   r3   r   kwargsmodule	__class__s	           r/   r   z"FlaxAlbertPreTrainedModel.__init__  s=     #""H&HH[tSXcklr.   rngr   returnc                 |   t        j                  |d      }t        j                  |      }t        j                  t        j                  t        j
                  |      j                  d         |      }t        j                  |      }t        j                  j                  |      \  }}	||	d}
| j                  j                  |
||||d      d   }|dt        t        |            }t        t        |            }| j                  D ]
  }||   ||<    t!               | _        t#        t%        |            S |S )NrR   r   rw   )r   rL   F)r   r   )r*   r   
zeros_likebroadcast_toarange
atleast_2dry   	ones_liker?   randomsplitr   initr
   r   _missing_keyssetr   r   )rN   r   r   r   rU   rV   rW   r   
params_rngrs   rngsrandom_paramsmissing_keys                r/   init_weightsz&FlaxAlbertPreTrainedModel.init_weights  s   IIk6		2''

3>>)3L3R3RSU3V(WYdey1"%**"2"23"7
K$=(()^^\W\ ) 

 (-)@AM!(6"23F#11 A&3K&@{#A!$D.011  r.   batch_size, sequence_lengthrs   trainrk   r   r   c                    ||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
|t	        j
                  |      }|St	        j                  t	        j                  t	        j                  |      j                  d         |j                        }|t	        j                  |      }i }|||d<   | j                  j                  d|xs | j                  it	        j                  |d      t	        j                  |d      t	        j                  |d      t	        j                  |d      | ||	|
|
      S )Nrw   rL   r   rR   r   )r  )r2   rk   r   r   r*   r   r   r   r   ry   r   r   r   r   array)rN   rU   r   rV   rW   r   rs   r
  rk   r   r   r  s               r/   rZ   z"FlaxAlbertPreTrainedModel.__call__*  sE    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY ! ^^I6N++CJJs~~i7P7V7VWY7Z,[]f]l]lmL! ]]95N ")DO{{  v,-IIit,IInD1IInD1IIl$/I  ! 
 	
r.   r   )	NNNNNFNNN)r&   r'   r(   r)   r   config_classbase_model_prefixr   r;   Moduler,   r*   r\   r   r   r3   r]   r   r?   r   PRNGKeyr   r  r   ALBERT_INPUTS_DOCSTRINGformatdictr   rZ   __classcell__)r   s   @r/   r   r     s7   
  L "L"))"
 $;;
m
m 
m 	
m
 yy
m 
m!

 2 2 ! !PZ !fp !0 ++B+I+IJg+hi *.,0/3&*-
 -
 ZZ''-
 -
 $D>-
 'tn-
 d^-
 j-
r.   r   c                       e Zd ZU eed<   ej                  Zej                  ed<   dZe	ed<   d Z
	 	 	 	 	 	 ddeej                     deej                     d	e	d
e	de	de	fdZy)FlaxAlbertModuler2   r3   Tadd_pooling_layerc                    t        | j                  | j                        | _        t	        | j                  | j                        | _        | j                  rt        j                  | j                  j                  t        j                  j                  j                  | j                  j                        | j                  d      | _        t        j                  | _        y d | _        d | _        y )Nr   pooler)ra   r3   r   )r1   r2   r3   
embeddingsr   encoderr  r;   rf   rc   r?   r@   rA   rB   r  tanhpooler_activationrM   s    r/   rO   zFlaxAlbertModule.setup`  s    .t{{$**M(DJJG!!((''FF//66t{{7T7TUjj	DK &(WWD"DK%)D"r.   NrV   rW   rP   rk   r   r   c	                     |t        j                  |      }|St        j                  t        j                  t        j                  |      j
                  d         |j
                        }| j                  ||||      }	| j                  |	|||||      }
|
d   }	| j                  r*| j                  |	d d df         }| j                  |      }nd }|s|	|	f|
dd  z   S |	|f|
dd  z   S t        |	||
j                  |
j                        S )Nrw   rS   rP   rk   r   r   r   r   )r   pooler_outputr$   r%   )r*   r   r   r   r   ry   r  r  r  r  r  r   r$   r%   )rN   rU   r   rV   rW   rP   rk   r   r   r$   r   pooleds               r/   rZ   zFlaxAlbertModule.__call__o  s*    ! ^^I6N ++CJJs~~i7P7V7VWY7Z,[]f]l]lmL	><_lm,,'/!5#  
  
!![[q!t!45F++F3FF~%''!"+55!6*WQR[88-+ !//))	
 	
r.   )NNTFFT)r&   r'   r(   r   r,   r*   r\   r3   r  r]   rO   r   r   r+   rZ   r-   r.   r/   r  r  [  s    {{E399""t"*& 04-1""'%* /
 !,	/

 rzz*/
 /
  /
 #/
 /
r.   r  z`The bare Albert Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd ZeZy)FlaxAlbertModelN)r&   r'   r(   r  r   r-   r.   r/   r#  r#    s	    
 $Lr.   r#  c            	       v    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 d
de	de	de	de	fdZ
y	)FlaxAlbertForPreTrainingModuler2   r3   c                     t        | j                  | j                        | _        t	        | j                  | j                        | _        t        | j                  | j                        | _        y )Nr   )r  r2   r3   r   r   predictionsr   sop_classifierrM   s    r/   rO   z$FlaxAlbertForPreTrainingModule.setup  sF    &dkkL04::V/t{{$**Ur.   rP   rk   r   r   c	           
      h   | j                  ||||||||      }	| j                  j                  r#| j                   j                  d   d   d   d   }
nd }
|	d   }|	d   }| j	                  ||
      }| j                  ||	      }|s
||f|	d
d  z   S t        |||	j                  |	j                        S )Nr  r   r  rC   	embeddingr   r   r   rS   rm   )r"   r#   r$   r%   )	r   r2   tie_word_embeddings	variablesr'  r(  r!   r$   r%   )rN   rU   r   rV   rW   rP   rk   r   r   r   r   r$   r   prediction_scores
sop_scoress                  r/   rZ   z'FlaxAlbertForPreTrainingModule.__call__  s     ++'/!5#  	
 ;;**#{{44X>|LM^_`kl#

 ,,]M],^((m(T
%z2WQR[@@-/!!//))	
 	
r.   Nr   r   r-   r.   r/   r%  r%    s[    {{E399"V #"'%* *
 *
  *
 #*
 *
r.   r%  z
    Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `sentence order prediction (classification)` head.
    c                       e Zd ZeZy)FlaxAlbertForPreTrainingN)r&   r'   r(   r%  r   r-   r.   r/   r1  r1    s	     2Lr.   r1  a  
    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, FlaxAlbertForPreTraining

    >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
    >>> model = FlaxAlbertForPreTraining.from_pretrained("albert/albert-base-v2")

    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
    >>> outputs = model(**inputs)

    >>> prediction_logits = outputs.prediction_logits
    >>> seq_relationship_logits = outputs.sop_logits
    ```
r	  )output_typer  c            	       v    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 d
de	de	de	de	fdZ
y	)FlaxAlbertForMaskedLMModuler2   r3   c                     t        | j                  d| j                        | _        t	        | j                  | j                        | _        y )NF)r2   r  r3   r   )r  r2   r3   r   r   r'  rM   s    r/   rO   z!FlaxAlbertForMaskedLMModule.setup  s4    &dkkUZ^ZdZde04::Vr.   rP   rk   r   r   c	           
      4   | j                  ||||||||      }	|	d   }
| j                  j                  r#| j                   j                  d   d   d   d   }nd }| j	                  |
|      }|s	|f|	dd  z   S t        ||	j                  |	j                  	      S )
Nr  r   r   r  rC   r*  r+  r   r   r$   r%   )r   r2   r,  r-  r'  r   r$   r%   )rN   rU   r   rV   rW   rP   rk   r   r   r   r$   r   r   s                r/   rZ   z$FlaxAlbertForMaskedLMModule.__call__  s     ++'/!5#  	
  
;;**#{{44X>|LM^_`kl# !!-BR!S9wqr{**!!//))
 	
r.   Nr   r   r-   r.   r/   r4  r4  	  s[    {{E399"W #"'%* '
 '
  '
 #'
 '
r.   r4  z4Albert Model with a `language modeling` head on top.c                       e Zd ZeZy)FlaxAlbertForMaskedLMN)r&   r'   r(   r4  r   r-   r.   r/   r9  r9  ;  s    .Lr.   r9  z
refs/pr/11)revisionc            	       v    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 d
de	de	de	de	fdZ
y	))FlaxAlbertForSequenceClassificationModuler2   r3   c                 ~   t        | j                  | j                        | _        | j                  j                  | j                  j                  n| j                  j
                  }t        j                  |      | _        t        j                  | j                  j                  | j                        | _        y )Nr   r9   r   r  r2   r3   r   r   rK   r;   rJ   rL   rf   
num_labelsr   rN   classifier_dropouts     r/   rO   z/FlaxAlbertForSequenceClassificationModule.setupI  s    &dkkL {{22> KK//00 	
 zz'9:((KK""**
r.   rP   rk   r   r   c	           
          | j                  ||||||||      }	|	d   }
| j                  |
|      }
| j                  |
      }|s	|f|	dd  z   S t        ||	j                  |	j
                        S )Nr  r   rS   rm   r7  )r   rL   r   r   r$   r%   )rN   rU   r   rV   rW   rP   rk   r   r   r   r   r   s               r/   rZ   z2FlaxAlbertForSequenceClassificationModule.__call__V  s     ++'/!5#  	
  
]-P/9wqr{**+!//))
 	
r.   Nr   r   r-   r.   r/   r<  r<  E  sZ    {{E399"
& #"'%* "
 "
  "
 #"
 "
r.   r<  z
    Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                       e Zd ZeZy)#FlaxAlbertForSequenceClassificationN)r&   r'   r(   r<  r   r-   r.   r/   rD  rD  {  s	     =Lr.   rD  c            	       v    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 d
de	de	de	de	fdZ
y	)!FlaxAlbertForMultipleChoiceModuler2   r3   c                     t        | j                  | j                        | _        t	        j
                  | j                  j                        | _        t	        j                  d| j                        | _	        y )Nr   r9   r   r   )
r  r2   r3   r   r;   rJ   rK   rL   rf   r   rM   s    r/   rO   z'FlaxAlbertForMultipleChoiceModule.setup  sH    &dkkLzzt{{'F'FG((1DJJ7r.   rP   rk   r   r   c	           
      :   |j                   d   }	||j                  d|j                   d         nd }||j                  d|j                   d         nd }||j                  d|j                   d         nd }||j                  d|j                   d         nd }| j                  ||||||||      }
|
d   }| j                  ||      }| j	                  |      }|j                  d|	      }|s	|f|
dd  z   S t        ||
j                  |
j                        S )Nr   rw   r  rS   rm   r7  )ry   rx   r   rL   r   r   r$   r%   )rN   rU   r   rV   rW   rP   rk   r   r   num_choicesr   r   r   reshaped_logitss                 r/   rZ   z*FlaxAlbertForMultipleChoiceModule.__call__  sE     ooa(BKBWI%%b)//"*=>]a	Q_Qk//N4H4H4LMquQ_Qk//N4H4H4LMquKWKc|++B0B0B20FGim ++'/!5#  	
  
]-P/ ..[9#%33,"!//))
 	
r.   Nr   r   r-   r.   r/   rF  rF    sZ    {{E399"8 #"'%* *
 *
  *
 #*
 *
r.   rF  z
    Albert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                       e Zd ZeZy)FlaxAlbertForMultipleChoiceN)r&   r'   r(   rF  r   r-   r.   r/   rL  rL    s	     5Lr.   rL  z(batch_size, num_choices, sequence_lengthc            	       v    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 d
de	de	de	de	fdZ
y	)&FlaxAlbertForTokenClassificationModuler2   r3   c                    t        | j                  | j                  d      | _        | j                  j                  | j                  j                  n| j                  j
                  }t        j                  |      | _        t        j                  | j                  j                  | j                        | _        y )NFr2   r3   r  r9   r   r>  r@  s     r/   rO   z,FlaxAlbertForTokenClassificationModule.setup  s    &dkk_de {{22> KK//00 	
 zz'9:((4;;#9#9Lr.   rP   rk   r   r   c	           
          | j                  ||||||||      }	|	d   }
| j                  |
|      }
| j                  |
      }|s	|f|	dd  z   S t        ||	j                  |	j
                        S )Nr  r   rS   r   r7  )r   rL   r   r   r$   r%   )rN   rU   r   rV   rW   rP   rk   r   r   r   r$   r   s               r/   rZ   z/FlaxAlbertForTokenClassificationModule.__call__  s     ++'/!5#  	
  
]-P/9wqr{**(!//))
 	
r.   Nr   r   r-   r.   r/   rN  rN    s[    {{E399"M  #"'%* "
 "
  "
 #"
 "
r.   rN  z
    Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       e Zd ZeZy) FlaxAlbertForTokenClassificationN)r&   r'   r(   rN  r   r-   r.   r/   rS  rS    s	     :Lr.   rS  c            	       v    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 d
de	de	de	de	fdZ
y	)$FlaxAlbertForQuestionAnsweringModuler2   r3   c                     t        | j                  | j                  d      | _        t	        j
                  | j                  j                  | j                        | _        y )NFrP  r   )r  r2   r3   r   r;   rf   r?  
qa_outputsrM   s    r/   rO   z*FlaxAlbertForQuestionAnsweringModule.setup$  s;    &dkk_de((4;;#9#9Lr.   rP   rk   r   r   c	           
      V   | j                  ||||||||      }	|	d   }
| j                  |
      }|j                  | j                  j                  d      \  }}|j                  d      }|j                  d      }|s
||f|	dd  z   S t        |||	j                  |	j                        S )Nr  r   rw   rp   r   )start_logits
end_logitsr$   r%   )	r   rW  r   r2   r?  squeezer   r$   r%   )rN   rU   r   rV   rW   rP   rk   r   r   r   r$   r   rY  rZ  s                 r/   rZ   z-FlaxAlbertForQuestionAnsweringModule.__call__(  s     ++'/!5#  	
  
/#)<<0F0FR<#P j#++B/''+
 *-;;/%!!//))	
 	
r.   Nr   r   r-   r.   r/   rU  rU     s[    {{E399"M #"'%* &
 &
  &
 #&
 &
r.   rU  z
    Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       e Zd ZeZy)FlaxAlbertForQuestionAnsweringN)r&   r'   r(   rU  r   r-   r.   r/   r]  r]  Q  s	     8Lr.   r]  )r   r#  r1  r9  rD  rL  rS  r]  )Rtypingr   r   r   flax
flax.linenlinenr;   r?   	jax.numpynumpyr*   r   flax.core.frozen_dictr   r   r   flax.linen.attentionr	   flax.traverse_utilr
   r   r   modeling_flax_outputsr   r   r   r   r   r   r   modeling_flax_utilsr   r   r   r   r   utilsr   r   r   r   configuration_albertr   
get_loggerr&   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCstruct	dataclassr!   ALBERT_START_DOCSTRINGr  r  r1   r_   r   r   r   r   r   r   r   r   r  r#  r%  r1  %FLAX_ALBERT_FOR_PRETRAINING_DOCSTRINGr  r4  r9  r<  rD  rF  rL  rN  rS  rU  r]  __all__r-   r.   r/   <module>rt     s    - ,   
   > > > ;     g f . 
		H	%-   4[ 4 4:! F B%299 %PRbii Rj)bii )X(		 (V 4,
BII ,
^
		 
>BII 4		 \
 3 \
~C
ryy C
L f$/ $	$ _.ACacr s3
RYY 3
l  28 22) %& ""#@ADii !*HWf
/
")) /
d PRhi/5 / j/ .0BO^j
3
		 3
l  =*C == ' 	3
		 3
l  5"; 55 !8!?!?@j!k !	0
RYY 0
f  :'@ :: $	.
299 .
b  8%> 88 "$		r.   