
    sgu                        d dl Z d dlmZmZmZ d dlmZ d dlZd dl	m
Z d dl
Zd dlmZmZmZ d dlmZmZ d dlmZ ddlmZmZmZmZmZmZ ddlmZmZmZm Z  dd	l!m"Z"m#Z#m$Z$ d
dl%m&Z&  e$jN                  e(      Z)dZ*dZ+dZ,dZ-d Z.d Z/ G d dej`                        Z1 G d dej`                        Z2 G d dej`                        Z3 G d dej`                        Z4 G d dej`                        Z5 G d dej`                        Z6 G d dej`                        Z7 G d  d!e      Z8 G d" d#ej`                        Z9 e"d$e,       G d% d&e8             Z: ee:e*de+        G d' d(ej`                        Z; e"d)e,       G d* d+e8             Z< ee<e*ee+        G d, d-ej`                        Z= e"d.e,       G d/ d0e8             Z> ee>e*ee+        G d1 d2ej`                        Z? e"d3e,       G d4 d5e8             Z@ e e@e-j                  d6              ee@e*ee+        G d7 d8ej`                        ZB e"d9e,       G d: d;e8             ZC eeCe*ee+        G d< d=ej`                        ZD e"d>e,       G d? d@e8             ZE eeEe*ee+       y)A    N)CallableOptionalTuple)
FrozenDictfreezeunfreeze)flatten_dictunflatten_dict)lax   )FlaxBaseModelOutputFlaxMaskedLMOutputFlaxMultipleChoiceModelOutput FlaxQuestionAnsweringModelOutputFlaxSequenceClassifierOutputFlaxTokenClassifierOutput)ACT2FNFlaxPreTrainedModelappend_call_sample_docstringoverwrite_call_docstring)add_start_docstrings%add_start_docstrings_to_model_forwardlogging   )DistilBertConfigzdistilbert-base-uncasedr   a  

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)

    This model is also a
    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
    behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)

    Parameters:
        config ([`DistilBertConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`numpy.ndarray` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                 v    dt        j                  dd|dz  z  t        j                  |      z        z  }| |z  S )Nr   i'     )nppowerfloat32)posid_modelangle_ratess       j/var/www/html/venv/lib/python3.12/site-packages/transformers/models/distilbert/modeling_flax_distilbert.py
get_anglesr&   `   s8    bhhuqAF|rzz'7J&JKKK    c                    t        t        j                  |       d d t        j                  f   t        j                  |      t        j                  d d f   |      }t        j                  |d d dd df         |d d dd df<   t        j
                  |d d dd df         |d d dd df<   |t        j                  df   }t        j                  |      S )Nr   r   r   .)r&   r   arangenewaxissincosjnparray)positionr#   
angle_radspos_encodings       r%   positional_encodingr2   e   s    BIIh/2::>		'@RSUS]S]_`S`@acjkJ &&Aqt!tG!45Jq!$Q$w &&Aqt!tG!45Jq!$Q$wbjj#o.L99\""r'   c                   f    e Zd ZU dZeed<   ej                  Zej                  ed<   d Z	dde
fdZy)	FlaxEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.configdtypec                 R   t        j                  | j                  j                  | j                  j                  t
        j                   j                  j                  | j                  j                              | _	        | j                  j                  st        j                  | j                  j                  | j                  j                  t
        j                   j                  j                  | j                  j                              | _        n9t        | j                  j                  | j                  j                        | _        t        j                  d| j                         | _        t        j"                  | j                  j$                        | _        y )Nstddev)embedding_init-q=epsilonr6   rate)nnEmbedr5   
vocab_sizedimjaxinitializersnormalinitializer_rangeword_embeddingssinusoidal_pos_embdsmax_position_embeddingsposition_embeddingsr2   r1   	LayerNormr6   Dropoutdropoutselfs    r%   setupzFlaxEmbeddings.setupz   s   !xxKK""KKOO66..55T[[=Z=Z5[ 

 {{//')xx33"vv2299A^A^9_(D$ !4DKK4W4WY]YdYdYhYh iDe4::Fzzt{{':':;r'   deterministicc                    |j                   \  }}| j                  |j                  d            }| j                  j                  s^t        j                  |      j                  d      }t        j                  |||f      }| j                  |j                  d            }n3| j                  d d d |d d f   }|j                  |j                        }||z   }| j                  |      }| j                  ||      }|S )Ni4)shaperR   )rU   rH   astyper5   rI   r-   r)   broadcast_torK   r1   r6   rL   rN   )	rP   	input_idsrR   
batch_size
seq_lengthinputs_embedsposition_idsposition_embedshidden_statess	            r%   __call__zFlaxEmbeddings.__call__   s    !*
J,,Y-=-=d-CD{{//::j188>L++LZ@XYL"66|7J7J47PQO"//;J;0ABO-44]5H5HIO &7 }5]-Pr'   NT)__name__
__module____qualname____doc__r   __annotations__r-   r    r6   rQ   boolr`    r'   r%   r4   r4   t   s.    Q{{E399"<" r'   r4   c                   j    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 dde	de	fdZ
y)	FlaxMultiHeadSelfAttentionr5   r6   c                    | j                   j                  | _        | j                   j                  | _        t        j                  | j                   j
                        | _        | j                  | j                  z  dk(  s%t        d| j                   d| j                         t        j                  | j                  | j                  t        j                  j                  j                  | j                   j                              | _        t        j                  | j                  | j                  t        j                  j                  j                  | j                   j                              | _        t        j                  | j                  | j                  t        j                  j                  j                  | j                   j                              | _        t        j                  | j                  | j                  t        j                  j                  j                  | j                   j                              | _        y )Nr>   r   Hidden size " not dividable by number of heads r8   r6   kernel_init)r5   n_headsrC   r@   rM   attention_dropoutrN   
ValueErrorDenser6   rD   rE   rF   rG   q_link_linv_linout_linrO   s    r%   rQ   z FlaxMultiHeadSelfAttention.setup   s   {{**;;??zzt{{'D'DE4<<'1,|DHH:5WX\XdXdWefggXXHH**++22$++:W:W2X


 XXHH**++22$++:W:W2X


 XXHH**++22$++:W:W2X


 xxHH**++22$++:W:W2X
r'   rR   output_attentionsc           	          |j                   \  }}|j                   d   }	 j                   j                  z  dd|	f}
 fd} fd} | j                  |            } | j	                  |            } | j                  |            }|t        j                        z  }t        j                  ||j                  dddd            }t        j                  ||
      }|j                  |j                        }|dd|z
  z  z
  }t        j                  |d	
      } j!                  ||      }t        j                  ||      } ||      } j#                  |      }|r||fS |fS )Nr   c                 d    | j                  dj                        j                  dddd      S )zseparate headsr   r   r   r   )reshaperp   	transposexbsdim_per_headrP   s    r%   rU   z2FlaxMultiHeadSelfAttention.__call__.<locals>.shape   s/    99RT\\<@JJ1aQRTUVVr'   c                 h    | j                  dddd      j                  dj                  z        S )zgroup headsr   r   r   r   r{   )r}   r|   rp   r~   s    r%   unshapez4FlaxMultiHeadSelfAttention.__call__.<locals>.unshape   s0    ;;q!Q*222r4<<,;VWWr'   r   r   r   gꌠ9Y>)Fg      ?r{   axisrV   )rU   rC   rp   rt   ru   rv   mathsqrtr-   matmulr}   r|   rW   r6   r@   softmaxrN   rw   )rP   querykeyvaluemaskrR   rx   q_lenrC   k_len
mask_reshprU   r   qkvscoresweightscontextr   r   s   `                  @@r%   r`   z#FlaxMultiHeadSelfAttention.__call__   sP    E3		! xx4<</!Q&
	W	X $**U#$$**S/"$**U#$		,''Aq{{1aA67{{4,{{6<<($#*--**V"-,,wm,D**Wa('",,w'W%%:r'   N)TFrb   rc   rd   r   rf   r-   r    r6   rQ   rg   r`   rh   r'   r%   rj   rj      sA    {{E399"
F #"'/ /  /r'   rj   c                   b    e Zd ZU eed<   ej                  Zej                  ed<   d Zdde	fdZ
y)FlaxFFNr5   r6   c                    t        j                  | j                  j                        | _        | j                  j                  | _        d| _        t        j                  | j                  j                  | j                  t        j                   j                  j                  | j                  j                              | _        t        j                  | j                  j                  | j                  t        j                   j                  j                  | j                  j                              | _        t         | j                  j"                     | _        y )Nr>   r   r8   rn   )r@   rM   r5   rN   chunk_size_feed_forwardseq_len_dimrs   
hidden_dimr6   rD   rE   rF   rG   lin1rC   lin2r   
activationrO   s    r%   rQ   zFlaxFFN.setup   s    zzt{{':':;'+{{'J'J$HHKK""**++22$++:W:W2X
	
 HHKKOO**++22$++:W:W2X
	 !!7!78r'   rR   c                     | j                  |      }| j                  |      }| j                  |      }| j                  ||      }|S )NrV   )r   r   r   rN   )rP   r_   rR   s      r%   r`   zFlaxFFN.__call__	  sD    		-06		-0]-Pr'   Nra   r   rh   r'   r%   r   r      s+    {{E399"9"T r'   r   c                   j    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 dde	de	fdZ
y)	FlaxTransformerBlockr5   r6   c                    | j                   j                  | j                   j                  z  dk(  s5J d| j                   j                   d| j                   j                          t        | j                   | j                        | _        t        j                  d| j                        | _        t        | j                   | j                        | _
        t        j                  d| j                        | _        y )Nr   rl   rm   r6   r;   r<   )r5   rC   rp   rj   r6   	attentionr@   rL   sa_layer_normr   ffnoutput_layer_normrO   s    r%   rQ   zFlaxTransformerBlock.setup  s    KKOOdkk111Q6	c$++//**LT[[M`M`Lab	c6 4DKKtzzR\\%tzzJ4;;djj9!#e4::!Nr'   rx   rR   c                     | j                  ||||||      }|r|\  }}nt        |      t        u sJ |d   }| j                  ||z         }| j	                  ||      }| j                  ||z         }|f}|rf|z   }|S )N)r   r   r   r   rx   rR   r   rV   )r   typetupler   r   r   )	rP   r_   	attn_maskrx   rR   	sa_output
sa_weights
ffn_outputoutputs	            r%   r`   zFlaxTransformerBlock.__call__   s     NN/' # 
	 $-!Iz	?e+++!!I&&y='@A	 XXi}XE
++J,BC
 ]V+Fr'   N)FTr   rh   r'   r%   r   r     sA    {{E399"	O #("  	
 r'   r   c            	       v    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 d
de	de	de	de	fdZ
y	)FlaxTransformerr5   r6   c           	          t        | j                  j                        D cg c]-  }t        | j                  t	        |      | j
                        / c}| _        y c c}w )N)namer6   )ranger5   n_layersr   strr6   layers)rP   r"   s     r%   rQ   zFlaxTransformer.setupD  sF    V[\`\g\g\p\pVq
QR 3q6L
 
s   2Arx   output_hidden_statesrR   return_dictc                 $   |rdnd }|rdnd }| j                   D ]I  }	|r||fz   } |	||||      }
|
d   }|rt        |
      dk(  sJ |
d   }||fz   }:t        |
      dk(  rIJ  |r||fz   }|st        d |||fD              S t        |||      S )	Nrh   )r_   r   rx   rR   r{   r   r   r   c              3   &   K   | ]	  }||  y wNrh   ).0r   s     r%   	<genexpr>z+FlaxTransformer.__call__.<locals>.<genexpr>m  s     hqZ[Zghs   )last_hidden_stater_   
attentions)r   lenr   r   )rP   r_   attention_maskrx   r   rR   r   all_hidden_statesall_attentionslayer_modulelayer_outputsr   s               r%   r`   zFlaxTransformer.__call__I  s     #7BD0d KK 	/L#$58H$H!(+("3+	M *"-M =)Q...*1-
!/:-!?=)Q...#	/(   1]4D Dh]NDU$Vhhh"+;LYg
 	
r'   NFFTFr   rh   r'   r%   r   r   @  sZ    {{E399"
 #(%*"!'
  	'

 #'
 '
 '
r'   r   c            	       v    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 d
de	de	de	de	fdZ
y	)FlaxTransformerEncoderr5   r6   c                 P    t        | j                  | j                        | _        y Nr   )r   r5   r6   layerrO   s    r%   rQ   zFlaxTransformerEncoder.setupw  s    $T[[

C
r'   rx   r   rR   r   c                 0    | j                  ||||||      S )N)r_   r   rx   r   rR   r   )r   )rP   r_   r   rx   r   rR   r   s          r%   r`   zFlaxTransformerEncoder.__call__z  s,     zz')/!5'#  
 	
r'   Nr   r   rh   r'   r%   r   r   s  s[    {{E399"D #(%*"!
  	

 #
 
 
r'   r   c                       e Zd ZU eed<   ej                  Zej                  ed<   ej                  j                  j                  Zedej                  f   ed<   d Zd Zy)FlaxDistilBertLMDecoderr5   r6   .	bias_initc                 r    | j                  d| j                  | j                  j                  f      | _        y )Nbias)paramr   r5   rB   r   rO   s    r%   rQ   zFlaxDistilBertLMDecoder.setup  s'    JJvt~~8N8N7PQ	r'   c                 6   t        j                  || j                        }t        j                  || j                        }t        j                  |||j
                  dz
  fdfdf      }t        j                  | j                  | j                        }||z   }|S )Nr   )r   )rh   rh   )r-   asarrayr6   r   dot_generalndimr   )rP   inputskernelyr   s        r%   r`   z FlaxDistilBertLMDecoder.__call__  sw    VTZZ0VTZZ0OOFFv{{Q.@$-G,RS{{499djj1Hr'   N)rb   rc   rd   r   rf   r-   r    r6   rD   r@   rE   zerosr   r   r   ndarrayrQ   r`   rh   r'   r%   r   r     sL    {{E399"+.66+>+>+D+DIxRZZ(DRr'   r   c                   x    e Zd ZU dZeZdZdZej                  e
d<   ddej                  dfded	ed
edej                  def
 fdZddej&                  j(                  d	ededefdZ eej3                  d            	 	 	 	 	 	 	 	 ddedej&                  j(                  dedee   dee   dee   fd       Z xZS )FlaxDistilBertPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    
distilbertNmodule_class)r   r   r   Tr5   input_shapeseedr6   _do_initc                 Z     | j                   d||d|}t        | 	  ||||||       y )Nr5   r6   )r   r   r6   r   rh   )r   super__init__)	rP   r5   r   r   r6   r   kwargsmodule	__class__s	           r%   r   z&FlaxDistilBertPreTrainedModel.__init__  s=     #""H&HH[tSXcklr'   rngparamsreturnc                    t        j                  |d      }t        j                  |      }t        j                  j                  |      \  }}||d}| j                  j                  |||d      d   }	|dt        t        |	            }	t        t        |            }| j                  D ]
  }
|	|
   ||
<    t               | _
        t        t        |            S |	S )NrT   r   )r   rN   F)r   r   )r-   r   	ones_likerD   randomsplitr   initr	   r   _missing_keyssetr   r
   )rP   r   r   r   rY   r   
params_rngdropout_rngrngsrandom_paramsmissing_keys              r%   init_weightsz*FlaxDistilBertPreTrainedModel.init_weights  s    IIk6	y1"%**"2"23"7
K$=((y.V[(\]ef(-)@AM!(6"23F#11 A&3K&@{#A!$D.011  r'   zbatch_size, sequence_lengthr   trainrx   r   r   c
           
         ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	|t	        j
                  |      }i }
|||
d<   | j                  j                  d|xs | j                  it	        j                  |d      t	        j                  |d      | |||	|
      S )NrN   r   rT   r   )r   )
r5   rx   r   r   r-   r   r   applyr   r.   )rP   rY   r   	head_maskr   r   r   rx   r   r   r   s              r%   r`   z&FlaxDistilBertPreTrainedModel.__call__  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY! ]]95N ")DO{{  v,-IIit,IInD1I  ! 	
 		
r'   r   )NNNNFNNN)rb   rc   rd   re   r   config_classbase_model_prefixr   r@   Modulerf   r-   r    r   intr6   rg   r   rD   r   PRNGKeyr   r   r   DISTILBERT_INPUTS_DOCSTRINGformatdictr   r`   __classcell__)r   s   @r%   r   r     s4   
 $L$"L"))"
 $;;
m 
m 
m 	
m
 yy
m 
m!

 2 2 ! !PZ !fp !( ++F+M+MNk+lm *.,0/3&*#

 #
 ZZ''#
 #
 $D>#
 'tn#
 d^#
 n#
r'   r   c            	       v    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 d
de	de	de	de	fdZ
y	)FlaxDistilBertModuler5   r6   c                     t        | j                  | j                        | _        t	        | j                  | j                        | _        y r   )r4   r5   r6   
embeddingsr   transformerrO   s    r%   rQ   zFlaxDistilBertModule.setup  s/    (DJJG1$++TZZPr'   rR   rx   r   r   c                     ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||      }| j                  ||||||      S )NrV   )r_   r   rR   rx   r   r   )r5   rx   r   r   r  r  )rP   rY   r   rR   rx   r   r   input_embedss           r%   r`   zFlaxDistilBertModule.__call__  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBYyN&)'/!5#   
 	
r'   NTFFTr   rh   r'   r%   r  r    s[    {{E399"Q #"'%* 
 	

  
 #
 
r'   r  zdThe bare DistilBert Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd ZeZy)FlaxDistilBertModelN)rb   rc   rd   r  r   rh   r'   r%   r  r    s	    
 (Lr'   r  c            	       v    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 d
de	de	de	de	fdZ
y	)FlaxDistilBertForMaskedLMModuler5   r6   c                    t        | j                  | j                        | _        t	        j
                  | j                  j                  | j                  t        j                  j                  j                  | j                  j                              | _        t	        j                  d| j                        | _        | j                  j                  r't        | j                  | j                        | _        y t	        j
                  | j                  j"                  | j                  t        j                  j                  j                  | j                  j                              | _        y )Nr   r8   rn   r;   r<   )r  r5   r6   r   r@   rs   rC   rD   rE   rF   rG   vocab_transformrL   vocab_layer_normtie_word_embeddingsr   vocab_projectorrB   rO   s    r%   rQ   z%FlaxDistilBertForMaskedLMModule.setup   s    .t{{$**M!xxKKOO**++22$++:W:W2X 

 !#U$** M;;**#:jj$D 
 $&88&&jjFF//66dkk>[>[6\$D r'   rR   rx   r   r   c                     ||n| j                   j                  }| j                  ||||||      }|d   }| j                  |      }	t	        | j                   j
                     |	      }	| j                  |	      }	| j                   j                  r?| j                  j                  d   d   d   d   }
| j                  |	|
j                        }	n| j                  |	      }	|s|	f|dd  z   }|S t        |	|j                  |j                        S )	N)rY   r   rx   r   rR   r   r   r   r  rH   	embeddingr   logitsr_   r   )r5   use_return_dictr   r  r   r   r  r  	variablesr  Tr   r_   r   )rP   rY   r   rR   rx   r   r   dlbrt_outputr_   prediction_logitsshared_embeddingr   s               r%   r`   z(FlaxDistilBertForMaskedLMModule.__call__4  s&    &1%<k$++B]B])/!5'# ' 
 %Q 00?"4;;#9#9:;LM 112CD;;**#88B<PQbcdop $ 4 45FHXHZHZ [ $ 4 45F G')L,<<FM!$&44#..
 	
r'   Nr  r   rh   r'   r%   r  r    sZ    {{E399"0 #"'%* &
 	&

  &
 #&
 &
r'   r  z8DistilBert Model with a `language modeling` head on top.c                       e Zd ZeZy)FlaxDistilBertForMaskedLMN)rb   rc   rd   r  r   rh   r'   r%   r&  r&  ]  s    2Lr'   r&  c            	       v    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 d
de	de	de	de	fdZ
y	)-FlaxDistilBertForSequenceClassificationModuler5   r6   c                    t        | j                  | j                        | _        t	        j
                  | j                  j                  | j                  t        j                  j                  j                  | j                  j                              | _        t	        j                  | j                  j                        | _        t	        j
                  | j                  j                  | j                        | _        y )Nr   r8   rn   r>   r   )r  r5   r6   r   r@   rs   rC   rD   rE   rF   rG   pre_classifierrM   seq_classif_dropoutrN   
num_labels
classifierrO   s    r%   rQ   z3FlaxDistilBertForSequenceClassificationModule.setupi  s    .dkkT hhKKOO**++22$++:W:W2X

 zzt{{'F'FG((KK""**
r'   rR   rx   r   r   c                 `   ||n| j                   j                  }| j                  ||||||      }|d   }|d d df   }	| j                  |	      }	t	        d   |	      }	| j                  |	|      }	| j                  |	      }
|s	|
f|dd  z   S t        |
|j                  |j                        S )NrR   rx   r   r   r   relurV   r   r  )
r5   r  r   r*  r   rN   r-  r   r_   r   )rP   rY   r   rR   rx   r   r   distilbert_outputhidden_statepooled_outputr  s              r%   r`   z6FlaxDistilBertForSequenceClassificationModule.__call__v  s     &1%<k$++B]B] OO'/!5# , 
 )+$QT*++M:v}5]-P/90444++99(33
 	
r'   Nr  r   rh   r'   r%   r(  r(  e  sZ    {{E399"
" #"'%* !
 	!

  !
 #!
 !
r'   r(  z
    DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                       e Zd ZeZy)'FlaxDistilBertForSequenceClassificationN)rb   rc   rd   r(  r   rh   r'   r%   r5  r5    s
     ALr'   r5  c            	       v    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 d
de	de	de	de	fdZ
y	)%FlaxDistilBertForMultipleChoiceModuler5   r6   c                    t        | j                  | j                        | _        t	        j
                  | j                  j                  | j                  t        j                  j                  j                  | j                  j                              | _        t	        j                  | j                  j                        | _        t	        j
                  d| j                        | _        y )Nr   r8   rn   r>   r   r   )r  r5   r6   r   r@   rs   rC   rD   rE   rF   rG   r*  rM   r+  rN   r-  rO   s    r%   rQ   z+FlaxDistilBertForMultipleChoiceModule.setup  s    .dkkT hhKKOO**++22$++:W:W2X

 zzt{{'F'FG((**
r'   rR   rx   r   r   c                 .   ||n| j                   j                  }|j                  d   }||j                  d|j                  d         nd }||j                  d|j                  d         nd }| j	                  ||||||      }|d   }	|	d d df   }
| j                  |
      }
t        d   |
      }
| j                  |
|      }
| j                  |
      }|j                  d|      }|s	|f|dd  z   S t        ||j                  |j                        S )	Nr   r{   r/  r   r0  rV   r   r  )r5   r  rU   r|   r   r*  r   rN   r-  r   r_   r   )rP   rY   r   rR   rx   r   r   num_choicesoutputsr2  r3  r  reshaped_logitss                r%   r`   z.FlaxDistilBertForMultipleChoiceModule.__call__  s7    &1%<k$++B]B]ooa(BKBWI%%b)//"*=>]a	Q_Qk//N4H4H4LMqu //'/!5# " 
 qz$QT*++M:v}5]-P/ ..[9#%33,"!//))
 	
r'   Nr  r   rh   r'   r%   r7  r7    sZ    {{E399"
" #"'%* (
 	(

  (
 #(
 (
r'   r7  z
    DistilBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
    a softmax) e.g. for RocStories/SWAG tasks.
    c                       e Zd ZeZy)FlaxDistilBertForMultipleChoiceN)rb   rc   rd   r7  r   rh   r'   r%   r>  r>    s	     9Lr'   r>  z(batch_size, num_choices, sequence_lengthc            	       v    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 d
de	de	de	de	fdZ
y	)*FlaxDistilBertForTokenClassificationModuler5   r6   c                 "   t        | j                  | j                        | _        t	        j
                  | j                  j                        | _        t	        j                  | j                  j                  | j                        | _	        y )Nr   r>   r   )
r  r5   r6   r   r@   rM   rN   rs   r,  r-  rO   s    r%   rQ   z0FlaxDistilBertForTokenClassificationModule.setup  sR    .dkkTzzt{{':':;((4;;#9#9Lr'   rR   rx   r   r   c                    ||n| j                   j                  }| j                  ||||||      }|d   }| j                  ||      }| j	                  |      }	|s	|	f|dd  z   S t        |	|j                  |j                        S )Nr/  r   rV   r   r  )r5   r  r   rN   r-  r   r_   r   )
rP   rY   r   rR   rx   r   r   r;  r_   r  s
             r%   r`   z3FlaxDistilBertForTokenClassificationModule.__call__  s     &1%<k$++B]B]//'/!5# " 
  
]-P/9wqr{**(!//))
 	
r'   Nr  r   rh   r'   r%   r@  r@    s[    {{E399"M #"'%* 
 	

  
 #
 
r'   r@  z
    DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
    for Named-Entity-Recognition (NER) tasks.
    c                       e Zd ZeZy)$FlaxDistilBertForTokenClassificationN)rb   rc   rd   r@  r   rh   r'   r%   rD  rD  *  s	     >Lr'   rD  c            	       v    e Zd ZU eed<   ej                  Zej                  ed<   d Z	 	 	 	 d
de	de	de	de	fdZ
y	)(FlaxDistilBertForQuestionAnsweringModuler5   r6   c                 X   t        | j                  | j                        | _        t	        j
                  | j                  j                  | j                        | _        | j                  j                  dk(  sJ t	        j                  | j                  j                        | _
        y )Nr   r   r   r>   )r  r5   r6   r   r@   rs   r,  
qa_outputsrM   
qa_dropoutrN   rO   s    r%   rQ   z.FlaxDistilBertForQuestionAnsweringModule.setupA  sj    .dkkT((4;;#9#9L{{%%***zzt{{'='=>r'   rR   rx   r   r   c                    ||n| j                   j                  }| j                  ||||||      }|d   }| j                  ||      }| j	                  |      }	|	j                  | j                   j                  d      \  }
}|
j                  d      }
|j                  d      }|s
|
|f|dd  z   S t        |
||j                  |j                        S )Nr/  r   rV   r{   r   r   )start_logits
end_logitsr_   r   )r5   r  r   rN   rH  r   r,  squeezer   r_   r   )rP   rY   r   rR   rx   r   r   r1  r_   r  rK  rL  s               r%   r`   z1FlaxDistilBertForQuestionAnsweringModule.__call__G  s     &1%<k$++B]B] !OO'/!5# , 
 *!,]-P/#)<<0F0FR<#P j#++B/''+
 *-0A!"0EEE/%!+99(33	
 	
r'   Nr  r   rh   r'   r%   rF  rF  =  sZ    {{E399"? #"'%* %
 	%

  %
 #%
 %
r'   rF  z
    DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       e Zd ZeZy)"FlaxDistilBertForQuestionAnsweringN)rb   rc   rd   rF  r   rh   r'   r%   rO  rO  o  s	     <Lr'   rO  )Fr   typingr   r   r   
flax.linenlinenr@   rD   	jax.numpynumpyr-   r   flax.core.frozen_dictr   r   r   flax.traverse_utilr	   r
   r   modeling_flax_outputsr   r   r   r   r   r   modeling_flax_utilsr   r   r   r   utilsr   r   r   configuration_distilbertr   
get_loggerrb   logger_CHECKPOINT_FOR_DOC_CONFIG_FOR_DOCFLAX_DISTILBERT_START_DOCSTRINGr  r&   r2   r  r4   rj   r   r   r   r   r   r   r  r  r  r&  r(  r5  r7  r>  r  r@  rD  rF  rO  rh   r'   r%   <module>r`     s     , ,  
   > > ;   w v Y Y 6 
		H	%/ $# . 6
#*RYY *ZP Pfbii :,299 ,^0
bii 0
f
RYY 
4bii "N
$7 N
b
299 
D j#(7 (	( 02Et_ ]>
bii >
B TVuv3 = 3 w3 68KM_ap q2
BII 2
j  $A.K AA + 	9
BII 9
x  $9&C 99 #%@%G%GHr%s #!	(
 (
V  $>+H >> (	/
ryy /
d  $<)F << &$	r'   