
    sg                   
   d Z ddlmZ ddlZddlmZmZmZmZm	Z	 ddl
ZddlZddlmZ ddlmZmZmZmZmZ ddlmZmZmZmZmZmZmZmZmZ dd	l m!Z!m"Z"m#Z# dd
l$m%Z%m&Z&m'Z'm(Z( ddl)m*Z*  e(jV                  e,      Z-dZ.dZ/ G d dej`                  jb                        Z2 G d dej`                  jb                        Z3 G d dej`                  jb                        Z4 G d dej`                  jb                        Z5 G d dej`                  jb                        Z6 G d dej`                  jb                        Z7 G d dej`                  jb                        Z8 G d dej`                  jb                        Z9 G d d ej`                  jb                        Z: G d! d"ej`                  jb                        Z;d# Z<d$ Z=d% Z>d& Z?d' Z@ G d( d)ej`                  jb                        ZA G d* d+ej`                  jb                        ZB G d, d-ej`                  jb                        ZC G d. d/ej`                  jb                        ZD G d0 d1ej`                  jb                        ZE G d2 d3ej`                  jb                        ZF G d4 d5e      ZGd6ZHd7ZI e&d8eH       G d9 d:eG             ZJ e&d;eH       G d< d=eGe             ZK e&d>eH       G d? d@eGe             ZL e&dAeH       G dB dCeGe             ZM e&dDeH       G dE dFeGe             ZNy)GzTF 2.0 DeBERTa model.    )annotationsN)DictOptionalSequenceTupleUnion   )get_tf_activation)TFBaseModelOutputTFMaskedLMOutputTFQuestionAnsweringModelOutputTFSequenceClassifierOutputTFTokenClassifierOutput)	TFMaskedLanguageModelingLossTFModelInputTypeTFPreTrainedModelTFQuestionAnsweringLossTFSequenceClassificationLossTFTokenClassificationLossget_initializerkerasunpack_inputs)check_embeddings_within_bounds
shape_liststable_softmax)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )DebertaConfigr!   zkamalkraj/deberta-basec                  D     e Zd Zd fdZdddZedd       Zd	dZ xZS )
TFDebertaContextPoolerc                    t        |   di | t        j                  j	                  |j
                  d      | _        t        |j                  d      | _	        || _
        y )Ndensenamedropout )super__init__r   layersDensepooler_hidden_sizer%   TFDebertaStableDropoutpooler_dropoutr(   configselfr1   kwargs	__class__s      b/var/www/html/venv/lib/python3.12/site-packages/transformers/models/deberta/modeling_tf_deberta.pyr+   zTFDebertaContextPooler.__init__9   sO    "6"\\''(A(A'P
-f.C.C)T    c                    |d d df   }| j                  ||      }| j                  |      } t        | j                  j                        |      }|S )Nr   training)r(   r%   r
   r1   pooler_hidden_act)r3   hidden_statesr:   context_tokenpooled_outputs        r6   callzTFDebertaContextPooler.call?   sT     &ad+]XF

=1H)$++*G*GHWr7   c                .    | j                   j                  S N)r1   hidden_sizer3   s    r6   
output_dimz!TFDebertaContextPooler.output_dimH   s    {{&&&r7   c                   | j                   ry d| _         t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   exY w# 1 sw Y   y xY w)NTr%   r(   )
builtgetattrtf
name_scoper%   r'   buildr1   r.   r(   r3   input_shapes     r6   rJ   zTFDebertaContextPooler.buildL   s    ::
4$'3tzz/ O

  $dkk.L.L!MNO4D)5t||001 )""4() ) 6O O) )s   3C"<C."C+.C7r1   r!   Fr:   bool)returnintrA   )	__name__
__module____qualname__r+   r?   propertyrD   rJ   __classcell__r5   s   @r6   r#   r#   8   s&     ' '	)r7   r#   c                  ,     e Zd ZdZd fd	ZddZ xZS )TFDebertaXSoftmaxa>  
    Masked Softmax which is optimized for saving memory

    Args:
        input (`tf.Tensor`): The input tensor that will apply softmax.
        mask (`tf.Tensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
        dim (int): The dimension that will apply softmax
    c                2    t        |   di | || _        y Nr)   )r*   r+   axis)r3   r]   r4   r5   s      r6   r+   zTFDebertaXSoftmax.__init__b   s    "6"	r7   c                   t        j                  t        j                  |t         j                              }t        j                  |t        j                  t        d      | j                        |      }t        t        j                  |t         j                        | j                        }t        j                  |d|      }|S )Nz-infdtype        )
rH   logical_notcastrP   wherefloatcompute_dtyper   float32r]   )r3   inputsmaskrmaskoutputs        r6   r?   zTFDebertaXSoftmax.callf   s}    rwwtRWW56%vd>P>P!QSYZbjj A499M%f-r7   ))rh   	tf.Tensorri   rm   )rS   rT   rU   __doc__r+   r?   rW   rX   s   @r6   rZ   rZ   X   s    r7   rZ   c                  P     e Zd ZdZ fdZej                  d        ZdddZ xZ	S )r/   z
    Optimized dropout module for stabilizing the training

    Args:
        drop_prob (float): the dropout probabilities
    c                2    t        |   di | || _        y r\   )r*   r+   	drop_prob)r3   rq   r4   r5   s      r6   r+   zTFDebertaStableDropout.__init__v   s    "6""r7   c                    t        j                  dt         j                  j                  j                  j                  d j                  z
        j                  t        |            z
  t         j                        t        j                  dd j                  z
  z   j                         j                  dkD  r9t        j                  t        j                  d j                        |      z  } fd}||fS )	z~
        Applies dropout to the inputs, as vanilla dropout, but also scales the remaining elements up by 1/drop_prob.
        r    g      ?)probs)sample_shaper_   r   ra   c                    j                   dkD  r9t        j                  t        j                  dj                        |       z  S | S )Nr   ra   r_   )rq   rH   rd   rc   rf   )upstreamri   scaler3   s    r6   gradz-TFDebertaStableDropout.xdropout.<locals>.grad   s>    ~~!xxbggc9K9K&LhWZ___r7   )rH   rc   compatv1distributions	Bernoullirq   sampler   rP   convert_to_tensorrf   rd   )r3   rh   rx   ri   rw   s   `  @@r6   xdropoutzTFDebertaStableDropout.xdropoutz   s    
 wwiill((22t~~9M2NUUcmntcuUvwGG

 $$SA,>%?tGYGYZ>>AXXdBGGCt7I7I$JFSV[[F	  t|r7   c                ,    |r| j                  |      S |S rA   )r   )r3   rh   r:   s      r6   r?   zTFDebertaStableDropout.call   s    ==((r7   rN   )rh   rm   r:   rm   )
rS   rT   rU   rn   r+   rH   custom_gradientr   r?   rW   rX   s   @r6   r/   r/   n   s1    #  * r7   r/   c                  6     e Zd ZdZd fd	Z fdZddZ xZS )TFDebertaLayerNormzBLayerNorm module in the TF style (epsilon inside the square root).c                @    t        |   di | || _        || _        y r\   )r*   r+   sizeeps)r3   r   r   r4   r5   s       r6   r+   zTFDebertaLayerNorm.__init__   s!    "6"	r7   c                    | j                  | j                  gt        j                         d      | _        | j                  | j                  gt        j
                         d      | _        t        | !  |      S )Nweight)shapeinitializerr'   bias)	
add_weightr   rH   ones_initializergammazeros_initializerbetar*   rJ   )r3   rL   r5   s     r6   rJ   zTFDebertaLayerNorm.build   s^    __DII;BDWDWDY`h_i
OO499+2CWCWCY`fOg	w}[))r7   c                .   t        j                  |dgd      }t        j                  t        j                  ||z
        dgd      }t         j                  j	                  || j
                  z         }| j                  ||z
  z  |z  | j                  z   S )Nrl   T)r]   keepdims)rH   reduce_meansquaremathsqrtr   r   r   )r3   xmeanvariancestds        r6   r?   zTFDebertaLayerNorm.call   ss    ~~ardT:>>"))AH"5RD4Pggll8dhh./zzQX&,tyy88r7   )g-q=)r   rm   rQ   rm   rS   rT   rU   rn   r+   rJ   r?   rW   rX   s   @r6   r   r      s    L
*
9r7   r   c                  2     e Zd Zd fdZdddZddZ xZS )TFDebertaSelfOutputc                *   t        |   di | t        j                  j	                  |j
                  d      | _        t        j                  j                  |j                  d      | _	        t        |j                  d      | _        || _        y )Nr%   r&   	LayerNormepsilonr'   r(   r)   )r*   r+   r   r,   r-   rB   r%   LayerNormalizationlayer_norm_epsr   r/   hidden_dropout_probr(   r1   r2   s      r6   r+   zTFDebertaSelfOutput.__init__   sq    "6"\\''(:(:'I
88AVAV]h8i-f.H.HyYr7   c                v    | j                  |      }| j                  ||      }| j                  ||z         }|S )Nr9   r%   r(   r   r3   r<   input_tensorr:   s       r6   r?   zTFDebertaSelfOutput.call   s;    

=1]XF}|'CDr7   c                   | j                   ry d| _         t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   xY w# 1 sw Y   qxY w# 1 sw Y   y xY wNTr%   r   r(   )rF   rG   rH   rI   r%   r'   rJ   r1   rB   r   r(   rK   s     r6   rJ   zTFDebertaSelfOutput.build   s)   ::
4$'3tzz/ H

  $dkk.E.E!FGH4d+7t~~223 L$$dD$++2I2I%JKL4D)5t||001 )""4() ) 6H HL L) )$   3E<3E-E+EE(+E4rM   rN   rO   rA   rS   rT   rU   r+   r?   rJ   rW   rX   s   @r6   r   r      s    )r7   r   c                  Z     e Zd Zd fdZ	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZddZ xZS )TFDebertaAttentionc                z    t        |   di | t        |d      | _        t	        |d      | _        || _        y )Nr3   r&   rk   r)   )r*   r+   "TFDebertaDisentangledSelfAttentionr3   r   dense_outputr1   r2   s      r6   r+   zTFDebertaAttention.__init__   s7    "6"6vFK	/XFr7   c           	     ~    | j                  |||||||      }||}| j                  |d   ||      }	|	f|dd  z   }
|
S )Nr<   attention_maskquery_statesrelative_posrel_embeddingsoutput_attentionsr:   r   r<   r   r:   r    )r3   r   )r3   r   r   r   r   r   r   r:   self_outputsattention_outputrk   s              r6   r?   zTFDebertaAttention.call   su     yy&)%%)/ ! 
 'L,,&q/x - 
 #$|AB'77r7   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   exY w# 1 sw Y   y xY w)NTr3   r   )rF   rG   rH   rI   r3   r'   rJ   r   rK   s     r6   rJ   zTFDebertaAttention.build   s    ::
4&2tyy~~. &		%&4.:t00556 .!!''-. . ;& &. .   C%CCC rM   NNNFF)r   rm   r   rm   r   rm   r   rm   r   rm   r   rP   r:   rP   rQ   Tuple[tf.Tensor]rA   r   rX   s   @r6   r   r      sq     #'"&$("' "  	
   "    
:	.r7   r   c                  0     e Zd Zd fdZddZddZ xZS )TFDebertaIntermediatec                T   t        |   di | t        j                  j	                  |j
                  t        |j                        d      | _        t        |j                  t              r"t        |j                        | _        || _        y |j                  | _        || _        y )Nr%   unitskernel_initializerr'   r)   )r*   r+   r   r,   r-   intermediate_sizer   initializer_ranger%   
isinstance
hidden_actstrr
   intermediate_act_fnr1   r2   s      r6   r+   zTFDebertaIntermediate.__init__   s    "6"\\''**vOgOg?hov ( 

 f''-'89J9J'KD$  (.'8'8D$r7   c                L    | j                  |      }| j                  |      }|S Nrh   )r%   r   r3   r<   s     r6   r?   zTFDebertaIntermediate.call  s(    

-
800?r7   c                (   | j                   ry d| _         t        | dd       et        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        y y # 1 sw Y   y xY w)NTr%   )	rF   rG   rH   rI   r%   r'   rJ   r1   rB   rK   s     r6   rJ   zTFDebertaIntermediate.build  s}    ::
4$'3tzz/ H

  $dkk.E.E!FGH H 4H Hs   3BBrM   r<   rm   rQ   rm   rA   r   rX   s   @r6   r   r      s    Hr7   r   c                  2     e Zd Zd fdZdddZddZ xZS )TFDebertaOutputc                R   t        |   di | t        j                  j	                  |j
                  t        |j                        d      | _        t        j                  j                  |j                  d      | _        t        |j                  d      | _        || _        y )Nr%   r   r   r   r(   r&   r)   )r*   r+   r   r,   r-   rB   r   r   r%   r   r   r   r/   r   r(   r1   r2   s      r6   r+   zTFDebertaOutput.__init__  s    "6"\\''$$IaIa9bip ( 

 88AVAV]h8i-f.H.HyYr7   c                x    | j                  |      }| j                  ||      }| j                  ||z         }|S )Nr   r9   r   r   s       r6   r?   zTFDebertaOutput.call  s=    

-
8]XF}|'CDr7   c                   | j                   ry d| _         t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   xY w# 1 sw Y   qxY w# 1 sw Y   y xY wr   )rF   rG   rH   rI   r%   r'   rJ   r1   r   r   rB   r(   rK   s     r6   rJ   zTFDebertaOutput.build&  s)   ::
4$'3tzz/ N

  $dkk.K.K!LMN4d+7t~~223 L$$dD$++2I2I%JKL4D)5t||001 )""4() ) 6N NL L) )r   rM   rN   )r<   rm   r   rm   r:   rP   rQ   rm   rA   r   rX   s   @r6   r   r     s    )r7   r   c                  Z     e Zd Zd fdZ	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZddZ xZS )TFDebertaLayerc                    t        |   di | t        |d      | _        t	        |d      | _        t        |d      | _        y )N	attentionr&   intermediaterk   r)   )r*   r+   r   r   r   r   r   bert_outputr2   s      r6   r+   zTFDebertaLayer.__init__6  s?    "6"+FE1&~N*6Ar7   c           	         | j                  |||||||      }|d   }	| j                  |	      }
| j                  |
|	|      }|f|dd  z   }|S )N)r   r   r   r   r   r   r:   r   r<   r   r    )r   r   r   )r3   r<   r   r   r   r   r   r:   attention_outputsr   intermediate_outputlayer_outputoutputss                r6   r?   zTFDebertaLayer.call=  s     !NN&)%%)/ + 
 -Q/"//>N/O''-<LW_ ( 
  /$5ab$99r7   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   xY w# 1 sw Y   qxY w# 1 sw Y   y xY w)NTr   r   r   )	rF   rG   rH   rI   r   r'   rJ   r   r   rK   s     r6   rJ   zTFDebertaLayer.buildY  s	   ::
4d+7t~~223 +$$T*+4.:t00556 .!!''-.4-9t//445 -  &&t,- - :+ +. .- -s$   D%%D1?D=%D.1D:=ErM   r   r<   rm   r   rm   r   rm   r   rm   r   rm   r   rP   r:   rP   rQ   r   rA   r   rX   s   @r6   r   r   5  sr    B #'"&$("'  "  	
   "    
8-r7   r   c                  t     e Zd Zd fdZddZd Zd Zd	dZ	 	 	 	 	 	 d
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ xZ	S )TFDebertaEncoderc                L   t        |   di | t        |j                        D cg c]  }t	        |d|        c}| _        t        |dd      | _        || _        | j                  r4t        |dd      | _	        | j                  dk  r|j                  | _	        y y y c c}w )	Nzlayer_._r&   relative_attentionFmax_relative_positionsrl   r    r)   )r*   r+   rangenum_hidden_layersr   layerrG   r   r1   r   max_position_embeddings)r3   r1   r4   ir5   s       r6   r+   zTFDebertaEncoder.__init__i  s    "6"KPQWQiQiKjkanVHQC.Ak
")&2F"N""*1&:RTV*WD'**Q..4.L.L+ / # ls   B!c                   | j                   ry d| _         | j                  rY| j                  d| j                  dz  | j                  j
                  gt        | j                  j                              | _        t        | dd       K| j                  D ];  }t        j                  |j                        5  |j                  d        d d d        = y y # 1 sw Y   IxY w)NTzrel_embeddings.weight   r'   r   r   r   )rF   r   r   r   r1   rB   r   r   r   rG   r   rH   rI   r'   rJ   )r3   rL   r   s      r6   rJ   zTFDebertaEncoder.buildt  s    ::
"""&//,22Q68O8OP+DKK,I,IJ #2 #D
 4$'3 &]]5::. &KK%& && 4& &s   5CC	c                >    | j                   r| j                  }|S d }|S rA   )r   r   )r3   r   s     r6   get_rel_embeddingz"TFDebertaEncoder.get_rel_embedding  s*    040G0G,, NRr7   c                   t        t        |            dk  r}t        j                  t        j                  |d      d      }|t        j                  t        j                  |d      d      z  }t        j
                  |t        j                        }|S t        t        |            dk(  rt        j                  |d      }|S )Nr   r    rl   r	   )lenr   rH   expand_dimssqueezerc   uint8)r3   r   extended_attention_masks      r6   get_attention_maskz#TFDebertaEncoder.get_attention_mask  s    z.)*a/&(nnR^^NTU5VXY&Z#4r~~bjjQhjlFmoq7rrNWW^RXX>N  N+,1^^NA>Nr7   c                    | j                   r8|6|t        |      d   nt        |      d   }t        |t        |      d         }|S )Nr   )r   r   build_relative_position)r3   r<   r   r   qs        r6   get_rel_poszTFDebertaEncoder.get_rel_pos  sO    ""|';0<0H
<(,jYfNghjNkA21j6OPR6STLr7   c	           
        |rdnd }	|rdnd }
| j                  |      }| j                  |||      }t        |t              r|d   }n|}| j	                         }t        | j                        D ]i  \  }}|r|	|fz   }	 ||||||||      }|d   }|8|}t        |t              r(|dz   t        | j                        k  r||dz      nd }n|}|sa|
|d   fz   }
k |r|	|fz   }	|st        d ||	|
fD              S t        ||	|
      S )Nr)   r   r   r    c              3  &   K   | ]	  }||  y wrA   r)   ).0vs     r6   	<genexpr>z(TFDebertaEncoder.call.<locals>.<genexpr>  s     hqZ[Zghs   last_hidden_stater<   
attentions)
r   r   r   r   r   	enumerater   r   tupler   )r3   r<   r   r   r   r   output_hidden_statesreturn_dictr:   all_hidden_statesall_attentionsnext_kvr   r   layer_modulelayer_outputss                   r6   r?   zTFDebertaEncoder.call  sT    #7BD0d00@''|\RmX.#A&G#G//1(4 	FOA|#$58H$H!(%-))-"3!M *!,M',mX667!ec$**o6MmAE2SWG' !/=3C2E!E/	F4   1]4D Dh]4E~$Vhhh +;LYg
 	
r7   rM   rA   )NN)NNFFTF)r<   rm   r   rm   r   rm   r   rm   r   rP   r
  rP   r  rP   r:   rP   rQ   *Union[TFBaseModelOutput, Tuple[tf.Tensor]])
rS   rT   rU   r+   rJ   r   r   r   r?   rW   rX   s   @r6   r   r   h  s    	M& #'"&"'%* :
 :
 ":
  	:

  :
  :
 #:
 :
 :
 
4:
r7   r   c                   t        j                  | t         j                        }t        j                  |t         j                        }|dddf   t        j                  t        j                  |ddg      | dg      z
  }|d| ddf   }t        j
                  |d      }t        j                  |t         j                        S )a  
    Build relative position according to the query and key

    We assume the absolute position of query \(P_q\) is range from (0, query_size) and the absolute position of key
    \(P_k\) is range from (0, key_size), The relative positions from query to key is \(R_{q \rightarrow k} = P_q -
    P_k\)

    Args:
        query_size (int): the length of query
        key_size (int): the length of key

    Return:
        `tf.Tensor`: A tensor with shape [1, query_size, key_size]

    r_   Nr    rl   r   r]   )rH   r   int32tilereshaper   rc   int64)
query_sizekey_sizeq_idsk_idsrel_pos_idss        r6   r   r     s      HHZrxx0EHHXRXX.E4.2772::eaW+E
TU#WWKkzk1n-K..15K77;))r7   c                    t        |      d   t        |      d   t        |      d   t        |      d   g}t        j                  | |      S )Nr   r    r   rl   r   rH   broadcast_to)c2p_posquery_layerr   shapess       r6   c2p_dynamic_expandr#    sP    ;";";"< $	F ??7F++r7   c                    t        |      d   t        |      d   t        |      d   t        |      d   g}t        j                  | |      S )Nr   r    r   r  )r   r!  	key_layerr"  s       r6   p2c_dynamic_expandr&    sP    ;";"9b!9b!	F ??7F++r7   c                    t        |      d d t        |       d   t        |      d   gz   }t        j                  | |      S )Nr   r   r  )	pos_indexp2c_attr%  r"  s       r6   pos_dynamic_expandr*     sC     !$
9(=b(A:iCXY[C\']]F??9f--r7   c                z   |dk  rt        j                  |       |z   }|t        j                  |       dz
  k7  rt        j                  |       dz
  |z
  }t        j                  t        j                  t        j                  |             |d      }t        j                  | |      } t        j                  ||      }nd}t        j
                  | dt        j                  |       d   f      }t        j
                  |dt        j                  |      d   f      }t        j                  ||d      }t        j
                  |t        j                  |            }|dk7  rVt        j                  t        j                  t        j                  |             | d      }t        j                  ||      }|S )Nr   r    r  permrl   )
batch_dims)rH   rankrollr   	transposer  r   gather)r   indicesgather_axispre_rollpermutationflat_xflat_indicesgathereds           r6   torch_gatherr:    s=   Qggaj;.bggaj1n$771:>K/ggbhhrwwqz2H1ELL-,,w[9ZZBB01F::gBHHW,=b,A'BCLyy!<Hzz(BHHW$56H1}ggbhhrwwqz2XIAF<<{;Or7   c                  l     e Zd ZdZd fdZddZd	dZ	 	 	 	 	 d
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZd Z xZ	S )r   a  
    Disentangled self-attention module

    Parameters:
        config (`str`):
            A model config class instance with the configuration to build a new model. The schema is similar to
            *BertConfig*, for more details, please refer [`DebertaConfig`]

    c                   t        |   di | |j                  |j                  z  dk7  r&t	        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  j                  | j                  dz  t        |j                        dd      | _        |j                  |j                  ng | _        t        |d	d      | _        t        |d
d      | _        | j"                  rt        j                  j                  | j                  t        |j                        dd      | _        t        j                  j                  | j                  t        |j                        dd      | _        t)        d      | _        | j                   rt        |dd      | _        | j,                  dk  r|j.                  | _        t1        |j2                  d      | _        d| j                  v rEt        j                  j                  | j                  t        |j                        dd      | _        d| j                  v rDt        j                  j                  | j                  t        |j                        d      | _        t1        |j:                  d      | _        || _        y )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r	   in_projFr   r'   use_biasr   talking_headhead_logits_projhead_weights_projrl   r  r   r    pos_dropoutr&   c2ppos_projp2c
pos_q_proj)r   r'   r(   r)   ) r*   r+   rB   num_attention_heads
ValueErrorrR   attention_head_sizeall_head_sizer   r,   r-   r   r   r>  pos_att_typerG   r   rA  rB  rC  rZ   softmaxr   r   r/   r   rD  rF  rH  attention_probs_dropout_probr(   r1   r2   s      r6   r+   z+TFDebertaDisentangledSelfAttention.__init__(  s   "6" : ::a?#F$6$6#7 8 445Q8  $*#=#= #&v'9'9F<V<V'V#W !558P8PP||))".v/G/GH	 * 
 4:3F3F3RF//XZ")&2F"N#FNEB$)LL$6$6((#263K3K#L'	 %7 %D! &+\\%7%7((#263K3K#L(	 &8 &D" )b1""*1&:RTV*WD'**Q..4.L.L+5f6P6PWdeD))) % 2 2&&'6v7O7O'P#"	 !3 ! )))"',,"4"4&&?6KcKc;dkw #5 # .f.Q.QXabr7   c                J   | j                   ry d| _         | j                  d| j                  t        j                  j                               | _        | j                  d| j                  t        j                  j                               | _        t        | dd       dt        j                  | j                  j                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       Mt        j                  | j                   j                        5  | j                   j                  d        d d d        t        | dd       Mt        j                  | j"                  j                        5  | j"                  j                  d        d d d        t        | dd       Mt        j                  | j$                  j                        5  | j$                  j                  d        d d d        t        | d	d       Mt        j                  | j&                  j                        5  | j&                  j                  d        d d d        t        | d
d       bt        j                  | j(                  j                        5  | j(                  j                  | j                  j                  g       d d d        t        | dd       ct        j                  | j*                  j                        5  | j*                  j                  | j                  j                  g       d d d        y y # 1 sw Y   RxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   kxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)NTq_biasr   v_biasr>  r(   rB  rC  rD  rF  rH  )rF   r   rL  r   initializersZerosrQ  rR  rG   rH   rI   r>  r'   rJ   r1   rB   r(   rB  rC  rD  rF  rH  rK   s     r6   rJ   z(TFDebertaDisentangledSelfAttention.builda  s   ::
oo$"4"45CUCUC[C[C] & 
 oo$"4"45CUCUC[C[C] & 
 4D)5t||001 J""D$0G0G#HIJ4D)5t||001 )""4()4+T2>t4499: 2%%++D124,d3?t55::; 3&&,,T234-9t//445 -  &&t,-4T*6t}}112 ?##T[[%<%<$=>?4t,8t334 A%%t{{'>'>&?@A A 9#J J) )2 23 3- -? ?A AsT   	3M:MM&.M3N "1N1NMM#&M03M= N
NN"c                    t        |      d d | j                  dgz   }t        j                  ||      }t        j                  |g d      S )Nrl   tensorr   r   r   r    r	   r,  )r   rI  rH   r  r1  )r3   rW  r   s      r6   transpose_for_scoresz7TFDebertaDisentangledSelfAttention.transpose_for_scores  sF    6"3B'4+C+CR*HH67 ||F66r7   c           	        |>| j                  |      }t        j                  | j                  |      dd      \  }	}
}nd }t        j                  t        j                  | j                   j
                  d         | j                  dz  d      }t        j                  | j                  d      }t        j                  d      D ]  }t        j                  | j                  | j                        }t        j                  | j                        D ]  }|j                  |||dz  |z            } |j                  ||j                               } dgdz  } ||d   |d   |      } ||d   |d   |      } ||d	   |d	   |      }| j                  |      }	| j                  |      }
| j                  |      }|	| j                  | j                  ddddf         z   }	|| j                  | j                  ddddf         z   }d}dt        | j                        z   }t!        j"                  t%        |	      d   |z        }|	|z  }	t        j&                  |	t        j                  |
g d
            }| j(                  r(| j+                  ||      }| j-                  |	|
|||      }|||z   }| j.                  r=t        j                  | j1                  t        j                  |g d            g d      }| j3                  ||      }| j5                  ||      }| j.                  r=t        j                  | j7                  t        j                  |g d            g d      }t        j&                  ||      }t        j                  |g d      }t%        |      }|dd |d   |d   z  gz   }t        j8                  ||      }|r||f}|S |f}|S )a  
        Call the module

        Args:
            hidden_states (`tf.Tensor`):
                Input states to the module usually the output from previous layer, it will be the Q,K and V in
                *Attention(Q,K,V)*

            attention_mask (`tf.Tensor`):
                An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.

            return_att (`bool`, *optional*):
                Whether return the attention matrix.

            query_states (`tf.Tensor`, *optional*):
                The *Q* state in *Attention(Q,K,V)*.

            relative_pos (`tf.Tensor`):
                The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
                values ranging in [*-max_relative_positions*, *max_relative_positions*].

            rel_embeddings (`tf.Tensor`):
                The embedding of relative distances. It's a tensor of shape [\(2 \times
                \text{max_relative_positions}\), *hidden_size*].


        Nr	   rl   )num_or_size_splitsr]   c                j    t        j                  || d      }||t        j                  |      z  }|S )NT)transpose_b)rH   matmulr1  )wbr   outs       r6   linearz7TFDebertaDisentangledSelfAttention.call.<locals>.linear  s0    ii1$7=2<<?*C
r7   r   )r`   r   r    r   r   r    r	   r   r9   )r   r   r	   r    )r   r	   r    r   rX  r   )r>  rH   splitrY  r1  r   rI  TensorArrayr`   r   writeconcatrQ  rR  r   rM  r   r   r   r^  r   rD  disentangled_att_biasrA  rB  rN  r(   rC  r  )r3   r<   r   r   r   r   r   r:   qpr!  r%  value_layerrb  wsqkvwkqkvw_insider   qkvbr   r  rel_attscale_factorrw   attention_scoresattention_probscontext_layercontext_layer_shapenew_context_layer_shaper   s                                 r6   r?   z'TFDebertaDisentangledSelfAttention.call  s   N m,B24(())"-!"3/KK
 T\\0034IaIadeIelmB >>

;DXXa[ ; nn4::DD\D\]$":":; FA"-"3"3Ar!a%!)}"EKFzz![%7%7%9:	;
 6A:DtAwQ6AtAwQ7AtAwQ7A33A6K11!4I33A6K!D$=$=dkk$PTVW->X$YY!D$=$=dkk$PTVW->X$YY3t0011		*[1"5DE!E)99[",,y,2WX""!--nx-PN00iWegstG/'9!||%%bll3C\&RSUa  ,,'7H,,,J ll&&r||O\'RSUaO 		/;?]LA(7
 #6cr":>QRT>UXklnXo>o=p"p

=2IJ6G=/2 O\M]r7   c           
     0   |&t        |      d   }t        |t        |      d         }t        |      }t        |      dk(  r+t        j                  t        j                  |d      d      }nJt        |      dk(  rt        j                  |d      }n%t        |      dk7  rt        dt        |             t        j                  t        j                  t        j                  t        |      d   t        |      d         | j                        t        j                        }t        j                  || j                  |z
  | j                  |z   d d f   d      }d}	d| j                  v r| j                  |      }
| j                  |
      }
t        j                  |t        j                  |
g d	            }t        j                   ||z   d|dz  dz
        }t#        |t%        |||      d
      }|	|z  }	d| j                  v r| j'                  |      }| j                  |      }|t        j(                  j+                  t        j                  t        |      d
   |z  | j,                              z  }t        |      d   t        |      d   k7  r%t        t        |      d   t        |      d         }n|}t        j                   | |z   d|dz  dz
        }t        j                  |t        j                  |g d	            }t        j                  t#        |t/        |||      d
      g d	      }t        |      d   t        |      d   k7  r;t        j                  |d d d d d d df   d
      }t#        |t1        |||      d      }|	|z  }	|	S )Nr   r   r   r	   r       z2Relative position ids must be of dim 2 or 3 or 4. rE  rc  rl   rG  r_   )r   r   r   rH   r   rJ  rc   minimummaximumr   r  rM  rF  rY  r^  r1  clip_by_valuer:  r#  rH  r   r   rf   r&  r*  )r3   r!  r%  r   r   rq  r   shape_list_posatt_spanscorepos_key_layerc2p_attr   pos_query_layerr_posp2c_posr)  r(  s                     r6   rh  z8TFDebertaDisentangledSelfAttention.disentangled_att_bias  s@   ;'+A21j6KB6OPL#L1~!#>>"..q*I1ML A%>>,:L A%QRUVdReQfghh77JJ

:k226
98Mb8QRTXToTo HH	
 466ADD_D_bjDjjlmmnpq
  D%%% MM.9M 55mDMiiR\\--VWG&&|h'>8a<RSCSTG"7,>wUa,bdfgGWE D%%%"oon=O"77HOrww||
?3B7,FdN`N`a  O +&r*j.CB.GG/
90Eb0I:V_K`acKde$&&v'8!X\A=MNGii	2<<+VWGllW&8+y&Y[]^`lG +&r*j.CB.GGNN<1a
+CRH	&w0B9gW`0acefWEr7   rM   rA   )rW  rm   rQ   rm   r   r   )
rS   rT   rU   rn   r+   rJ   rY  r?   rh  rW   rX   s   @r6   r   r     s    7rA@7 #'"&$("'m m "m  	m
  m "m  m m 
m^7r7   r   c                  Z     e Zd ZdZ fdZddZ	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ xZS )TFDebertaEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                H   t        |   di | || _        t        |d|j                        | _        |j                  | _        |j                  | _        t        |dd      | _        |j                  | _        | j
                  |j                  k7  rEt        j                  j                  |j                  t        |j                        dd      | _        t        j                  j                  |j                  d      | _        t#        |j$                  d	
      | _        y )Nembedding_sizeposition_biased_inputT
embed_projFr?  r   r   r(   r&   r)   )r*   r+   r1   rG   rB   r  r   r  r   r   r,   r-   r   r  r   r   r   r/   r   r(   r2   s      r6   r+   zTFDebertaEmbeddings.__init__5  s    "6"%f.>@R@RS!--'-'E'E$%,V5Ld%S"!'!9!9&"4"44#ll00""#263K3K#L!	 1 DO 88AVAV]h8i-f.H.HyYr7   c                   t        j                  d      5  | j                  d| j                  j                  | j
                  gt        | j                              | _        d d d        t        j                  d      5  | j                  j                  dkD  rM| j                  d| j                  j                  | j
                  gt        | j                              | _
        nd | _
        d d d        t        j                  d      5  | j                  rC| j                  d| j                  | j                  gt        | j                              | _        nd | _        d d d        | j                  ry d| _        t!        | d	d       dt        j                  | j"                  j$                        5  | j"                  j'                  d d | j                  j                  g       d d d        t!        | d
d       Mt        j                  | j(                  j$                        5  | j(                  j'                  d        d d d        t!        | dd       [t        j                  | j*                  j$                        5  | j*                  j'                  d d | j
                  g       d d d        y y # 1 sw Y   QxY w# 1 sw Y   xY w# 1 sw Y   lxY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)Nword_embeddingsr   r   token_type_embeddingsr   
embeddingsposition_embeddingsTr   r(   r  )rH   rI   r   r1   
vocab_sizer  r   r   r   type_vocab_sizer  r  r   rB   r  rF   rG   r   r'   rJ   r(   r  rK   s     r6   rJ   zTFDebertaEmbeddings.buildH  s|   ]],- 	//{{--t/B/BC+D,B,BC * DK	 ]]23 	2{{**Q.-1__%;;668K8KL /0F0F G .= .* .2*	2 ]]01 	0))+/??%779I9IJ /0F0F G ,; ,( ,0(	0 ::
4d+7t~~223 L$$dD$++2I2I%JKL4D)5t||001 )""4()4t,8t334 I%%tT43F3F&GHI I 9I	 		2 	2	0 	0L L) )I IsJ   AJ2 A.J?AK43K%K%?)K12J<?K	KK"%K.1K:c                .   ||t        d      |At        || j                  j                         t	        j
                  | j                  |      }t        |      dd }|t	        j                  |d      }|/t	        j                  t	        j                  d|d         d      }|}| j                  r&t	        j
                  | j                  |      }	||	z  }| j                  j                  dkD  r&t	        j
                  | j                  |      }
||
z  }| j                  | j                   k7  r| j#                  |      }| j%                  |      }|t'        t        |            t'        t        |            k7  ryt'        t        |            d	k(  r,t	        j(                  t	        j(                  |d
      d
      }t	        j*                  t	        j                  |d      | j,                        }||z  }| j/                  ||      }|S )z
        Applies embedding based on inputs tensor.

        Returns:
            final_embeddings (`tf.Tensor`): output embedding tensor.
        Nz5Need to provide either `input_ids` or `input_embeds`.)paramsr3  rl   r   dimsvalue)startlimitr  rx  r    r   r_   r9   )rJ  r   r1   r  rH   r2  r   r   fillr   r   r  r  r  r  r  rB   r  r   r   r   rc   rf   r(   )r3   	input_idsposition_idstoken_type_idsinputs_embedsri   r:   rL   final_embeddingsposition_embedstoken_type_embedss              r6   r?   zTFDebertaEmbeddings.callq  s    !6TUU *9dkk6L6LMIIT[[)LM /4!WW+Q?N>>"((+b/*RYZ[L(%% iit/G/GQ]^O/;;&&* "		1K1KUc d 11$"2"22#/?@>>*:;:d#$J7G,H(IIz$'(A-::bjjA&>QGDwwr~~d;4CUCUV/$6<<(88<Lr7   rA   )NNNNNF)r  rm   r  rm   r  rm   r  rm   ri   rm   r:   rP   rQ   rm   r   rX   s   @r6   r  r  2  sp    QZ&'IV  $"&$(#'5 5   5  "	5 
 !5  5  5  
5 r7   r  c                  0     e Zd Zd fdZddZddZ xZS ) TFDebertaPredictionHeadTransformc                   t        |   di | t        |d|j                        | _        t
        j                  j                  | j                  t        |j                        d      | _
        t        |j                  t              rt        |j                        | _        n|j                  | _        t
        j                  j!                  |j"                  d      | _        || _        y )Nr  r%   r   r   r   r)   )r*   r+   rG   rB   r  r   r,   r-   r   r   r%   r   r   r   r
   transform_act_fnr   r   r   r1   r2   s      r6   r+   z)TFDebertaPredictionHeadTransform.__init__  s    "6"%f.>@R@RS\\''%%.v/G/GH ( 

 f''-$5f6G6G$HD!$*$5$5D!88AVAV]h8ir7   c                n    | j                  |      }| j                  |      }| j                  |      }|S r   )r%   r  r   r   s     r6   r?   z%TFDebertaPredictionHeadTransform.call  s6    

-
8--m<}5r7   c                   | j                   ry d| _         t        | dd       dt        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   rxY w# 1 sw Y   y xY w)NTr%   r   )rF   rG   rH   rI   r%   r'   rJ   r1   rB   r   r  rK   s     r6   rJ   z&TFDebertaPredictionHeadTransform.build  s    ::
4$'3tzz/ H

  $dkk.E.E!FGH4d+7t~~223 H$$dD$2E2E%FGH H 8H HH Hs   3C/<)C;/C8;DrM   r   rA   r   rX   s   @r6   r  r    s    $	Hr7   r  c                  P     e Zd Zd fdZd	dZd
dZddZddZddZddZ	 xZ
S )TFDebertaLMPredictionHeadc                    t        |   di | || _        t        |d|j                        | _        t        |d      | _        || _        y )Nr  	transformr&   r)   )	r*   r+   r1   rG   rB   r  r  r  input_embeddingsr3   r1   r  r4   r5   s       r6   r+   z"TFDebertaLMPredictionHead.__init__  sJ    "6"%f.>@R@RS9&{S !1r7   c                X   | j                  | j                  j                  fddd      | _        | j                  ry d| _        t        | dd       Nt        j                  | j                  j                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY w)NzerosTr   )r   r   	trainabler'   r  )r   r1   r  r   rF   rG   rH   rI   r  r'   rJ   rK   s     r6   rJ   zTFDebertaLMPredictionHead.build  s    OO4;;+A+A*CQXdhouOv	::
4d+7t~~223 +$$T*+ + 8+ +s   :B  B)c                    | j                   S rA   )r  rC   s    r6   get_output_embeddingsz/TFDebertaLMPredictionHead.get_output_embeddings  s    $$$r7   c                `    || j                   _        t        |      d   | j                   _        y Nr   )r  r   r   r  r3   r  s     r6   set_output_embeddingsz/TFDebertaLMPredictionHead.set_output_embeddings  s(    ',$+5e+<Q+?(r7   c                    d| j                   iS )Nr   )r   rC   s    r6   get_biasz"TFDebertaLMPredictionHead.get_bias  s    		""r7   c                X    |d   | _         t        |d         d   | j                  _        y )Nr   r   )r   r   r1   r  r  s     r6   set_biasz"TFDebertaLMPredictionHead.set_bias  s'    &M	!+E&M!:1!=r7   c                   | j                  |      }t        |      d   }t        j                  |d| j                  g      }t        j
                  || j                  j                  d      }t        j                  |d|| j                  j                  g      }t        j                  j                  || j                        }|S )Nr   r    rl   rV  T)ar`  r]  )r  r   )r  r   rH   r  r  r^  r  r   r1   r  nnbias_addr   )r3   r<   
seq_lengths      r6   r?   zTFDebertaLMPredictionHead.call  s    ]C.q1


-DDWDW?XY		MT5J5J5Q5Q_cd

-JPTP[P[PfPf?gh]Kr7   r1   r!   r  keras.layers.LayerrA   rQ   r  r  ztf.Variable)rQ   zDict[str, tf.Variable]r   )rS   rT   rU   r+   rJ   r  r  r  r  r?   rW   rX   s   @r6   r  r    s'    
1+%@#>r7   r  c                  0     e Zd Zd fdZddZddZ xZS )TFDebertaOnlyMLMHeadc                J    t        |   di | t        ||d      | _        y )Npredictionsr&   r)   )r*   r+   r  r  r  s       r6   r+   zTFDebertaOnlyMLMHead.__init__   s&    "6"4V=MTabr7   c                *    | j                  |      }|S )Nr   )r  )r3   sequence_outputprediction_scoress      r6   r?   zTFDebertaOnlyMLMHead.call  s     ,,?,K  r7   c                    | j                   ry d| _         t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY w)NTr  )rF   rG   rH   rI   r  r'   rJ   rK   s     r6   rJ   zTFDebertaOnlyMLMHead.build	  sm    ::
4-9t//445 -  &&t,- - :- -   A11A:r  )r  rm   rQ   rm   rA   r   rX   s   @r6   r  r    s    c!
-r7   r  c                       e Zd ZeZd fdZddZd	dZd Ze		 	 	 	 	 	 	 	 	 d
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd       Z
ddZ xZS )TFDebertaMainLayerc                z    t        |   di | || _        t        |d      | _        t        |d      | _        y )Nr  r&   encoderr)   )r*   r+   r1   r  r  r   r  r2   s      r6   r+   zTFDebertaMainLayer.__init__  s6    "6"-f<H'Y?r7   c                    | j                   S rA   )r  rC   s    r6   get_input_embeddingsz'TFDebertaMainLayer.get_input_embeddings  s    r7   c                `    || j                   _        t        |      d   | j                   _        y r  )r  r   r   r  r  s     r6   set_input_embeddingsz'TFDebertaMainLayer.set_input_embeddings!  s$    !&%/%6q%9"r7   c                    t         )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        )NotImplementedError)r3   heads_to_prunes     r6   _prune_headszTFDebertaMainLayer._prune_heads%  s
    
 "!r7   c
                   ||t        d      |t        |      }
n|t        |      d d }
nt        d      |t        j                  |
d      }|t        j                  |
d      }| j	                  ||||||	      }| j                  ||||||	      }|d   }|s	|f|dd  z   S t        ||j                  |j                  	      S )
NzDYou cannot specify both input_ids and inputs_embeds at the same timerl   z5You have to specify either input_ids or inputs_embedsr    r  r   )r  r  r  r  ri   r:   )r<   r   r   r
  r  r:   r  )	rJ  r   rH   r  r  r  r   r<   r  )r3   r  r   r  r  r  r   r
  r  r:   rL   embedding_outputencoder_outputsr  s                 r6   r?   zTFDebertaMainLayer.call,  s     ]%>cdd"$Y/K&$]3CR8KTUU!WW+Q?N!WW+Q?N??%)' + 
 ,,*)/!5# ' 
 *!,#%(;;; -)77&11
 	
r7   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   exY w# 1 sw Y   y xY w)NTr  r  )rF   rG   rH   rI   r  r'   rJ   r  rK   s     r6   rJ   zTFDebertaMainLayer.builde  s    ::
4t,8t334 ,%%d+,4D)5t||001 )""4() ) 6, ,) )r   rM   r  r  	NNNNNNNNF)r  TFModelInputType | Noner   np.ndarray | tf.Tensor | Noner  r  r  r  r  r  r   Optional[bool]r
  r  r  r  r:   rP   rQ   r  rA   )rS   rT   rU   r!   config_classr+   r  r  r  r   r?   rJ   rW   rX   s   @r6   r  r    s     L@:"  .28<8<6:7;,0/3&*6
*6
 66
 6	6

 46
 56
 *6
 -6
 $6
 6
 
46
 6
p	)r7   r  c                      e Zd ZdZeZdZy)TFDebertaPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    debertaN)rS   rT   rU   rn   r!   r  base_model_prefixr)   r7   r6   r  r  q  s    
 !L!r7   r  a1
  
    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
    Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Parameters:
        config ([`DebertaConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a	  
    Args:
        input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput``] instead of a plain tuple.
zaThe bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd Zd fdZe eej                  d             ee	e
e      	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd                     Zd	dZ xZS )
TFDebertaModelc                P    t        |   |g|i | t        |d      | _        y )Nr  r&   )r*   r+   r  r  r3   r1   rh   r4   r5   s       r6   r+   zTFDebertaModel.__init__  s(    3&3F3)&yAr7   batch_size, sequence_length
checkpointoutput_typer  c
                :    | j                  |||||||||		      }
|
S )N	r  r   r  r  r  r   r
  r  r:   )r  )r3   r  r   r  r  r  r   r
  r  r:   r   s              r6   r?   zTFDebertaModel.call  s9    & ,,))%'/!5#  

 r7   c                    | j                   ry d| _         t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   y xY w)NTr  )rF   rG   rH   rI   r  r'   rJ   rK   s     r6   rJ   zTFDebertaModel.build  si    ::
4D)5t||001 )""4() ) 6) )r  rM   r  )r  r  r   r  r  r  r  r  r  r  r   r  r
  r  r  r  r:   r  rQ   r  rA   )rS   rT   rU   r+   r   r   DEBERTA_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr?   rJ   rW   rX   s   @r6   r  r    s    
B
 *+C+J+JKh+ij&%$ .28<8<6:7;,0/3&*#(* 6 6	
 4 5 * - $ ! 
4 k 4)r7   r  z5DeBERTa Model with a `language modeling` head on top.c                       e Zd Zd fdZddZe eej                  d             e	e
ee      	 	 	 	 	 	 	 	 	 	 d		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d
d                     ZddZ xZS )TFDebertaForMaskedLMc                    t        |   |g|i | |j                  rt        j	                  d       t        |d      | _        t        || j                  j                  d      | _	        y )NzpIf you want to use `TFDebertaForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.r  r&   cls)r  r'   )
r*   r+   
is_decoderloggerwarningr  r  r  r  mlmr  s       r6   r+   zTFDebertaForMaskedLM.__init__  s_    3&3F3NN1
 *&yA'AXAX_der7   c                .    | j                   j                  S rA   )r  r  rC   s    r6   get_lm_headz TFDebertaForMaskedLM.get_lm_head  s    xx###r7   r  r  c                   | j                  |||||||||
	      }|d   }| j                  ||
      }|	dn| j                  |	|      }|s|f|dd z   }||f|z   S |S t        |||j                  |j
                        S )a  
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        r  r   )r  r:   Nlabelslogitsr   lossr   r<   r  )r  r  hf_compute_lossr   r<   r  )r3   r  r   r  r  r  r   r
  r  r  r:   r   r  r  r  rk   s                   r6   r?   zTFDebertaForMaskedLM.call  s    4 ,,))%'/!5#  

 "!* HH_xHX~t4+?+?vVg+?+h')GABK7F)-)9TGf$EvE$!//))	
 	
r7   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Nt        j                  | j                  j
                        5  | j                  j                  d        d d d        y y # 1 sw Y   exY w# 1 sw Y   y xY w)NTr  r  )rF   rG   rH   rI   r  r'   rJ   r  rK   s     r6   rJ   zTFDebertaForMaskedLM.buildJ  s    ::
4D)5t||001 )""4()4%1txx}}- %t$% % 2) )% %r   rM   r  
NNNNNNNNNF)r  r  r   r  r  r  r  r  r  r  r   r  r
  r  r  r  r  r  r:   r  rQ   z)Union[TFMaskedLMOutput, Tuple[tf.Tensor]]rA   )rS   rT   rU   r+   r  r   r   r  r  r   r  r   r  r?   rJ   rW   rX   s   @r6   r  r    s    
f$ *+C+J+JKh+ij&$$ .28<8<6:7;,0/3&*04#(+
*+
 6+
 6	+

 4+
 5+
 *+
 -+
 $+
 .+
 !+
 
3+
 k +
Z	%r7   r  z
    DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                       e Zd Zd fdZe eej                  d             ee	e
e      	 	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd                     Zd	dZ xZS )
"TFDebertaForSequenceClassificationc                   t        |   |g|i | |j                  | _        t        |d      | _        t        |d      | _        t        |dd       }|| j                  j                  n|}t        |d      | _        t        j                  j                  |j                  t        |j                         d      | _        | j                  j$                  | _        y )Nr  r&   poolercls_dropout
classifierr   )r*   r+   
num_labelsr  r  r#   r	  rG   r1   r   r/   r(   r   r,   r-   r   r   r  rD   )r3   r1   rh   r4   drop_outr5   s        r6   r+   z+TFDebertaForSequenceClassification.__init__^  s    3&3F3 ++)&yA,V(C6=$76>6F4;;22H-h]K,,,,##.v/G/GH - 

 ++00r7   r  r  c                L   | j                  |||||||||
	      }|d   }| j                  ||
      }| j                  ||
      }| j                  |      }|	dn| j	                  |	|      }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        r  r   r9   Nr  r    r  )r  r	  r(   r  r  r   r<   r  )r3   r  r   r  r  r  r   r
  r  r  r:   r   r  r>   r   r  rk   s                    r6   r?   z'TFDebertaForSequenceClassification.callp  s    4 ,,))%'/!5#  

 "!*OhG]XF/~t4+?+?vV\+?+]Y,F)-)9TGf$EvE)!//))	
 	
r7   c                z   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       [t        j                  | j                  j
                        5  | j                  j                  d d | j                  g       d d d        y y # 1 sw Y   'xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   y xY w)NTr  r	  r(   r  )rF   rG   rH   rI   r  r'   rJ   r	  r(   r  rD   rK   s     r6   rJ   z(TFDebertaForSequenceClassification.build  s]   ::
4D)5t||001 )""4()44(4t{{//0 (!!$'(4D)5t||001 )""4()4t,8t334 E%%tT4??&CDE E 9) )( () )E Es0   F%F?F%)F1FF"%F.1F:rM   r  )r  r  r   r  r  r  r  r  r  r  r   r  r
  r  r  r  r  r  r:   r  rQ   z3Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]rA   )rS   rT   rU   r+   r   r   r  r  r   r  r   r  r?   rJ   rW   rX   s   @r6   r  r  V  s    1$ *+C+J+JKh+ij&.$ .28<8<6:7;,0/3&*04#(.
*.
 6.
 6	.

 4.
 5.
 *.
 -.
 $.
 ..
 !.
 
=.
 k .
`Er7   r  z
    DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       e Zd Zd fdZe eej                  d             ee	e
e      	 	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd                     Zd	dZ xZS )
TFDebertaForTokenClassificationc                f   t        |   |g|i | |j                  | _        t        |d      | _        t
        j                  j                  |j                        | _	        t
        j                  j                  |j                  t        |j                        d      | _        || _        y )Nr  r&   )rater  r   )r*   r+   r  r  r  r   r,   Dropoutr   r(   r-   r   r   r  r1   r  s       r6   r+   z(TFDebertaForTokenClassification.__init__  s    3&3F3 ++)&yA||++1K1K+L,,,,##H`H`8aht - 
 r7   r  r  c                (   | j                  |||||||||
	      }|d   }| j                  ||
      }| j                  |      }|	dn| j                  |	|      }|s|f|dd z   }||f|z   S |S t	        |||j
                  |j                        S )	z
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        r  r   r9   r   Nr  r    r  )r  r(   r  r  r   r<   r  )r3   r  r   r  r  r  r   r
  r  r  r:   r   r  r   r  rk   s                   r6   r?   z$TFDebertaForTokenClassification.call  s    0 ,,))%'/!5#  

 "!*,,,J8~t4+?+?vV\+?+]Y,F)-)9TGf$EvE&!//))	
 	
r7   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       et        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        y y # 1 sw Y   |xY w# 1 sw Y   y xY w)NTr  r  )
rF   rG   rH   rI   r  r'   rJ   r  r1   rB   rK   s     r6   rJ   z%TFDebertaForTokenClassification.build       ::
4D)5t||001 )""4()4t,8t334 M%%tT4;;3J3J&KLM M 9) )M M   C"%3C."C+.C7rM   r  )r  r  r   r  r  r  r  r  r  r  r   r  r
  r  r  r  r  r  r:   r  rQ   z0Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]rA   )rS   rT   rU   r+   r   r   r  r  r   r  r   r  r?   rJ   rW   rX   s   @r6   r  r    s    
 *+C+J+JKh+ij&+$ .28<8<6:7;,0/3&*04#(*
**
 6*
 6	*

 4*
 5*
 **
 -*
 $*
 .*
 !*
 
:*
 k *
X	Mr7   r  z
    DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       e Zd Zd fdZe eej                  d             ee	e
e      	 	 	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd                     Zd	dZ xZS )
TFDebertaForQuestionAnsweringc                   t        |   |g|i | |j                  | _        t        |d      | _        t
        j                  j                  |j                  t        |j                        d      | _
        || _        y )Nr  r&   
qa_outputsr   )r*   r+   r  r  r  r   r,   r-   r   r   r  r1   r  s       r6   r+   z&TFDebertaForQuestionAnswering.__init__  sr    3&3F3 ++)&yA,,,,##H`H`8aht - 
 r7   r  r  c                   | j                  |||||||||	      }|d   }| j                  |      }t        j                  |dd      \  }}t        j                  |d      }t        j                  |d      }d}|	 |
d	|	i}|
|d
<   | j                  |||f      }|s||f|dd z   }||f|z   S |S t        ||||j                  |j                        S )a  
        start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        r  r   r   r   rl   )r  r[  r]   )inputr]   Nstart_positionend_positionr  )r  start_logits
end_logitsr<   r  )	r  r  rH   rd  r   r  r   r<   r  )r3   r  r   r  r  r  r   r
  r  start_positionsend_positionsr:   r   r  r   r!  r"  r  r  rk   s                       r6   r?   z"TFDebertaForQuestionAnswering.call  s   > ,,))%'/!5#  

 "!*8#%88&QUW#X jzz2>ZZjr:
&=+D&8F%2F>"''v|Z>X'YD"J/'!"+=F)-)9TGf$EvE-%!!//))
 	
r7   c                   | j                   ry d| _         t        | dd       Mt        j                  | j                  j
                        5  | j                  j                  d        d d d        t        | dd       et        j                  | j                  j
                        5  | j                  j                  d d | j                  j                  g       d d d        y y # 1 sw Y   |xY w# 1 sw Y   y xY w)NTr  r  )
rF   rG   rH   rI   r  r'   rJ   r  r1   rB   rK   s     r6   rJ   z#TFDebertaForQuestionAnswering.builda  r  r  rM   )NNNNNNNNNNF)r  r  r   r  r  r  r  r  r  r  r   r  r
  r  r  r  r#  r  r$  r  r:   r  rQ   z7Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]rA   )rS   rT   rU   r+   r   r   r  r  r   r  r   r  r?   rJ   rW   rX   s   @r6   r  r    s    	 *+C+J+JKh+ij&2$ .28<8<6:7;,0/3&*9=7;#(9
*9
 69
 6	9

 49
 59
 *9
 -9
 $9
 79
 59
 !9
 
A9
 k 9
v	Mr7   r  )Orn   
__future__r   r   typingr   r   r   r   r   numpynp
tensorflowrH   activations_tfr
   modeling_tf_outputsr   r   r   r   r   modeling_tf_utilsr   r   r   r   r   r   r   r   r   tf_utilsr   r   r   utilsr   r   r   r   configuration_debertar!   
get_loggerrS   r  r  r  r,   Layerr#   rZ   r/   r   r   r   r   r   r   r   r   r#  r&  r*  r:  r   r  r  r  r  r  r  DEBERTA_START_DOCSTRINGr  r  r  r  r  r  r)   r7   r6   <module>r4     s    "  9 9   / 
 
 
 S R u u 0 
		H	% ". )U\\// )@** ,%U\\// %P9++ 9()%,,,, ):-.++ -.`HELL.. H:)ell(( )B0-U\\'' 0-fi
u||)) i
X*0,,.
0R);); Rjt %,,,, t n#Hu||'9'9 #HL- 2 2 -`-5<<-- -([)++ [)|"0 "( T) X g-)- -)	-)` QSjkM%35Q M% lM%`  YE)AC_ YEYEx  IM&>@Y IMIMX  WM$<>U WMWMr7   