
    sg                        d Z ddlmZmZmZ ddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZmZmZmZmZ dd	lmZ dd
lmZmZmZmZ ddlmZ  ej:                  e      ZdZ dZ!dZ"dZ#dZ$dZ%dZ&dZ'dZ(dZ) G d dejT                        Z+ G d dejT                        Z,ejZ                  j\                  d        Z/ejZ                  j\                  d        Z0ejZ                  j\                  d        Z1ejZ                  j\                  d        Z2ejZ                  j\                  dejf                  d e4fd!       Z5ejZ                  j\                  dejf                  d"ejf                  fd#       Z6ejZ                  j\                  dejf                  d"ejf                  d$e4fd%       Z7ejZ                  j\                  dejf                  d"ejf                  fd&       Z8 G d' d(ejT                        Z9 G d) d*ejT                        Z: G d+ d,ejT                        Z; G d- d.ejT                        Z< G d/ d0ejT                        Z= G d1 d2ejT                        Z> G d3 d4e      Z? G d5 d6e      Z@d7ZAd8ZB ed9eA       G d: d;e@             ZC G d< d=ejT                        ZD G d> d?ejT                        ZE G d@ dAejT                        ZF G dB dCejT                        ZG G dD dEejT                        ZH edFeA       G dG dHe@             ZI G dI dJejT                        ZJ edKeA       G dL dMe@             ZK edNeA       G dO dPe@             ZL edQeA       G dR dSe@             ZMy)TzPyTorch DeBERTa model.    )OptionalTupleUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputMaskedLMOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )DebertaConfigr   zmicrosoft/deberta-basez!lsanochkin/deberta-large-feedbackz' Paris'z0.54z#Palak/microsoft_deberta-large_squadz' a nice puppet'gQ?      c                   *     e Zd ZdZd fd	Zd Z xZS )DebertaLayerNormzBLayerNorm module in the TF style (epsilon inside the square root).c                     t         |           t        j                  t	        j
                  |            | _        t        j                  t	        j                  |            | _        || _	        y N)
super__init__r   	Parametertorchonesweightzerosbiasvariance_epsilon)selfsizeeps	__class__s      _/var/www/html/venv/lib/python3.12/site-packages/transformers/models/deberta/modeling_deberta.pyr   zDebertaLayerNorm.__init__9   sH    ll5::d#34LLT!23	 #    c                 X   |j                   }|j                         }|j                  dd      }||z
  j                  d      j                  dd      }||z
  t	        j
                  || j                  z         z  }|j                  |      }| j                  |z  | j                  z   }|S )NT)keepdim   )
dtypefloatmeanpowr!   sqrtr&   tor#   r%   )r'   hidden_states
input_typer3   varianceys         r+   forwardzDebertaLayerNorm.forward?   s    "((
%++-!!"d!3!D(--a055b$5G&-HtG\G\<\1]]%((4KK-'$))3r,   )g-q=__name__
__module____qualname____doc__r   r;   __classcell__r*   s   @r+   r   r   6   s    L$r,   r   c                   $     e Zd Z fdZd Z xZS )DebertaSelfOutputc                    t         |           t        j                  |j                  |j                        | _        t        |j                  |j                        | _        t        j                  |j                        | _        y r   )r   r   r   Linearhidden_sizedenser   layer_norm_eps	LayerNormDropouthidden_dropout_probdropoutr'   configr*   s     r+   r   zDebertaSelfOutput.__init__K   s\    YYv1163E3EF
)&*<*<f>S>STzz&"<"<=r,   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   rH   rM   rJ   r'   r7   input_tensors      r+   r;   zDebertaSelfOutput.forwardQ   7    

=1]3}|'CDr,   r=   r>   r?   r   r;   rA   rB   s   @r+   rD   rD   J   s    >r,   rD   c                    | j                  d      }|j                  d      }t        j                  |t        j                  | j                        }t        j                  |t        j                  |j                        }|dddf   |j                  dd      j                  |d      z
  }|d|ddf   }|j                  d      }|S )a  
    Build relative position according to the query and key

    We assume the absolute position of query \(P_q\) is range from (0, query_size) and the absolute position of key
    \(P_k\) is range from (0, key_size), The relative positions from query to key is \(R_{q \rightarrow k} = P_q -
    P_k\)

    Args:
        query_size (int): the length of query
        key_size (int): the length of key

    Return:
        `torch.LongTensor`: A tensor with shape [1, query_size, key_size]

    r1   deviceNr   r.   r   )r(   r!   arangelongrY   viewrepeat	unsqueeze)query_layer	key_layer
query_sizekey_sizeq_idsk_idsrel_pos_idss          r+   build_relative_positionrf   X   s    $ !!"%J~~b!HLL5::k>P>PQELLI<L<LME4.5::a#4#;#;J#JJKkzk1n-K''*Kr,   c                     | j                  |j                  d      |j                  d      |j                  d      |j                  d      g      S )Nr   r   r0   r.   expandr(   )c2p_posr_   relative_poss      r+   c2p_dynamic_expandrl   u   sI    >>;++A.0@0@0C[EUEUVWEXZfZkZklnZopqqr,   c                     | j                  |j                  d      |j                  d      |j                  d      |j                  d      g      S )Nr   r   rW   rh   )rj   r_   r`   s      r+   p2c_dynamic_expandrn   z   sG    >>;++A.0@0@0CY^^TVEWYbYgYghjYklmmr,   c                     | j                  |j                         d d | j                  d      |j                  d      fz         S )Nr0   rW   rh   )	pos_indexp2c_attr`   s      r+   pos_dynamic_expandrr      s=    GLLN2A.)..2DinnUWFX1YYZZr,   r_   scale_factorc                     t        j                  t        j                  | j                  d      t         j                        |z        S )Nr.   r1   )r!   r5   tensorr(   r2   )r_   rs   s     r+   scaled_size_sqrtrw      s0    ::ell;#3#3B#7u{{KlZ[[r,   r`   c                 d    | j                  d      |j                  d      k7  rt        | |      S |S NrW   )r(   rf   )r_   r`   rk   s      r+   
build_rposrz      s1    y~~b11&{I>>r,   max_relative_positionsc           
          t        j                  t        t        | j	                  d      |j	                  d            |            S ry   )r!   rv   minmaxr(   )r_   r`   r{   s      r+   compute_attention_spanr      s4    <<C 0 0 4innR6HIKabccr,   c           	          |j                  d      |j                  d      k7  rA|d d d d d d df   j                  d      }t        j                  | dt	        || |            S | S )NrW   r   r.   r0   dimindex)r(   r^   r!   gatherrr   )rq   r_   r`   rk   rp   s        r+   uneven_size_correctedr      s_    y~~b11 Aq!,66r:	||G2DYPWYb2cddr,   c                   p    e Zd ZdZ fdZd Z	 	 	 	 ddej                  dej                  dede	ej                     de	ej                     d	e	ej                     d
e
ej                  e	ej                     f   fdZdej                  dej                  dej                  d	ej                  def
dZ xZS )DisentangledSelfAttentiona  
    Disentangled self-attention module

    Parameters:
        config (`str`):
            A model config class instance with the configuration to build a new model. The schema is similar to
            *BertConfig*, for more details, please refer [`DebertaConfig`]

    c                 Z   t         |           |j                  |j                  z  dk7  r&t	        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                  dz  d      | _
        t        j                  t        j                  | j                  t        j                              | _        t        j                  t        j                  | j                  t        j                              | _        |j"                  |j"                  ng | _        t%        |d	d      | _        t%        |d
d      | _        | j(                  rct        j                  |j                  |j                  d      | _        t        j                  |j                  |j                  d      | _        nd | _        d | _        | j&                  rt%        |dd      | _        | j.                  dk  r|j0                  | _        t        j2                  |j4                        | _        d| j"                  v r1t        j                  |j                  | j                  d      | _        d| j"                  v r/t        j                  |j                  | j                        | _        t        j2                  |j<                        | _        y )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r
   Fr%   ru   relative_attentiontalking_headr{   r.   r   c2pp2c) r   r   rG   num_attention_heads
ValueErrorintattention_head_sizeall_head_sizer   rF   in_projr    r!   r$   r2   q_biasv_biaspos_att_typegetattrr   r   head_logits_projhead_weights_projr{   max_position_embeddingsrK   rL   pos_dropoutpos_proj
pos_q_projattention_probs_dropout_probrM   rN   s     r+   r   z"DisentangledSelfAttention.__init__   sm    : ::a?#F$6$6#7 8 445Q8  $*#=#= #&v'9'9F<V<V'V#W !558P8PPyy!3!3T5G5G!5KRWXll5;;0B0B5;;#WXll5;;0B0B5;;#WX393F3F3RF//XZ")&2F"N#FNEB$&IIf.H.H&JdJdkp$qD!%'YYv/I/I6KeKelq%rD"$(D!%)D"""*1&:RTV*WD'**Q..4.L.L+!zz&*D*DED))) "		&*<*<d>P>PW\ ])))"$))F,>,>@R@R"Szz&"E"EFr,   c                     |j                         d d | j                  dfz   }|j                  |      }|j                  dddd      S )Nr.   r   r0   r   r
   )r(   r   r\   permute)r'   xnew_x_shapes      r+   transpose_for_scoresz.DisentangledSelfAttention.transpose_for_scores   sF    ffhsmt'?'?&DDFF;yyAq!$$r,   r7   attention_maskoutput_attentionsquery_statesrk   rel_embeddingsreturnc                 p   |9| j                  |      }| j                  |      j                  dd      \  }}	}
n| j                   j                  j                  | j                  dz  d      }t        d      D cg c]C  }t        j                  t        | j                        D cg c]  }||dz  |z       c}d      E }}}t        j                  |d   |j                         j                  |d   j                              }t        j                  |d   |j                         j                  |d   j                              }t        j                  |d   |j                         j                  |d   j                              }|||fD cg c]  }| j                  |       c}\  }}	}
|| j                  | j                  ddddf         z   }|
| j                  | j                  ddddf         z   }
d}dt        | j                        z   }t!        ||      }||j                  |j                        z  }t        j                  ||	j#                  dd	            }| j$                  r*|(|&| j'                  |      }| j)                  ||	|||      }|||z   }| j*                  5| j+                  |j-                  dddd            j-                  dddd      }|j/                         }|j1                  | t        j2                  |j                        j4                        }t6        j8                  j;                  |d      }|j1                  |d       | j=                  |      }| j>                  5| j?                  |j-                  dddd            j-                  dddd      }t        j                  ||
      }|j-                  dddd      jA                         }|jC                         dd	 d
z   }|jE                  |      }|s|dfS ||fS c c}w c c}}w c c}w )a  
        Call the module

        Args:
            hidden_states (`torch.FloatTensor`):
                Input states to the module usually the output from previous layer, it will be the Q,K and V in
                *Attention(Q,K,V)*

            attention_mask (`torch.BoolTensor`):
                An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.

            output_attentions (`bool`, *optional*):
                Whether return the attention matrix.

            query_states (`torch.FloatTensor`, *optional*):
                The *Q* state in *Attention(Q,K,V)*.

            relative_pos (`torch.LongTensor`):
                The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
                values ranging in [*-max_relative_positions*, *max_relative_positions*].

            rel_embeddings (`torch.FloatTensor`):
                The embedding of relative distances. It's a tensor of shape [\(2 \times
                \text{max_relative_positions}\), *hidden_size*].


        Nr
   r.   r   r   ru   r   r0   rW   )r.   )#r   r   chunkr#   r   ranger!   catmatmultr6   r1   r   r   lenr   rw   	transposer   r   disentangled_att_biasr   r   boolmasked_fillfinfor}   r   
functionalsoftmaxrM   r   
contiguousr(   r\   )r'   r7   r   r   r   rk   r   qpr_   r`   value_layerwskiqkvwqvr   rel_attrs   scaleattention_scoresattention_probscontext_layernew_context_layer_shapes                            r+   r;   z!DisentangledSelfAttention.forward   s   L m,B262K2KB2O2U2UVW]_2U2`/KK$$**4+C+Ca+GQ*OBhmnohpqcdEIIeD<T<T6UVr!a%!)}V\]^qDqT!Wlnn&6&9&9Q&9&NOAT!Wmoo&7&:&:a&:&OPAT!Wmoo&7&:&:a&:&OPAZ[]^`aYb2cTU43L3LQ3O2c/KK!D$=$=dkk$PTVW->X$YY!D$=$=dkk$PTVW->X$YY3t0011 l;!EHH;3D3DH$EE <<Y5H5HR5PQ""~'AlF^!--n=N00iWegstG/'9   ,#445E5M5MaQRTUWX5YZbbcdfgijlmn',,.+77.8I5;;WbWhWhKiKmKmn--//0@b/I##NA6,,7!!-"44_5L5LQPQSTVW5XYaabcefhiklmO_kB%--aAq9DDF"/"4"4"6s";e"C%**+BC !4((//W Wq 3ds   >+P-)P(;P-P3(P-r_   r`   rs   c           	      $   |t        |||j                        }|j                         dk(  r!|j                  d      j                  d      }nT|j                         dk(  r|j                  d      }n/|j                         dk7  rt	        d|j                                t        ||| j                        }|j                         }|| j                  |z
  | j                  |z   d d f   j                  d      }d}d| j                  v r| j                  |      }| j                  |      }t        j                  ||j                  dd	            }	t        j                  ||z   d|dz  dz
        }
t        j                  |	dt!        |
||      
      }	||	z  }d| j                  v r| j#                  |      }| j                  |      }|t%        ||      z  }t'        |||      }t        j                  | |z   d|dz  dz
        }t        j                  ||j                  dd	      j)                  |j*                              }t        j                  |dt-        |||      
      j                  dd	      }t/        ||||      }||z  }|S )Nr0   r   r
   r      z2Relative position ids must be of dim 2 or 3 or 4. r   r.   rW   r   r   ru   )rf   rY   r   r^   r   r   r{   r[   r   r   r   r!   r   r   clampr   rl   r   rw   rz   r6   r1   rn   r   )r'   r_   r`   rk   r   rs   att_spanscorepos_key_layerc2p_attrj   pos_query_layerr_posp2c_posrq   s                  r+   r   z/DisentangledSelfAttention.disentangled_att_bias3  s    2;	;K]K]^L"'11!4>>qAL1$'11!4L1$QR^RbRbRdQefgg)+y$B]B]^#((*'''(2T5P5PS[5[[]^^

)A, 	  D%%% MM.9M 55mDMll;0G0GB0OPGkk,"91hlQ>NOGll7:LWVaco:pqGWE D%%%"oon=O"77HO/NNOE
 kk5&8"3Q1q8HIGll9o.G.GB.O.R.RYbYhYh.R.ijGllR'9';PY'ZiB  ,G[)\ZGWEr,   FNNN)r=   r>   r?   r@   r   r   r!   Tensorr   r   r   r;   r   r   rA   rB   s   @r+   r   r      s    $GL% #(/3/315V0||V0 V0  	V0
 u||,V0 u||,V0 !.V0 
u||Xell33	4V0p6\\6 <<6 ll	6
 6 6r,   r   c                   *     e Zd ZdZ fdZddZ xZS )DebertaEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t         |           t        |dd      }t        |d|j                        | _        t        j                  |j                  | j                  |      | _        t        |dd      | _	        | j                  sd | _
        n/t        j                  |j                  | j                        | _
        |j                  dkD  r0t        j                  |j                  | j                        | _        nd | _        | j                  |j                  k7  r2t        j                  | j                  |j                  d      | _        nd | _        t!        |j                  |j"                        | _        t        j&                  |j(                        | _        || _        | j/                  d	t1        j2                  |j                        j5                  d
      d       y )Npad_token_idr   embedding_size)padding_idxposition_biased_inputTFr   position_ids)r   r.   )
persistent)r   r   r   rG   r   r   	Embedding
vocab_sizeword_embeddingsr   position_embeddingsr   type_vocab_sizetoken_type_embeddingsrF   
embed_projr   rI   rJ   rK   rL   rM   rO   register_bufferr!   rZ   ri   )r'   rO   r   r*   s      r+   r   zDebertaEmbeddings.__init__o  sy   v~q9%f.>@R@RS!||F,=,=t?R?R`lm%,V5Ld%S"))'+D$')||F4R4RTXTgTg'hD$!!A%)+f6L6LdNaNa)bD&)-D&&"4"44 ii(;(;V=O=OV[\DO"DO)&*<*<f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
r,   c                    ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|:t        j                  |t        j                  | j                  j
                        }|| j                  |      }| j                   | j                  |j	                               }nt        j                  |      }|}	| j                  r|	|z  }	| j                  | j                  |      }
|	|
z  }	| j                  | j                  |	      }	| j                  |	      }	||j                         |	j                         k7  rD|j                         dk(  r |j                  d      j                  d      }|j                  d      }|j!                  |	j"                        }|	|z  }	| j%                  |	      }	|	S )Nr.   r   rX   r   r0   )r(   r   r!   r$   r[   rY   r   r   
zeros_liker   r   r   rJ   r   squeezer^   r6   r1   rM   )r'   	input_idstoken_type_idsr   maskinputs_embedsinput_shape
seq_lengthr   
embeddingsr   s              r+   r;   zDebertaEmbeddings.forward  s    #..*K',,.s3K ^
,,Q^<L!"[[EJJtO`O`OgOghN  00;M##/"&":":<;L;L;N"O"'"2"2="A"
%%--J%%1$($>$>~$N!//J??&4J^^J/
xxzZ^^--88:?<<?2215D~~a(77:++,D#d*J\\*-
r,   )NNNNNr<   rB   s   @r+   r   r   l  s    Q
>,r,   r   c                   p     e Zd Z fdZ	 	 	 	 ddedeej                  eej                     f   fdZ	 xZ
S )DebertaAttentionc                 p    t         |           t        |      | _        t	        |      | _        || _        y r   )r   r   r   r'   rD   outputrO   rN   s     r+   r   zDebertaAttention.__init__  s-    -f5	'/r,   r   r   c                 v    | j                  ||||||      \  }}||}| j                  ||      }	|r|	|fS |	d fS )N)r   rk   r   )r'   r   )
r'   r7   r   r   r   rk   r   self_output
att_matrixattention_outputs
             r+   r;   zDebertaAttention.forward  se     #'))%%) #, #
Z (L;;{LA$j11$d++r,   r   r=   r>   r?   r   r   r   r!   r   r   r;   rA   rB   s   @r+   r   r     sF     #(,  	, 
u||Xell33	4,r,   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )DebertaIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r   r   r   rF   rG   intermediate_sizerH   
isinstance
hidden_actstrr   intermediate_act_fnrN   s     r+   r   zDebertaIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r,   r7   r   c                 J    | j                  |      }| j                  |      }|S r   )rH   r   r'   r7   s     r+   r;   zDebertaIntermediate.forward  s&    

=100?r,   r=   r>   r?   r   r!   r   r;   rA   rB   s   @r+   r   r     s#    9U\\ ell r,   r   c                   $     e Zd Z fdZd Z xZS )DebertaOutputc                     t         |           t        j                  |j                  |j
                        | _        t        |j
                  |j                        | _	        t        j                  |j                        | _        || _        y r   )r   r   r   rF   r   rG   rH   r   rI   rJ   rK   rL   rM   rO   rN   s     r+   r   zDebertaOutput.__init__  sc    YYv779K9KL
)&*<*<f>S>STzz&"<"<=r,   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   rQ   rR   s      r+   r;   zDebertaOutput.forward  rT   r,   rU   rB   s   @r+   r  r    s    r,   r  c                   p     e Zd Z fdZ	 	 	 	 ddedeej                  eej                     f   fdZ	 xZ
S )DebertaLayerc                     t         |           t        |      | _        t	        |      | _        t        |      | _        y r   )r   r   r   	attentionr   intermediater  r   rN   s     r+   r   zDebertaLayer.__init__  s3    )&1/7#F+r,   r   r   c                     | j                  ||||||      \  }}| j                  |      }	| j                  |	|      }
|r|
|fS |
d fS )Nr   r   rk   r   )r
  r  r   )r'   r7   r   r   rk   r   r   r   r   intermediate_outputlayer_outputs              r+   r;   zDebertaLayer.forward  sn     (,~~/%%) (6 (
$* #//0@A{{#68HI *-- $''r,   )NNNFr   rB   s   @r+   r  r    sF    , "'(  ( 
u||Xell33	4(r,   r  c                        e Zd ZdZ fdZd Zd ZddZ	 	 	 	 	 ddej                  dej                  de
d	e
d
e
f
dZ xZS )DebertaEncoderz8Modified BertEncoder with relative position bias supportc                    t         |   |       t        j                  t	        |j
                        D cg c]  }t        |       c}      | _        t        |dd      | _	        | j                  rdt        |dd      | _
        | j                  dk  r|j                  | _
        t        j                  | j                  dz  |j                        | _        d| _        y c c}w )Nr   Fr{   r.   r   r0   )r   r   r   
ModuleListr   num_hidden_layersr  layerr   r   r{   r   r   rG   r   gradient_checkpointing)r'   rO   _r*   s      r+   r   zDebertaEncoder.__init__"  s     ]]%H`H`Ba#bQL$8#bc
")&2F"N""*1&:RTV*WD'**Q..4.L.L+"$,,t/J/JQ/NPVPbPb"cD&+# $cs   Cc                 R    | j                   r| j                  j                  }|S d }|S r   )r   r   r#   )r'   r   s     r+   get_rel_embeddingz DebertaEncoder.get_rel_embedding-  s0    7;7N7N,,33 UYr,   c                     |j                         dk  rE|j                  d      j                  d      }||j                  d      j                  d      z  }|S |j                         dk(  r|j                  d      }|S )Nr0   r   rW   r.   r
   )r   r^   r   )r'   r   extended_attention_masks      r+   get_attention_maskz!DebertaEncoder.get_attention_mask1  s    1$&4&>&>q&A&K&KA&N#47N7V7VWY7Z7d7deg7hhN  !Q&+55a8Nr,   c                 Z    | j                   r||t        ||      }|S t        ||      }|S r   )r   rf   )r'   r7   r   rk   s       r+   get_rel_poszDebertaEncoder.get_rel_pos:  s>    ""|';'6|]S   7}mTr,   r7   r   output_hidden_statesr   return_dictc           
         | j                  |      }| j                  |||      }|r|fnd }|rdnd }	|}
| j                         }t        | j                        D ]k  \  }}| j
                  r1| j                  r%| j                  |j                  |
|||||      \  }}n ||
|||||      \  }}|r||fz   }||}n|}
|sf|	|fz   }	m |st        d |||	fD              S t        |||	      S )N )r   rk   r   r   c              3   &   K   | ]	  }||  y wr   r"  ).0r   s     r+   	<genexpr>z)DebertaEncoder.forward.<locals>.<genexpr>v  s     hqZ[Zghs   last_hidden_stater7   
attentions)r  r  r  	enumerater  r  training_gradient_checkpointing_func__call__tupler   )r'   r7   r   r  r   r   rk   r   all_hidden_statesall_attentionsnext_kvr   r   layer_moduleatt_ms                  r+   r;   zDebertaEncoder.forwardB  s7    00@''|\ROcM;Kim0d//1(4 	;OA|**t}}'+'H'H ))"  "%($u (4"!-!-#1&7($u $$58H$H!',' !/5(!:=	;@ h]4E~$Vhhh+;LYg
 	
r,   )NN)TFNNT)r=   r>   r?   r@   r   r  r  r  r!   r   r   r;   rA   rB   s   @r+   r  r    sh    B	, &*"' 7
||7
 7
 #	7

  7
 7
r,   r  c                   (    e Zd ZdZeZdZdgZdZd Z	y)DebertaPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    debertar   Tc                 :   t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyy)zInitialize the weights.g        )r3   stdN)r   r   rF   r#   datanormal_rO   initializer_ranger%   zero_r   r   )r'   modules     r+   _init_weightsz$DebertaPreTrainedModel._init_weights  s    fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> . .r,   N)
r=   r>   r?   r@   r   config_classbase_model_prefix"_keys_to_ignore_on_load_unexpectedsupports_gradient_checkpointingr=  r"  r,   r+   r4  r4  |  s(    
 !L!*?)@&&*#?r,   r4  a  
    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
    Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.


    Parameters:
        config ([`DebertaConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zaThe bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.c                   \    e Zd Z fdZd Zd Zd Z eej                  d             e
eee      	 	 	 	 	 	 	 	 ddeej                      deej                      d	eej                      d
eej                      deej                      dee   dee   dee   deeef   fd              Z xZS )DebertaModelc                     t         |   |       t        |      | _        t	        |      | _        d| _        || _        | j                          y Nr   )	r   r   r   r   r  encoderz_stepsrO   	post_initrN   s     r+   r   zDebertaModel.__init__  s@     +F3%f-r,   c                 .    | j                   j                  S r   r   r   r'   s    r+   get_input_embeddingsz!DebertaModel.get_input_embeddings  s    ...r,   c                 &    || j                   _        y r   rJ  r'   new_embeddingss     r+   set_input_embeddingsz!DebertaModel.set_input_embeddings  s    *8'r,   c                     t        d      )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        z7The prune function is not implemented in DeBERTa model.)NotImplementedError)r'   heads_to_prunes     r+   _prune_headszDebertaModel._prune_heads  s    
 ""[\\r,   batch_size, sequence_length
checkpointoutput_typer>  r   r   r   r   r   r   r  r   r   c	           	      |   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t	        d      |#| j                  ||       |j                         }	n!||j                         d d }	nt	        d      ||j                  n|j                  }
|t        j                  |	|
      }|&t        j                  |	t        j                  |
      }| j                  |||||      }| j                  ||d||      }|d	   }| j                  d	kD  r|d
   }t        | j                        D cg c]  }| j                  j                   d    }}|d   }| j                  j#                         }| j                  j%                  |      }| j                  j'                  |      }|d	d  D ]!  } |||d|||      }|j)                  |       # |d   }|s|f||rd	d  z   S dd  z   S t+        ||r|j,                  nd |j.                        S c c}w )NzDYou cannot specify both input_ids and inputs_embeds at the same timer.   z5You have to specify either input_ids or inputs_embeds)rY   rX   )r   r   r   r   r   T)r  r   r   r   rW   Fr  r0   r&  )rO   r   r  use_return_dictr   %warn_if_padding_and_no_attention_maskr(   rY   r!   r"   r$   r[   r   rF  rG  r   r  r  r  r  appendr   r7   r(  )r'   r   r   r   r   r   r   r  r   r   rY   embedding_outputencoder_outputsencoded_layersr7   r  layersr   r   rel_posr  sequence_outputs                         r+   r;   zDebertaModel.forward  sz   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU%.%:!!@T@T!"ZZFCN!"[[EJJvVN??)%' + 
 ,,!%/# ' 
 )+<<!*2.M6;DLL6IJdll((,JFJ)"-L!\\;;=N!\\<<^LNll../?@G 	4$!"&+!-!(#1  %%l3	4 ),#%>R8\(]]]XY8\(]]]-;O/77UY&11
 	
+ Ks    H9)NNNNNNNN)r=   r>   r?   r   rL  rP  rT  r   DEBERTA_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   r!   r   r   r   r   r;   rA   rB   s   @r+   rC  rC    s   
/9] ++C+J+JKh+ij&#$ -11515/304,0/3&*N
ELL)N
 !.N
 !.	N

 u||,N
  -N
 $D>N
 'tnN
 d^N
 
uo%	&N
 kN
r,   rC  c                   $     e Zd Z fdZd Z xZS )$LegacyDebertaPredictionHeadTransformc                    t         |           t        |d|j                        | _        t        j                  |j                  | j                        | _        t        |j                  t              rt        |j                     | _        n|j                  | _        t        j                  | j                  |j                        | _        y )Nr   )r)   )r   r   r   rG   r   r   rF   rH   r   r   r   r   transform_act_fnrJ   rI   rN   s     r+   r   z-LegacyDebertaPredictionHeadTransform.__init__F  s    %f.>@R@RSYYv1143F3FG
f''-$*6+<+<$=D!$*$5$5D!d&9&9v?T?TUr,   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )rH   rj  rJ   r  s     r+   r;   z,LegacyDebertaPredictionHeadTransform.forwardQ  s4    

=1--m<}5r,   rU   rB   s   @r+   rh  rh  E  s    	Vr,   rh  c                   *     e Zd Z fdZd Zd Z xZS )LegacyDebertaLMPredictionHeadc                    t         |           t        |      | _        t	        |d|j
                        | _        t        j                  | j                  |j                  d      | _
        t        j                  t        j                  |j                              | _        | j                  | j                  _        y )Nr   Fr   )r   r   rh  	transformr   rG   r   r   rF   r   decoderr    r!   r$   r%   rN   s     r+   r   z&LegacyDebertaLMPredictionHead.__init__Y  s    =fE%f.>@R@RS yy!4!4f6G6GeTLLV->->!?@	 !IIr,   c                 :    | j                   | j                  _         y r   )r%   rp  rK  s    r+   _tie_weightsz*LegacyDebertaLMPredictionHead._tie_weightsg  s     IIr,   c                 J    | j                  |      }| j                  |      }|S r   )ro  rp  r  s     r+   r;   z%LegacyDebertaLMPredictionHead.forwardj  s$    }5]3r,   )r=   r>   r?   r   rr  r;   rA   rB   s   @r+   rm  rm  X  s    &&r,   rm  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )LegacyDebertaOnlyMLMHeadc                 B    t         |           t        |      | _        y r   )r   r   rm  predictionsrN   s     r+   r   z!LegacyDebertaOnlyMLMHead.__init__r  s    8@r,   rb  r   c                 (    | j                  |      }|S r   )rw  )r'   rb  prediction_scoress      r+   r;   z LegacyDebertaOnlyMLMHead.forwardv  s     ,,_=  r,   r  rB   s   @r+   ru  ru  q  s$    A!u|| ! !r,   ru  c                   (     e Zd ZdZ fdZd Z xZS )DebertaLMPredictionHeadzMhttps://github.com/microsoft/DeBERTa/blob/master/DeBERTa/deberta/bert.py#L270c                    t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                  d      | _        t        j                  t        j                  |j                               | _        y )NT)r)   elementwise_affine)r   r   r   rF   rG   rH   r   r   r   r   rj  rJ   rI   r    r!   r$   r   r%   rN   s     r+   r   z DebertaLMPredictionHead.__init__~  s    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>ShlmLLV->->!?@	r,   c                     | j                  |      }| j                  |      }| j                  |      }t        j                  ||j
                  j                               | j                  z   }|S r   )rH   rj  rJ   r!   r   r#   r   r%   )r'   r7   r   s      r+   r;   zDebertaLMPredictionHead.forward  sd    

=1--m<
 ]O4J4J4L4L4NORVR[R[[r,   r<   rB   s   @r+   r{  r{  {  s    WAr,   r{  c                   $     e Zd Z fdZd Z xZS )DebertaOnlyMLMHeadc                 B    t         |           t        |      | _        y r   )r   r   r{  lm_headrN   s     r+   r   zDebertaOnlyMLMHead.__init__  s    .v6r,   c                 *    | j                  ||      }|S r   )r  )r'   rb  r   ry  s       r+   r;   zDebertaOnlyMLMHead.forward  s     LL/J  r,   rU   rB   s   @r+   r  r    s    7
!r,   r  z5DeBERTa Model with a `language modeling` head on top.c                       e Zd ZddgZ fdZd Zd Z eej                  d             e
eeedee      	 	 	 	 	 	 	 	 	 dd	eej$                     d
eej$                     deej$                     deej$                     deej$                     deej$                     dee   dee   dee   deeef   fd              Z xZS )DebertaForMaskedLMzcls.predictions.decoder.weightzcls.predictions.decoder.biasc                     t         |   |       |j                  | _        t        |      | _        | j                  rt        |      | _        nddg| _        t        |      | _	        | j                          y )Nzlm_predictions.lm_head.weightz)deberta.embeddings.word_embeddings.weight)r   r   legacyrC  r5  ru  cls_tied_weights_keysr  lm_predictionsrH  rN   s     r+   r   zDebertaForMaskedLM.__init__  sa     mm#F+;;/7DH'FHs&tD#"4V"<D 	r,   c                     | j                   r | j                  j                  j                  S | j                  j
                  j                  S r   )r  r  rw  rp  r  r  rH   rK  s    r+   get_output_embeddingsz(DebertaForMaskedLM.get_output_embeddings  s7    ;;88''///&&..444r,   c                    | j                   rA|| j                  j                  _        |j                  | j                  j                  _        y || j
                  j                  _        |j                  | j
                  j                  _        y r   )r  r  rw  rp  r%   r  r  rH   rN  s     r+   set_output_embeddingsz(DebertaForMaskedLM.set_output_embeddings  sa    ;;+9DHH  ((6(;(;DHH  %0>D''-/=/B/BD'',r,   rU  z[MASK])rW  rX  r>  r   expected_outputexpected_lossr   r   r   r   r   labelsr   r  r   r   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }
|
d   }| j                  r| j	                  |      }n0| j                  || j                  j                  j                        }d}|Ft               } ||j                  d| j                   j                        |j                  d            }|	s|f|
dd z   }||f|z   S |S t        |||
j                  |
j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   r   r   r   r   r  r   r   r.   r   losslogitsr7   r(  )rO   rZ  r5  r  r  r  r   r   r   r\   r   r   r7   r(  )r'   r   r   r   r   r   r  r   r  r   outputsrb  ry  masked_lm_lossloss_fctr   s                   r+   r;   zDebertaForMaskedLM.forward  s!   8 &1%<k$++B]B],,))%'/!5#  	
 "!*;; $ 9 $ 3 3OT\\E\E\ElEl m')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r,   	NNNNNNNNN)r=   r>   r?   r  r   r  r  r   rc  rd  r   _CHECKPOINT_FOR_MASKED_LMr   rf  _MASKED_LM_EXPECTED_OUTPUT_MASKED_LM_EXPECTED_LOSSr   r!   r   r   r   r   r;   rA   rB   s   @r+   r  r    s0   :<Z[5C ++C+J+JKh+ij,"$2. -11515/304)-,0/3&*4
ELL)4
 !.4
 !.	4

 u||,4
  -4
 &4
 $D>4
 'tn4
 d^4
 
un$	%4
 k4
r,   r  c                   4     e Zd Z fdZd Zed        Z xZS )ContextPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        || _	        y r   )
r   r   r   rF   pooler_hidden_sizerH   rK   pooler_dropoutrM   rO   rN   s     r+   r   zContextPooler.__init__  sI    YYv88&:S:ST
zz&"7"78r,   c                     |d d df   }| j                  |      }| j                  |      }t        | j                  j                     |      }|S rE  )rM   rH   r   rO   pooler_hidden_act)r'   r7   context_tokenpooled_outputs       r+   r;   zContextPooler.forward  sM     &ad+]3

=1t{{<<=mLr,   c                 .    | j                   j                  S r   )rO   rG   rK  s    r+   
output_dimzContextPooler.output_dim  s    {{&&&r,   )r=   r>   r?   r   r;   propertyr  rA   rB   s   @r+   r  r     s!     ' 'r,   r  z
    DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                   v    e Zd Z fdZd Zd Z eej                  d             e	e
ee      	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     d	eej                     d
eej                     deej                     dee   dee   dee   deeef   fd              Z xZS ) DebertaForSequenceClassificationc                    t         |   |       t        |dd      }|| _        t	        |      | _        t        |      | _        | j                  j                  }t        j                  ||      | _        t        |dd       }|| j                  j                  n|}t        j                  |      | _        | j!                          y )N
num_labelsr0   cls_dropout)r   r   r   r  rC  r5  r  poolerr  r   rF   
classifierrO   rL   rK   rM   rH  )r'   rO   r  r  drop_outr*   s        r+   r   z)DebertaForSequenceClassification.__init__  s     V\15
$#F+#F+[[++
))J
;6=$76>6F4;;22Hzz(+ 	r,   c                 6    | j                   j                         S r   )r5  rL  rK  s    r+   rL  z5DebertaForSequenceClassification.get_input_embeddings0  s    ||0022r,   c                 :    | j                   j                  |       y r   )r5  rP  rN  s     r+   rP  z5DebertaForSequenceClassification.set_input_embeddings3  s    )).9r,   rU  rV  r   r   r   r   r   r  r   r  r   r   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }
|
d   }| j                  |      }| j	                  |      }| j                  |      }d}|| j                   j                  | j                  dk(  rXt        j                         }|j                  d      j                  |j                        } |||j                  d            }n_|j                         dk(  s|j                  d      dk(  r|dk\  j                         }|j!                         }|j                  d      dkD  rt#        j$                  |d|j'                  |j                  d      |j                  d                  }t#        j$                  |d|j                  d            }t)               } ||j                  d| j                        j+                         |j                  d            }nIt#        j,                  d      j                  |      }n#t        j.                  d      } ||      |z  j1                  d      j3                          }n| j                   j                  dk(  rIt               }| j                  dk(  r& ||j5                         |j5                               }n |||      }n| j                   j                  dk(  r=t)               } ||j                  d| j                        |j                  d            }n,| j                   j                  dk(  rt7               } |||      }|	s|f|
dd z   }||f|z   S |S t9        |||
j:                  |
j<                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r   r   r   r   r  r   r   r   r.   
regressionsingle_label_classificationmulti_label_classificationr  )rO   rZ  r5  r  rM   r  problem_typer  r   r	   r\   r6   r1   r   r(   nonzeror[   r!   r   ri   r   r2   rv   
LogSoftmaxsumr3   r   r   r   r7   r(  )r'   r   r   r   r   r   r  r   r  r   r  encoder_layerr  r  r  loss_fnlabel_indexlabeled_logitsr  log_softmaxr   s                        r+   r;   z(DebertaForSequenceClassification.forward6  s   0 &1%<k$++B]B],,))%'/!5#  	
  
M2]3/{{''/??a' jjlG#[[_//=F"66;;r?;DZZ\Q&&++b/Q*>#)Q;"7"7"9K#[[]F"''*Q.)."A{'9'9+:J:J1:Mv{{[\~'^* "'fa9I9I"9M!N#3#5'(;(;B(P(V(V(XZ`ZeZefhZij$||A11&9"$--"3K)&1F:??CIIKKD))\9"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'fG4I4IV]VhVh
 	
r,   r  )r=   r>   r?   r   rL  rP  r   rc  rd  r   re  r   rf  r   r!   r   r   r   r   r;   rA   rB   s   @r+   r  r    s'   $3: ++C+J+JKh+ij&,$ -11515/304)-,0/3&*M
ELL)M
 !.M
 !.	M

 u||,M
  -M
 &M
 $D>M
 'tnM
 d^M
 
u..	/M
 kM
r,   r  z
    DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                   j    e Zd Z fdZ eej                  d             eee	e
      	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
ee   dee   dee   deee	f   fd              Z xZS )DebertaForTokenClassificationc                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r   )r   r   r  rC  r5  r   rK   rL   rM   rF   rG   r  rH  rN   s     r+   r   z&DebertaForTokenClassification.__init__  si      ++#F+zz&"<"<=))F$6$68I8IJ 	r,   rU  rV  r   r   r   r   r   r  r   r  r   r   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }
|
d   }| j                  |      }| j	                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|	s|f|
dd z   }||f|z   S |S t        |||
j                  |
j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r.   r   r  )rO   rZ  r5  rM   r  r   r\   r  r   r7   r(  )r'   r   r   r   r   r   r  r   r  r   r  rb  r  r  r  r   s                   r+   r;   z%DebertaForTokenClassification.forward  s    , &1%<k$++B]B],,))%'/!5#  	
 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$fG4I4IV]VhVh
 	
r,   r  )r=   r>   r?   r   r   rc  rd  r   re  r   rf  r   r!   r   r   r   r   r;   rA   rB   s   @r+   r  r    s   	 ++C+J+JKh+ij&)$ -11515/304)-,0/3&*-
ELL)-
 !.-
 !.	-

 u||,-
  --
 &-
 $D>-
 'tn-
 d^-
 
u++	,-
 k-
r,   r  z
    DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       e Zd Z fdZ eej                  d             eee	e
eeee      	 	 	 	 	 	 	 	 	 	 ddeej"                     deej"                     deej"                     deej"                     deej"                     d	eej"                     d
eej"                     dee   dee   dee   deee	f   fd              Z xZS )DebertaForQuestionAnsweringc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y r   )
r   r   r  rC  r5  r   rF   rG   
qa_outputsrH  rN   s     r+   r   z$DebertaForQuestionAnswering.__init__  sS      ++#F+))F$6$68I8IJ 	r,   rU  )rW  rX  r>  r  r  qa_target_start_indexqa_target_end_indexr   r   r   r   r   start_positionsend_positionsr   r  r   r   c           
      &   |
|
n| j                   j                  }
| j                  |||||||	|
      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|
s||f|dd z   }||f|z   S |S t        ||||j                  |j                  	      S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        Nr  r   r   r.   r   )ignore_indexr0   )r  start_logits
end_logitsr7   r(  )rO   rZ  r5  r  splitr   r   r   r(   r   r   r   r7   r(  )r'   r   r   r   r   r   r  r  r   r  r   r  rb  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                         r+   r;   z#DebertaForQuestionAnswering.forward  s   B &1%<k$++B]B],,))%'/!5#  	
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r,   )
NNNNNNNNNN)r=   r>   r?   r   r   rc  rd  r   _CHECKPOINT_FOR_QAr   rf  _QA_EXPECTED_OUTPUT_QA_EXPECTED_LOSS_QA_TARGET_START_INDEX_QA_TARGET_END_INDEXr   r!   r   r   r   r   r;   rA   rB   s   @r+   r  r    s@    ++C+J+JKh+ij%0$+'40 -11515/3042604,0/3&*F
ELL)F
 !.F
 !.	F

 u||,F
  -F
 "%,,/F
  -F
 $D>F
 'tnF
 d^F
 
u22	3F
 kF
r,   r  )Nr@   typingr   r   r   r!   torch.utils.checkpointr   torch.nnr   r   r	   activationsr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   r   r   configuration_debertar   
get_loggerr=   loggerrf  re  r  r  r  r  r  r  r  r  Moduler   rD   jitscriptrf   rl   rn   rr   r   r   rw   rz   r   r   r   r   r   r   r  r  r  r4  DEBERTA_START_DOCSTRINGrc  rC  rh  rm  ru  r{  r  r  r  r  r  r  r"  r,   r+   <module>r     s    ) )    A A !  . u u 0 
		H	%!.  @ ' !  ; (    ryy (		   8 r r n n [ [ \%,, \c \ \ ELL U\\   d d dgj d d    D		 DNN		 Nb,ryy ,F")) BII (299 (BZ
_ Z
z?_ ?2 ") X gl
) l
	l
^299 &BII 2!ryy !bii 6! ! QSjk[
/ [
 l[
|'BII ',  l
'= l
l
^  ?
$: ?
?
D  [
"8 [
[
r,   