
    sg                        d Z ddlZddlmZ ddlmZmZmZ ddlZddl	Zddlm
Z
 ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ ddlmZmZmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(  e&jR                  e*      Z+dZ,dZ-dZ.da/d Z0d Z1dPdZ2dPdZ3dPdZ4d Z5 G d dejl                  jn                        Z8 G d dejl                  jn                        Z9 G d d      Z:dQdZ;d  Z<	 	 	 dRd!Z= G d" d#e
j|                        Z? G d$ d%e
j|                        Z@ G d& d'e
j|                        ZA G d( d)e
j|                        ZB G d* d+e
j|                        ZC G d, d-e
j|                        ZD G d. d/e
j|                        ZE G d0 d1e
j|                        ZF G d2 d3e
j|                        ZG G d4 d5e
j|                        ZH G d6 d7e
j|                        ZI G d8 d9e      ZJd:ZKd;ZL e"d<eK       G d= d>eJ             ZM e"d?eK       G d@ dAeJ             ZN G dB dCe
j|                        ZO e"dDeK       G dE dFeJ             ZP e"dGeK       G dH dIeJ             ZQ e"dJeK       G dK dLeJ             ZR e"dMeK       G dN dOeJ             ZSy)SzPyTorch MRA model.    N)Path)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss)load   )ACT2FN)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardis_ninja_availableis_torch_cuda_availablelogging   )	MraConfigzuw-madison/mra-base-512-4r   AutoTokenizerc                      t        t              j                         j                  j                  j                  dz  dz  fd}  | g d      }t	        d|d      ay )Nkernelsmrac                 4    | D cg c]  }|z  	 c}S c c}w N )filesfile
src_folders     W/var/www/html/venv/lib/python3.12/site-packages/transformers/models/mra/modeling_mra.pyappend_rootz&load_cuda_kernels.<locals>.append_root?   s    .34d
T!444s   )zcuda_kernel.cuzcuda_launch.cuztorch_extension.cppcuda_kernelT)verbose)r   __file__resolveparentr   mra_cuda_kernel)r+   	src_filesr)   s     @r*   load_cuda_kernelsr3   ;   sQ    h'')0077>>JURJ5 WXI=)TBO    c                 N   t        | j                               dk7  rt        d      t        |j                               dk7  rt        d      | j                  d      dk7  rt        d      | j                  d      dk7  rt        d      | j                  d	
      j                  j                  dd	      }|j                         }|j                         }|j                         }t        j                  ||||      \  }}|j                  dd	      dddddddf   }||fS )z8
    Computes maximum values for softmax stability.
       z.sparse_qk_prod must be a 4-dimensional tensor.   'indices must be a 2-dimensional tensor.    z>The size of the second dimension of sparse_qk_prod must be 32.r   z=The size of the third dimension of sparse_qk_prod must be 32.dimN)
lensize
ValueErrormaxvalues	transpose
contiguousintr1   	index_max)sparse_qk_prodindicesquery_num_blockkey_num_block
index_valsmax_valsmax_vals_scatters          r*   
sparse_maxrN   G   s    > !Q&IJJ
7<<>aBCC1#YZZ1#XYY###+22<<RDJ&&(JkkmG  "G!0!:!::wP_an!oH'11"b9!Qa-H%%%r4   c                    t        | j                               dk7  rt        d      t        |j                               dk7  rt        d      | j                  d   |j                  d   k7  rt        d      | j                  \  }}||z  }t	        j
                  |j                  d      t        j                  |j                        }| j                  |||      } | |dddf   ||z  j                         ddf   } | S )zN
    Converts attention mask to a sparse mask for high resolution logits.
    r7   z$mask must be a 2-dimensional tensor.r8   r   zBmask and indices must have the same size in the zero-th dimension.dtypedeviceN)	r>   r?   r@   shapetorcharangelongrR   reshape)maskrH   
block_size
batch_sizeseq_len	num_block	batch_idxs          r*   sparse_maskr^   c   s     499;1?@@
7<<>aBCCzz!}a((]^^**J:%IW\\!_EJJw~~VI<<
Iz:D	!T'"Wy%8$>$>$@!CDDKr4   c                 j   | j                         \  }}}|j                         \  }}}||z  dk7  rt        d      ||z  dk7  rt        d      | j                  |||z  ||      j                  dd      } |j                  |||z  ||      j                  dd      }t	        | j                               dk7  rt        d      t	        |j                               dk7  rt        d      t	        |j                               d	k7  rt        d
      | j                  d      dk7  rt        d      |j                  d      dk7  rt        d      | j                         } |j                         }|j                         }|j                         }t        j                  | ||j                               S )z7
    Performs Sampled Dense Matrix Multiplication.
    r   zTquery_size (size of first dimension of dense_query) must be divisible by block_size.Pkey_size (size of first dimension of dense_key) must be divisible by block_size.r=   r:   r6   z+dense_query must be a 4-dimensional tensor.)dense_key must be a 4-dimensional tensor.r7   r8   r   r9   z.The third dimension of dense_query must be 32.z,The third dimension of dense_key must be 32.)	r?   r@   rW   rC   r>   rD   rE   r1   mm_to_sparse)	dense_query	dense_keyrH   rY   rZ   
query_sizer<   _key_sizes	            r*   rb   rb   z   s    #."2"2"4J
C ~~'AxJ!#opp*!kll%%j*
2JJX[\ffgikmnK!!*h*.DjRUV``aceghI
;!#FGG
9>>!DEE
7<<>aBCCb IJJ~~aBGHH((*K$$&IkkmG  "G''YNNr4   c                 B   |j                         \  }}}||z  dk7  rt        d      | j                  d      |k7  rt        d      | j                  d      |k7  rt        d      |j                  |||z  ||      j                  dd      }t	        | j                               d	k7  rt        d
      t	        |j                               d	k7  rt        d      t	        |j                               dk7  rt        d      |j                  d      dk7  rt        d      | j                         } |j                         }|j                         }|j                         }t        j                  | |||      }|j                  dd      j                  |||z  |      }|S )zP
    Performs matrix multiplication of a sparse matrix with a dense matrix.
    r   r`   r7   zQThe size of the second dimension of sparse_query must be equal to the block_size.r   zPThe size of the third dimension of sparse_query must be equal to the block_size.r=   r:   r6   ,sparse_query must be a 4-dimensional tensor.ra   r8   r9   z8The size of the third dimension of dense_key must be 32.)	r?   r@   rW   rC   r>   rD   rE   r1   sparse_dense_mm)	sparse_queryrH   rd   rI   rY   rZ   rg   r<   dense_qk_prods	            r*   rj   rj      s    !* 0J#*!kllz)lmmz)kll!!*h*.DjRUV``aceghI
<1$GHH
9>>!DEE
7<<>aBCC~~aBSTT**,LkkmG  "G$$&I#33L'9VefM!++B3;;JZdHdfijMr4   c                 `    | |z  |z  t        j                  | |d      z   j                         S )Nfloorrounding_mode)rT   divrV   )rH   dim_1_blockdim_2_blocks      r*   transpose_indicesrt      s.    {"k1EIIg{bi4jjpprrr4   c                   >    e Zd Zed        Zed        Zedd       Zy)MraSampledDenseMatMulc                 V    t        ||||      }| j                  |||       || _        |S r%   )rb   save_for_backwardrY   )ctxrc   rd   rH   rY   rG   s         r*   forwardzMraSampledDenseMatMul.forward   s1    %k9gzRk9g>#r4   c                    | j                   \  }}}| j                  }|j                  d      |z  }|j                  d      |z  }t        |||      }t	        |j                  dd      |||      }	t	        ||||      }
|
|	d d fS Nr   r=   r:   )saved_tensorsrY   r?   rt   rj   rC   )ry   gradrc   rd   rH   rY   rI   rJ   	indices_Tgrad_key
grad_querys              r*   backwardzMraSampledDenseMatMul.backward   s    *-*;*;'Y^^
%**1-;!q)Z7%gN	"4>>"b#99kS`a$T7IO
8T4//r4   c                 2    t         j                  | |||      S r%   )rv   apply)rc   rd   rH   rY   s       r*   operator_callz#MraSampledDenseMatMul.operator_call   s    $**;	7JWWr4   Nr9   __name__
__module____qualname__staticmethodrz   r   r   r&   r4   r*   rv   rv      s>      0 0 X Xr4   rv   c                   <    e Zd Zed        Zed        Zed        Zy)MraSparseDenseMatMulc                 V    t        ||||      }| j                  |||       || _        |S r%   )rj   rx   rI   )ry   rk   rH   rd   rI   rG   s         r*   rz   zMraSparseDenseMatMul.forward   s2    (w	?[lGY?-r4   c                     | j                   \  }}}| j                  }|j                  d      |j                  d      z  }t        |||      }t	        |j                  dd      |||      }t        |||      }	|	d |d fS r|   )r}   rI   r?   rt   rj   rC   rb   )
ry   r~   rk   rH   rd   rI   rJ   r   r   r   s
             r*   r   zMraSparseDenseMatMul.backward   s    +.+<+<(gy--!q)\->->r-BB%gN	"<#9#9"b#A9dTab!$	7;
44//r4   c                 2    t         j                  | |||      S r%   )r   r   )rk   rH   rd   rI   s       r*   r   z"MraSparseDenseMatMul.operator_call   s    #)),O\\r4   Nr   r&   r4   r*   r   r      s>      0 0 ] ]r4   r   c                       e Zd Zed        Zy)MraReduceSumc                 B   | j                         \  }}}}t        | j                               dk7  rt        d      t        |j                               dk7  rt        d      | j                         \  }}}}|j                         \  }}| j                  d      j	                  ||z  |      } t        j                  |j                  d      t
        j                  |j                        }t        j                  ||d	      j                         |d d d f   |z  z   j	                  ||z        }	t        j                  ||z  |f| j                  | j                        }
|
j                  d|	|       j	                  |||      }|j	                  |||z        }|S )
Nr6   ri   r7   r8   r;   r   rP   rn   ro   )r?   r>   r@   sumrW   rT   rU   rV   rR   rq   zerosrQ   	index_add)rk   rH   rI   rJ   rZ   r\   rY   rf   r]   global_idxestempoutputs               r*   r   zMraReduceSum.operator_call   sy   /;/@/@/B,
Iz1|  "#q(KLLw||~!#FGG*//11j! '
I#''A'.66zI7MzZLLa

7>>Z	IIg}GDIIKiXY[_X_N`crNrr
'*y(
) 	 {{/):6l>P>PYeYlYl
 <>FFzSbdno
Oj,HIr4   N)r   r   r   r   r   r&   r4   r*   r   r      s     r4   r   c                 &   | j                         \  }}}||z  }d}	||j                  |||      j                  d      }
| j                  ||||      j                  d      |
dddddf   dz   z  }|j                  ||||      j                  d      |
dddddf   dz   z  }||j                  ||||      j                  d      |
dddddf   dz   z  }	n|t        j                  ||t        j
                  | j                        z  }
| j                  ||||      j                  d      }|j                  ||||      j                  d      }|$|j                  ||||      j                  d      }	t        j                  ||j                  dd            t        j                  |      z  }|j                  dd      j                  }|0|d	|
dddddf   |
dddddf   z  d
k  j                         z  z
  }||
||	fS )z/
    Compute low resolution approximation.
    Nr=   r;   r:   ư>rP   T)r<   keepdims     @g      ?)r?   rW   r   rT   onesfloatrR   meanmatmulrC   mathsqrtrA   rB   )querykeyrY   rX   valuerZ   r[   head_dimnum_block_per_row	value_hattoken_count	query_hatkey_hatlow_resolution_logitlow_resolution_logit_row_maxs                  r*   get_low_resolution_logitr     sN    %*JJL!J:-Ill:/@*MQQVXQYMM*.?XVZZ_aZb1d
#d*
	 ++j*;ZRVV[]V^1d
#d*
 j2CZQYZ^^ce^fAq$J'$.I !5::j:KSXS^S^glgsgs#ttMM*.?XV[[`b[c	++j*;ZRWW\^W_j2CZQYZ__df_gI <<	73D3DR3LMPTPYPYZbPcc#7#;#;T#;#R#Y#Y  3;q$z+B[QRTUW[Q[E\+\`c*c)j)j)l#ll 	  .JIUUr4   c                    | j                   \  }}}|dkD  rf|dz  }t        j                  ||| j                        }	t        j                  t        j
                  |	|       |      }
| |
dddddf   dz  z   } |dkD  r:| ddd|ddf   dz   | ddd|ddf<   | ddddd|f   dz   | ddddd|f<   t        j                  | j                  |d      |ddd	
      }|j                  }|dk(  rE|j                  j                  d      j                  }| |ddddf   k\  j                         }||fS |dk(  rd}||fS t        | d      )zZ
    Compute the indices of the subset of components to be used in the approximation.
    r   r7   rR   )diagonalNg     @r=   TF)r<   largestsortedfullr;   sparsez# is not a valid approx_model value.)rS   rT   r   rR   triltriutopkrW   rH   rB   minr   r@   )r   
num_blocksapprox_modeinitial_prior_first_n_blocksinitial_prior_diagonal_n_blocksrZ   total_blocks_per_rowrf   offset	temp_maskdiagonal_mask
top_k_valsrH   	thresholdhigh_resolution_masks                  r*   get_block_idxesr   B  s    +?*D*D'J$a&*0A5JJ35IRfRmRmn	

5::i6'#JU[\3mD!QJ6ORU6UU#a' $A%A$A1!DEK 	Q =!= =q@A !A'D(D'D!DEK 	Q#@$@#@@A $$Z4jbRV_dJ   Gf%%))b)188	 4	!T4-8P PWWY ((( 
	 # ((( K=(KLMMr4   c	                    t         #t        j                  |       j                         S | j	                         \  }	}
}}|	|
z  }||z  dk7  rt        d      ||z  }| j                  |||      } |j                  |||      }|j                  |||      }|-| |dddddf   z  } ||dddddf   z  }||dddddf   z  }|dk(  rt        | ||||      \  }}}}nA|dk(  r1t        j                         5  t        | |||      \  }}}}ddd       nt        d      t        j                         5  z
  }t        |||||      \  }}ddd       t        j                  | ||      t        j                  |      z  }t        ||||      \  }}||z
  }|"|dd	t!        ||      dddddddf   z
  z  z
  }t        j"                  |      }t$        j                  ||||      }t&        j                  ||||      }|dk(  ryt        j"                  z
  dz  z
        dddddf   z  }t        j(                  |      dddddddf   j+                  d	d	|d	      j                  |||      }|j-                  d
      dddddf   j+                  d	d	|      j                  ||      }|j+                  d	d	|      j                  ||      |z
  } || |z  } t        j"                  | | dk  j/                         z        }!||!dddddf   z  }||!z  }t        j"                  |  | dkD  j/                         z        }"||"dddddf   z  }||"z  }||z   |dddddf   |dddddf   z   dz   z  }#n#|dk(  r||dddddf   dz   z  }#nt        d      ||#|dddddf   z  }#|#j                  |	|
||      }#|#S # 1 sw Y   xY w# 1 sw Y   xY w)z0
    Use Mra to approximate self-attention.
    Nr   z4sequence length must be divisible by the block_size.r   r   z&approx_mode must be "full" or "sparse")rY   r   r   r=   r;   r   z-config.approx_mode must be "full" or "sparse")r1   rT   
zeros_likerequires_grad_r?   r@   rW   r   no_grad	Exceptionr   rv   r   r   r   rN   r^   expr   r   r   repeatr   r   )$r   r   r   rX   r   r   rY   r   r   rZ   num_headr[   r   
meta_batchr   r   r   r   r   rf   low_resolution_logit_normalizedrH   r   high_resolution_logitrL   rM   high_resolution_attnhigh_resolution_attn_outhigh_resolution_normalizerlow_resolution_attnlow_resolution_attn_outlow_resolution_normalizerlog_correctionlow_resolution_corrhigh_resolution_corrcontext_layers$                                       r*   mra2_attentionr   h  s    &5577.3jjl+J'8h&Jq OPP:-MM*gx8E
++j'8
4CMM*gx8EQ4Z((DAt$$Q4Z((fUm3
D%V
Rk+G 
	 ]]_ 	QisJRN +/KQ	 	
 @AA	 
*>A]*]'(7+(+)
%%
 2??sG
 @ 		( ",,A7L]_p!qH14DD 5q;tU\C]^_abdegk^kCl?l8m m 99%:;3AAgu.?  ".!;!;g'8:K" fII*-IICRfLffg!T1*%& 	 LL,i8AtQGVAq*a(WZ(3 	   ###+Aq$J7>>q!ZPXXYcelm 	" 6<<Q:NVVWacjknvv+d2N#ii.A:M9T9T9V(VW"9<OPQSTVZPZ<["[$=@S$S!$yy.NQ<N;U;U;W)WX#;>RSTVWY]S]>^#^ %?BV%V"14KK&q!Tz25NqRSUYz5ZZ]aa
 
	 04NqRSUYz4Z]a4abGHH%Q4Z(88!))*hRMS	 	
 
s   7O
3O
OO!c                   *     e Zd ZdZ fdZddZ xZS )MraEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 p   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  dz   |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      dz          t+        |dd      | _        | j#                  dt%        j.                  | j0                  j3                         t$        j4                  | j0                  j6                  	      d
       y )N)padding_idxr7   epsposition_ids)r   r=   position_embedding_typeabsolutetoken_type_idsrP   F)
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferrT   rU   expandgetattrr   r   r   r?   rV   rR   selfconfig	__class__s     r*   r   zMraEmbeddings.__init__  s?   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NQR0RTZTfTf#g %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<= 	^U\\&:X:X-Y-`-`ah-ilm-mn'.v7PR\']$KK))..0

4K\K\KcKcd 	 	
r4   c                 T   ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }|}n:t        j                  |t
        j                  | j                  j                        }|| j                  |      }| j                  |      }	||	z   }
| j                  dk(  r| j                  |      }|
|z  }
| j                  |
      }
| j                  |
      }
|
S )Nr=   r   r   r   rP   r   )r?   r   hasattrr   r   rT   r   rV   rR   r   r   r   r   r   r   )r   	input_idsr   r   inputs_embedsinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   s               r*   rz   zMraEmbeddings.forward  s=    #..*K',,.s3K ^
,,Q^<L
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
r4   )NNNNr   r   r   __doc__r   rz   __classcell__r   s   @r*   r   r     s    Q
( r4   r   c                   .     e Zd Zd fd	Zd ZddZ xZS )MraSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      t        d u}t               rt               r|s	 t                |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t!        j"                  |j                  | j                        | _        t!        j"                  |j                  | j                        | _        t!        j"                  |j                  | j                        | _        t!        j*                  |j,                        | _        ||n|j0                  | _        |j2                  dz  |j4                  z  | _        t9        | j6                  t        |j2                  dz  dz              | _        |j:                  | _        |j<                  | _        |j>                  | _        y # t        $ r#}t        j                  d|        Y d }~d }~ww xY w)	Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zGCould not load the custom kernel for multi-scale deformable attention: r9   r7   ) r   r   r   num_attention_headsr   r@   r1   r   r   r3   r   loggerwarningrE   attention_head_sizeall_head_sizer   Linearr   r   r   r   attention_probs_dropout_probr   r   r   block_per_rowr\   r   r   r   r   )r   r   r   kernel_loadeder   s        r*   r   zMraSelfAttention.__init__  s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 (t3"$);)=mn!# $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'>'J#PVPnPn 	$ !88B>&BVBVVT^^S&2P2PTV2V[\1\-]^!--,2,O,O)/5/U/U,+  n!hijhklmmns   =
H, ,	I5IIc                     |j                         d d | j                  | j                  fz   } |j                  | }|j	                  dddd      S )Nr=   r   r7   r   r   )r?   r  r  viewpermute)r   layernew_layer_shapes      r*   transpose_for_scoresz%MraSelfAttention.transpose_for_scores9  sO    **,s+t/G/GIaIa.bb

O,}}Q1a((r4   c           
         | j                  |      }| j                  | j                  |            }| j                  | j                  |            }| j                  |      }|j	                         \  }}}	}
d|dz  z   }|j                         j                  d|d      j                  ||z  |	      j                         }d}|
|k  r|||	||
z
  f}t        j                  |t        j                  ||j                        gd      }t        j                  |t        j                  ||j                        gd      }t        j                  |t        j                  ||j                        gd      }t        |j                         |j                         |j                         |j                         | j                  | j                   | j"                  | j$                        }|
|k  r|d d d d d d d |
f   }|j                  |||	|
      }|j'                  d	d
dd      j)                         }|j	                         d d | j*                  fz   } |j,                  | }|f}|S )N      ?r   r   r9   r   r=   r;   )r   r   r   r   r7   r   r:   )r   r   r   r   r?   squeezer   rW   rE   rT   catr   rR   r   r   r\   r   r   r   r  rD   r  r  )r   hidden_statesattention_maskmixed_query_layer	key_layervalue_layerquery_layerrZ   	num_headsr[   r   gpu_warp_sizepad_sizer   new_context_layer_shapeoutputss                   r*   rz   zMraSelfAttention.forward>  sD    JJ}5--dhh}.EF	//

=0IJ//0AB3>3C3C3E0
Iw ~77""$++Ay!<DDZR[E[]deiik 	 m#!9g}x7OOH))[%++h{OaOa2b$ciklK		9ekk(9K[K[.\"]cefI))[%++h{OaOa2b$ciklK&OO  "NN(()-)J)J,0,P,P	
 m#)!Q9H9*<=M%--j)WhW%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD "r4   r%   )r   r   r   r   r   rz   r
  r  s   @r*   r  r    s    !VF)
0r4   r  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )MraSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr   )r   r   r   r  r   denser   r   r   r   r   r   s     r*   r   zMraSelfOutput.__init__s  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r4   r%  input_tensorreturnc                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r%   r4  r   r   r   r%  r5  s      r*   rz   zMraSelfOutput.forwardy  7    

=1]3}|'CDr4   r   r   r   r   rT   Tensorrz   r
  r  s   @r*   r1  r1  r  1    >U\\  RWR^R^ r4   r1  c                   .     e Zd Zd fd	Zd ZddZ xZS )MraAttentionc                     t         |           t        ||      | _        t	        |      | _        t               | _        y )N)r   )r   r   r  r   r1  r   setpruned_heads)r   r   r   r   s      r*   r   zMraAttention.__init__  s3    $VE\]	#F+Er4   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r;   )r>   r   r   r  r  rB  r   r   r   r   r   r4  r  union)r   headsindexs      r*   prune_headszMraAttention.prune_heads  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r4   c                 f    | j                  ||      }| j                  |d   |      }|f|dd  z   }|S Nr   r   )r   r   )r   r%  r&  self_outputsattention_outputr/  s         r*   rz   zMraAttention.forward  s@    yy?;;|AF#%QR(88r4   r%   )r   r   r   r   rG  rz   r
  r  s   @r*   r?  r?    s    ";$r4   r?  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MraIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r%   )r   r   r   r  r   intermediate_sizer4  
isinstance
hidden_actstrr   intermediate_act_fnr   s     r*   r   zMraIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r4   r%  r6  c                 J    | j                  |      }| j                  |      }|S r%   )r4  rS  r   r%  s     r*   rz   zMraIntermediate.forward  s&    

=100?r4   r;  r  s   @r*   rM  rM    s#    9U\\ ell r4   rM  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )	MraOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r3  )r   r   r   r  rO  r   r4  r   r   r   r   r   r   s     r*   r   zMraOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r4   r%  r5  r6  c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r%   r8  r9  s      r*   rz   zMraOutput.forward  r:  r4   r;  r  s   @r*   rW  rW    r=  r4   rW  c                   ,     e Zd Z fdZddZd Z xZS )MraLayerc                     t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        t        |      | _        t        |      | _
        y Nr   )r   r   chunk_size_feed_forwardseq_len_dimr?  	attentionadd_cross_attentionrM  intermediaterW  r   r   s     r*   r   zMraLayer.__init__  sW    '-'E'E$%f-#)#=#= +F3'r4   c                     | j                  ||      }|d   }|dd  }t        | j                  | j                  | j                  |      }|f|z   }|S rI  )r`  r   feed_forward_chunkr^  r_  )r   r%  r&  self_attention_outputsrK  r/  layer_outputs          r*   rz   zMraLayer.forward  sc    !%~!N1!4(,0##T%A%A4CSCSUe
  /G+r4   c                 L    | j                  |      }| j                  ||      }|S r%   )rb  r   )r   rK  intermediate_outputrf  s       r*   rd  zMraLayer.feed_forward_chunk  s,    "//0@A{{#68HIr4   r%   )r   r   r   r   rz   rd  r
  r  s   @r*   r[  r[    s    (r4   r[  c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )
MraEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r   r   r   r   
ModuleListrangenum_hidden_layersr[  r  gradient_checkpointing)r   r   rf   r   s      r*   r   zMraEncoder.__init__  sN    ]]eFD\D\>]#^HV$4#^_
&+# $_s   A#c                 6   |rdnd }t        | j                        D ]Q  \  }}|r||fz   }| j                  r*| j                  r| j	                  |j
                  ||      }	n	 |||      }	|	d   }S |r||fz   }|st        d ||fD              S t        ||      S )Nr&   r   c              3   &   K   | ]	  }||  y wr%   r&   ).0vs     r*   	<genexpr>z%MraEncoder.forward.<locals>.<genexpr>   s     Xq!-Xs   )last_hidden_stater%  )	enumerater  ro  training_gradient_checkpointing_func__call__tupler   )
r   r%  r&  	head_maskoutput_hidden_statesreturn_dictall_hidden_statesilayer_modulelayer_outputss
             r*   rz   zMraEncoder.forward  s     #7BD(4 	-OA|#$58H$H!**t}} $ A A ))!"! !-]N K)!,M	-   1]4D DX]4E$FXXX1++
 	
r4   )NNFT)r   r   r   r   rz   r
  r  s   @r*   rj  rj    s    , "!
r4   rj  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MraPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r3  )r   r   r   r  r   r4  rP  rQ  rR  r   transform_act_fnr   r   r   s     r*   r   z#MraPredictionHeadTransform.__init__	  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr4   r%  r6  c                 l    | j                  |      }| j                  |      }| j                  |      }|S r%   )r4  r  r   rU  s     r*   rz   z"MraPredictionHeadTransform.forward  s4    

=1--m<}5r4   r;  r  s   @r*   r  r    s$    UU\\ ell r4   r  c                   *     e Zd Z fdZd Zd Z xZS )MraLMPredictionHeadc                 H   t         |           t        |      | _        t	        j
                  |j                  |j                  d      | _        t	        j                  t        j                  |j                              | _        | j                  | j                  _        y )NF)bias)r   r   r  	transformr   r  r   r   decoder	ParameterrT   r   r  r   s     r*   r   zMraLMPredictionHead.__init__  sm    3F; yy!3!3V5F5FUSLLV->->!?@	 !IIr4   c                 :    | j                   | j                  _         y r%   )r  r  r   s    r*   _tie_weightsz MraLMPredictionHead._tie_weights(  s     IIr4   c                 J    | j                  |      }| j                  |      }|S r%   )r  r  rU  s     r*   rz   zMraLMPredictionHead.forward+  s$    }5]3r4   )r   r   r   r   r  rz   r
  r  s   @r*   r  r    s    &&r4   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MraOnlyMLMHeadc                 B    t         |           t        |      | _        y r%   )r   r   r  predictionsr   s     r*   r   zMraOnlyMLMHead.__init__3  s    .v6r4   sequence_outputr6  c                 (    | j                  |      }|S r%   )r  )r   r  prediction_scoress      r*   rz   zMraOnlyMLMHead.forward7  s     ,,_=  r4   r;  r  s   @r*   r  r  2  s#    7!u|| ! !r4   r  c                   "    e Zd ZdZeZdZdZd Zy)MraPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    r#   Tc                    t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j                  j                  j                  d       yy)zInitialize the weightsg        )r   stdNr"  )rP  r   r  weightdatanormal_r   initializer_ranger  zero_r   r   r   fill_)r   modules     r*   _init_weightsz MraPreTrainedModel._init_weightsG  s   fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) .r4   N)	r   r   r   r	  r   config_classbase_model_prefixsupports_gradient_checkpointingr  r&   r4   r*   r  r  =  s    
 L&*#*r4   r  aF  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`MraConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
ak	  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z]The bare MRA Model transformer outputting raw hidden-states without any specific head on top.c                   p    e Zd Z fdZd Zd Zd Z eej                  d             e
eee      	 	 	 	 	 	 	 	 ddeej                      deej                      d	eej                      d
eej                      deej                      deej                      dee   dee   deeef   fd              Z xZS )MraModelc                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y r%   )r   r   r   r   r  rj  encoder	post_initr   s     r*   r   zMraModel.__init__  s;     '/!&) 	r4   c                 .    | j                   j                  S r%   r  r   r  s    r*   get_input_embeddingszMraModel.get_input_embeddings  s    ...r4   c                 &    || j                   _        y r%   r  )r   r   s     r*   set_input_embeddingszMraModel.set_input_embeddings  s    */'r4   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r  r`  rG  )r   heads_to_pruner  rE  s       r*   _prune_headszMraModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr4   batch_size, sequence_length
checkpointoutput_typer  r  r&  r   r   r{  r  r|  r}  r6  c	                    ||n| j                   j                  }||n| j                   j                  }||t        d      |#| j	                  ||       |j                         }	n!||j                         d d }	nt        d      |	\  }
}||j                  n|j                  }|t        j                  |
|f|      }|pt        | j                  d      r4| j                  j                  d d d |f   }|j                  |
|      }|}n&t        j                  |	t        j                  |      }| j                  ||	      }| j!                  || j                   j"                        }| j                  ||||      }| j%                  |||||      }|d	   }|s	|f|d
d  z   S t'        ||j(                  |j*                  |j,                        S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer=   z5You have to specify either input_ids or inputs_embedsr   r   rP   )r  r   r   r  )r&  r{  r|  r}  r   r   )ru  r%  
attentionscross_attentions)r   r|  use_return_dictr@   %warn_if_padding_and_no_attention_maskr?   rR   rT   r   r   r  r   r   r   rV   get_extended_attention_maskget_head_maskrn  r  r   r%  r  r  )r   r  r&  r   r   r{  r  r|  r}  r  rZ   r  rR   r  r  extended_attention_maskembedding_outputencoder_outputsr  s                      r*   rz   zMraModel.forward  s   $ %9$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_al0m &&y$++2O2OP	??%)'	 + 
 ,,2!5# ' 
 *!,#%(;;;1-)77&11,==	
 	
r4   )NNNNNNNN)r   r   r   r   r  r  r  r   MRA_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   rT   r<  boolr   r   rz   r
  r  s   @r*   r  r    s   
/0C ++?+F+FGd+ef&6$ -11515/3,004/3&*J
ELL)J
 !.J
 !.	J

 u||,J
 ELL)J
  -J
 'tnJ
 d^J
 
u88	9J
 gJ
r4   r  z1MRA Model with a `language modeling` head on top.c                       e Zd ZddgZ fdZd Zd Z eej                  d             e
eee      	 	 	 	 	 	 	 	 	 ddeej                      d	eej                      d
eej                      deej                      deej                      deej                      deej                      dee   dee   deeef   fd              Z xZS )MraForMaskedLMzcls.predictions.decoder.weightzcls.predictions.decoder.biasc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r%   )r   r   r  r#   r  clsr  r   s     r*   r   zMraForMaskedLM.__init__  s4     F#!&) 	r4   c                 B    | j                   j                  j                  S r%   )r  r  r  r  s    r*   get_output_embeddingsz$MraForMaskedLM.get_output_embeddings  s    xx##+++r4   c                     || j                   j                  _        |j                  | j                   j                  _        y r%   )r  r  r  r  )r   new_embeddingss     r*   set_output_embeddingsz$MraForMaskedLM.set_output_embeddings  s,    '5$$2$7$7!r4   r  r  r  r&  r   r   r{  r  labelsr|  r}  r6  c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }
|
d   }| j                  |      }d}|Ft	               } ||j                  d| j                   j                        |j                  d            }|	s|f|
dd z   }||f|z   S |S t        |||
j                  |
j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        Nr&  r   r   r{  r  r|  r}  r   r=   r   losslogitsr%  r  )
r   r  r#   r  r	   r  r   r   r%  r  )r   r  r&  r   r   r{  r  r  r|  r}  r/  r  r  masked_lm_lossloss_fctr   s                   r*   rz   zMraForMaskedLM.forward  s    0 &1%<k$++B]B](())%'!5#  	
 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r4   	NNNNNNNNN)r   r   r   _tied_weights_keysr   r  r  r   r  r  r   r  r   r  r   rT   r<  r  r   r   rz   r
  r  s   @r*   r  r    s+   :<Z[,8 ++?+F+FGd+ef&"$ -11515/3,004)-/3&*0
ELL)0
 !.0
 !.	0

 u||,0
 ELL)0
  -0
 &0
 'tn0
 d^0
 
un$	%0
 g0
r4   r  c                   (     e Zd ZdZ fdZd Z xZS )MraClassificationHeadz-Head for sentence-level classification tasks.c                 4   t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _
        || _        y r%   )r   r   r   r  r   r4  r   r   r   
num_labelsout_projr   r   s     r*   r   zMraClassificationHead.__init__S  sg    YYv1163E3EF
zz&"<"<=		&"4"4f6G6GHr4   c                     |d d dd d f   }| j                  |      }| j                  |      }t        | j                  j                     |      }| j                  |      }| j                  |      }|S )Nr   )r   r4  r   r   rQ  r  )r   featureskwargsxs       r*   rz   zMraClassificationHead.forward[  se    Q1WLLOJJqM4;;))*1-LLOMM!r4   r  r  s   @r*   r  r  P  s    7r4   r  zMRA Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks.c                   ~    e Zd Z fdZ eej                  d             eee	e
      	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     dee   dee   deee	f   fd              Z xZS )MraForSequenceClassificationc                     t         |   |       |j                  | _        t        |      | _        t        |      | _        | j                          y r%   )r   r   r  r  r#   r  
classifierr  r   s     r*   r   z%MraForSequenceClassification.__init__k  sA      ++F#/7 	r4   r  r  r  r&  r   r   r{  r  r  r|  r}  r6  c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }
|
d   }| j                  |      }d}|| j                   j                  | j
                  dk(  rd| j                   _        nl| j
                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }| j
                  dk(  r& ||j                         |j                               }n |||      }n| j                   j                  dk(  r=t               } ||j                  d| j
                        |j                  d            }n,| j                   j                  dk(  rt               } |||      }|	s|f|
dd z   }||f|z   S |S t        |||
j                   |
j"                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   
regressionsingle_label_classificationmulti_label_classificationr=   r  )r   r  r#   r  problem_typer  rQ   rT   rV   rE   r
   r#  r	   r  r   r   r%  r  )r   r  r&  r   r   r{  r  r  r|  r}  r/  r  r  r  r  r   s                   r*   rz   z$MraForSequenceClassification.forwardt  s   0 &1%<k$++B]B](())%'!5#  	
 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r4   r  )r   r   r   r   r   r  r  r   r  r   r  r   rT   r<  r  r   r   rz   r
  r  s   @r*   r  r  e  s"    ++?+F+FGd+ef&,$ -11515/3,004)-/3&*A
ELL)A
 !.A
 !.	A

 u||,A
 ELL)A
  -A
 &A
 'tnA
 d^A
 
u..	/A
 gA
r4   r  zMRA Model with a multiple choice classification head on top (a linear layer on top of
    the pooled output and a softmax) e.g. for RocStories/SWAG tasks.c                   ~    e Zd Z fdZ eej                  d             eee	e
      	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     dee   dee   deee	f   fd              Z xZS )MraForMultipleChoicec                    t         |   |       t        |      | _        t	        j
                  |j                  |j                        | _        t	        j
                  |j                  d      | _        | j                          y r]  )
r   r   r  r#   r   r  r   pre_classifierr  r  r   s     r*   r   zMraForMultipleChoice.__init__  s_     F# ii(:(:F<N<NO))F$6$6: 	r4   z(batch_size, num_choices, sequence_lengthr  r  r&  r   r   r{  r  r  r|  r}  r6  c
           
         |	|	n| j                   j                  }	||j                  d   n|j                  d   }
|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  ||||||||	      }|d   }|dddf   }| j                  |      } t        j                         |      }| j                  |      }|j                  d|
      }d}|t               } |||      }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )aJ  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r=   r:   r  r   r  )r   r  rS   r  r?   r#   r  r   ReLUr  r	   r   r%  r  )r   r  r&  r   r   r{  r  r  r|  r}  num_choicesr/  hidden_statepooled_outputr  reshaped_logitsr  r  r   s                      r*   rz   zMraForMultipleChoice.forward  s   0 &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 (())%'!5#  	
 qz$QT*++M:!	-0/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r4   r  )r   r   r   r   r   r  r  r   r  r   r  r   rT   r<  r  r   r   rz   r
  r  s   @r*   r  r    s"    ++?+F+FGq+rs&-$ -11515/3,004)-/3&*@
ELL)@
 !.@
 !.	@

 u||,@
 ELL)@
  -@
 &@
 'tn@
 d^@
 
u//	0@
 t@
r4   r  zMRA Model with a token classification head on top (a linear layer on top of
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.c                   ~    e Zd Z fdZ eej                  d             eee	e
      	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     dee   dee   deee	f   fd              Z xZS )MraForTokenClassificationc                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r%   )r   r   r  r  r#   r   r   r   r   r  r   r  r  r   s     r*   r   z"MraForTokenClassification.__init__  si      ++F#zz&"<"<=))F$6$68I8IJ 	r4   r  r  r  r&  r   r   r{  r  r  r|  r}  r6  c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }
|
d   }| j                  |      }| j	                  |      }d}|t               }||j                  d      dk(  }|j                  d| j                        }t        j                  ||j                  d      t        j                  |j                        j                  |            } |||      }n2 ||j                  d| j                        |j                  d            }|	s|f|
dd z   }||f|z   S |S t        |||
j                  |
j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r=   r   r  )r   r  r#   r   r  r	   r  r  rT   wheretensorignore_indextype_asr   r%  r  )r   r  r&  r   r   r{  r  r  r|  r}  r/  r  r  r  r  active_lossactive_logitsactive_labelsr   s                      r*   rz   z!MraForTokenClassification.forward(  sh   , &1%<k$++B]B](())%'!5#  	
 "!*,,71')H),11"5: &B @ %R%,,x?T?T2U2]2]^d2e!  }=B @&++b/RY,F)-)9TGf$EvE$!//))	
 	
r4   r  )r   r   r   r   r   r  r  r   r  r   r  r   rT   r<  r  r   r   rz   r
  r  s   @r*   r  r    s   	 ++?+F+FGd+ef&)$ -11515/3,004)-/3&*9
ELL)9
 !.9
 !.	9

 u||,9
 ELL)9
  -9
 &9
 'tn9
 d^9
 
u++	,9
 g9
r4   r  zMRA Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).c                       e Zd Z fdZ eej                  d             eee	e
      	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     deej                     dee   dee   deee	f   fd              Z xZS )MraForQuestionAnsweringc                     t         |   |       d|_        |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y )Nr7   )
r   r   r  r  r#   r   r  r   
qa_outputsr  r   s     r*   r   z MraForQuestionAnswering.__init__p  s[      ++F#))F$6$68I8IJ 	r4   r  r  r  r&  r   r   r{  r  start_positionsend_positionsr|  r}  r6  c           
         |
|
n| j                   j                  }
| j                  |||||||	|
      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      }|j                  d      }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|
s||f|dd z   }||f|z   S |S t        ||||j                  |j                  	      S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        Nr  r   r   r=   r;   )r  r7   )r  start_logits
end_logitsr%  r  )r   r  r#   r  splitr#  r>   r?   clampr	   r   r%  r  )r   r  r&  r   r   r{  r  r  r  r|  r}  r/  r  r  r
  r  
total_lossignored_indexr  
start_lossend_lossr   s                         r*   rz   zMraForQuestionAnswering.forward|  s   : &1%<k$++B]B](())%'!5#  	
 "!*1#)<<r<#: j#++B/''+

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r4   )
NNNNNNNNNN)r   r   r   r   r   r  r  r   r  r   r  r   rT   r<  r  r   r   rz   r
  r  s   @r*   r  r  j  s9   
 ++?+F+FGd+ef&0$ -11515/3,0042604/3&*F
ELL)F
 !.F
 !.	F

 u||,F
 ELL)F
  -F
 "%,,/F
  -F
 'tnF
 d^F
 
u22	3F
 gF
r4   r  r   )NN)r9   r   r   )Tr	  r   pathlibr   typingr   r   r   rT   torch.utils.checkpointr   torch.nnr   r	   r
   torch.utils.cpp_extensionr   activationsr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   r   r   configuration_mrar   
get_loggerr   r  r  r  _TOKENIZER_FOR_DOCr1   r3   rN   r^   rb   rj   rt   autogradFunctionrv   r   r   r   r   r   Moduler   r  r1  r?  rM  rW  r[  rj  r  r  r  r  MRA_START_DOCSTRINGr  r  r  r  r  r  r  r  r&   r4   r*   <module>r#     s      ) )    A A * !  . l l  ) 
		H	%1 $  	C&8.%OP%PsXENN33 X0]5>>22 ]. :%VP#)Z !"$%pf7BII 7tYryy YzBII 299 Bbii  		 ryy :(
 (
X $")) 0!RYY !* *6	 , ^ ci
! i
	i
X MObcI
' I
 dI
ZBII * /
Q
#5 Q

Q
h H
Q
- Q

Q
h P
K
 2 K

K
\ h
Y
0 Y

Y
r4   