
    sg                     0   d Z ddlZddlmZmZmZmZ ddlZddlm	c m
Z ddlZddlm	Z	 ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZ ddlmZ  ej>                  e       Z!dZ"dZ# G d de	jH                        Z% G d de	jH                        Z& G d de	jH                        Z' G d de	jH                        Z( G d de	jH                        Z) G d de	jH                        Z* G d de	jH                        Z+ G d de	jH                        Z, G d  d!e	jH                        Z- G d" d#e	jH                        Z. G d$ d%e	jH                        Z/ G d& d'e      Z0d(Z1d)Z2 ed*e1       G d+ d,e0             Z3 ed-e1       G d. d/e0e             Z4y)0zPyTorch CPMAnt    N)ListOptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN)GenerationMixin)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )CpmAntConfigzopenbmb/cpm-ant-10br   c                   H     e Zd ZdZdef fdZdej                  fdZ xZ	S )CpmAntLayerNormzv
    We use Root Mean Square (RMS) Layer Normalization, please see https://arxiv.org/abs/1910.07467 for details."
    configc                     t         |           |j                  | _        |j                  | _        t        j                  t        j                  |j                              | _	        y N)
super__init__epshidden_sizedim_normr   	Parametertorchemptyweightselfr   	__class__s     ]/var/www/html/venv/lib/python3.12/site-packages/transformers/models/cpmant/modeling_cpmant.pyr   zCpmAntLayerNorm.__init__-   sE    ::**ll5;;v/A/A#BC    hidden_statesc                 p   |j                  d      | j                  k7  rt        d      |j                  }|j	                  t
        j                        j                  d      j                  dd      }|t        j                  || j                  z         z  j	                  |      | j                  z  }|S )f
        Args:
            hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
        z'hidden_states.size(-1) != self.dim_norm   T)dimkeepdim)sizer   AssertionErrordtypetor    float32powmeanrsqrtr   r"   )r$   r(   	old_dtypevariances       r&   forwardzCpmAntLayerNorm.forward4   s    
 b!T]]2 !JKK!''	 ##EMM266q9>>2t>T&X5H)IIMMiX[_[f[ffr'   )
__name__
__module____qualname____doc__r   r   r    Tensorr9   __classcell__r%   s   @r&   r   r   (   s&    D| D
U\\ 
r'   r   c                        e Zd Zdef fdZ	 	 	 ddej                  dej                  dej                  dej                  dee	   dee
ej                  ej                  f      d	ee	   fd
Z xZS )CpmAntAttentionr   c                 H   t         |           |j                  | _        |j                  | _        |j                  | _        t        j                  | j                  | j
                  | j                  z  d      | _	        t        j                  | j                  | j
                  | j                  z  d      | _
        t        j                  | j                  | j
                  | j                  z  d      | _        t        j                  | j
                  | j                  z  | j                  d      | _        t        j                  j                  d      | _        |j                   0t        j                  j#                  |j                         | _        y d | _        y )NFbiasr+   r-   )p)r   r   r   	dim_modelnum_attention_heads	num_headsdim_headr   Linear	project_q	project_k	project_vattention_outr    Softmaxsoftmax	dropout_pDropoutdropoutr#   s     r&   r   zCpmAntAttention.__init__B   s   ++334>>4>>DMM3QX]^4>>4>>DMM3QX]^4>>4>>DMM3QX]^YYt~~'Et~~\abxx''B'/' 88++f.>.>+?DLDLr'   hidden_q	hidden_kvattention_maskposition_biasoutput_attentionspast_key_values	use_cachec           	         |j                  d      }|j                  d      }	|j                  d      }
| j                  |      }| j                  |      }| j                  |      }|j	                  ||	| j
                  | j                        j                  dddd      }|j	                  ||
| j
                  | j                        j                  dddd      }|j	                  ||
| j
                  | j                        j                  dddd      }|It        j                  |d   |gd      }t        j                  |d   |gd      }|j                  d      }
t        j                  ||j                  dd            t        j                  | j                        z  }||z   }t        j                  ||j	                  |d|	|
      t        j                  d	      k(  t        j                   t#        d
      |j$                  |j&                              }| j)                  |      }t        j                  ||j	                  |d|	|
      t        j                  d	      k(  t        j                   d|j$                  |j&                              }|r|}nd}| j*                  | j+                  |      }t        j                  ||      }|j	                  || j
                  |	| j                        j                  dddd      }|j-                         j	                  ||	| j
                  | j                  z        }| j/                  |      }d}|r||f}|||fS )a  
        Args:
            hidden_q (`torch.Tensor`):
                Input of transformer block(self-attention block). It can be the raw embedding of a batch of sequences.
            hidden_kv (`torch.Tensor` of shape `(batch, len_k, dim_model)`)):
                Tensor *key_value* and *query* of shape `(batch, len_k, dim_model)`
            attention_mask (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                Avoid invalid areas to participate in the calculation of self-attention.
            position_bias (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                Provide positional information to self-attention block.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            past_key_values (`Tuple[torch.Tensor, torch.Tensor]`, *optional*):
                Cached past key and value projection states.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        r   r   r,   r	   NrF   r+   Fz-inf)devicer1   )r/   rM   rN   rO   viewrJ   rK   permuter    catmatmul	transposemathsqrtmasked_filltensorscalar_tensorfloatr_   r1   rR   rU   
contiguousrP   )r$   rV   rW   rX   rY   rZ   r[   r\   
batch_sizelen_qlen_kquerykeyvaluescoreattn_weightss                   r&   r9   zCpmAntAttention.forwardU   s   8 ]]1%
a q!x(nnY'y)

:udnndmmLTTUVXY[\^_`hhz5$..$--HPPQRTUWXZ[\

:udnndmmLTTUVXY[\^_`&))_Q/52>CIIq159rBEHHRLE UCMM"b$9:TYYt}}=UU%!!
Aue<U@SSfell%++V

 U#!!
Aue<U@SS%,,ekkJ

  LL<<#LL'E UE*

:t~~udmmLTTUVXY[\^_`  "''
E4>>DMM;YZ""5)"ElOlO33r'   )FNN)r:   r;   r<   r   r   r    r>   
BoolTensorr   boolr   r9   r?   r@   s   @r&   rB   rB   A   s     |  2 -2GK$(Q4,,Q4 <<Q4 ((	Q4
 ||Q4 $D>Q4 "%ell(B"CDQ4 D>Q4r'   rB   c                        e Zd Zdef fdZ	 	 	 	 d
dej                  dej                  deej                     dee   dee	ej                  ej                  f      dee   fd	Z
 xZS )CpmAntSelfAttentionBlockr   c                     t         |           t        |      | _        t	        |      | _        |j                  r/t        j                  j                  |j                        | _
        y d | _
        y r   )r   r   r   layernorm_before_attentionrB   self_attentionrS   r    r   rT   rU   r#   s     r&   r   z!CpmAntSelfAttentionBlock.__init__   sT    *9&*A'-f5 88++F,<,<=DLDLr'   r(   rX   rY   rZ   r[   r\   c           	          | j                  |      }| j                  |||||||      }|\  }}}	| j                  | j                  |      }||z   }|||	fS )a  
        Args:
            hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`):
                Input of transformer block(self-attention block). It can be the raw embedding of a batch of sequences.
            attention_mask (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                Avoid invalid areas to participate in the calculation of self-attention.
            position_bias (`torch.Tensor` of shape `(batch, len_seq, len_seq)`):
                Provide positional information to self-attention block.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            past_key_values (`Tuple(torch.FloatTensor)`, *optional*):
                Cached past key and value projection states.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        )ry   rz   rU   )
r$   r(   rX   rY   rZ   r[   r\   outputsrs   current_key_values
             r&   r9   z CpmAntSelfAttentionBlock.forward   su    2 11-@%%Wnm=NP_aj
 4;00<<#ll7+G%/l,===r'   NFNNr:   r;   r<   r   r   r    r>   r   ru   r   r9   r?   r@   s   @r&   rw   rw      s     |   15,1GK$($>||$> $>  -	$>
 $D>$> "%ell(B"CD$> D>$>r'   rw   c                   D     e Zd Zdef fdZdej                  fdZ xZS )CpmAntDenseGatedACTr   c                 ,   t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j                  |j
                  d      | _        t        j                  j                         | _
        y NFrD   )r   r   r   rL   r   dim_ffw_0w_1r    GELUactr#   s     r&   r   zCpmAntDenseGatedACT.__init__   s[    99V//UK99V//UK88==?r'   r(   c                 r    | j                  | j                  |            }| j                  |      }||z  }|S )zTransform an input tensor from one feature space to another via a nonlinear operation

        Args:
            hidden_states (`torch.Tensor` of shape `(batch, seq_len, dim_in)`)
        )r   r   r   )r$   r(   
gate_scores      r&   r9   zCpmAntDenseGatedACT.forward   s9     XXdhh}56
/"]2r'   	r:   r;   r<   r   r   r    r>   r9   r?   r@   s   @r&   r   r      s    #| #
U\\ 
r'   r   c                   D     e Zd Zdef fdZdej                  fdZ xZS )CpmAntFeedForwardr   c                 (   t         |           t        |      | _        |j                  /t
        j                  j                  |j                        | _        nd | _        t        j                  |j                  |j                  d      | _        y r   )r   r   r   w_inrS   r    r   rT   rU   rL   r   r   w_outr#   s     r&   r   zCpmAntFeedForward.__init__   sg    '/	' 88++F,<,<=DLDLYYv}}f.@.@uM
r'   r(   c                     | j                  |      }| j                  | j                  |      }| j                  |      }|S )r*   )r   rU   r   r$   r(   s     r&   r9   zCpmAntFeedForward.forward   s>    
 		-0<<# LL7M

=1r'   r   r@   s   @r&   r   r      s!    N| NU\\ r'   r   c                   D     e Zd Zdef fdZdej                  fdZ xZS )CpmAntFFNBlockr   c                     t         |           t        |      | _        t	        |      | _        |j                  r/t        j                  j                  |j                        | _
        y d | _
        y r   )r   r   r   layernorm_before_ffnr   ffnrS   r    r   rT   rU   r#   s     r&   r   zCpmAntFFNBlock.__init__	  sS    $3F$;!$V, 88++F,<,<=DLDLr'   r(   c                     | j                  |      }| j                  |      }| j                  | j                  |      }||z   }|S )z
        Args:
            hidden_states (`torch.Tensor` of shape `(batch, len_seq, dim_model)`):
                Hidden states before feed forward layer.
        )r   r   rU   )r$   r(   
ln_outputsr|   s       r&   r9   zCpmAntFFNBlock.forward  sJ     ..}=
((:&<<#ll7+G%/r'   r   r@   s   @r&   r   r     s      |  ||r'   r   c                        e Zd Zdef fdZ	 	 	 	 d
dej                  dej                  deej                     dee   dee	ej                  ej                  f      dee   fd	Z
 xZS )CpmAntTransformerBlockr   c                 b    t         |           t        |      | _        t	        |      | _        y r   )r   r   rw   self_attr   r   r#   s     r&   r   zCpmAntTransformerBlock.__init__$  s&    08!&)r'   r(   rX   rY   rZ   r[   r\   c                 h    | j                  ||||||      }|\  }}}| j                  |      }|||fS )a  
        Args:
            hidden_states (`torch.Tensor`):
                Input to the layer of shape `(batch, seq_len, dim_model)`
            attention_mask (`torch.Tensor`):
                Avoid invalid areas to participate in the calculation of shape `(batch, seq_len, seq_len)`
            position_bias (`torch.Tensor`):
                Provides position information to attention mechanism of shape `(num_heads, seq_len, seq_len)`
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            past_key_values (`Tuple[torch.Tensor, torch.Tensor])`, *optional*):
                Cached past key and value projection states
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        )rX   rY   rZ   r[   r\   )r   r   )	r$   r(   rX   rY   rZ   r[   r\   rs   r}   s	            r&   r9   zCpmAntTransformerBlock.forward)  sU    2 )'/+ & 
 :G6|%6/l,===r'   r~   r   r@   s   @r&   r   r   #  s    *| * 15,1GK$(&>||&> &>  -	&>
 $D>&> "%ell(B"CD&> D>&>r'   r   c                        e Zd Zdef fdZ	 	 	 	 ddej                  dej                  dej                  dee   dee   dee	ej                  ej                  f      d	ee   fd
Z
 xZS )CpmAntEncoderr   c                     t         |           |j                  | _        t	        j
                  t        | j                        D cg c]  }t        |       c}      | _        t        |      | _
        y c c}w r   )r   r   num_hidden_layers
num_layersr   
ModuleListranger   layersr   output_layernorm)r$   r   ithr%   s      r&   r   zCpmAntEncoder.__init__S  s[     22mmuUYUdUdOe$f%;F%C$fg / 7 %gs   A6r(   rX   rY   rZ   output_hidden_statesr[   r\   c           	         |rdnd}|rdnd}	|rdnd}
t        | j                        D ]9  \  }}|r||fz  } ||||||r||   nd|      }|\  }}}|r|	|fz  }	|4|
|fz   }
; | j                  |      }|r||fz  }||
||	fS )a%  
        Args:
            hidden_states (`torch.Tensor`):
                Input to the layer of shape `(batch, seq_len, dim_model)`
            attention_mask (`torch.Tensor`):
                Avoid invalid areas to participate in the calculation of shape `(batch, seq_len, seq_len)`
            position_bias (`torch.Tensor`):
                Provides position information to attention mechanism of shape `(num_heads, seq_len, seq_len)`
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers.
            past_key_values (`Tuple[torch.Tensor, torch.Tensor])`, *optional*):
                Cached past key and value projection states
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
         N)rZ   r[   r\   )	enumerater   r   )r$   r(   rX   rY   rZ   r   r[   r\   all_hidden_statesall_self_attnscurrent_key_valuesilayerlayer_outputsrs   r}   s                   r&   r9   zCpmAntEncoder.forwardZ  s    8 #7BD0d#,R$!$++. 	OHAu#!m%55!!"36E 24#M >K:M<): </1 ,%7;L:N%N"	O" --m<-!1102C^SSr'   )NNNNr   r@   s   @r&   r   r   R  s    8| 8 -1/3GK$(6T||6T 6T ||	6T
 $D>6T 'tn6T "%ell(B"CD6T D>6Tr'   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )CpmAntIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r   r   r   rL   r   intermediate_sizedense
isinstance
hidden_actstrr
   intermediate_act_fnr#   s     r&   r   zCpmAntIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r'   r(   returnc                 J    | j                  |      }| j                  |      }|S r   )r   r   r   s     r&   r9   zCpmAntIntermediate.forward  s&    

=100?r'   r:   r;   r<   r   r    r>   r9   r?   r@   s   @r&   r   r     s#    9U\\ ell r'   r   c                        e Zd Zdef fdZdej                  dej                  dej                  dej                  fdZd Zd
d	Z	 xZ
S )CpmAntSegmentPositionEmbeddingr   c                 b   t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _	        t        j                  t        j                  |j                  |j                  z  |j                  z   |j                              | _        y r   )r   r   rI   rJ   position_bias_num_bucketsnum_bucketsposition_bias_max_distancemax_distancesegment_typesnum_segmentsr   r   r    r!   relative_attention_biasr#   s     r&   r   z'CpmAntSegmentPositionEmbedding.__init__  s    33!;;"=="00')||KK$$v';';;f>^>^^**(
$r'   key_pos	query_poskey_segmentquery_segmentc           	      0   t        j                         5  |j                  d      }|j                  d      }|j                  d      }|j                  d      |j                  d      k7  r0t        d|j                  d       d|j                  d       d      ||j                  d      k7  s||j                  d      k7  r!t        d| d|j                  d       d      ||j                  d      k7  r!t        d| d|j	                  d       d      |j                  |d|      }|j                  ||d      }|j                  |d|      }|j                  ||d      }| j                  ||      }|| j                  z   }| j                  t        j                  |t         j                  |j                  	      d d d f   t        j                  |t         j                  |j                  	      d d d f   z
  | j                  | j                  
      }	t        j                  ||k(  |	d d d d d f   |      }d d d        t        j                  | j                         }
|
j#                  dddd      j%                         }
|
S # 1 sw Y   MxY w)Nr   r   z>key_pos.size(0) should be equal to query_pos.size(0), but got z and !z7keylen should be equal to key_segment.size(1), but got z;querylen should be equal to query_segment.size(1), but got r+   r1   r_   )r   r   r	   r,   )r    no_gradr/   r0   szier`   !_segment_relative_position_bucketr   _position_bucketarangeint32r_   r   whereF	embeddingr   ra   rk   )r$   r   r   r   r   batchkeylenquerylenrelative_position_bucketabsolute_position_bucketembedss              r&   r9   z&CpmAntSegmentPositionEmbedding.forward  s    ]]_ %	LLOE\\!_F ~~a(H||A).."33$TU\UaUabcUdTeejktkykyz{k|j}}~  ))!,,M<N<Nq<Q0Q$MfXUZ[f[k[klm[nZoopq  =--a00$QRZQ[[`anasastuav`wwxy  ll5"f5G!uh;I%**5"f=K)..uhCM'+'M'Mm]h'i$'?$BRBR'R$ (,'<'<V5;;?W?^?^_`dfg`gh,,xu{{C[CbCbcdegkdklm ,,!..	 (= ($ (-{{-(q!4(($C%	P 5t7S7ST1a+668W%	 %	s   H+JJc                 &    || j                   z  |z   S r   )r   )r$   r   r   s      r&   r   z@CpmAntSegmentPositionEmbedding._segment_relative_position_bucket  s    t000;>>r'   c                 .   d}|dz  }|dkD  j                  t        j                        |z  }t        j                  |      }|dz  }||k  }|t        j                  |j                         |z        t        j                  ||z        z  ||z
  z  j                  t        j                        z   }t        j                  |t        j                  ||dz
              }|t        j                  ||j                  t        j                        |      z  }|S )Nr   r,   r   )
r2   r    r   abslogrj   re   min	full_liker   )r$   relative_positionr   r   relative_buckets	max_exactis_smallrelative_postion_if_larges           r&   r   z/CpmAntSegmentPositionEmbedding._position_bucket  s   -155ekkB[P!II&781$	$y0$-II'--/);<hh|i/01Y&( "U[[/	%!
 %*II%OO5{QG%
! 	EKK2C2F2Fu{{2SUnoor'   )       )r:   r;   r<   r   r   r    r>   r9   r   r   r?   r@   s   @r&   r   r     sU    
| 
22 <<2 \\	2
 ||2h? r'   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )CpmAntOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y )N)r   )r   r   r   rL   r   r   r   	LayerNormlayer_norm_epsrT   hidden_dropout_probrU   r#   s     r&   r   zCpmAntOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r'   r(   input_tensorr   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   )r   rU   r   )r$   r(   r   s      r&   r9   zCpmAntOutput.forward  s7    

=1]3}|'CDr'   r   r@   s   @r&   r   r      s1    >U\\  RWR^R^ r'   r   c                       e Zd ZdZeZdZd Zy)CpmAntPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    cpmantc                    t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j                  j                  j                  d       yt        |t              r&|j                  j                  j                  d       yt        |t              r<|j                   j                  j                  d| j                  j                         yy)zInitialize the weightsg        )r5   stdNg      ?)r   r   rL   r"   datanormal_r   init_stdrE   zero_	Embeddingpadding_idxr   fill_r   r   r   )r$   modules     r&   _init_weightsz#CpmAntPreTrainedModel._init_weights  s[   fbii(MM&&CT[[5I5I&J{{&  &&( '-MM&&CT[[5I5I&J!!-""6#5#56<<> .-KK""$MM$$S)0MM$$S) >?**//77SdkkFZFZ7[ @r'   N)r:   r;   r<   r=   r   config_classbase_model_prefixr  r   r'   r&   r   r     s    
  L \r'   r   aB  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters
        config ([`~CpmAntConfig`]): Model configuration class with all the parameters of the
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`CPMAntTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zTThe bare CPMAnt Model outputting raw hidden-states without any specific head on top.c                       e Zd Zdef fdZd Zd Zd Z ee	       e
eee      	 	 	 	 	 	 ddeej                      dee   d	ee   d
eeeej                            dee   dee   deeej                      ef   fd              Z xZS )CpmAntModelr   c                    t         |   |       t        |      | _        t	        j
                  |j                  |j                        | _        t	        j
                  |j                  |j                  |j                  z  z   |j                        | _        t        |      | _        |j                  | _        |j                  | _	        | j                          y r   )r   r   r   encoderr   r   r   r   segment_embedding
vocab_sizeprompt_typesprompt_lengthinput_embeddingr   rY   	post_initr#   s     r&   r   zCpmAntModel.__init__R  s     $V,!#f.B.BFDVDV!W!|| 3 3f6J6J JJFL^L^ 
 <FC#11 ++r'   c                     | j                   S r   r  r$   s    r&   get_input_embeddingsz CpmAntModel.get_input_embeddings_  s    ###r'   c                     || _         y r   r  )r$   
embeddingskwargss      r&   set_input_embeddingsz CpmAntModel.set_input_embeddingsb  s
    )r'   c                 *   |j                  d      }|j                  d      }|j                  }t        j                  ||      t        j                  ||      j	                  dd      k  }|d d d d d f   |d d d d d f   j                         |j	                  d||      z  z  }	|	|d d d d d f   |d d d d d f   k(  z  }	t        j                  t        t        || j                  z
              d d d   |      d d d f   j                  |d      |d d d f   k  }
t        j                  t        j                  || j                  |      j                         |
fd      }
|
j	                  ||d      |
j	                  |d|      z  |	z  }	|	S )Nr   r   )r_   r+   rF   )r/   r_   r    r   r`   logical_notrh   listr   r  repeatrb   onesru   )r$   	input_idsspancontextlengthr   seqlenr_   directional_mask_2drX   mask_1ds              r&   _prepare_attention_maskz#CpmAntModel._prepare_attention_maske  s   q!"!!#ll6&AU\\RXagEhEmEmnprsEtt D!,Aq$J++-0C0H0HFTZ0[[
 (44
+;tAq$J?O+OP LLeFT-?-?$?@A$B$GPVWX\^_X_`gghmopqQWo 	 ))UZZt/A/A&QVVXZabhij eVQ7',,uaQW:XX[iir'   
checkpointoutput_typer  r  rZ   r   r[   r\   return_dictr   c           	         ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|j
                  t        j                  k7  r|j                  t        j                        }|j
                  |j                  }	}t        j                  |dk7  dd      j                  ||	      }
|
dk7  j                  d      j                  ||	      }t        j                  t        j                  | j                  dz  | j                  z   | j                  dz  | j                  z   ||	      j!                  |j#                  d      d      |fd      }|j#                         \  }}t        j                  t        j$                  || j                  ||	      |
fd      }
t        j&                  ||fd||	      }t        j                  |||	      j!                  |d      }t        j&                  ||fd||	      }|]d}t)        d g| j*                  j,                  z        }|j/                         }| j1                  |      }| j3                  |
      }||z   }nH|d   d   j#                  d      }| j3                  |
      }| j1                  |      |d d dd d d f   z   }| j5                  ||||      }| j7                  |||
|
      }|d d |d d d f   }|d d d d |d d d f   }|d d |d d d f   }| j+                  |||||||      \  }}}}|dk(  rw|d d | j                  d d d f   }|4d	}|D ]+  }||d d d d | j                  d | j                  d f   fz  }- |}|'d	}|D ]  }||d d | j                  d d d f   fz  }  |}|st)        d
 ||||fD              S t9        ||||      S )Nr   r,   r   r+   r	   r   rF   r^   r   c              3   &   K   | ]	  }||  y wr   r   ).0vs     r&   	<genexpr>z&CpmAntModel.forward.<locals>.<genexpr>  s      efers   )last_hidden_stater[   r(   
attentions)r   rZ   r   use_return_dictr\   r1   r    r   r2   r_   r   sumrb   r   r  r	  r  r/   zerosfulltupler  r   rk   r  r  r"  rY   r   )r$   r  rZ   r   r[   r\   r&  r  r1   r_   segmentr  r   
seq_lengthr  positionr  past_lengthr(   segment_statesrX   rY   present_key_valuesr   all_attentionsnew_attentions	attentionnew_hidden_stateshidden_states                                r&   r9   zCpmAntModel.forwardw  s.     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]!*!6IDKK<Q<Q	 ??ekk)!U[[1I!)9)9v++i1na366U66RQ,##B'**v*FII&&*T__<&&*T__<!	
 &*A. 
	 &NN,z))U[[0B0B%X^_ahiopq**eZ0!5P<<
%GNNuVWXzz5*-qfM"K#TFT\\-D-D$DEO!,,.I 00;M!33G<N)N:M)!,Q/44R8K!33G<N 00;nQPRPSUVY>WWM55iwPVW**8XwP';<(:;%aKL!&;<%aq&89OS|| P
L)+<n !)!T-?-?-A1*DEM)!#!/ eI"yAt7I7I7KTM_M_Ma1a'b&ddNe!/ ,$&!$5 UL%,q$:L:L:NPQ7Q*R)TT%U$5! )+=?PR`a   '+.+%	
 	
r'   )NNNNNN)r:   r;   r<   r   r   r  r  r"  r   CPMANT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   r    r>   ru   r   r   r9   r?   r@   s   @r&   r  r  M  s    
| $*$ ++BC&+$ -1,0/3@D$(&*^
ELL)^
 $D>^
 'tn	^

 "%ell(;"<=^
 D>^
 d^^
 
uU\\"$;;	<^
 D^
r'   r  zy
    The CPMAnt Model with a language modeling head on top (linear layer with weights tied to the input embeddings).
    c                   f    e Zd ZdgZdef fdZ ee       ee	e
e      	 	 	 	 	 	 	 	 ddeej                     deeeej                  ej                  f         dee   dee   d	ee   d
eej                     dee   deej                     deee
f   fd              Zd Zd Zd Zd Zd Z xZS )CpmAntForCausalLMzlm_head.weightr   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  |j                  |j                  z  z   d      | _
        | j                          y r   )r   r   r  r   r   rL   r   r	  r
  r  lm_headr  r#   s     r&   r   zCpmAntForCausalLM.__init__  sd     !&) yy 1 1F4G4G&J^J^4^ ^ej
 	r'   r#  r  r[   r\   rZ   r   labelsr&  rX   r   c	                    ||n| j                   j                  }| j                  ||||||      }
|r|
j                  n|
d   }| j	                  |      }d}|At               } ||j                  d|j                  d            |j                  d            }|s|f|
dd z   }||f|z   S |S t        |||
j                  |
j                  |
j                        S )u;
  
        Args:
            input_ids (`torch.Tensor` of shape `(batch_size, seq_len)`):
                Indices of input sequence tokens in the vocabulary.

                Indices can be obtained using [`CPMAntTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers.
            labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                CPMAnt will process attention mask automatically, this parameter is a dummy parameter for
                text-generation pipeline.

        Example:

        Text Generation with CpmAntForCausalLM.
        ```python
        >>> from transformers import CPMAntTokenizer, CpmAntForCausalLM

        >>> texts = "今天天气不错，"
        >>> model = CpmAntForCausalLM.from_pretrained("openbmb/cpm-ant-10b")
        >>> tokenizer = CPMAntTokenizer.from_pretrained("openbmb/cpm-ant-10b")
        >>> input_ids = tokenizer(texts, return_tensors="pt")
        >>> outputs = model.generate(**input_ids)
        >>> output_texts = tokenizer.batch_decode(outputs)
        >>> print(output_texts)
        ['今天天气不错，阳光明媚，我和妈妈一起去超市买东西。\n在超市里，我看到了一个很好玩的玩具，它的名字叫“机器人”。它有一个圆圆的脑袋，两只圆圆的眼睛，还有一个圆圆的']
        ```
        Nr   r+   r   )losslogitsr[   r(   r-  )r   r.  r   r,  rD  r   r`   r/   r   r[   r(   r-  )r$   r  r[   r\   rZ   r   rE  r&  rX   r  model_outputr(   rH  rG  	loss_funcoutputs                   r&   r9   zCpmAntForCausalLM.forward  s    z &1%<k$++B]B]{{(*>QZ\g
 ;F66<XY?m,(*IV[[V[[_=v{{2ODYab!11F)-)9TGf$EvE%(88&44#..
 	
r'   c                 .    | j                   j                  S r   r   r  r  s    r&   r  z&CpmAntForCausalLM.get_input_embeddingsH  s    {{***r'   c                 &    || j                   _        y r   rM  )r$   r  s     r&   r  z&CpmAntForCausalLM.set_input_embeddingsK  s    &0#r'   c                     | j                   S r   rD  r  s    r&   get_output_embeddingsz'CpmAntForCausalLM.get_output_embeddingsN  s    ||r'   c                     || _         y r   rP  )r$   new_embeddingss     r&   set_output_embeddingsz'CpmAntForCausalLM.set_output_embeddingsQ  s	    %r'   c                     |D cg c]  }|t        |      n| }}|D ]  }|d   |   |d<   |d   |   |d<    |S c c}w )Nr   r   )r  )r$   r[   beam_idxeachkey_value_layers        r&   _reorder_cachez CpmAntForCausalLM._reorder_cacheT  sh    P_`)94:tC``. 	>O!0!3H!=OA!0!3H!=OA	> 	 as   <)NNNNNNNN)r:   r;   r<   _tied_weights_keysr   r   r   r>  r   r?  r   r@  r   r    r>   r   r   ru   r   r9   r  r  rQ  rT  rY  r?   r@   s   @r&   rB  rB    s4    ++|  ++BC&*$ -1MQ$(,0/3)-&*15O
ELL)O
 "$uU\\5<<-G'H"IJO
 D>	O

 $D>O
 'tnO
 &O
 d^O
 !.O
 
u,,	-O
 DO
b+1&r'   rB  )5r=   re   typingr   r   r   r   r    torch.nn.functionalr   
functionalr   torch.utils.checkpointtorch.nnr   activationsr
   
generationr   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   r   configuration_cpmantr   
get_loggerr:   loggerr?  r@  Moduler   rB   rw   r   r   r   r   r   r   r   r   r   CPMANT_START_DOCSTRINGr>  r  rB  r   r'   r&   <module>rj     s     / /      % ! ) O - u u . 
		H	%+  bii 2e4bii e4P.>ryy .>b")) (		 4RYY 6,>RYY ,>^>TBII >TD Y RYY Y z299 \O \8	  0 ZJ
' J
	J
Z  	u- uur'   