
    sg?                        d Z ddlZddlmZmZmZ ddlZddlZddlmZ ddl	m
Z
mZmZmZ ddl	mZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ ddl m!Z! ddl"m#Z#  e!jH                  e%      Z&dZ'dZ(d,dZ) G d dejT                        Z+ G d dejT                        Z, G d dejT                        Z- G d de      Z.dZ/dZ0 ede/       G d de.             Z1 ed e/       G d! d"e.e             Z2 ed#e/       G d$ d%e.             Z3 ed&e/       G d' d(e.             Z4 ed)e/       G d* d+e.             Z5y)-zPyTorch MPT model.    N)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLoss	LayerNormMSELoss)
functional   )add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forward)GenerationMixin)!_prepare_4d_causal_attention_mask))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)logging   )	MptConfigzmosaicml/mpt-7br   c                 R   t        j                  d|z
  dt         j                  |      j                  ddd|      }dt	        j
                  t	        j                  |             z  }t        j                  d|dz   t         j                  |      j                         }|||z  z  }dt        j                  d|      z  }|j                  d|dd      }|| k7  r9t        j                  |ddddddf   |ddddddf   gd      ddd| df   }||z  }|j                  d      S )	a  
    Link to paper: https://arxiv.org/abs/2108.12409 - Alibi tensor is not causal as the original paper mentions, it
    relies on a translation invariance of softmax for quick implementation. This implementation has been copied from
    the alibi implementation of MPT source code that led to slightly different results than the Bloom alibi:
    https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L292
    r   )dtypedevice         ?N.dimr   )torcharangeint32viewmathceillog2int64floatpowconcatsqueeze)	num_headssequence_lengthalibi_bias_maxr   alibinum_heads_power_of_2baseslopess           W/var/www/html/venv/lib/python3.12/site-packages/transformers/models/mpt/modeling_mpt.pybuild_mpt_alibi_tensorr6   /   s    LL_,au{{6RWWXY[\^_apqE		$))I*> ??<</!35;;vV\\^D>$889D599Q%%F[[0!Q7Fy(vaAsl3VAssCK5HIqQRSU_V_U_adRdeFNE==    c            
            e Zd ZdZdef fdZ	 	 d	dej                  dej                  dee	ej                        deej                     fdZ
 xZS )
MptAttentionzyMulti-head self attention.
    Using torch or triton attention implemetation enables user to also use additive bias.
    configc                    t         |           |j                  | _        |j                  | _        |j                  | _        | j                  | j                  z  | _        |j                  j                  | _        | j                  4dt        j                  | j                  | j                  z        z  | _        |j                  j                  | _        |j                  j                  | _        t        j                  | j                  d| j                  z  d      | _        t        j                  | j                  | j                  d      | _        y )Nr   r   Fbias)super__init__hidden_sizen_headsmax_seq_lenmax_seq_lengthhead_dimattn_configsoftmax_scaler&   sqrt
attn_pdropattn_dropout_pclip_qkvr   LinearWqkvout_projselfr:   	__class__s     r5   r?   zMptAttention.__init__K   s    !--~~$00((DLL8#//==%!"TYYt/?/?$,,/N%O!OD$00;;**33IId..D4D4D0D5Q			$"2"2D4D4D5Qr7   hidden_statesposition_biaspast_key_valueattention_maskc                    |j                   d d \  }}| j                  |      }| j                  r(|j                  | j                   | j                        }|j	                  dd      \  }}	}
|j                  ||| j                  | j                        j                  dd      }|	j                  ||| j                  | j                        j                  dd      }	|
j                  ||| j                  | j                        j                  dd      }
|Kt        |      dk7  r8t        j                  |d   |	gd      }	t        j                  |d   |
gd      }
|	|
f}n|	|
f}t        j                  ||	j                  dd            | j                  z  }||n||d   j                   d   z   }|t        |j                         dk7  r!t        d	t        |j                                |	j                   d   }t        d|j!                  d      |z
        }t        d|j!                  d      |z
        }|d d |d |d f   }||z   }|9|j#                  |t        j$                  |j&                        j(                        }t*        j,                  j/                  |j1                         d      j3                  |
j&                        }t*        j,                  j5                  || j6                  | j8                  
      }t        j                  ||
      }|j;                  dddd      j=                         j?                  ||d      }| jA                  |      }|||fS )Nr   )minmaxr   r    r   r   z6Expecting position_bias shape to be 3 dimensions, got ptraining)!shaperL   rJ   clampchunkreshaperA   rD   	transposelenr"   catmatmulrF   
ValueErrorrW   sizemasked_fillfinfor   rV   r   r   softmaxr*   todropoutrI   r\   permute
contiguousr%   rM   )rO   rQ   rR   rS   rT   
batch_size
seq_length	mixed_qkvquery_states
key_statesvalue_statesattention_scoresquery_length
key_lengthposition_bias_query_indexposition_bias_key_indexattn_weightscontext_statesattn_outputs                      r5   forwardzMptAttention.forwardZ   s,    "/!4!4Ra!8
JIIm,	==!T]]NNI1:1J.j,#++J
DLLRVR_R_`jjklnop''
Jdmm\ffghjkl
#++J
DLLRVR_R_`jjklnop%>"a'"YYq(9:'FAN
$yy.*;\)JPQR(,7N(,7N <<j6J6J2r6RSVZVhVhh%3%;zn]^N_NeNefgNhAh$=&&'1, #YZ]^k^q^qZrYs!tuu#))"-J(+A}/A/A!/D|/S(T%&)!]-?-?-BZ-O&P#)!-F-GI`Ia*abM/-?%/;;NEKKXdXjXjLkLoLop }},,-=-C-C-E2,NQQR^RdRde}},,\T=P=P[_[h[h,ilLA'//1a;FFHMMjZdfhimmN3L.88r7   )NN)__name__
__module____qualname____doc__r   r?   r"   Tensorr   r   r|   __classcell__rP   s   @r5   r9   r9   F   sh    Ry R& 9=1559||59 ||59 !u||!45	59
 !.59r7   r9   c                   t     e Zd Zdef fdZdej                  dej                  dej                  fdZ xZS )MptMLPr:   c                 &   t         |           |j                  }t        j                  |d|z  d      | _        t        j                  d      | _        t        j                  d|z  |d      | _        |j                  j                  | _        y )N   Fr<   none)approximate)r>   r?   r@   r   rK   up_projGELUact	down_projrE   rH   hidden_dropoutrO   r:   r@   rP   s      r5   r?   zMptMLP.__init__   sm    ((yya+oEJ77v.1{?KeL$00;;r7   rQ   residualreturnc                     | j                  | j                  |            }| j                  |      }t        j                  || j
                  | j                        }||z   }|S )NrZ   )r   r   r   Frk   r   r\   )rO   rQ   r   intermediate_outputoutputs        r5   r|   zMptMLP.forward   sW    m!<="nn];.$2E2EPTP]P]^("r7   )	r}   r~   r   r   r?   r"   r   r|   r   r   s   @r5   r   r      s5    <y <U\\ U\\ ell r7   r   c                        e Zd Zdef fdZ	 	 	 d
dej                  dej                  dej                  deeej                  ej                  f      de	de	fd	Z
 xZS )MptBlockr:   c                    t         |           |j                  }t        ||j                        | _        d | j
                  _        |j                  | _        t        |      | _
        t        ||j                        | _        d | j                  _        t        |      | _        |j                  j                  | _        t#        j$                  | j                         | _        y )Neps)r>   r?   r@   r	   layer_norm_epsilonnorm_1r=   rA   r.   r9   attnnorm_2r   ffnrE   rH   dropout_rater   Dropoutresid_attn_dropoutr   s      r5   r?   zMptBlock.__init__   s    ((1J1JK (	1J1JK&>"..99"$**T->->"?r7   rQ   rR   rT   
layer_past	use_cacheoutput_attentionsc                     | j                  |      }|}| j                  ||||      \  }	}
}| j                  |	      |z   }| j                  |      }|}| j	                  ||      }|f}|r||fz  }|r||
fz  }|S )N)rR   rT   rS   )r   r   r   r   r   )rO   rQ   rR   rT   r   r   r   layernorm_outputr   attn_outputsry   rS   r   outputss                 r5   r|   zMptBlock.forward   s      ;;}5  6:YY')%	 6? 6
2lN //=H;;}5 ! *H5)((G&Gr7   )NFF)r}   r~   r   r   r?   r"   r   r   r   boolr|   r   r   s   @r5   r   r      s    @y @2 CG"'(||( ||( 	(
 U5<<#=>?( (  (r7   r   c                        e Zd ZeZdZdZdgZdgZ fdZ	de
j                  fdZedeeej                   ej                   f      d	eeej                   ej                   f      fd
       Z xZS )MptPreTrainedModeltransformerTr   z
lm_head.*.c                 $    t        |   |i | y N)r>   r?   )rO   inputskwargsrP   s      r5   r?   zMptPreTrainedModel.__init__   s    &+F+r7   modulec                    t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t              rV|j                  $|j                  j                  j                          |j                  j                  j                  d       yy)zInitialize the weights.g        )meanstdNr   )
isinstancer   rK   weightdatanormal_r:   initializer_ranger=   zero_	Embeddingpadding_idxr	   fill_)rO   r   s     r5   _init_weightsz MptPreTrainedModel._init_weights   s   fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .	*{{&  &&(MM$$S) +r7   rS   r   c                 l    | d   d   j                   \  }}||z  t        fd| D              S )zw
        Converts the cache to the format expected by Mpt, i.e. to tuple(tuple([batch_size * num_heads, ...]))
        r   c              3   v   K   | ]0  }|d    j                        |d   j                        f 2 ywr   r   N)r`   ).0r   batch_size_times_num_headsrD   ro   s     r5   	<genexpr>z;MptPreTrainedModel._convert_to_mpt_cache.<locals>.<genexpr>  sK      

  1%%&@(JW1%%&@*hW
s   69)r]   tuple)rS   rn   r.   r   rD   ro   s      @@@r5   _convert_to_mpt_cachez(MptPreTrainedModel._convert_to_mpt_cache  sM     7EQ6G6J6P6P3
Ix%/)%;"  

 -
 
 	
r7   )r}   r~   r   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_keys_to_ignore_on_load_missingr?   r   Moduler   staticmethodr   r"   r   r   r   r   s   @r5   r   r      s    L%&*##'4o#,*BII *" 
eELL%,,$>?@
	uU\\5<</0	1
 
r7   r   a*  

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`MptConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.n_layers`):
            Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
            `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
            their past given to this model should not be passed as `input_ids` as they have already been computed.

            Each element of `past_key_values` is a tuple (past_key, past_value):
            - past_key: [batch_size * num_heads, head_dim, kv_length]
            - past_value: [batch_size * num_heads, kv_length, head_dim]
        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.

            If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
            `past_key_values`).
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
z]The bare Mpt Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd Zdef fdZd ZddZdej                  fdZ	 e
e       eeee      	 	 	 	 	 	 	 	 ddeej"                     d	eeeej                  ej                  f   d
f      deej                     deej"                     dee   dee   dee   dee   deeej                  d
f   ef   fd              Z xZS )MptModelr:   c                    t         |   |       |j                  | _        |j                  | _        t        j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        t        | j                  |j                        | _        d | j                   _        d| _        | j'                          y c c}w )Nr   F)r>   r?   r@   rA   r.   r   r   
vocab_sizewte
ModuleListrangen_layersr   blocksr	   r   norm_fr=   gradient_checkpointing	post_init)rO   r:   _rP   s      r5   r?   zMptModel.__init__\  s     !-- << 1 143C3CD mmuV__?U$V!Xf%5$VW   0 0f6O6OP&+# 	 %Ws   C5c                     | j                   S r   r   rO   s    r5   get_input_embeddingszMptModel.get_input_embeddingsr  s    xxr7   c                     t        ||||      S r   )r6   )rO   r.   r/   r0   r   s        r5   r6   zMptModel.build_mpt_alibi_tensoru  s    %i.RXYYr7   new_embeddingsc                     || _         y r   r   rO   r   s     r5   set_input_embeddingszMptModel.set_input_embeddingsx  s	    !r7   
checkpointoutput_typer   	input_idspast_key_values.rT   inputs_embedsr   r   output_hidden_statesreturn_dictr   c	           
      h   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t        d      ||j                  \  }	}
n||j                  \  }	}
}nt        d      |"t        d gt        | j                        z        }|| j                  |      }|}|rdnd }|rdnd }|rdnd }| j                  r%| j                  r|rt        j                  d       d}|
}d}|d   |d   d   j                  d   }||z   }|$t        j                   |	|f|j"                        }n|j%                  |j"                        }| j'                  | j(                  | j                   j*                  |j"                        }t-        ||	|
f||      }|j/                         }t1        | j                  |      D ]w  \  }}|r||fz   }| j                  r.| j                  r"| j3                  |j4                  ||||||      }n |||||||	      }|d   }|d
u r	||d   fz   }|sk|||rdnd   fz   }y | j7                  |      }|r||fz   }|st        d ||||fD              S t9        ||||      S )NzDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embeds zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r   r   )r   rT   r   r   rR   Tr   c              3   &   K   | ]	  }||  y wr   r   )r   vs     r5   r   z#MptModel.forward.<locals>.<genexpr>  s     wqijivws   )last_hidden_stater   rQ   
attentions)r:   r   r   r   use_return_dictre   r]   r   rb   r   r   r   r\   loggerwarning_oncer"   onesr   rj   r6   r.   rB   r   r   zip_gradient_checkpointing_func__call__r   r   )rO   r   r   rT   r   r   r   r   r   rn   ro   r   rQ   presentsall_self_attentionsall_hidden_statesseq_length_with_pastpast_key_values_lengthr1   causal_maskblockr   r   s                          r5   r|   zMptModel.forward{  s%   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"%.__"J
&(5(;(;%J
ATUU"#TFS-=$=>O  HHY/M%"2$5b4"6BD&&4==##p "	  *!"1)%4Q%7%:%@%@%C"#7:P#P !"ZZ5I(JS`SgSghN+..}/C/CDN++DNNDKK<S<S\i\p\p+q7Z4mE[
 "&&(!$T[[/!B 	^E:#$58H$H!**t}};;NN!%  !)#.'&7"' $AJMD #wqzm3 &9W)QYZ=[<]&]#;	^@ M2 1]4D Dw]H>OQd$ewww8+$+*	
 	
r7      NNNNNNNNN)r}   r~   r   r   r?   r   r6   r"   r   r   r   MPT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   
LongTensorr   r   r   r|   r   r   s   @r5   r   r   W  sA   
y ,Z"5<< " ++?@&=$ 15SW1548$(,0/3&*m
E,,-m
 "%ellELL.H(I3(N"OPm
 !.	m

   0 01m
 D>m
 $D>m
 'tnm
 d^m
 
uU\\3&')RR	Sm
 Am
r7   r   z
    The MPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                   L    e Zd ZdgZdef fdZd Zdej                  fdZ	 e
e       eeee      	 	 	 	 	 	 	 	 	 ddeej"                     d	eeeej                  ej                  f   d
f      deej                     deej                     deej                     dee   dee   dee   dee   deeej                     ef   fd              Zdeeej                  ej                  f   d
f   dej"                  deeej                  ej                  f   d
f   fdZ xZS )MptForCausalLMzlm_head.weightr:   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y NFr<   )
r>   r?   r   r   r   rK   r@   r   lm_headr   rN   s     r5   r?   zMptForCausalLM.__init__  sI     #F+yy!3!3V5F5FUS 	r7   c                     | j                   S r   r  r   s    r5   get_output_embeddingsz$MptForCausalLM.get_output_embeddings  s    ||r7   r   c                     || _         y r   r  r   s     r5   set_output_embeddingsz$MptForCausalLM.set_output_embeddings  s	    %r7   r   r   r   .rT   r   labelsr   r   r   r   r   c
           
      J   |	|	n| j                   j                  }	| j                  ||||||||	      }
|
d   }| j                  |      }d}||j	                  |j
                        }|dddddf   j                         }|dddf   j                         }|j                  \  }}}t               } ||j                  ||z  |      |j                  ||z              }|	s|f|
dd z   }||f|z   S |S t        |||
j                  |
j                  |
j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        Nr   rT   r   r   r   r   r   r   .rX   r   losslogitsr   rQ   r   )r:   r   r   r  rj   r   rm   r]   r   r%   r   r   rQ   r   )rO   r   r   rT   r   r  r   r   r   r   transformer_outputsrQ   	lm_logitsr  shift_logitsshift_labelsrn   ro   r   loss_fctr   s                        r5   r|   zMptForCausalLM.forward	  sa   0 &1%<k$++B]B]"..+)'/!5# / 	
 ,A.LL/	YYy//0F$S#2#q[1<<>L!#qr'?557L1=1C1C.J
J')H!!*z"9:FHYHYZdgqZqHrD \$7$;;F)-)9TGf$EvE0/??-;;*55
 	
r7   pastbeam_idxc           	          |D ci c]/  }|D ](  }|j                   |j                  |j                         * 1 c}}t        fd|D              }|S c c}}w )aL  
        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.

        Output shares the same memory storage as `past`.
        c              3      K   | ]N  }|d    j                  d |d    j                           |d   j                  d |d    j                           f P ywr   )index_selectr   )r   r   device_to_beam_idxs     r5   r   z0MptForCausalLM._reorder_cache.<locals>.<genexpr>Y  se      

  1**1.@AAUAU.VW1**1.@AAUAU.VW
s   AA)r   rj   r   )rO   r  r  r   
past_statereordered_pastr#  s         @r5   _reorder_cachezMptForCausalLM._reorder_cacheK  sr     QU
BLgq
YcJx{{:+<+<==

  

 #
 
 
s   4A	NNNNNNNNN)r}   r~   r   _tied_weights_keysr   r?   r  r"   r   r  r   r  r   r  r   r  r   r  r   r   r   r|   r&  r   r   s   @r5   r
  r
    s    ++y &ELL & ++?@&5$ 15SW1504)-$(,0/3&*:
E,,-:
 "%ellELL.H(I3(N"OP:
 !.	:

  -:
 &:
 D>:
 $D>:
 'tn:
 d^:
 
uU\\"$EE	F:
 A:
x%ell :;S@AMRM]M]	uU\\5<</0#5	6r7   r
  a  
    The MPT Model transformer with a sequence classification head on top (linear layer).

    [`MptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                       e Zd Zdef fdZ ee       eee	e
      	 	 	 	 	 	 	 	 	 ddeej                     deeeej                  ej                  f   df      deej                     deej                     d	eej                     d
ee   dee   dee   dee   deeej                     e	f   fd              Z xZS )MptForSequenceClassificationr:   c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                  d      | _        | j                          y r  )
r>   r?   
num_labelsr   r   r   rK   r@   scorer   rN   s     r5   r?   z%MptForSequenceClassification.__init__s  sV      ++#F+YYv1163D3D5Q
 	r7   r   r   r   .rT   r   r  r   r   r   r   r   c
           
      <   |	|	n| j                   j                  }	| j                  ||||||||	      }
|
d   }| j                  |      }||j                  d   }n|j                  d   }| j                   j
                  |dk7  rt        d      | j                   j
                  d}n|xt        j                  || j                   j
                        j                         j                  d      dz
  }||j                  d   z  }|j                  |j                        }n.d}t        j                  | j                  j                    d       |t        j"                  ||j                        |f   }d}|^| j                   j$                  | j&                  dk(  rd	| j                   _        nl| j&                  dkD  rL|j(                  t        j*                  k(  s|j(                  t        j                  k(  rd
| j                   _        nd| j                   _        | j                   j$                  d	k(  rIt-               }| j&                  dk(  r& ||j/                         |j/                               }nc |||      }nY| j                   j$                  d
k(  rt1               } |||      }n,| j                   j$                  dk(  rt3               } |||      }|	s|f|
dd z   }||f|z   S |S t5        |||
j6                  |
j8                  |
j:                        S )  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.rX   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classificationr  )r:   r   r   r-  r]   pad_token_idre   r"   eqintargmaxrj   r   r   r   rP   r}   r#   problem_typer,  r   longr
   r-   r   r   r   r   rQ   r   )rO   r   r   rT   r   r  r   r   r   r   r  rQ   r  rn   sequence_lengthspooled_logitsr  r  r   s                      r5   r|   z$MptForSequenceClassification.forward|  s   0 &1%<k$++B]B]"..+)'/!5# / 	
 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!$#(88It{{7O7O#P#T#T#V#]#]^`#ade#e #3ioob6I#I #3#6#6v}}#E #% ##~~../ 0^ ^
 u||Jv}}MO__`{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+-v6))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r7   r'  )r}   r~   r   r   r?   r   r  r   r  r   r  r   r"   r  r   r   r   r   r|   r   r   s   @r5   r*  r*  c  s6    y  ++?@&4$ 15SW1504)-$(,0/3&*Z
E,,-Z
 "%ellELL.H(I3(N"OPZ
 !.	Z

  -Z
 &Z
 D>Z
 $D>Z
 'tnZ
 d^Z
 
uU\\"$DD	EZ
 AZ
r7   r*  z
    MPT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       e Zd Zdef fdZ ee       eee	e
      	 	 	 	 	 	 	 	 	 ddeej                     deeeej                  ej                  f   df      deej                     deej                     d	eej                     d
ee   dee   dee   dee   deeej                     e	f   fd              Z xZS )MptForTokenClassificationr:   c                    t         |   |       |j                  | _        t        |      | _        t        |d      r|j                  |j                  }n't        |d      r|j                  |j                  }nd}t        j                  |      | _
        t        j                  |j                  |j                        | _        | j                          y )Nclassifier_dropoutr   g?)r>   r?   r,  r   r   hasattrr>  r   r   r   rk   rK   r@   
classifierr   )rO   r:   r>  rP   s      r5   r?   z"MptForTokenClassification.__init__  s      ++#F+6/0V5N5N5Z!'!:!:V-.63H3H3T!'!6!6!$zz"45))F$6$68I8IJ 	r7   r   r   r   .rT   r   r  r   r   r   r   r   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }| j	                  |      }d}|l|j                  |j                        }|j                  \  }}t               } ||j                  ||z  | j                        |j                  ||z              }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )r/  Nr  r   r   )r  r  rQ   r   )r:   r   r   rk   r@  rj   r   r]   r   r%   r,  r   rQ   r   )rO   r   r   rT   r   r  r   r   r   r   deprecated_argumentsr  rQ   r  r  rn   ro   r  r   s                      r5   r|   z!MptForTokenClassification.forward  s+   2 &1%<k$++B]B]"..+)'/!5# / 	
 ,A.]3/YYv}}-F%+\\"J
')HJ3T__Ev{{S]`jSjGkD Y!4QR!88F)-)9TGf$EvE$-;;*55	
 	
r7   r'  )r}   r~   r   r   r?   r   r  r   r  r   r  r   r"   r  r   r   r   r   r|   r   r   s   @r5   r<  r<    s*   y " ++?@&)$ 15SW1504)-$(,0/3&*7
E,,-7
 "%ellELL.H(I3(N"OP7
 !.	7

  -7
 &7
 D>7
 $D>7
 'tn7
 d^7
 
uU\\"$99	:7
 A7
r7   r<  z
    The MPT Model transformer with a span classification head on top for extractive question-answering tasks like SQuAD
    (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                   .    e Zd Z fdZ eej                  d            	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     dee   d	ee   d
ee   deeef   fd       Z xZS )MptForQuestionAnsweringc                     t         |   |       t        |      | _        t	        j
                  |j                  d      | _        | j                          y )Nr   )	r>   r?   r   r   r   rK   r@   
qa_outputsr   rN   s     r5   r?   z MptForQuestionAnswering.__init__@  sA     #F+))F$6$6: 	r7   zbatch_size, sequence_lengthr   rT   r   start_positionsend_positionsr   r   r   r   c	                 "   ||n| j                   j                  }| j                  ||||||      }	|	d   }
| j                  |
      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|s||f|	dd z   }||f|z   S |S t        ||||	j                  |	j                  	      S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        N)rT   r   r   r   r   r   r   rX   r    )ignore_indexr   )r  start_logits
end_logitsrQ   r   )r:   r   r   rF  splitr-   rm   rb   rf   r^   r   r   rQ   r   )rO   r   rT   r   rG  rH  r   r   r   r   sequence_outputr  rK  rL  
total_lossignored_indexr  
start_lossend_lossr   s                       r5   r|   zMptForQuestionAnswering.forwardH  s   , &1%<k$++B]B]"")'/!5# # 
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r7   r  )r}   r~   r   r?   r   r  formatr   r"   r  FloatTensorr   r   r   r   r|   r   r   s   @r5   rD  rD  8  s     ++?+F+FGd+ef 156:596:48,0/3&*B
E,,-B
 !!2!23B
   1 12	B

 "%"2"23B
   0 01B
 $D>B
 'tnB
 d^B
 
u22	3B
 gB
r7   rD  r  )6r   r&   typingr   r   r   r"   torch.utils.checkpointr   torch.nnr   r   r	   r
   r   r   
file_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   configuration_mptr   
get_loggerr}   r   r  r  r6   r   r9   r   r   r   MPT_START_DOCSTRINGr  r   r
  r*  r<  rD  r   r7   r5   <module>ra     s     ) )    L L $ q q ) I  .  ( 
		H	%' .I9299 I9XRYY *=ryy =@,
 ,
^ / d cS
! S
	S
l  h' hhV  j
#5 j
j
Z  O
 2 O
O
d  L
0 L
L
r7   