
    sgj                        d Z ddlZddlmZmZmZ ddlZddlZddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZ ddlmZ ddlmZmZmZmZ ddl m!Z!  ejD                  e#      Z$dZ%dZ& G d dejN                        Z( G d dejN                        Z) G d dejT                        Z+ G d de+      Z,e+e,dZ- G d dejT                        Z. G d de      Z/dZ0dZ1 ed e0       G d! d"e/             Z2 ed#e0       G d$ d%e/e             Z3 ed&e0       G d' d(e/             Z4 ed)e0       G d* d+e/             Z5y),zPyTorch BioGPT model.    N)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GenerationMixin)!_prepare_4d_causal_attention_mask*_prepare_4d_causal_attention_mask_for_sdpa))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )BioGptConfigzmicrosoft/biogptr   c                   V     e Zd ZdZdedef fdZddej                  def fdZ xZ	S )	 BioGptLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                 N    d| _         t        | 	  || j                   z   |       y )N   )offsetsuper__init__)selfr   r   	__class__s      ]/var/www/html/venv/lib/python3.12/site-packages/transformers/models/biogpt/modeling_biogpt.pyr"   z)BioGptLearnedPositionalEmbedding.__init__9   s$     $++5}E    attention_maskpast_key_values_lengthc                     |j                         }t        j                  |d      j                  |      |z  j                         dz
  }|dd|df   }t        |   || j                  z         S )z3`input_ids_shape` is expected to be [bsz x seqlen].r   dimN)longtorchcumsumtype_asr!   forwardr    )r#   r'   r(   	positionsr$   s       r%   r0   z(BioGptLearnedPositionalEmbedding.forward?   sp    ',,. \\.a8@@PSaaggilmm	 a!7!889	wy4;;677r&   )r   )
__name__
__module____qualname____doc__intr"   r-   
LongTensorr0   __classcell__r$   s   @r%   r   r   4   s=    Fs F3 F
8e&6&6 
8PS 
8 
8r&   r   c            
       `     e Zd ZdZd	dedededee   f fdZdej                  f fdZ
 xZS )
BioGptScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    r   r   padding_idxembed_scalec                 6    t         |   |||       || _        y N)r!   r"   r=   )r#   r   r   r<   r=   r$   s        r%   r"   z"BioGptScaledWordEmbedding.__init__R   s    D&r&   	input_idsc                 <    t         |   |      | j                  z  S r?   )r!   r0   r=   )r#   r@   r$   s     r%   r0   z!BioGptScaledWordEmbedding.forwardV   s    wy)D,<,<<<r&   )      ?)r2   r3   r4   r5   r6   r   floatr"   r-   Tensorr0   r8   r9   s   @r%   r;   r;   M   sE    's '3 'S '_ghm_n '= = =r&   r;   c                       e Zd ZdZ	 	 	 	 	 ddededededededee   f fd	Z	d
e
j                  dedefdZ	 	 	 	 	 dde
j                  dee
j                     deee
j                        dee
j                     dee
j                     dedee
j                  ee
j                     eee
j                        f   fdZ xZS )BioGptAttentionz=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsdropout
is_decoderbias	is_causalconfigc                 
   t         |           || _        || _        || _        ||z  | _        || _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _	        || _
        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩rK   )r!   r"   rG   rH   rI   head_dimrM   
ValueErrorscalingrJ   rL   r   Lineark_projv_projq_projout_proj)	r#   rG   rH   rI   rJ   rK   rL   rM   r$   s	           r%   r"   zBioGptAttention.__init__^   s     	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$"ii	94@ii	94@ii	94@		)YTBr&   tensorseq_lenbszc                     |j                  ||| j                  | j                        j                  dd      j	                         S )Nr   r   )viewrH   rP   	transpose
contiguous)r#   rX   rY   rZ   s       r%   _shapezBioGptAttention._shape}   s7    {{3GQQRSUVWbbddr&   hidden_stateskey_value_statespast_key_valuer'   layer_head_maskoutput_attentionsreturnc                 
   |du}|j                         \  }}	}
| j                  |      | j                  z  }|r0|.|d   j                  d   |j                  d   k(  r|d   }|d   }n
|rE| j	                  | j                  |      d|      }| j	                  | j                  |      d|      }n|}| j	                  | j                  |      d|      }| j	                  | j                  |      d|      }t        j                  |d   |gd      }t        j                  |d   |gd      }nD| j	                  | j                  |      d|      }| j	                  | j                  |      d|      }| j                  r||f}|| j                  z  d| j                  f} | j	                  ||	|      j                  | } |j                  | } |j                  | }|j                  d      }t        j                  ||j                  dd            }|j                         || j                  z  |	|fk7  r/t!        d|| j                  z  |	|f d|j                                |{|j                         |d|	|fk7  r#t!        d	|d|	|f d|j                                |j                  || j                  |	|      |z   }|j                  || j                  z  |	|      }t"        j$                  j'                  |d      }||j                         | j                  fk7  r*t!        d
| j                  f d|j                                |j                  dddd      |j                  || j                  |	|      z  }|j                  || j                  z  |	|      }|r?|j                  || j                  |	|      }|j                  || j                  z  |	|      }nd}t"        j$                  j)                  || j(                  | j*                        }t        j                  ||      }|j                         || j                  z  |	| j                  fk7  r9t!        d|| j                  z  |	| j                  f d|j                                |j                  || j                  |	| j                        }|j                  dd      }|j                  ||	| j,                        }| j/                  |      }|||fS )#Input shape: Batch x Time x ChannelNr   r   r   r*   z$Attention weights should be of size 	, but is z!Attention mask should be of size z/Head mask for a single layer should be of size ptraining `attn_output` should be of size )sizerV   rR   shaper_   rT   rU   r-   catrJ   rH   rP   r\   reshapebmmr]   rQ   r   
functionalsoftmaxrI   rl   rG   rW   )r#   r`   ra   rb   r'   rc   rd   is_cross_attentionrZ   tgt_len_query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                       r%   r0   zBioGptAttention.forward   s    .T9',,.Wa {{=1DLL@ *q!''*.>.D.DQ.GG (*J)!,LT[[1A%BBLJ;;t{{3C'Db#NL'T[[%?SIJ;;t{{='A2sKLN1$5z#BJJ 99nQ&7%FANL T[[%?SIJ;;t{{='A2sKL?? ),7NDNN*B>
Ct{{<#>CCZP'Z''4
+|++Z8//!$yyz/C/CAq/IJ3#7'"JJ6dnn8LgW^7_6` a %%'(* 
 %""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S$..'7SVddL',,S4>>-A7GTL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVZVdVdfmov?wwL',,S4>>-A7GTL
 %1$5$5c4>>7T[$\!055cDNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2C$..4H'SWS`S`3a2b c$$&') 
 "&&sDNNGT]]S!++Aq1 "))#wGmmK01>AAr&   )        FTFNNNNNF)r2   r3   r4   r5   r6   rC   boolr   r   r"   r-   rD   r_   r   r0   r8   r9   s   @r%   rF   rF   [   sM   G  )-CC C 	C
 C C C &C>eU\\ eC ec e 488<1526"'vB||vB #5<<0vB !u||!45	vB
 !.vB "%,,/vB  vB 
u||Xell3XeELL>Q5RR	SvBr&   rF   c                   $    e Zd Z	 	 	 	 	 d	dej                  deej                     deeej                        deej                     deej                     dedeej                  eej                     eeej                        f   f fdZ xZ	S )
BioGptSdpaAttentionr`   ra   rb   r'   rc   rd   re   c                 z   |s|*t         j                  d       t        |   ||||||      S |du}|j	                         \  }}	}
| j                  |      }|r0|.|d   j                  d   |j                  d   k(  r|d   }|d   }n
|rE| j                  | j                  |      d|      }| j                  | j                  |      d|      }n|}| j                  | j                  |      d|      }| j                  | j                  |      d|      }t        j                  |d   |gd      }t        j                  |d   |gd      }nD| j                  | j                  |      d|      }| j                  | j                  |      d|      }| j                  r||f}| j                  ||	|      }| j                  r	||	dkD  rd	nd
}t        j                  j                  j!                  ||||| j"                  r| j$                  nd|      }|j	                         || j&                  |	| j(                  fk7  r7t+        d|| j&                  |	| j(                  f d|j	                                |j-                  dd      }|j/                  ||	| j0                        }| j3                  |      }|d|fS )rg   Na  BioGptModel is using BioGptSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)ra   rb   r'   rc   rd   r   r   r   rh   r*   TFr   )	attn_mask	dropout_prL   rm   ri   )loggerwarning_oncer!   r0   rn   rV   ro   r_   rT   rU   r-   rp   rJ   rL   r   rs   scaled_dot_product_attentionrl   rI   rH   rP   rQ   r]   rq   rG   rW   )r#   r`   ra   rb   r'   rc   rd   ru   rZ   rv   rw   rx   ry   rz   rL   r   r$   s                   r%   r0   zBioGptSdpaAttention.forward   s     ;l 7?!1-- /"3 #   .T9',,.Wa {{=1 *q!''*.>.D.DQ.GG (*J)!,LT[[1A%BBLJ;;t{{3C'Db#NL'T[[%?SIJ;;t{{='A2sKLN1$5z#BJJ 99nQ&7%FANL T[[%?SIJ;;t{{='A2sKL?? ),7N{{<#>
 !NN~/E'TU+D[`	 hh))FF$&*mmdll G 
 #t~~w!NN2CRVR_R_3`2a b$$&') 
 "++Aq1 "))#wGmmK0D.00r&   r   )
r2   r3   r4   r-   rD   r   r   r   r0   r8   r9   s   @r%   r   r      s     488<1526"'f1||f1 #5<<0f1 !u||!45	f1
 !.f1 "%,,/f1  f1 
u||Xell3XeELL>Q5RR	Sf1 f1r&   r   )eagersdpac                       e Zd Zdef fdZ	 	 	 	 	 ddej                  deej                     deej                     deeej                        dee	   dee	   d	eej                  eeej                  ej                  f      f   fd
Z xZS )BioGptDecoderLayerrM   c                    t         |           |j                  | _        t	        |j
                     | j                  |j                  |j                  dd      | _        |j                  | _
        t        |j                     | _        |j                  | _        t        j                   | j                        | _        t        j$                  | j                  |j&                        | _        t        j$                  |j&                  | j                        | _        t        j                   | j                        | _        y )NT)rG   rH   rI   rJ   rL   )r!   r"   hidden_sizerG   BIOGPT_ATTENTION_CLASSES_attn_implementationnum_attention_headsattention_probs_dropout_prob	self_attnhidden_dropout_probrI   r   
hidden_actactivation_fnactivation_dropoutr   	LayerNormself_attn_layer_normrS   intermediate_sizefc1fc2final_layer_normr#   rM   r$   s     r%   r"   zBioGptDecoderLayer.__init__k  s    ++1&2M2MNnn0077
 11#F$5$56"(";";$&LL$@!99T^^V-E-EF99V55t~~F "T^^ <r&   r`   r'   rc   rb   rd   	use_cachere   c                 |   |}| j                  |      }||dd nd}| j                  |||||      \  }}	}
t        j                  j	                  || j                  | j
                        }||z   }|}| j                  |      }| j                  |      }| j                  |      }t        j                  j	                  || j                  | j
                        }| j                  |      }t        j                  j	                  || j                  | j
                        }||z   }|f}|r||	fz  }|r||
fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        Nr   )r`   rb   r'   rc   rd   rj   )r   r   r   rs   rI   rl   r   r   r   r   r   )r#   r`   r'   rc   rb   rd   r   residualself_attn_past_key_valueself_attn_weightspresent_key_valueoutputss               r%   r0   zBioGptDecoderLayer.forward  sY   0 !11-@ :H9S>"1#5Y] >Bnn'3)+/ ?M ?
;(*; --mt||VZVcVc-d =0 !--m</**=9--mt?V?Vaeanan-o/--mt||VZVcVc-d =0 ")++G)++Gr&   )NNNFT)r2   r3   r4   r   r"   r-   rD   r   r   r   FloatTensorr0   r8   r9   s   @r%   r   r   j  s    =| =0 26268<,1$(<||< !.< "%,,/	<
 !u||!45< $D>< D>< 
u  (51B1BEDUDU1U+V"WW	X<r&   r   c                   &    e Zd ZdZeZdZdZdZd Z	y)BioGptPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    biogptTc                    t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j                  j                  j                  d       yy)zInitialize the weightsr   )meanstdNrB   )
isinstancer   rS   weightdatanormal_rM   initializer_rangerK   zero_	Embeddingr<   r   fill_)r#   modules     r%   _init_weightsz#BioGptPreTrainedModel._init_weights  s   fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) .r&   N)
r2   r3   r4   r5   r   config_classbase_model_prefixsupports_gradient_checkpointing_supports_sdpar    r&   r%   r   r     s$    
  L &*#N*r&   r   aJ  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`~BioGptConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
            than the model's internal embedding lookup matrix.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z`The bare BioGPT Model transformer outputting raw hidden-states without any specific head on top.c                   t    e Zd Zdef fdZd Zd Z eej                  d             e
eee      	 	 	 	 	 	 	 	 	 ddeej                      deej"                     d	eej"                     d
eej"                     deeeej&                           dee   dee   dee   dee   deeef   fd              Z xZS )BioGptModelrM   c                    t         |   |       || _        |j                  | _        |j                  | _        |j                  | _        |j                  | _	        |j                  rt        j                  |j                        nd}t        |j                  | j                  | j                  |      | _        t!        |j"                  | j                        | _        t'        j(                  t+        |j,                        D cg c]  }t/        |       c}      | _        t'        j2                  | j                        | _        d| _        |j8                  dk(  | _        | j=                          y c c}w )NrB   )r=   Fr   )r!   r"   rM   	layerdropr   rI   r   rG   pad_token_idr<   scale_embeddingmathsqrtr;   
vocab_sizeembed_tokensr   max_position_embeddingsembed_positionsr   
ModuleListrangenum_hidden_layersr   layersr   
layer_normgradient_checkpointingr   	_use_sdpa	post_init)r#   rM   r=   rw   r$   s       r%   r"   zBioGptModel.__init__"  s    ))11++!..7=7M7Mdii 2 23SV5t~~t/?/?[
  @@^@^`d`n`nommvOgOgIh$iA%7%?$ij,,t~~6&+#44> %js   E4c                     | j                   S r?   r   r#   s    r%   get_input_embeddingsz BioGptModel.get_input_embeddings8  s       r&   c                     || _         y r?   r   r#   values     r%   set_input_embeddingsz BioGptModel.set_input_embeddings;  s
    !r&   batch_size, sequence_length
checkpointoutput_typer   r@   r'   	head_maskinputs_embedspast_key_valuesr   rd   output_hidden_statesreturn_dictre   c
           
         ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	||t        d      ||}
|
j                         }n-| |j                         d d }|d d d d df   }
nt        d      ||d   d   j                  d   nd}|| j                  |
      }|Pt        j                  |j                  d   |j                  d   |z   ft        j                  |j                        }n=|j                  d   ||d   z   k7  r%t        d|j                  d    d	||d   z    d
      | j                  ||      }| j                  r|s|t        ||||      }nt!        ||||      }||z   }t"        j$                  j'                  || j&                  | j(                        }| j*                  r%| j(                  r|rt,        j/                  d       d}|rdnd }|rdnd }d }|rdnd }t1        | j2                        D ]  \  }}|r||fz  }| j(                  r%t        j4                  g       }|| j6                  k  r?|||   nd }| j*                  r5| j(                  r)| j9                  |j:                  |||||   nd d ||      }n ||||||   nd |||      }|d   }|r|||rdnd   fz  }|s||d   fz  } |r||fz  }| j=                  |      }|r|nd }|	st?        d |||||fD              S tA        |||||      S )NzDYou cannot specify both input_ids and inputs_embeds at the same timerh   z5You have to specify either input_ids or inputs_embedsr   r   r   )dtypedevicez'The provided attention mask has length z, but its length should be z0 (sum of the lengths of current and past inputs)rj   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   )r'   rc   rb   rd   r   c              3   $   K   | ]  }|| 
 y wr?   r   ).0vs     r%   	<genexpr>z&BioGptModel.forward.<locals>.<genexpr>  s      = s   )last_hidden_stater   r`   
attentionscross_attentions)!rM   rd   r   r   use_return_dictrQ   rn   ro   r   r-   onesr   r   r   r   r   r   r   rs   rI   rl   r   r   r   	enumerater   randr   _gradient_checkpointing_func__call__r   tupler   )r#   r@   r'   r   r   r   r   rd   r   r   inputinput_shaper(   r1   r`   all_hidden_statesall_self_attnsall_cross_attentionsnext_decoder_cacheidxdecoder_layerdropout_probabilityrb   layer_outputs
next_caches                            r%   r0   zBioGptModel.forward>  s   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]  ]%>cdd"E**,K&',,.s3K!!Q(+ETUU DSC^!3A!6!<!<Q!?de  --e4M!"ZZ$$Q')<)<Q)?BX)XYjj$++N
 !!!$(>Q(OO9.:N:Nq:Q9RRm)KN:;;km  ((9OP	>>"3	8I H]<RN ?]<RN &	1--mt||VZVcVc-d&&4==##p "	"6BD0d##,R$"+DKK"8 %	6C#!m%55!}}&+jjn#&75D5P_S1VZN**t}} $ A A!**!"&/&;IcN%! !.!#17@7LYs^RV#1&7'! *!,M"}:KQQR'S&UU" =#3"55K%	6P  -!116+4'$
 '5FXlm  
 9+&+%1
 	
r&   )	NNNNNNNNN)r2   r3   r4   r   r"   r   r   r   BIOGPT_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   r-   r7   r   r   rD   r   r   r0   r8   r9   s   @r%   r   r     s;   
| ,!" ++B+I+IJg+hi&=$ 156:1559@D$(,0/3&*J
E,,-J
 !!2!23J
 E--.	J

   1 12J
 "%ell(;"<=J
 D>J
 $D>J
 'tnJ
 d^J
 
u??	@J
 jJ
r&   r   zHBioGPT Model with a `language modeling` head on top for CLM fine-tuning.c                       e Zd ZdgZ fdZd Zd Z eej                  d             e
eee      	 	 	 	 	 	 	 	 	 	 ddeej                      deej"                     d	eej"                     d
eej"                     deeeej&                           deej                      dee   dee   dee   dee   deeef   fd              Zed        Z xZS )BioGptForCausalLMzoutput_projection.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y NFrO   )
r!   r"   r   r   r   rS   r   r   output_projectionr   r   s     r%   r"   zBioGptForCausalLM.__init__  sJ     !&)!#6+=+=v?P?PW\!] 	r&   c                     | j                   S r?   r  r   s    r%   get_output_embeddingsz'BioGptForCausalLM.get_output_embeddings  s    %%%r&   c                     || _         y r?   r
  )r#   new_embeddingss     r%   set_output_embeddingsz'BioGptForCausalLM.set_output_embeddings  s
    !/r&   r   r   r@   r'   r   r   r   labelsr   rd   r   r   re   c                 0   |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }d}|{|ddddddf   j	                         }|ddddf   j	                         }t               } ||j                  d| j                   j                        |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)r'   r   r   r   r   rd   r   r   r   rh   r   )losslogitsr   r`   r   r   )rM   r   r   r  r^   r   r\   r   r   r   r`   r   r   )r#   r@   r'   r   r   r   r  r   rd   r   r   r   sequence_outputprediction_scoreslm_lossshifted_prediction_scoresloss_fctoutputs                     r%   r0   zBioGptForCausalLM.forward  sB   2 &1%<k$++B]B]++)'+/!5#  

 "!* 22?C(9!SbS!)(D(O(O(Q%AqrE]--/F')H8==b$++BXBXY[a[f[fgi[jkG')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
r&   c                 J    d}| D ]  }|t        fd|D              fz  } |S )Nr   c              3   t   K   | ]/  }|j                  d j                  |j                               1 yw)r   N)index_selecttor   )r   
past_statebeam_idxs     r%   r   z3BioGptForCausalLM._reorder_cache.<locals>.<genexpr>*  s.     nU_j--aZ=N=N1OPns   58)r   )r   r  reordered_past
layer_pasts    `  r%   _reorder_cachez BioGptForCausalLM._reorder_cache%  s=    ) 	Jncmnn N	 r&   
NNNNNNNNNN)r2   r3   r4   _tied_weights_keysr"   r  r  r   r   r  r   r  r   r  r   r-   r7   r   r   rD   r   r   r0   staticmethodr!  r8   r9   s   @r%   r  r    s`    55&0 ++B+I+IJg+hi&5$ 156:1559@D-1$(,0/3&*7
E,,-7
 !!2!237
 E--.	7

   1 127
 "%ell(;"<=7
 ))*7
 D>7
 $D>7
 'tn7
 d^7
 
u77	87
 j7
r  r&   r  z
    BioGPT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       e Zd Z fdZ ee       eeee	      	 	 	 	 	 	 	 	 	 	 	 dde
ej                     de
ej                     de
ej                     de
ej                     de
eeej                           de
ej                     d	e
ej                     d
e
e   de
e   de
e   de
e   deeef   fd              Z xZS )BioGptForTokenClassificationc                 z   t         |   |       |j                  | _        t        |      | _        t        |d      r|j                  |j                  }n|j                  }t        j                  |      | _
        t        j                  |j                  |j                        | _        | j                          y )Nclassifier_dropout)r!   r"   
num_labelsr   r   hasattrr(  r   r   DropoutrI   rS   r   
classifierr   )r#   rM   r(  r$   s      r%   r"   z%BioGptForTokenClassification.__init__7  s      ++!&)6/0V5N5N5Z!'!:!:!'!;!;zz"45))F$6$68I8IJr&   r   r@   token_type_idsr'   r   r   r   r  r   rd   r   r   re   c                    ||n| j                   j                  }| j                  |||||||	|
|	      }|d   }| j                  |      }| j	                  |      }d}|t               }||j                  d      dk(  }|j                  d| j                        }t        j                  ||j                  d      t        j                  |j                        j                  |            } |||      }n2 ||j                  d| j                        |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r'   r   r   r   rd   r   r   r   rh   r   r   )r  r  r`   r   )rM   r   r   rI   r,  r   r\   r)  r-   whererX   ignore_indexr/   r   r`   r   )r#   r@   r-  r'   r   r   r   r  r   rd   r   r   transformer_outputsr`   r  r  r  active_lossactive_logitsactive_labelsr  s                        r%   r0   z$BioGptForTokenClassification.forwardE  so   4 &1%<k$++B]B]"kk+)'/!5# * 

 ,A.]3/')H),11"5: &B @ %R%,,x?T?T2U2]2]^d2e!  }=B @&++b/RY!4QR!88F)-)9TGf$EvE$-;;*55	
 	
r&   )NNNNNNNNNNN)r2   r3   r4   r"   r   r   r   r  r   r  r   r-   r7   r   r   rD   r   r   r0   r8   r9   s   @r%   r&  r&  /  sB    ++BC&)$ 15596:15@D59-1$(,0/3&*=
E,,-=
 !!1!12=
 !!2!23	=

 E--.=
 "%ell(;"<==
   1 12=
 ))*=
 D>=
 $D>=
 'tn=
 d^=
 
u++	,=
 D=
r&   r&  a  
    The BioGpt Model transformer with a sequence classification head on top (linear layer).

    [`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it is required to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                   v    e Zd Zdef fdZ ee       eee	e
      	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deeeej                            deej                     d	eej                     d
ee   dee   dee   dee   deee	f   fd              Zd Zd Z xZS )BioGptForSequenceClassificationrM   c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  | j                  d      | _        | j                          y r  )
r!   r"   r)  r   r   r   rS   r   scorer   r   s     r%   r"   z(BioGptForSequenceClassification.__init__  sS      ++!&)YYv114??O
 	r&   r   r@   r'   r   r   r   r  r   rd   r   r   re   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }||j                  dd \  }}n|j                  dd \  }}| j                   j
                  d}n|Vt        j                  || j                   j
                        j                  d      dz
  j                  |j                        }n.d}t        j                  | j                  j                   d       |t        j                  ||j                        |f   }d}|| j                   j                   | j"                  dk(  rd	| j                   _        nl| j"                  dkD  rL|j$                  t        j&                  k(  s|j$                  t        j(                  k(  rd
| j                   _        nd| j                   _        | j                   j                   d	k(  rIt+               }| j"                  dk(  r& ||j-                         |j-                               }n |||      }n| j                   j                   d
k(  r=t/               } ||j1                  d| j"                        |j1                  d            }n,| j                   j                   dk(  rt3               } |||      }|
s|f|dd z   }||f|z   S |S t5        |||j6                  |j8                  |j:                        S )r/  Nr0  r   r   rh   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`)r   
regressionsingle_label_classificationmulti_label_classification)r  r  r   r`   r   )rM   r   r   r:  ro   r   r-   nesumr  r   r   r   r$   r2   arangeproblem_typer)  r   r,   r6   r	   squeezer   r\   r   r   r   r`   r   )r#   r@   r'   r   r   r   r  r   rd   r   r   r3  r`   r  
batch_sizesequence_lengthpooled_logitsr  r  r  s                       r%   r0   z'BioGptForSequenceClassification.forward  s   2 &1%<k$++B]B]"kk+)'/!5# * 

 ,A.M* *3//"1*='J*7*=*=bq*A'J;;##+ O$#(88It{{7O7O#P#T#TUW#X[\#\"`"`aganan"o"$##~~../ 0^ ^
 u||Jv}}M^_{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r&   c                 .    | j                   j                  S r?   r   r   r   s    r%   r   z4BioGptForSequenceClassification.get_input_embeddings  s    {{'''r&   c                 &    || j                   _        y r?   rH  r   s     r%   r   z4BioGptForSequenceClassification.set_input_embeddings  s    #( r&   r"  )r2   r3   r4   r   r"   r   r   r   r  r   r  r   r-   r7   r   r   rD   r   r   r0   r   r   r8   r9   s   @r%   r8  r8    sI    |  ++BC&4$ 156:15@D59-1$(,0/3&*V
E,,-V
 !!2!23V
 E--.	V

 "%ell(;"<=V
   1 12V
 ))*V
 D>V
 $D>V
 'tnV
 d^V
 
u66	7V
 DV
p()r&   r8  )6r5   r   typingr   r   r   r-   torch.utils.checkpointr   torch.nnr   r   r	   activationsr   
generationr   modeling_attn_mask_utilsr   r   modeling_outputsr   r   r   r   modeling_utilsr   utilsr   r   r   r   configuration_biogptr   
get_loggerr2   r   r  r  r   r   r;   ModulerF   r   r   r   r   BIOGPT_START_DOCSTRINGr   r   r  r&  r8  r   r&   r%   <module>rW     s     ) )    A A ! ) u  .  / 
		H	%(  
8r|| 82
= 
=[Bbii [B~g1/ g1V  R Rj*O *8	 4 n fm
' m
	m
` RTjX- XXv  R
#8 R
R
j  l)&; l)l)r&   