
    sgSf                     J   d Z ddlZddlmZmZmZmZ ddlZddlZddlm	Z	 ddl
mZ ddlmZmZ ddlmZ dd	lmZmZmZmZ d
dlmZmZmZmZmZ d
dlmZmZmZm Z  ddl!m"Z"  e       rddl#m$Z$  ejJ                  e&      Z'dZ(dZ) G d de      Z* G d de	jV                        Z, G d de	jV                        Z- G d de-      Z. G d de-      Z/e-e.e/dZ0 G d dee	jV                        Z1 G d d e       Z2dZ3 G d! d"e      Z4 G d# d$e      Z5 G d% d&e      Z6 G d' d(e      Z7y))zPyTorch Starcoder2 model.    N)ListOptionalTupleUnion)nn   )ACT2FN)CacheDynamicCache)BaseModelOutputWithPast)%add_start_docstrings_to_model_forwardis_flash_attn_2_available#is_flash_attn_greater_or_equal_2_10logging   )LlamaForSequenceClassificationLlamaForTokenClassificationLlamaRotaryEmbeddingapply_rotary_pos_emb	repeat_kv)Qwen2DecoderLayerQwen2ForCausalLM
Qwen2ModelQwen2PreTrainedModel   )Starcoder2Config)_flash_attention_forwardr   zbigcode/starcoder2-7bc                       e Zd Zy)Starcoder2RotaryEmbeddingN__name__
__module____qualname__     d/var/www/html/venv/lib/python3.12/site-packages/transformers/models/starcoder2/modular_starcoder2.pyr   r   =       r%   r   c                   h     e Zd Zdef fdZdeeej                        dej                  fdZ	 xZ
S )Starcoder2MLPconfigc                 P   t         |           |j                  }t        j                  ||j
                  |j                        | _        t        j                  |j
                  ||j                        | _        t        |j                     | _        |j                  | _        y )Nbias)super__init__hidden_sizer   Linearintermediate_sizeuse_biasc_fcc_projr	   
hidden_actactresidual_dropout)selfr*   	embed_dim	__class__s      r&   r/   zStarcoder2MLP.__init__B   su    &&	IIi)A)AX	ii 8 8)&//Z&++, & 7 7r%   hidden_statesreturnc                     | j                  |      }| j                  |      }| j                  |      }t        j                  j                  || j                  | j                        }|S )Nptraining)r4   r7   r5   r   
functionaldropoutr8   rA   )r9   r<   s     r&   forwardzStarcoder2MLP.forwardJ   sZ    		-0/M2--mt?T?T_c_l_l-mr%   )r!   r"   r#   r   r/   r   r   torchFloatTensorrD   __classcell__r;   s   @r&   r)   r)   A   s9    8/ 8XeE4E4E.F%G EL]L] r%   r)   c                   j    e Zd ZdZddedee   f fdZ	 	 	 	 	 	 	 ddej                  deej                     deej                     dee   d	ed
edeej                     deeej                  ej                  f      deej                  eej                     eeej                        f   fdZ xZS )Starcoder2Attentionz
    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
    and "Generating Long Sequences with Sparse Transformers".
    r*   	layer_idxc                    t         |           || _        || _        |-t        j                  d| j                  j                   d       |j                  | _        |j                  | _
        | j                  | j                  z  | _        |j                  | _        | j                  | j                  z  | _        |j                  | _        |j                  | _        d| _        |j"                  | _        |j$                  | _        | j                  | j                  z  | j                  k7  r&t'        d| j                   d| j                   d      t)        j*                  | j                  | j                  | j                  z  | j                        | _        t)        j*                  | j                  | j                  | j                  z  | j                        | _        t)        j*                  | j                  | j                  | j                  z  | j                        | _        t)        j*                  | j                  | j                  z  | j                  | j                        | _        t5        | j                        | _        y )	NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).r,   )r*   )r.   r/   r*   rK   loggerwarning_oncer;   r!   r0   num_attention_heads	num_headshead_dimnum_key_value_headsnum_key_value_groups
rope_thetar3   	is_causalattention_dropoutr8   
ValueErrorr   r1   q_projk_projv_projo_projr   
rotary_emb)r9   r*   rK   r;   s      r&   r/   zStarcoder2Attention.__init__X   s   " !8!8 9 :, , "--33((DNN:#)#=#= $(NNd6N6N$N! ++!'!9!9 & 7 7MMDNN*t/?/??QRVRbRbQc$T^^$4B8  ii 0 0$..4==2PW[WdWdeii 0 0$2J2JT]]2Zaeananoii 0 0$2J2JT]]2Zaeananoii >@P@PW[WdWde34;;Gr%   r<   attention_maskposition_idspast_key_valueoutput_attentions	use_cachecache_positionposition_embeddingsr=   c	                    |j                         \  }	}
}| j                  |      }| j                  |      }| j                  |      }|j	                  |	|
d| j
                        j                  dd      }|j	                  |	|
d| j
                        j                  dd      }|j	                  |	|
d| j
                        j                  dd      }|+t        j                  d       | j                  ||      \  }}n|\  }}t        ||||      \  }}|'|||d}|j                  ||| j                  |      \  }}t        || j                        }t        || j                        }t        j                   ||j                  dd            t#        j$                  | j
                        z  }|#|d d d d d d d |j&                  d   f   }||z  }t(        j*                  j-                  |dt        j.                        j1                  |j2                        }t(        j*                  j5                  || j6                  | j8                  	      }t        j                   ||      }|j                         |	| j:                  |
| j
                  fk7  r7t=        d
|	| j:                  |
| j
                  f d|j                                |j                  dd      j?                         }|jA                  |	|
| jB                        }| jE                  |      }t(        j*                  j5                  || jF                  | j8                  	      }|sd }|||fS )Nr   r   Y  The attention layers in this model are transitioning from computing the RoPE embeddings internally through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed `position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be removed and `position_embeddings` will be mandatory.sincosrb   r   )dimdtyper?   z `attn_output` should be of size z	, but is )$sizerX   rY   rZ   viewrQ   	transposerM   rN   r\   r   updaterK   r   rS   rE   matmulmathsqrtshaper   rB   softmaxfloat32torl   rC   rV   rA   rP   rW   
contiguousreshaper0   r[   r8   )r9   r<   r]   r^   r_   r`   ra   rb   rc   bszq_len_query_states
key_statesvalue_statesri   rh   cache_kwargsattn_weightscausal_maskattn_outputs                        r&   rD   zStarcoder2Attention.forwardz   s     &**,UA{{=1[[/
{{=1#((eRGQQRSUVW__S%T]]CMMaQRS
#((eRGQQRSUVW&G |\BHC*HC#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$J z4+D+DE
 t/H/HI||L*2F2Fq!2LMPTPYPYZ^ZgZgPhh%(Aq2HJ4D4DR4H2H)HIKK'L }},,\r,WZZ[g[m[mn}},,\T=S=S^b^k^k,lll<>#t~~udmm!LL2CPTP]P]3^2_ `$$&') 
 "++Aq1<<>!))#ud6F6FGkk+.mm++K4;P;P[_[h[h+i LL.88r%   NNNNFFNN)r!   r"   r#   __doc__r   r   intr/   rE   Tensor
LongTensorr
   boolr   rD   rG   rH   s   @r&   rJ   rJ   R   s   
 H/  HHSM  HJ 2637*."'59KOB9||B9 !.B9 u//0	B9
 !B9  B9 B9 !!1!12B9 &eELL%,,,F&GHB9 
u||Xell3XeELL>Q5RR	SB9r%   rJ   c                        e Zd ZdZ fdZ	 	 	 	 	 	 	 ddej                  deej                     deej                     dee	   de
de
d	eej                     d
eeej                  ej                  f      fdZ xZS )Starcoder2FlashAttention2aP  
    Starcoder2 flash attention module. This module inherits from `Starcoder2Attention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    c                 D    t        |   |i | t                | _        y r   )r.   r/   r   _flash_attn_uses_top_left_mask)r9   argskwargsr;   s      r&   r/   z"Starcoder2FlashAttention2.__init__   s&    $)&)
 3V2W.W+r%   r<   r]   r^   r_   r`   ra   rb   rc   c	                    |j                         \  }	}
}| j                  |      }| j                  |      }| j                  |      }|j	                  |	|
d| j
                        j                  dd      }|j	                  |	|
d| j
                        j                  dd      }|j	                  |	|
d| j
                        j                  dd      }|+t        j                  d       | j                  ||      \  }}n|\  }}t        ||||      \  }}|'|||d}|j                  ||| j                  |      \  }}t        || j                        }t        || j                        }| j                  sdn| j                   }|j"                  }|t$        j&                  k(  rt%        j(                         rt%        j*                         }nMt-        | j.                  d      r| j.                  j0                  }n | j                  j2                  j"                  }t        j                  d| d	       |j5                  |      }|j5                  |      }|j5                  |      }|j                  dd      }|j                  dd      }|j                  dd      }t7        |||||
||t9        | j.                  d
d       | j:                  | j<                  
      }|j?                  |	|
| j@                        jC                         }| jE                  |      }tF        jH                  jK                  || jL                  | j                        }|sd }||fS )Nre   r   r   rf   rg           _pre_quantization_dtypezThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .sliding_window)r^   rC   r   rU   use_top_left_maskr?   )'rm   rX   rY   rZ   rn   rQ   ro   rM   rN   r\   r   rp   rK   r   rS   rA   rV   rl   rE   rv   is_autocast_enabledget_autocast_gpu_dtypehasattrr*   r   weightrw   r   getattrrU   r   ry   r0   rx   r[   r   rB   rC   r8   )r9   r<   r]   r^   r_   r`   ra   rb   rc   rz   r{   r|   r}   r~   r   ri   rh   r   dropout_rateinput_dtypetarget_dtyper   r   s                          r&   rD   z!Starcoder2FlashAttention2.forward   s    &**,UA{{=1[[/
{{=1#((eRGQQRSUVW__S%T]]CMMaQRS
#((eRGQQRSUVW&G |\BHC*HC#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$J z4+D+DE
 t/H/HI"&--sT5K5K
 #((%--'((*$;;=&?@#{{BB#{{1177 >$ (??<8L#|4J'??<8L $--a3))!Q/
#--a3.% "4;;0@$Gnn"AA
 "))#ud6F6FGRRTkk+.mm++K4;P;P[_[h[h+i LL.88r%   r   )r!   r"   r#   r   r/   rE   r   r   r   r
   r   r   rD   rG   rH   s   @r&   r   r      s    X 2637*."'59KOZ9||Z9 !.Z9 u//0	Z9
 !Z9  Z9 Z9 !!1!12Z9 &eELL%,,,F&GHZ9r%   r   c                   R    e Zd ZdZ	 	 	 	 	 	 	 ddej
                  deej
                     deej                     dee   de	de	deej                     d	ee
ej
                  ej
                  f      d
e
ej
                  eej
                     ee
ej
                        f   f fdZ xZS )Starcoder2SdpaAttentiona  
    Starcoder2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
    `Starcoder2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
    SDPA API.
    r<   r]   r^   r_   r`   ra   rb   rc   r=   c	                    |r*t         j                  d       t        |   ||||||      S |j	                         \  }	}
}| j                  |      }| j                  |      }| j                  |      }|j                  |	|
d| j                        j                  dd      }|j                  |	|
d| j                        j                  dd      }|j                  |	|
d| j                        j                  dd      }|+t         j                  d       | j                  ||      \  }}n|\  }}t        ||||      \  }}|'|||d}|j                  ||| j                  |      \  }}t        || j                         }t        || j                         }|}||d d d d d d d |j"                  d   f   }|j$                  j&                  d	k(  r2|0|j)                         }|j)                         }|j)                         }||
dkD  rd
nd}t*        j,                  j.                  j1                  ||||| j2                  r| j4                  nd|      }|j                  dd      j)                         }|j7                  |	|
| j8                        }| j;                  |      }t,        j.                  j=                  || j>                  | j2                        }|d |fS )Na  Starcoder2Model is using Starcoder2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)r<   r]   r^   r_   r`   ra   re   r   r   rf   rg   rj   cudaTFr   )	attn_mask	dropout_prU   r?   ) rM   rN   r.   rD   rm   rX   rY   rZ   rn   rQ   ro   r\   r   rp   rK   r   rS   rt   devicetyperx   rE   r   rB   scaled_dot_product_attentionrA   rV   ry   r0   r[   rC   r8   )r9   r<   r]   r^   r_   r`   ra   rb   rc   rz   r{   r|   r}   r~   r   ri   rh   r   r   rU   r   r;   s                        r&   rD   zStarcoder2SdpaAttention.forward2  s    [ 7?+-)-"3# #   &**,UA{{=1[[/
{{=1#((eRGQQRSUVW__S%T]]CMMaQRS
#((eRGQQRSUVW&G |\BHC*HC#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$Jz4+D+DE
 t/H/HI$%(Aq2HJ4D4DR4H2H)HIK ##v-.2L'224L#..0J'224L
 (/EAID5	hh))FF!04d,,3 G 
 "++Aq1<<>!))#ud6F6FGkk+.mm++K4;P;P[_[h[h+iD.00r%   r   )r!   r"   r#   r   rE   r   r   r   r
   r   r   rD   rG   rH   s   @r&   r   r   +  s     2637*."'59KOW1||W1 !.W1 u//0	W1
 !W1  W1 W1 !!1!12W1 &eELL%,,,F&GHW1 
u||Xell3XeELL>Q5RR	SW1 W1r%   r   )eagerflash_attention_2sdpac                       e Zd ZdedefdZy)Starcoder2DecoderLayerr*   rK   c                    t         j                  j                  |        |j                  | _        t	        |j
                     ||      | _        t        |      | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        y N)eps)r   Moduler/   r0   STARCODER2_ATTENTION_CLASSES_attn_implementation	self_attnr)   mlp	LayerNormnorm_epsiloninput_layernormpost_attention_layernorm)r9   r*   rK   s      r&   r/   zStarcoder2DecoderLayer.__init__  s    
		4 !--5f6Q6QRSY[de (!||F,>,>FDWDWX(*V5G5GVM`M`(a%r%   N)r!   r"   r#   r   r   r/   r$   r%   r&   r   r     s    	b/ 	bC 	br%   r   c                       e Zd Zy)Starcoder2PreTrainedModelNr    r$   r%   r&   r   r     r'   r%   r   c                   F    e Zd ZdZdef fdZ ee      	 	 	 	 	 	 	 	 	 	 ddej                  de
ej                     de
ej                     de
eej                        de
ej                     d	e
e   d
e
e   de
e   de
e   de
ej                     deeef   fd       Z xZS )Starcoder2Modelz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Starcoder2DecoderLayer`]

    Args:
        config: Starcoder2Config
    r*   c                     t         |   |       |j                  | _        t        j                  |j
                  |j                        | _        y r   )r.   r/   embedding_dropoutr   r   r0   r   norm)r9   r*   r;   s     r&   r/   zStarcoder2Model.__init__  s<     !'!9!9LL!3!39L9LM	r%   	input_idsr]   r^   past_key_valuesinputs_embedsra   r`   output_hidden_statesreturn_dictrb   r=   c                 ^   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	|d u |d uz  rt        d      | j                  r%| j                  r|rt        j                  d       d}d}|rIt        |t              s9d}|t               }n*t        j                  |      }t        j                  d       || j                  |      }|
F||j                         nd}t!        j"                  |||j$                  d   z   |j&                        }
||
j)                  d      }| j+                  |||
||      }|}t,        j.                  j1                  || j2                  | j                  	      }| j5                  ||      }|rd
nd }|rd
nd }d }| j6                  D ]r  }|r||fz  }| j                  r0| j                  r$| j9                  |j:                  |||||||
|	      }n ||||||||
|      }|d   }|r	||rdnd   }|sj||d   fz  }t | j=                  |      }|r||fz  }|r|nd }|r|j?                         }|	stA        d ||||fD              S tC        ||||      S )Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FTzWe detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)r   r   )r   r?   r$   )r]   r^   r_   r`   ra   rb   rc   r   c              3   &   K   | ]	  }||  y wr   r$   ).0vs     r&   	<genexpr>z*Starcoder2Model.forward.<locals>.<genexpr>+  s     tqfgfsts   )last_hidden_stater   r<   
attentions)"r*   r`   r   ra   use_return_dictrW   gradient_checkpointingrA   rM   rN   
isinstancer
   r   from_legacy_cacheembed_tokensget_seq_lengthrE   arangert   r   	unsqueeze_update_causal_maskr   rB   rC   r   r\   layers_gradient_checkpointing_func__call__r   to_legacy_cachetupler   )r9   r   r]   r^   r   r   ra   r`   r   r   rb   return_legacy_cachepast_seen_tokensr   r<   rc   all_hidden_statesall_self_attnsnext_decoder_cachedecoder_layerlayer_outputs
next_caches                         r&   rD   zStarcoder2Model.forward  s"    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==##p "	 $Z?"&&".."."@"@"Q##^   --i8M!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L..M>?L]
 &--mt?U?U`d`m`m-n #oom\J #7BD0d!![[ "	6M#!m%55!**t}} $ A A!**! #%"'
! !.!#.!-#2&7'#1(;	! *!,M%28I1q%Q" =#3"55E"	6H 		-0  -!11+4'$
#335Jt]J@QSa$bttt&+&+%	
 	
r%   )
NNNNNNNNNN)r!   r"   r#   r   r   r/   r   STARCODER2_INPUTS_DOCSTRINGrE   r   r   r   r   rF   r   r   r   r   rD   rG   rH   s   @r&   r   r     s&   N/ N
 ++FG '+1537=A59$(,0/3&*59|
##|
 !.|
 u//0	|

 "$u'8'8"9:|
   1 12|
 D>|
 $D>|
 'tn|
 d^|
 !!1!12|
 
u--	.|
 H|
r%   r   c                       e Zd Zy)Starcoder2ForCausalLMNr    r$   r%   r&   r   r   4  r'   r%   r   c                       e Zd Zy)#Starcoder2ForSequenceClassificationNr    r$   r%   r&   r   r   8  r'   r%   r   c                       e Zd Zy) Starcoder2ForTokenClassificationNr    r$   r%   r&   r   r   <  r'   r%   r   )8r   rr   typingr   r   r   r   rE   torch.utils.checkpointr   activationsr	   cache_utilsr
   r   modeling_outputsr   utilsr   r   r   r   llama.modeling_llamar   r   r   r   r   qwen2.modeling_qwen2r   r   r   r   configuration_starcoder2r   modeling_flash_attention_utilsr   
get_loggerr!   rM   _CONFIG_FOR_DOC_CHECKPOINT_FOR_DOCr   r   r)   rJ   r   r   r   r   r   r   r   r   r   r   r$   r%   r&   <module>r      s4  (    / /    ! .   i h 6 J 
		H	%$- 	 4 	BII "j9")) j9Zi9 3 i9X^11 ^1D !2#  
b.		 
b	 4 	 # J
j J
Z	, 		*H 		'B 	r%   