
    sgB              &          d dl Z d dlZd dlmZmZmZ d dlZd dlmc m	Z
 ddlmZmZmZ  ej                  e      Z e       r6d dlmZmZmZ d dlmZmZ d e e j4                  e      j6                        v Zdej:                  d	eej:                  ej:                  ef   fd
Zdej:                  dej:                  dej:                  dej:                  def
dZ d Z!	 d.dej:                  dej:                  dej:                  deejD                     fdZ# ed      Z$ejJ                  jM                  dd      dk(  Z'	 	 	 	 	 	 	 	 	 	 	 	 d/dej:                  dej:                  dej:                  dej:                  dede(de)d eej:                     d!ee)   d"ee   d#e(d$ee)   d%e(d&eejT                     d'eejT                     d(ee   d)ee   deejD                     f$d*Z+ G d+ d,ed-      Z,y)0    N)OptionalTuple	TypedDict   )is_flash_attn_2_availableis_flash_attn_greater_or_equallogging)index_first_axis	pad_inputunpad_input)flash_attn_funcflash_attn_varlen_funcwindow_sizeattention_maskreturnc                 d   | j                  dt        j                        }t        j                  | j	                         d      j	                         }|j                         j                         }t        j                  t        j                  |dt        j                        d      }|||fS )aq  
    Retrieves indexing data required to repad unpadded (ragged) tensors.

    Arguments:
        attention_mask (`torch.Tensor`):
            Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.

    Return:
        indices (`torch.Tensor`):
            The indices of non-masked tokens from the flattened input sequence.
        cu_seqlens (`torch.Tensor`):
            The cumulative sequence lengths, used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
        max_seqlen_in_batch (`int`):
            Maximum sequence length in batch.
    )dimdtypeF)as_tupler   )r   r   )
sumtorchint32nonzeroflattenmaxitemFpadcumsum)r   seqlens_in_batchindicesmax_seqlen_in_batch
cu_seqlenss        ^/var/www/html/venv/lib/python3.12/site-packages/transformers/modeling_flash_attention_utils.py_get_unpad_datar&   $   s      &))b)DmmN224uEMMOG*..0557u||$4!5;;OQWXJ     query_layer	key_layervalue_layerquery_lengthc                    t        |      \  }}}|j                  \  }}	}
}t        |j                  ||	z  |
|      |      }t        |j                  ||	z  |
|      |      }||	k(  r't        | j                  ||	z  d|      |      } |}|}|}nn|dk(  rLd}t	        j
                  |dz   t        j                  | j                        }|dd }| j                  d      } n|dd| df   }t        | |      \  } }}}| |||||f||ffS )a  
    Unpads query, key, and values tensors, using a single dimension for all tokens even though they belong to different batches.

    This function is used instead of `flash_attn.bert_padding.unpad_input` in order to avoid the recomputation of the same intermediary
    tensors for query, key, value tensors.

    Arguments:
        query_layer (`torch.Tensor`):
            Query state with padding. Shape: (batch_size, query_length, num_heads, head_dim).
        key_layer (`torch.Tensor`):
            Key state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
        value_layer (`torch.Tensor`):
            Value state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
        attention_mask (`torch.Tensor`):
            Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
        query_length (`int`):
            Target length.

    Return:
        query_layer (`torch.Tensor`):
            Query state without padding. Shape: (total_target_length, num_heads, head_dim).
        key_layer (`torch.Tensor`):
            Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
        value_layer (`torch.Tensor`):
            Value state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
        indices_q (`torch.Tensor`):
            The indices of non-masked tokens from the flattened input target sequence.
        (cu_seqlens_q, cu_seqlens_k) (`Tuple[int]`):
            The cumulative sequence lengths for the target (query) and source (key, value), used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
        (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`Tuple[int]`):
            Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value).
    r   r   )r   deviceN)
r&   shaper
   reshaper   aranger   r-   squeezer   )r(   r)   r*   r   r+   	indices_kcu_seqlens_kmax_seqlen_in_batch_k
batch_size
kv_seq_lennum_key_value_headshead_dimcu_seqlens_qmax_seqlen_in_batch_q	indices_qs                  r%   _upad_inputr<   ?   sU   N 6E^5T2I|2<EOO9J
/ !2!2:
3JL_ai!jluvI"J35H(SU^K z!&{':'::
;RTVX`'aclm# 5			 !||N%++k6H6H
 !"%	!))!, (L=>(9:FQR]_mFnCY.C 		|$	 56 r'   c                    | j                  d| j                  d      | j                  d            } |j                         j                  d|j                  d      |j                  d            }|j                         j                  d|j                  d      |j                  d            }|j                         }t	        j
                  |j                  d      |j                  t        j                        }t	        j                  ||dk(     t	        j                  |j                         |j                  t        j                        f      }|j                         dz   }| |||||f||ffS )aK  
    This function returns necessary arguments to call `flash_attn_varlen_func`.
    All three query, key, value states will be flattened.
    Cummulative lengths of each examples in the batch will be extracted from position_ids.

    NOTE: ideally cummulative lengths should be prepared at the data collator stage

    Arguments:
        query (`torch.Tensor`):
            Query state with padding. Shape: (batch_size, query_length, num_heads, head_dim).
        key (`torch.Tensor`):
            Key state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
        value (`torch.Tensor`):
            Value state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
        position_ids (`torch.Tensor`):
            Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.

    Return:
        query (`torch.Tensor`):
            Query state without padding. Shape: (total_target_length, num_heads, head_dim).
        key (`torch.Tensor`):
            Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
        value (`torch.Tensor`):
            Value state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
        indices_q (`torch.Tensor`):
            The indices of non-masked tokens from the flattened input target sequence.
        (cu_seqlens_q, cu_seqlens_k) (`Tuple[int]`):
            The cumulative sequence lengths for the target (query) and source (key, value), used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
        (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`Tuple[int]`):
            Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value).
    r   r   )r-   r   r   )viewsize
contiguousr   r   r0   r-   r   cattensorr   )querykeyvalueposition_idsr;   cu_seq_lens
max_lengths          r%   prepare_fa2_from_position_idsrJ      s0   @ JJr5::b>5::b>:E
..


CHHRL#((2,
?C##B

2

2GE'')L\..q1,:M:MUZU`U`aI))la'(LL**,\5H5HPUP[P[\	
K !!#a'J3y;*DzS]F^__r'   rD   rE   rF   target_dtypec                     || ||fS |j                   }|t        j                  k(  rLt        j	                  d| d       | j                  |      } |j                  |      }|j                  |      }| ||fS )aG  
    PEFT usually casts the layer norms in float32 for training stability reasons
    therefore the input hidden states gets silently casted in float32. Hence, we need
    cast them back in float16 / bfloat16 just to be sure everything works as expected.
    This might slowdown training & inference so it is recommended to not cast the LayerNorms!

    Args:
        query (`torch.Tensor`):
            Input query states to be passed to Flash Attention API
        key (`torch.Tensor`):
            Input key states to be passed to Flash Attention API
        value (`torch.Tensor`):
            Input value states to be passed to Flash Attention API
        target_dtype (`torch.dtype`, *optional*):
            The dtype to convert the attention tensors to. Conversion can be ignored by
            not providing the target dtype.
    zThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .)r   r   float32loggerwarning_onceto)rD   rE   rF   rK   input_dtypes        r%   fa_peft_integration_checkrS      s    . c5  ++Kemm#~Q 	
 &ff\"&#ur'   z2.4.1FLASH_ATTENTION_DETERMINISTIC01Fquery_states
key_statesvalue_states	is_causaldropoutrG   softmax_scalesliding_windowuse_top_left_masksoftcapdeterministiccu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kc                 ,   |
s|}n	|xr |dk7  }t         xr |	duxr |j                  d   |	kD  }|rd|	|	fini }t        r|t        }||d<   |||d<   t	        | |||      \  } }}|U| j                  d   }t        | ||||      \  } }}}}}|\  }}|\  }}t        | ||f|||||||d|}t        ||||      }|S |C|/|dk7  r;t        j                  |d	      dk\  j                         s| j                  d      }|| t        | |||      \  } }}}}}|\  }}|\  }}n| j                  d| j                  d
      | j                  d            } |j                  d|j                  d
      |j                  d            }|j                  d|j                  d
      |j                  d            }t        | ||f|||||||d|}|j                  |d|j                  d
      |j                  d            }|S t        | |||f||d|}|S )a  
    Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
    first unpad the input, then computes the attention scores and pad the final attention scores.

    Args:
        query_states (`torch.Tensor`):
            Input query states to be passed to Flash Attention API
        key_states (`torch.Tensor`):
            Input key states to be passed to Flash Attention API
        value_states (`torch.Tensor`):
            Input value states to be passed to Flash Attention API
        attention_mask (`torch.Tensor`):
            The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
            position of padding tokens and 1 for the position of non-padding tokens.
        dropout (`float`):
            Attention dropout
        softmax_scale (`float`, *optional*):
            The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
        use_top_left_mask (`bool`, defaults to `False`):
            flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference.
        softcap (`float`, *optional*):
            Softcap for the attention logits, used e.g. in gemma2.
        deterministic (`bool`, *optional*):
            Determines if the deterministic option introduced in flash_attn>=2.4.1 is enabled.
    r   Nr   r`   r_   r   )r9   r3   max_seqlen_qmax_seqlen_k	dropout_pr\   causalr   )r   r>   )r\   ri   )_flash_supports_window_sizer.   	flash_241deterministic_grS   r<   r   r   r   diffallr@   rJ   r/   r?   r   )rW   rX   rY   r   r+   rZ   r[   rG   r\   r]   r^   r_   r`   ra   rb   rc   rd   rK   ri   use_sliding_windowsflash_kwargsr5   r;   rH   max_seq_lensr9   r3   r:   r4   attn_output_unpadattn_outputs                                  r%   _flash_attention_forwardrt      s   Z  0|q0 	$kd(BkzGWGWXYGZ]kGk  I\MNN#CDacL +M(5_%")Y .Gj,.*L*l
 !!''*
Wb*lNLX
Tj,	; &1"l7C442
 &%..'
 
   19j,WX O 
	! \Q%6

<]_@`de@e?j?j?l!&&q)
 M$9-lJVbc YL*lI{L ,7(M=)5&L, (//L4E4Eb4I<K\K\]_K`aL#++B
0CZ__UWEXYJ'//L4E4Eb4I<K\K\]_K`aL,
 '&%%'
 
 "&&z2{7G7G7K[M]M]^`Mab 	 &*lG
KXag
kw
 r'   c                   z    e Zd ZU dZeej                     ed<   eej                     ed<   ee   ed<   ee   ed<   y)FlashAttentionKwargsa  
    Keyword arguments for Flash Attention with Compile.

    Attributes:
        cu_seq_lens_q (`torch.LongTensor`, *optional*)
            Gets cumlative sequence length for query state.
        cu_seq_lens_k (`torch.LongTensor`, *optional*)
            Gets cumlative sequence length for key state.
        max_length_q (`int`, *optional*):
            Maximum sequence length for query state.
        max_length_k (`int`, *optional*):
            Maximum sequence length for key state.
    ra   rb   rc   rd   N)	__name__
__module____qualname____doc__r   r   
LongTensor__annotations__int r'   r%   rv   rv   r  s?     E,,--E,,--3-3-r'   rv   )total)N)g        NNNFNNNNNNN)-inspectostypingr   r   r   r   torch.nn.functionalnn
functionalr   utilsr   r   r	   
get_loggerrw   rO   flash_attn.bert_paddingr
   r   r   
flash_attnr   r   list	signature
parametersrj   Tensorr}   r&   r<   rJ   r   rS   rk   environgetrl   boolfloatr{   rt   rv   r~   r'   r%   <module>r      s     	 - -    U U 
		H	% PPB"/48I8I8I/8Z8e8e3f"fELL U5<<WZ;Z5[ 6FF||F F LL	F
 FR/`l +/	&<<&	& <<& 5;;'	&R +73	**..!@#F#M +/%)$(##0404"&"&*.%H,,HH ,,H LL	H
 H H H 5<<(H E?H SMH H e_H H E,,-H E,,-H  3-!H" 3-#H$ 5;;'%HV 9E  r'   