
    sgnF                        d Z ddlZddlZddlmZmZ ddlmZ d Zedej                  dej                  d	ej                  d
ej                  fd       Z
edej                  dej                  fd       Zedej                  dej                  d	ej                  dej                  dej                  dej                  fd       Zedej                  dej                  d	ej                  dej                  dej                  dej                  fd       Z G d dej                  j                        Zej"                  Zy)ao  
Fused Attention
===============
This is a Triton implementation of the Flash Attention algorithm
(see: Dao et al., https://arxiv.org/pdf/2205.14135v2.pdf; Rabe and Staats https://arxiv.org/pdf/2112.05682v2.pdf)

Sequence Parallel implementation inspired by HazyResearch
(see https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attn_triton.py)
    N   )cdivjit)languagec                      t         j                  j                  j                  j	                         j
                  dk(  S )Nhip)tritonruntimedriveractiveget_current_targetbackend     M/var/www/html/venv/lib/python3.12/site-packages/triton/ops/flash_attention.pyis_hipr      s,    >>  ''::<DDMMr   BLOCK_MBLOCK_DMODELBLOCK_N	IS_CAUSALc           	      @   t        j                  d      }t        j                  d      }||z  } | |z  }!t        j                  |||f||fd|!f||fd      }"t        j                  |||f||f|!df||fd      }#||z  t        j                  d|      z   }$t        j                  d|      }%t        j                  |gt         j
                        t        d      z
  }&t        j                  |gt         j
                        }'t        j                  ||gt         j
                        }(|dz  })t        j                  d|      }*| | z   |$d d d f   |z  z   |*d d d f   |	z  z   }+t        j                  |+      },|,|)z  j                  |j                  j                        },d}-|r|dz   |z  n|}.t        |-|.|      D ]  }/t        j                  |"      }0t        j                  |#      }1t        j                  ||gt         j
                        }2|r4t        j                  |$d d d f   |/|%d d d f   z   k\  |2t        d	            }2|2t        j                  |,|0      z  }2t        j                  |&t        j                  |2d            }3t         j                   j#                  |&|3z
        }4t         j                   j#                  |2|3d d d f   z
        }5|(|4d d d f   z  }(|(t        j                  |5j                  |j                  j                        |1      z  }(|'|4z  t        j$                  |5d      z   }'|3}&t        j&                  |"d|f      }"t        j&                  |#|df      }# |(|'d d d f   z  }(|||z  z   |$z   }6t        j(                  |6|&t         j                   j+                  |'      z          t        j                  |||f||f|!||z  z   df||fd      }7t        j(                  |7|(j                  |j                  j                               y )
Nr      )r   r   baseshapestridesoffsetsblock_shapeorderr   r   dtypeinf/ldG?-inf)tl
program_idmake_block_ptrarangezerosfloat32floatloadtor"   
element_tyrangewheredotmaximummaxmathexp2sumadvancestorelog2)8QKVsm_scaleLOut	stride_qz	stride_qh	stride_qm	stride_qk	stride_kz	stride_kh	stride_kn	stride_kk	stride_vz	stride_vh	stride_vn	stride_vk	stride_oz	stride_oh	stride_om	stride_onZHN_CTX	Z_H_N_CTXr   r   r   r   start_moff_hz
qvk_offset	vk_offsetK_block_ptrV_block_ptroffs_moffs_nm_il_iaccqk_scaleoffs_kQ_ptrsqlohistart_nkvqkm_i_newalphapl_ptrsO_block_ptrs8                                                           r   _fwd_kernelro      s    mmAG]]1F)#Ji'I##Y'I&I!7+K ##,'I&Al+K w1g!66FYYq'"F
((G9BJJ
/%,
>C
((G9BJJ
/C
((G\*"**
=C
 *$H YYq,'F^fQWo	99F47Oi<WWF
A	
X!'',,-A	
B$-'A+	 5BR) <GGK GGK XXw(

;&D/gtQw.GH"eTZm\B
bffQl**S"&&Q-0S7]+GGLLgag../uQW~rvvadd177--.22EkBFF1aL(jjq'l;jjwl;+<. AtG
C%&(FHHVS277<<,,-##,'I&Ww..2l+K HH[#&&!3!345r   D_HEADc                    t        j                  d      |z  t        j                  d|      z   }t        j                  d|      }t        j                  | |d d d f   |z  z   |d d d f   z         j	                  t         j
                        }t        j                  ||d d d f   |z  z   |d d d f   z         j	                  t         j
                        }t        j                  ||z  d      }	t        j                  ||z   |	       y )Nr   r   )axis)r&   r'   r)   r-   r.   r+   r7   r9   )
r@   DODeltar   rp   off_moff_nododeltas
             r   _bwd_preprocessrz   u   s     MM!w&1g)>>EIIa E
eAtGnv--dAg>?BB2::NA	eAtGnv--dAg>	?	B	B2::	NBFF1r6"EHHUU]E"r   SEQUENCE_PARALLELCAUSALMMA_V3c.           	      N   |,r|&|(z  }.nd}.|$|z  |#|z  z   |z  }/|$|z  |#|z  z   }0|$|z  |#|z  z   |z  }1|$|z  |#|z  z   |z  }2|+r|0||&z  z  }0|0|z  }0t        j                  ||.|/z   df      }t        j                  ||&|(z  |1z   df      }t        j                  ||&|(z  |2z   df      }t        j                  ||.|/z   df      }t        j                  ||.|0z   df      }t        j                  ||&|(z  |1z   df      }t        j                  ||&|(z  |2z   df      }|&|(z  t        j                  d|(      z   }3t        j                  d|*      }4||%|"z  z   }5|
|%|"z  z   }6t        j                  |(|)gt         j                        }7t        j                  |(|)gt         j                        }8t        j
                  |      }9t        j
                  |      }:t        |.|'|(z  |(      D ]N  };|;|4z   }<t        j
                  |      }=|,r;t        j                  |<d d d f   |3d d d f   k\  t        d      t        d            }>n't        j                  |(|*gt         j                        }>|>t        j                  |=t        j                  |9            z  }>|>|z  }>t        j
                  |6|<z         }?t         j                  j                  |>|?d d d f   z
        }@t        j
                  |      }A|7t        j                  t        j                  |@j                  | j                  j                              |A      z  }7t        j
                  |5|<z         }Bt        j                  |At        j                  |:            }C|@|C|Bd d d f   z
  z  |z  j                  | j                  j                        }D|8t        j                  t        j                  |D      |=      z  }8|+sht        j
                  |      }E|Et        j                  D|9      z  }Et        j                   ||Ej                  | j                  j                               n|+r|-rt        j                  D|9      }EnOt        j                  t        j                  t        j                  |9      t        j                  D                  }Et        j                   |Ej                  | j                  j                               t        j                  ||(df      }t        j                  ||(df      }t        j                  ||(df      }Q t        j                   ||7j                  |j                  j                               t        j                   ||8j                  |j                  j                               y )Nr   r!   g        r%   )r&   r8   r)   r*   r+   r-   r0   r1   r,   r2   transr5   r6   r.   r"   r/   r9   )Fr;   r<   r=   r>   r`   r@   rs   DQDKDVr?   DQ_block_ptrrY   rZ   DO_block_ptrDQ_block_ptrDK_block_ptrDV_block_ptr
stride_dqarA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rQ   rR   rS   off_hoff_zrV   rf   	num_blockr   r   r   r{   r|   r}   rd   Q_offset	DQ_offsetK_offsetV_offsetr\   r[   D_ptrsrm   dvdkrg   rh   rU   offs_m_currrc   ri   r^   rl   rx   DidpdsdqsF                                                                         r   _bwd_kernel_one_col_blockr      s   & w	!EI$55)CH	!EI$55I	!EI$55)CH	!EI$55)CHZ'))	Y&I**[2=!*<=K**[7W+<x+G*KLK**[7W+<x+G*KLK::lR(]A,>?L::lR)^Q,?@L::lWw->-I1,MNL::lWw->-I1,MNL w1g!66FYYq'"F%F%F	7L)	<B	7L)	<B
A
AY0': )>&GGK  +ag.6$'?CU3ZQVW]Q^_B7G,BJJ?B
bffQ$$
hggf{*+GGLLc!T'l*+WW\"
bffRXXadd177#5#567<<WWVk)*VVB$21d7#$x/33AGG4F4FG
bffRXXb\1%% &B"&&Q-BHH\255););#<=VVB] XXbffRXXa["((2,?@HH\255););#<= zz,!=jjwl;zz,!=S)>V HH\255!3!345HH\255!3!345r   c#                 d   |dz  }#t        j                  d      }$|$|z  }%|$|z  }&t        j                  | ||f||fd||fd      }'t        j                  |||f||fd||fd      }(t        j                  |||f||fd||fd      })t        j                  |||f||fd||fd      }*| r"t        j                  |||f||fd||fd      }+n!t        j                  |||f||fd||fd      }+t        j                  |||f||fd||fd      },t        j                  |||f||fd||fd      }-t        j                  ||      }.| sst	        d|.      D ]c  }/t        g | ||||#||||||	|
|'|(|)|*|+|,|-|||||||||||||||||&|%|$|/|.|||| |!|"d e y t        j                  d      }/t        g | ||||#||||||	|
|'|(|)|*|+|,|-|||||||||||||||||&|%|$|/|.|||| |!|"d y )Nr$   r   )r   r   r    r   )r   r   r   r{   r|   r}   r   )r&   r'   r(   r   r0   r   )0r;   r<   r=   r>   r@   rs   r   r   r   r?   r   r   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rQ   rR   rS   rT   SQ_Z_H_N_CTXr   r   r   r{   r|   r}   r`   rV   r   r   r   rY   rZ   r   r   r   r   num_block_nrf   s0                                                   r   _bwd_kernelr      sO   $ *$H]]1FaKEQJE##,'I&l+K ##,'I&l+K ##,'I&l+K $$,'I&l+L ((.	* ,/
 ((l+	* ,/
 $$,'I&l+L $$,'I&l+L ''%)KQ, 	(G% (a ( (A (x ( (3 ( (&((*,(.0(&'( '(( '2	( 4?	( AL	(
 '3(
 5A(
 CO(
 Q]( '1( 3<( >G( IR( T]( '0( 2;( =F( HQ( '0( 2;( =F( HQ( '(( *+( -2( ',( .3( 5;( =D( FQ( /6L.58I-3-3(	(& --"! 	$! 	$Q 	$ 	$8 	$X 	$s 	$B 	$"$	$&(	$*,	$"#	$ #$	$ #.		$ 0;		$ =H		$
 #/	$
 1=	$
 ?K	$
 MY	$ #-	$ /8	$ :C	$ EN	$ PY	$ #,	$ .7	$ 9B	$ DM	$ #,	$ .7	$ 9B	$ DM	$ #$	$ &'	$ ).	$ #(	$ */	$ 17	$ 9@	$ BM	$ +2*14E)/)/	$r   c                   .    e Zd Zedd       Zed        Zy)
_attentionc                    t         j                  j                         }|d   dk  rt        d      d}d}	|j                  d   |j                  d   |j                  d   }}}
|
|k(  r||k(  sJ |dv sJ t        j
                  |      }t        |j                  d   |      |j                  d   |j                  d	   z  d	f}t        j                  |j                  d   |j                  d	   z  |j                  d   f|j                  t         j                  
      }|dk  rdnd}t        |   |||||||j                  d      |j                  d	      |j                  d      |j                  d      |j                  d      |j                  d	      |j                  d      |j                  d      |j                  d      |j                  d	      |j                  d      |j                  d      |j                  d      |j                  d	      |j                  d      |j                  d      |j                  d   |j                  d	   |j                  d   |j                  d   |j                  d	   z  |j                  d   z  f||	|||dd | j                  |||||       || _        || _        || _        || _        || _        |S )Nr      zEFlash attention currently only supported for compute capability >= 80   @   >          r   r   r   r   devicer"         )r   r   r   r   	num_warps
num_stages)torchcudaget_device_capabilityRuntimeErrorr   
empty_liker   emptyr   r+   ro   stridesave_for_backwardgridr>   r   causalsequence_parallel)ctxrc   rg   rh   r   r>   r   
capabilityr   r   LqLkLvrw   r   r?   r   s                    r   forwardz_attention.forwardr  sh    ZZ557
a=1fggWWR[!''"+qwwr{BRxB"H$$&&&&QQWWQZ)1771:
+BAFKKaggaj0!''!*=ahhV[VcVcdrAq	Dq!XHHQK!ahhqk188A;HHQK!ahhqk188A;HHQK!ahhqk188A;HHQK!ahhqk188A;GGAJ
AGGAJGGAJ#aggaj0	
 W2	
  	aAq!,
 1r   c           "         t         j                  j                         }|d   dk\  }d}t               rd}| j                  \  }}}}}	| j
                  }
|j                  d   }|j                         }|
rIt        ||      }|f|j                  z   }t        j                  ||j                  |j                        }n!t        j                  ||j                        }t        j                  |      }t        j                  |      }t        j                  |	      }t        t        |j                  d   |      | j                  d   z  f   ||||| j                   	       t#        | j                  d   |
rt        ||      ndf   |||| j$                  ||||||	||j'                         |j)                  d      |j)                  d      |j)                  d      |j)                  d
      |j)                  d      |j)                  d      |j)                  d      |j)                  d
      |j)                  d      |j)                  d      |j)                  d      |j)                  d
      |j                  d   |j                  d   |j                  d   |j                  d   |j                  d   z  |j                  d   z  t        ||      |j                  d   z  |j                  d   z  |j                  d   z  f||| j                   |
| j*                  |ddd t-        |j                        dk(  r|j/                  d      }|||d d d fS )Nr   	   r   r   r   r   r!   r   )r   rp   r   r   )r   r   r   r{   r|   r}   r   r      )dim)r   r   r   r   saved_tensorsr   r   
contiguousr   r*   r   r"   
zeros_liker   rz   r   r   r   r>   numelr   r   lenr7   )r   rx   r   r}   BLOCKrc   rg   rh   rw   r?   r   
seq_len_kvreplicasnew_dq_shaper   r   r   ry   s                     r   backwardz_attention.backward  s   ZZ557
A!#8E))1aA11WWQZ
]]_J.H$<!''1L\!((!''JB!!!1773Ba a   #aggaj%0388A;>AB##	
 	SXXa[=N$z5"9TUVWq!S\\rBGGIqxx{AHHQK!ahhqkHHQK!ahhqk188A;HHQK!ahhqk188A;GGAJ
AGGAJGGAJ#aggaj0U#aggaj01771:=
J	
 5))/::%	
* rxx=AAB2r4t++r   N)F)__name__
__module____qualname__staticmethodr   r   r   r   r   r   r   p  s)    % %N 4, 4,r   r   )__doc__r   r	    r   r   r   r&   r   	constexprro   rz   r   r   autogradFunctionr   apply	attentionr   r   r   <module>r      s      N [6 [6 68\\[6 [6 <<[6 [6| # \\	#
 LL# #$ `6 (*||`6 DF<<`6 (*||`6 24`6 ')ll`6  ')ll!`6 `6F @$ @$ 68\\@$ @$ $&<<@$ @$ @$ @$F_,(( _,D 	r   