
    sg3                      <   d dl Z ddlmZ ddlmZ ddlmZ d Zedej                  dej                  d	ej                  fd
       Zedej                  dej                  d	ej                  fd       Z	 G d de j                  j                        Z G d d      Zy)    N   )jit)language)next_power_of_2c                 4    | dk  ry| dk  ry| dk  ry| dk  ryy	)
N            i      i          )ns    Q/var/www/html/venv/lib/python3.12/site-packages/triton/ops/blocksparse/softmax.py	num_warpsr      s-    CxCxCxDy    ROW_SIZE
BLOCK_SIZEIS_DENSEc                    t        j                  d      }t        j                  d      }t        j                  d      }|t        j                  d      z  |z   }t        j                  d|
      |z  }t        j                  d|
      |z  }|||z  dz  z   }t        j                  |dz         }t        j                  |dz         }||z  }|||z   |z  |z  z  }|||z  |z  z  }|rt        j                  d|
      }n]|dt        j                  d      z  t        j                  d      z  |z  z   }t        j                  ||z   |z   ||k  d      }||z  |z   }||k  }t        j                  ||z   |z   |t        d             }|j                  t         j                        }|}||z  }|L|||z  z  }|||z  z  }||z
  dz
  |z   }|dk\  ||k  z  }t        j                  |||z  z   |z   |d      }||z  }|j                  t         j                        }t        j                  ||kD  |	z  t        d       |      }t        j                  |      }t        j                  | |z   |z   ||       y )Nr   r	   r   maskotherinf        r   )tl
program_idnum_programsarangeloadfloattofloat32wheresoftmaxstore) OutA	stride_xzLUTRextent	stride_zr	stride_hrscale	is_causalr   r   r   hmzhmlane_nblock_nheadersizeoffsetoff_ansoff_lutstart_nr   aoutoff_lomask_lo
rel_logitss                                    r   _blocksparse_softmax_fwdrE      sZ    	aA
aA
aA	
R__Q	!	#BYYq(#j0Fii8$
2GB*$))F776A:DWWVaZ F	ME	fw*,z99E	a*n
**EYYq(#1rq11BOOA4FF*TT''#-'1$aPz!F*T>D
E	F"eEl]CA	RZZA
C5LC}	Q]	Q]1*q.B&Q;6F?3WWQV^f47#N
z
&&
C
((BFi'%,
<C
**S/CHHS5[6!3T2r   c                 4   t        j                  d      }t        j                  d      }t        j                  d      }|t        j                  d      z  |z   }t        j                  d|      |z  }t        j                  d|      |z  }|||z  dz  z   }t        j                  |dz         }t        j                  |dz         }||z   |z  |z  }|||z  |z  z  }||k  }|||z  z   |z   }|||z  z   |z   }|rt        j                  d|      }nZ|dt        j                  d      z  t        j                  d      z  |z  z   }t        j                  ||z   |z   |d      } | |z  |z   }t        j                  ||z   |d      }!|!j                  t         j                        }!t        j                  ||z   |d      }"|"j                  t         j                        }"t        j                  ||kD  |z  |!|!k(  z  d|!      }!|!|"t        j                  |!|"z  d      z
  z  }#|J|||
z  z  }|||z  z  }|	|z
  dz
  |z   }$|$dk\  |$|	k  z  |z  }%t        j                  |||	z  z   |$z   |#|%       |#|z  }#| ||z  z   |z   }&t        j                  |&|z   |#|       y )Nr   r	   r   r   r   r   )
r   r   r    r!   r"   r$   r%   r&   sumr(   )'DA
stride_zdxDOutstride_zdoutr)   stride_zoutr1   r,   DRr.   r/   r0   	stride_err2   r   r   r   r3   r4   r5   r6   r7   r8   r9   r:   r;   off_mnr   AsDOutsr=   r>   r?   r@   doutdarB   rC   DAss'                                          r   _blocksparse_softmax_bwdrU   J   s    	aA
aA
aA	
R__Q	!	#BYYq(#j0Fii8$
2GB*$))F776A:DWWVaZ Fw*,z9F
q:~++FT>D	q;		'B1|##f,EYYq(#1rq11BOOA4FF*TT''#-'1AFz!F*
V$c2A	RZZA7756>C8D772::D
"q&I%a0"a8A	
dRVVAHa((	)B	~
a)m
a)m1*q.B&Q;6F?3d:
a&j6)2G<	eB q:~

&CHHS6\2D)r   c                   <    e Zd Zed        Zed        Zed        Zy)_softmaxc           	         t        j                  g t         j                  | j                        }|j	                         }t        | j                  d         D ]2  }t        j                  || |d d d d f   j                  d      f      }4 ||z  }t        j                  |      }t        j                  |d d d      |dd  | j                  d      d d df   }t        j                  ||fd      j                  d      }	t        j                  |	|f      j                  t         j                        j!                  |      }
|
t#        |j%                               fS )	Ndtypedevicer   )dimr	   F)as_tupler   )torchtensorint64r[   clonerangeshapecatrG   
zeros_likecumsumnonzerostackviewtypeint32r$   intmax)layoutblockr[   _emptysizesr3   total_sizesoffsetscolumnsr9   luts              r   make_lutz_softmax.make_lut   s   bFMMJv||A' 	@AIIufQ1Wo&9&9"&=>?E	@em""5)ll5":15..%.0A6eW-15::2>ii)*//<??GC)***r   c
                    |Et        |t        j                        r+|j                  j                  dk(  sJ |j                         }|j                  d   }
|d   |d   |z  |
g}|dn|j                  }|dn|j                         }t        j                  |      }t        |   |||j                  d      |||d   |d   |d   |||t        |      |	t        |             | j                  ||       || _        || _        || _        || _        || _        || _        |j&                  | _        |	| _        || _        |S )Ncpur   r	   )r	   r	   r	   r	   r\   r   r   r   r   )
isinstancer_   Tensorr[   rk   itemrd   stride
empty_likerE   r   r   save_for_backwardspdimsrp   maxlutr1   	rel_shaperel_stridesrZ   	rel_dtypeis_denser2   )ctxr@   r1   rD   r2   r   rp   rv   r   r   Mgridr   r   rA   s                  r   forwardz_softmax.forward   s>   E5<<!@<<$$---JJLEGGAJq	6!9u,a0$.$6LJ<L<L	&0&8lj>O>O>Qq! &AHHQK	"{1~{1~$V,'		
 	c3'
	
	!%!
r   c                    | j                   \  }}d }| j                  d   r6t        j                  | j                  | j
                  |j                        }|j                  d   }| j                  d   | j                  d   | j                  z  |f}t        j                  |      }t        |   ||j                  d      ||j                  d      ||j                  d      | j                  ||| j                  d   | j                  d   | j                  d   | j                  d   | j                  | j                  t!        | j"                        | j$                  t'        | j"                               |d d |d d d d d d d d d d d d d d fS )Nr   rY   r   r	   r\   r   rz   )saved_tensorsneeds_input_gradr_   zerosr   r   r[   rd   r   rp   r   rU   r~   r1   r   r2   r   r   r   r   )r   rR   rA   rv   drr   r   rS   s           r   backwardz_softmax.backward   sT    $$S"S]]#--

SBIIaL

1szz!}syy8!<d# &		!$++a.AIIb!3??1#5sq7I3??[\K]MMyy$SZZ0\\

+	
 D$D$dD$dTXZ^`dfjlprvwwr   N)__name__
__module____qualname__staticmethodrw   r   r   r   r   r   rW   rW      s?    + +     D x xr   rW   c                   $    e Zd ZddZdddddZy)r'   Fc                     |j                   | _        || _        || _        t        j                  | j                  | j                  |      \  | _        | _        || _        y )N)	rd   r   ro   rp   rW   rw   rv   r   r   )selfro   rp   r[   r   s        r   __init__zsoftmax.__init__   sG    ll
 ( 1 1$++tzz6 R$+ r   g      ?N)r1   rD   r2   c                
   |1|j                   |j                   k7  rt        d|j                          t        j                  ||||| j                  | j
                  | j                  | j                  | j                  	      }|S )Nz$relative position embedding must be )	rZ   
ValueErrorrW   applyr   rp   rv   r   r   )r   r@   r1   rD   r2   s        r   __call__zsoftmax.__call__   sp    !j&6&6!''&ACAGG9MNNNN1eZDKKUYU]U]_c_j_j==*r   )F)r   r   r   r   r   r   r   r   r'   r'      s    ! $'45 r   r'   )r_    r   r   r   r   r   	constexprrE   rU   autogradFunctionrW   r'   r   r   r   <module>r      s       	 23 (*||23 *,	23
 (*||23 23j 6* (*||6* *,6* (*||6* 6*rOxu~~&& Oxd r   