
    sg0>                        d dl Z ddlmZmZmZ ddlmZ  edd i      edej                  dej                  d	ej                  d
ej                  dej                  f
d              ZddZ	d Z
edej                  dej                  d	ej                  dej                  d
ej                  f
d       ZddZd ZddZ G d de j                  j                         Z G d d      Zy)    N   )cdiv
heuristicsjit)languageEVEN_Kc                     | d   | d   z  dk(  S )NKTILE_Kr    )nargss    P/var/www/html/venv/lib/python3.12/site-packages/triton/ops/blocksparse/matmul.py<lambda>r      s    E#Jx8A=     TILE_MTILE_Nr   BLOCKc                 h   t        j                  d      |z   }||dz  z  }t        j                  d      }t        j                  |dz         }t        j                  |dz         }||z  t        j                  d|      |z  z   }t        j                  d|      }| ||z  z   ||z  z   |d d d f   |z  z   |d d d f   |z  z   }t        j                  |dz         }||z  t        j                  d|      |z  z   }t        j                  d|      } |||z  z   ||z  z   |d d d f   |
z  z   | d d d f   |	z  z   }!t        j                  ||ft         j
                        }"t        |d|       D ]  }#|r+t        j                  |      }$t        j                  |!      }%nDt        j                  ||d d d f   |#k  d      }$t        j                  |!| d d d f   |#k  d      }%|"t        j                  |$|%t         j
                        z  }"|||z  z  }|!||	z  z  }! |"j                  |j                  j                        }&t        j                  d|      |z  }'t        j                  d|      |z  }(|||z  z   ||z  z   |'d d d f   |z  z   |(d d d f   |z  z   })t        j                  |)|&d	
       y )Nr   r         dtypeg        )maskother	out_dtypeTr   )tl
program_idloadarangezerosfloat32rangedottor   
element_tystore)*ABC	stride_za	stride_ha	stride_ma	stride_ak	stride_zb	stride_hb	stride_bk	stride_nb	stride_zc	stride_hc	stride_mc	stride_ncr
   grid_offsetlutr   r   r   r   r   block_idoff_zoff_hstart_amoffs_amoffs_aka_ptrsstart_bnoffs_bnoffs_bkb_ptrsacckabcoffs_cmoffs_cnpcs*                                             r   _sdd_kernelrM      s    }}Q+-H8a<CMM!EGGC!GE wwsQwH"))Av"6">?Gii6"G
)

)
 !T'
Y
&' $'
Y
&	'F wwsQwH"))Av"6">?Gii6"G
)

)
 $'
Y
&' !T'
Y
&	'F ((FF#2::
6C1a&! 	%AAWT1W%5%9DAWQW%5%9DArvvabjj11&9$$&9$$	% 	qww!!"A ii6"U*Gii6"U*G	

)


Y

 !T'
Y
&
' $'
Y
&	
'B
 HHRr   c
                    | j                  d      dk7  r$| j                  d      dk7  r| j                         } |j                  d      dk7  r$|j                  d      dk7  r|j                         }|r
|| }} | | }}|rdnd}
|rdnd}| j                  |
   |j                  |   }}||k7  rt        d| d| d      |	Kt	        j
                  | j                  d	   |j                  d	   ||f| j                  | j                  
      }n1|	j                  | j                  d	   |j                  d	   ||fk(  sJ |	}|j                  d   d|j                  d	   g}t        |   | ||| j                  d	      | j                  d      | j                  |rdnd      | j                  |rdnd      |j                  d	      |j                  d      |j                  |rdnd      |j                  |rdnd      |j                  d	      |j                  d      |j                  d      |j                  d      |d	|||d|dd       |S )Nr   r   r   zInner dimension mismatch (A: z vs B: )r   r   device       )r   r   r   r   
num_stages	num_warps)	stride
contiguousshape
ValueErrortorchemptyr   rS   rM   )rG   rH   trans_atrans_btrans_cspdimsblockr9   widthsouta_dimb_dimKaKbrI   grids                   r   
sdd_matmulrj   S   s   xx{aAHHQK1,LLNxx{aAHHQK1,LLN!1&;GBrEBrEWWU^QWWU^B	Rx8GB4qIJJ
{KKSYYq\5%@XYX`X`ayyQWWQZ1ueDDDDGGAJ1771:&D	1a	QXXa[!((1Q"?w!\]A^	QXXa[!((1Q"?w!\]A^	QXXa[!((1+qxx{
AsU2Uq Hr   c                     | j                  d      j                  |      j                         }|j                         }|d fS )NFas_tuple)nonzeror&   intrY   )layoutrb   rS   r9   s       r   sdd_lutrq   u   s<    
..%.
(
+
+F
3
7
7
9C
..
C9r   GROUP_SIZE_Mc                 P   t        j                  d      }t        j                  d      }t        j                  d      }t        j                  d      }t        j                  |||||      \  }}t        j                  d      }||dz  z   }t        j                  |dz         }t        j                  |dz         }t        j                  |dz         }t        j                  |dz         } ||z   }!t        j                  |!dz         }"t        j
                  |"d      }"t        j                  d|      }#t        j                  d|      }$| ||z  z   |"|z  z   |#d d d f   |z  z   |$d d d f   |z  z   }%||z  t        j                  d|      z   }&t        j                  t        j
                  |&|z  |      |      }&t        j                  |!      }'t        j
                  |'d      }'|'t        j                  d|      z   }(|||z  z   | |z  z   |&d d d f   |
z  z   |(d d d f   |	z  z   })t        j                  ||ft         j                        }*|!dz  }!t        j                  |!dz         }+t        j
                  |+d      }+t        j                  |!      },t        j
                  |,d      },t        |d|       D ]  }-t        j                  |%      }.t        j                  |)      }/|*t        j                  |.|/t         j                        z  }*|%|+z  }%|)|,|	z  z  })|!dz  }!t        j                  |!dz         }+t        j
                  |+d      }+t        j                  |!      },t        j
                  |,d      }, |*j                  |j                  j                        }0||z  t        j                  d|      z   }1||z  t        j                  d|      z   }2|| |z  z   ||z  z   |1d d d f   |z  z   |2d d d f   |z  z   }3t        j                  |3|0|2d d d f   |k  	       y )
Nr   r   r   rU   r      r   r   r   )r   r   num_programs	swizzle2dr    multiple_ofr!   max_contiguousr"   r#   r$   r%   r&   r   r'   r(   )4r)   r*   r+   	stride_azr-   	stride_amr/   r0   r1   r2   	stride_bnr4   r5   	stride_cm	stride_cnDS0DS1r9   r   r   r   rr   r   pid_mpid_n	num_pid_m	num_pid_npidzheaderoffsetr
   columnr<   pincr:   r>   r?   parB   start_bkrC   pbrE   inc_ainc_brF   rG   rH   rI   rJ   rK   rL   s4                                                       r   _dsd_kernelr      s    MM!EMM!E"I"I<<uiLQLE5==D519_FWWVaZ F

AWWVaZ FGGFQJE<Dwwtax H~~h*Hii6"Gii6"G	
TI	
Y


!T'
Y
&
' $'
Y
&
'B
 fnryyF33Gw}f EvNGwwt}H~~h*H1f--G	
TI	
)


$'
Y
&
' !T'
Y
&
'B ((FF#2::
6CAIDGGD1HENN5!$EGGDMENN5!$E1a&! 
)GGBKGGBKrvvabjj11
e
ei	q!ua(ua(
) 	qww!!"Avo		!V 44GfnryyF33G	

)




 !T'
Y
&
' $'
Y
&	
'B
 HHRq)C/0r   c
                 4   | j                  d      dk7  r$| j                  d      dk7  r| j                         } |j                  d      dk7  r$|j                  d      dk7  r|j                         }|||rdnd   z  }
|j                  d      |j                  d      }|j                  |rdnd      | j                  }}|}|rn|
}|r|
n}|	't	        j
                  ||||f|| j                        }n|	j                  ||||fk(  sJ |	}d}fd}t        |   | ||| j                  d      | j                  d      | j                  |rdnd      | j                  |rdnd      |j                  d      |j                  d      |j                  |rdnd      |j                  |rdnd      |j                  d      |j                  d      |j                  |rdnd      |j                  |rdnd      |
|f||t        |d      |d	d	d	d
 |S )Nr   r   r   r   rR      c                 (    t        | d         gS )Nr   )r   )metaBS0BS3widths    r   r   zdsd_matmul.<locals>.<lambda>   s    c4>2E3? r   rT   rU   )r   r   r   r   rV   rW   rr   )
rX   rY   sizer   r\   r]   rS   rZ   r   min)rG   rH   r^   r_   r`   ra   rb   r9   r   rd   AS1BS1r   CS0CS1CS2CS3rI   r   ri   r   r   s           `           @@r   
dsd_matmulr      s   xx{aAHHQK1,LLNxx{aAHHQK1,LLN
&g1-
-C
&&)C
&&)C
&&g1
%CGGE
C
C#cC#cC
{KKc3,E!((KyyS#sC0000F?D	1a	QXXa[!((1Q"?w!\]A^	QXXa[!((1Q"?w!\]A^	QXXa[!((1Q"?w!\]A^S# VCrN%TU! Hr   c                 	   t        j                  | |rdnd      }t        j                  |      j                  d      \  }}|j	                         }||z  }|r| j                  d      }	n"| j                  dd      j                  d      }	|	j                  d      }
t        j                  |      }t        j                  |dd d	      |dd t        j                  ||
dz
  t        j                  |      z        }|	dddf   |z  }|j                         }|ddxxx |dd z  ccc ||z  }|j                  dd      j                  d|      }||ddddf<   |dddfxx   |dz
  |z  z  cc<   |||dkD        |||dkD     df<   |j                  d      }|r"t        j                  |
| j                  
      }nt        j                  g t         j                   | j                        }d}t#        | j                  d            D ]  }| |ddddf   j                         j%                         }|j                         }dt        j                  || j                  
      z   ||dkD  <   t        j&                  |||j(                  |j(                  dkD     z   dz
  f      }||z  } ||z  |z  }|ddxxx |dd |z  |z  z  ccc |j                  dd      j                  d|      }|r#||ddddf<   |dddfxx   |dz
  |z  z  cc<   n(||z  |ddddf<   |dddfxx   |dz
  |z  |z  z  cc<   |||dkD        |||dkD     df<   |j                  d      }|j                  d      }|dz  |z  d|z  z   }||z  }t        j*                  ||||fd	      j                  d      j-                         }t        j*                  ||fd	      j                  d      j-                         }t        j.                  d|j                  |j0                        }t        j&                  ||f      }t        j&                  ||f      }|j3                  t         j4                        j7                  |      }||fS )a  
    Generates the look-up table for incrementing pointers in the DSD/DDS matmul.
    Example (BLOCK=32, STEP=16)
    [[1, 0, 0, 1, 0],
     [0, 1, 1, 0, 1],
     [1, 0, 1, 0, 0]]

    Then the offsets for A are
     [0 , 16, 32, 48] <- row 0
      \----/  \----/
      col=0   col=3
     [64, 80, 96, 112, 128, 144] <- row 1
      \----/   \----/  \------/
       col=1    col=2    col=3
     [160, 176, 192, 208]
    which leads to increments table
    [0, 16, 16, 16, || 64, 16, 16, 16, 16, 16, || 160, 16, 16, 16]

    Because B is dense, the offsets are
    [0, 16, 96, 112] <- row 0
    [32, 48, 64, 80]  <- row 1
    [0, 16, 64, 80]   <- row 2
    r   r   Trl   Fr   NrP   )dim)rS   rR   rU      )rS   r   )r\   sum	ones_likern   flatten	transposer   
zeros_likecumsumr   cloneviewrepeatr!   rS   tensorint64r$   longcatTstackrY   r"   r   typeint32r&   )rp   rb   steptransrS   sizeshead_idcol_idsegmentsnnz
num_blocksoffsetsB_idxB_incsdivA_idxcurrent_offsetzlayoutwmsumA_incsr   r   incspadr9   s                             r   dsd_lutr      s7   0 IIf5aa0Eooe,44d4COGVMMOEt|Hnnen,q!$,,e,<!Ju%G,,uSbzq1GABKii*q.EOOG4L!LMG 1IE[[]F
12J%*J
4-C[[Q&&q#.FF1ab5M
1a4LS1W$$L',WX\-B'CF78a< !#$[[_F
 Z>Ru{{6==Iv{{1~& 	#AQ1Wo++-224G;;=D#$u||D'O#OGGaK IIunwyyQ7O&ORS&STUEd"N	# U]U"F
12J%*u$u,,J[[Q&&q#.Fq!"uq!tqD((uq!"uq!tqD(500',WX\-B'CF78a< !#$[[_FKKNEkC!e)+G#~H[['8VW=1EJJ2NYY[F;;'Q/44R8CCED ++bDJJ
?C99dC[!D
))VTN
#C
((5;;

"
"6
*C:r   c
                 2    t        || | | | |||||	
      S N)rd   )r   )
rG   rH   r^   r_   r`   ra   rb   r9   r   rd   s
             r   
dds_matmulr   Z  s(    aKW'k65RUW\beffr   c                   8    e Zd ZeeedZed        Zed        Z	y)_matmulsdddsdddsc                    t        j                  |   ||||||||	|
|
      }| j                  ||       || _        || _        || _        || _        || _        || _        || _	        || _
        || _        || _        |d u| _        |S r   )r   fnsave_for_backwardda_lutda_widthdb_lutdb_widthmodera   rb   r^   r_   r`   has_out)ctxrG   rH   r^   r_   r`   r   ra   rb   c_lutc_widthr   r   r   r   rd   rI   s                    r   forwardz_matmul.forwardg  s     JJtQ7GWfeUT[adea#


	or   c                    | j                   \  }}d\  }}| j                  }| j                  d   rx|d   |d   z   |d   z   }t        j                  |   ||| j
                  | j                   | j                  | j                  | j                  | j                  | j                  	      }| j                  d   rx|d   |d   z   |d   z   }t        j                  |   ||| j                   | j
                  | j                  | j                  | j                  | j                  | j                  	      }| j                  r|nd }	||d d d d d d d d d d d d |	fS )N)NNr   r   r   )saved_tensorsr   needs_input_gradr   r   r`   r_   r^   ra   rb   r   r   r   r   r   )
r   dcrG   rH   dadbr   mode_damode_dbdouts
             r   backwardz_matmul.backwardz  sE      1Bxx"1gQ'$q'1GG$RCKKS[[#++WZWaWacfclcl%(ZZ?B "1gQ'$q'1GG$QOS[[#++WZWaWacfclcl%(ZZ?B[[rd2tT4$d$dD$/ 	/r   N)
__name__
__module____qualname__rj   r   r   r   staticmethodr   r   r   r   r   r   r   c  s4    Jz	BB $ / /r   r   c                       e Zd ZddZddZy)matmulc                 L   |dvrt        d      || _        || _        || _        || _        || _        || _        |j                  | _        t        |d      }| j                  dk(  rRt        |||      \  | _        | _        t        |||d|      \  | _        | _        t        |||d|      \  | _        | _        | j                  dk(  rgt        |||| j                   |      \  | _        | _        t        |||      \  | _        | _        t        |||| j                  |      \  | _        | _        | j                  dk(  rht        |||| j                  |      \  | _        | _        t        |||| j                   |      \  | _        | _        t        |||      \  | _        | _        y y )	Nr   z"Supported modes are: sdd, dsd, ddsrT   r   TFr   r   )NotImplementedErrorrb   r   r^   r_   r`   rp   rZ   ra   r   rq   r   r   r   r   r   r   r   )	selfrp   rb   r   rS   r^   r_   r`   r   s	            r   __init__zmatmul.__init__  s   ,,%&JKK
	ll5"~99'.vuf'E$DJ)0dF)S&DK)0eV)T&DK99'.vudDTV\']$DJ)0)G&DK)0dllTZ)[&DK99'.vudDLLRX'Y$DJ)0$,,FVX^)_&DK)0)G&DK r   Nc                 <   t         j                  ||| j                  | j                  | j                  | j
                  | j                  | j                  | j                  | j                  | j                  | j                  | j                  | j                  |      }|S N)r   applyr^   r_   r`   r   ra   rb   r   r   r   r   r   r   )r   rG   rH   rd   rI   s        r   __call__zmatmul.__call__  so    MM!QdllDLL$))UYU`U`bfblbl**dll++t}}++t}}	
 r   )FFFr   )r   r   r   r   r   r   r   r   r   r     s    H0r   r   r   )r\    r   r   r   r   r   	constexprrM   rj   rq   r   r   r   r   autogradFunctionr   r   r   r   r   <module>r      s+    % %  =  <
 <
 /1ll<
 EGLL< ||< .0\\< <~D G1
 G1
 /1llG1
 EGLLG1 !llG1 46<<G1 G1T#L^Ng*/enn%% */Z   r   