
    sgd                         d dl Z d dlZd dlmZmZmZmZ d dlZd dlm	Z	 d dl
mZmZmZmZmZmZ d dlmZ d dlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z)m*Z* ddl+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2 ddl3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<  ejz                  e>      Z?ej                  j                  ZA e*de:d      ZB e(ej                  d      ZD e(ej                  deAj                  j                        ZG e(ej                  d      ZI e(ej                  dd      ZKd ZLdddddZM e(eMd      ZN e%eAj                  d       dd!d"       ZOd# ZP e%eAj                  d       dd!d$       ZQ e%eAj                  d       dddd%d&       ZR e%eAj                  d       ddd'd(       ZSd) ZT e(eTd      ZU e j                  d      d*eeW   d+eXfd,       ZYd- ZZd. Z[	 	 d5d/eeW   fd0Z\d1 Z]d2 Z^d3 Z_dd!d4Z`y)6    N)AnyDictListOptional)AutoHeuristicSelectAlgorithm)	AHContextcontext_add_stridescontext_add_using_tf32get_mixedmm_preconditionmixed_mm_operationsmm_operations)CppPackedGemmTemplate)V   )config)BackendFeature)CUTLASS2xGemmTemplateCUTLASS3xGemmTemplate)CKGemmTemplate)WrapperCodeGen)FlexibleLayout	is_triton)register_lowering)autotune_select_algorithmExternKernelChoiceNoValidChoicesErrorTritonTemplate)get_gpu_shared_memoryuse_aten_gemm_kernelsuse_ck_templateuse_cpp_packed_gemm_templateuse_cutlass_templateuse_max_autotuneuse_triton_template   )	addmm_epilogueextra_mm_configsint8_mm_configsmixed_mm_configsmm_args
mm_configsmm_grid
mm_optionstriton_configmma  
{{def_kernel("A", "B")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return
    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}

    # based on triton.ops.matmul
    pid = tl.program_id(0)
    grid_m = (M + BLOCK_M - 1) // BLOCK_M
    grid_n = (N + BLOCK_N - 1) // BLOCK_N

    # re-order program ID for better L2 performance
    width = GROUP_M * grid_n
    group_id = pid // width
    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
    pid_m = group_id * GROUP_M + (pid % group_size)
    pid_n = (pid % width) // (group_size)

    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    if (stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1):
        ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
    else:
        ram = rm % M
    if (stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1):
        rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
    else:
        rbn = rn % N
    rk = tl.arange(0, BLOCK_K)
    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)

    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
    for k in range(K, 0, -BLOCK_K):
        if EVEN_K:
            a = tl.load(A)
            b = tl.load(B)
        else:
            a = tl.load(A, mask=rk[None, :] < k, other=0.)
            b = tl.load(B, mask=rk[:, None] < k, other=0.)
        if B_PROLOGUE_CAST_TYPE is not None:
            b = b.to(B_PROLOGUE_CAST_TYPE)
        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)
        A += BLOCK_K * stride_ak
        B += BLOCK_K * stride_bk

    # rematerialize rm and rn to save registers
    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    idx_m = rm[:, None]
    idx_n = rn[None, :]
    mask = (idx_m < M) & (idx_n < N)

    # inductor generates a suffix
    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
)namegridsourcez
at::mm_outzat::addmm_out)op_overloadzat::_int_mmzat::_sparse_semi_structured_mmF)has_out_variantc                 b    | j                         t        j                  t        j                  fv S N)	get_dtypetorchint8uint8)mats    L/var/www/html/venv/lib/python3.12/site-packages/torch/_inductor/kernel/mm.py_is_int8_matr=      s     ==?uzz5;;777    outalphabetac                    | j                  d      dk(  s| j                  d      dk(  rt        j                  | d   |||||      S t        j                  | |||||      S )z
    Giving torch.addmm a 1D tensor calls a different (faster) cublasLt
    kernel under the hood.  There are a few shapes where this is slower,
    but they are rare.
    r   r%   r?   )stridesizer8   addmm)inpmat1mat2r@   rA   rB   s         r<   
bias_addmmrJ      sY     zz!}SXXa[A-{{3q643e$OO;;sD$Cu4HHr>   )type_promotion_kindlayoutc                l   t        | ||      \  }}}}} }d}|}t               s,t        |j                  |j                  |j
                        }t               rt        j                  | |f|      gng }t        | |g|      \  }	}
|
rEt        |      r:t        |||      D ]*  }t        j                  |f| |f|dt        |||||       , |	r)|
r't        ||||      rt!        j"                  ||| |g       |
r't%        ||||      rt'        j(                  ||| |g       t+        || |      rt-        j.                  ||| |g       | |g}|
rt        |      rt0        j2                  j4                  j7                  |      rt9        |       rg }t               r|j;                  d       t=        |      }t?        |||      D ]*  }t        j                  |f| |f|dt        |||||       , tA        | |||||||tC               d d|      }t0        j2                  j4                  jE                  |      s*|#t=        |      dkD  r|D cg c]	  }||v s| }}n|d | }t=        |      dk(  rUt               sKtF        jH                  r;tJ        jM                  d	       t        j                  | |f|      jO                         S 	 tQ        ||| |g|      S c c}w # tR        $ rO tF        jH                  s tJ        jM                  d
       t        j                  | |f|      jO                         cY S w xY w)NrL   r/   devicedtyperE   input_nodesrM   	extern_mm
   )top_kalways_includedr   3No choices for GEMM, using ATen backend as fallbackAAll choices for GEMM were invalid, using ATen backend as fallback)*r*   r#   r   rP   rQ   rE   r   aten_mmbind_is_static_problemr$   r+   mm_templatemaybe_append_choicer-   r"   r   add_cutlass_gemm_choicesr    r   add_ck_gemm_choicesr!   r   add_choicesr8   	_inductorr   run_autoheuristicr   appendlenr'   mm_autoheuristicr   collect_autoheuristicinductor_configautotune_fallback_to_atenlogwarningoutput_noder   r   )rH   rI   rM   mnkr0   aten_layoutchoicesstatic_shape
is_nonzeror   rS   rW    num_choices_before_extra_configs
ah_choiceschoices                    r<   tuned_mmrw      s9   ")$V"DAq!VT4DK$==6;;
 6K5LtTlK	01RT   24,GL*)&1 Aq) 	F++!4L VQ1f5		 
';FAq!'L66wtUofaA6**7FT4LI#FD$7))4L	
 ,K'OO""44T:dO """;/+.w<(&q!Q/ 	F++!4L VQ1f5		 &O+

 %%;;DA%#j/A*=
 18Pf6Z;O6PP!"C#CD 	G%'55IJ||T4L+6BBDDE(wtfMM Q  E88WX||T4L+6BBDD	Es   	KKK AL32L3c                     d}t        j                  |j                        }|;d}|j                  D ]&  }t        j                  |      }||dk(  s!d} d|fS  d|fS d}|D ]  }||z  }	 |dkD  }||fS )NTr   Fr%   )r   %statically_known_list_of_ints_or_nonerE   statically_known_int_or_none)	inputs_tensorsrM   rr   static_sizenonzerossznumeldims	            r<   r\   r\     s     L FFv{{SK 	A<<Q?B~"'g~	
 g~E aiG  r>   c                (   t        | ||t        j                        \  }}}}} }t        | |g|      \  }}|xr |xr t	        ||||      }t               rt        j                  | |f|      gng }	|st        |d      rg }	|rt        j                  |	|| |gdd       |rGt        |d      r:t        |||      D ]*  }
t        j                  |	f| |f|dt        |
||||       , t        |	      dk(  r.t         j#                  d       t        j                  | |f|      g}		 t%        d|	| |g|      S # t&        $ rR t(        j*                  s t         j#                  d	       t        j                  | |f|      g}	t%        d|	| |g|      cY S w xY w)
NrM   	out_dtypeT)enable_int32fuseablenon_fuseablerR   r   z^No choices for integer GEMM avaialbe using configured backends, using ATen backend as fallbackint_mmrY   )r*   r8   int32r\   r"   r   aten__int_mmr[   r$   r   r_   r(   r]   r^   r-   re   rj   rk   r   r   rh   ri   )rH   rI   rM   rm   rn   ro   rr   rs   use_cutlassrq   r   s              r<   tuned_int_mmr     s   ")d6U[[#Aq!VT4  24,GL*W:W2FvqRSUV2WK 6K5L		D$<	01RT 
 )&tD66VdD\Dt	
 )&tD%aA. 	F++!4L VQ1f5		 7|ql	
  $$dD\6:;R(7T4L&QQ R88WX$$dD\6:;(7T4L&QQRs   &D6 6AFF)rA   rB   rM   c                j   d}t        ||| |      \  }}}	}}}}
t        | ||g|      \  }}|r
t               swddlm}m} t        ||      r) ||j                  |j                  |j                        }t               rt        j                  | ||f|||      gng }t        d|| ||g|      S t               rt        j                  |
||f|||      gng }t               ry|
j                         d   dk(  rc|
j                         j                   dk(  rFt"        j$                  j&                  r,|j)                  dt*        j                  |
||f|||             |r`t-        |      rUt/        |||	      D ]E  }t1        j2                  |f|
||f|d	t5        ||||	|      d
t7        |j                  ||      d G |r\|rZt9        ||||	      rLt;        j<                  |
j>                  j@                  d         dk7  rtC        jD                  |||||
g||       |r+tG        ||||	      rtI        jJ                  |||||
g||       tM        |||      rtO        jP                  |||
||g||d       d}tS        |      dk(  rtT        jW                  d       d}|r|jY                  t        j                  |
||f||||             |
j                         d   dk(  rc|
j                         j                   dk(  rFt"        j$                  j&                  r,|j)                  dt*        j                  |
||f|||             	 t        d||
||g|      S # tZ        $ rV t"        j\                  s tT        jW                  d       t        j                  | ||f||||      }|j_                         cY S w xY w)N)rB   rA   rL   r   )FixedLayoutr   rO   )rA   rB   rF   cudarR   r%   )prefix_argsepilogue_fnT)rA   rB   has_biasFrX   rY   )0r*   r\   r#   torch._inductor.irr   r   
isinstancerP   rQ   rE   r   
aten_addmmr[   r   
get_stride
get_devicetyperh   tritonautotune_cublasLtinsertaten_bias_addmmr$   r+   r]   r^   r-   r&   r"   r   rz   rM   rD   r   r_   r    r   r`   r!   r   ra   re   rj   rk   rd   r   ri   rl   )rG   rH   rI   rA   rB   rM   ordered_kwargs_for_cpp_kernelrm   rn   ro   inp_expandedrr   rs   r   r   rq   r   add_aten_fallbackfallback_choices                      r<   tuned_addmmr   B  s    $5!07dCPV0W-Aq!VT413d2CVLL* 0 2 	Cfk*#}}FLLv{{F %& $%	     	 )'Ct;LfUU !" OOtT*	  	
   	##%a(A-##%**f4""44 	  tT*F%d ! 	
 )&1 Aq) 	F++)46 VQ1f5	
 *6<<E	 
';FAq!'L
 778K8K8R8RSU8VW "::t\* ofaA6**4&	
 $FD$7))4&	
 
7|qIJ OOtT*-  	
 ##%a(A-'')..&8&&88 NN$$!4.e$ % -(W|T48&
 	
  -88WX$//$) * 
 **,,-s   M AN21N2)r   rM   c                   ddl m}  || ||      \  } }}| j                         \  }}|j                         \  }}	|j                         \  }
}t        j                  j
                  j                  ||      }t        j                  j
                  j                  d|z  |
      }|6ddlm}  ||j                         |r|n|j                         ||g|dg      }n	|J d       t               rt        j                  | ||f||      gng }||z  dk7  r+t        ||||      rt        j                   ||| ||gdd	       t#        d
|| ||g|      S )Nr   )realize_inputsr   )r   r%   z,out_dtype is ignored if layout is specified.)r   Tr   sparse_semi_structured_mm) torch._inductor.select_algorithmr   get_sizer   graphsizevarsguard_equalsr   r   r   r7   r   aten__sparse_semi_structured_mmr[   r"   r   r_   r   )rH   	mat1_metarI   r   rM   r   m1k1m2_k2rn   rm   ro   r   rq   s                   r<   tuned_sparse_semi_structured_mmr     sj    @*4DAD)T]]_FB EBMMOEB	%%b"-A	%%a"fb1A~2OO"I(8FF	
  P"PP  !"	 ,00y$'9 1 	
   	1uz*61a;66VdD)4tRV	
 %#WtY.Ev r>   c                d    t        j                  | |j                  | j                        |      S )N)r@   )r8   r/   torQ   )rH   rI   r@   s      r<   fallback_mixed_mmr     s"    88D$''$**-377r>   indexreturnc                 f    t         j                  j                  | xs d      }|j                  dk  S )Nr      )r8   r   get_device_propertiesmajor)r   propss     r<   _is_sm7x_or_older_gpur   	  s)    JJ,,UZa8E;;!r>   c                 &    t        d | D              S )Nc              3   <   K   | ]  }t        |t                y wr6   )r   int).0r   s     r<   	<genexpr>zdims_are_int.<locals>.<genexpr>  s     4z#s#4s   )all)dimss    r<   dims_are_intr     s    4t444r>   c                    t        ||| ||      \  } }}t        | ||g      sy |j                  t        j                  k7  ry t        j
                  j                         dk\  rt               dk7  ry | dk(  r|dz  dk7  s|dz  dk7  ry | dk  r|dk\  r|dk\  rt        dddd	d
      S | dkD  r| dk  r|dk\  r|dk\  rt        dddd	d
      S | dkD  r| dk  r|dk\  r|dk\  rt        dddd	d
      S y )N)   r   i  r%      r   i   @            )BLOCK_MBLOCK_NBLOCK_K
num_stages	num_warps    )	get_size_hintsr   rQ   r8   float16r   get_device_capabilityr   r.   )rm   rn   ro   rq   rH   rI   
mat2_dtyperM   s           r<   try_heuristicr     s#   T4Aq1GAq!Aq	"zzU]]" JJ,,.&8		 F	*Av1r6Q;!b&A+Bw19d
 	
 
RAGT	a4i
 	
 
RAGT	a4i
 	
 r>   rV   c           	          t        | ||||      \  }}}t        |||g      sy t        | |      \  }}fd}d } ||||| |||      }t        ||||||	      }|
|j	                  |
|      S |j                         S )Nc                 V   t               }|j                  d|        |j                  d|       |j                  d|       |j                  d|j                  j                  d       |j                  d|j                  j                  d       t	        |d|       t	        |d	|       |j                  d
|j                  j                         d       |j                  d|j                  j                         d       dk(  r t        ||j                  j                         |S )Nrm   ro   rn   
mat1_dtypeT)is_categoricalr   rH   rI   mat1_iscontigmat2_iscontigr/   )r   add_featurerM   rQ   r	   is_contiguousr
   )	rm   ro   rn   rH   rI   mat1_stridemat2_stridecontextr0   s	           r<   get_contextz%mm_autoheuristic.<locals>.get_contextT  s   +C#C#C#L$++*;*;DQL$++*;*;DQGV[9GV[9T[[668 	 	
 	T[[668 	 	
 4<"7DKK,=,=>r>   c                       y r6    r   r>   r<   fallbackz"mm_autoheuristic.<locals>.fallbackh  s    r>   )r   rq   rS   r   r0   augment_contextprecondition)rW   )r   r   get_size_hints_stridesr   get_top_k_choices_callerget_choice_caller)rH   rI   rm   rn   ro   rq   r0   rS   opsr   rV   rW   r   r   r   r   r   autoheuristics         `           r<   rf   rf   A  s     T4Aq1GAq!Aq	"5dDAK( !Q4{KHG0!M 55? 6 
 	
 **,,r>   c                    t        |t              rt        |t              s^t        j                  j                  j                  | j                         t        j                  j                  j                        \  }}t        |t              rt        |t              s^t        j                  j                  j                  |j                         t        j                  j                  j                        \  }}|||fS )Nr   )r   r   r   r   r   
size_hintsr   r8   rb   r   unbacked_symint_fallback)rH   rI   rm   rn   ro   s        r<   r   r     s    aZ3%7!!,,MMO__++DD - 
A
 aZ3%7!!,,MMO__++DD - 
A a7Nr>   c                 d   | j                   j                  }|j                   j                  }||g}g }|D ]p  }t        |t              sMt        j
                  j                  j                  |t        j                  j                  j                        }|j                  |       r |d   |d   fS )Nr   r   r%   )rM   rD   r   r   r   r   r   r   r8   rb   r   r   rd   )rH   rI   r   r   stridesstrides_hintsrD   s          r<   r   r     s    ++$$K++$$KK(GM %&#&WW%%00//HH 1 F 	V$% ]1---r>   c                    t        | |d       \  }}}}} }t        | |g|      \  }}t        j                  | |f|      }	|	g}
| j                  j
                  t        j                  k7  xr7 |j                  j                         xs |j                  j                          xs t        |j                  j                        xs t        j                  dk(  xs t        j                   j#                  |j                  t$        j&                         xs | j                  j
                  t        j                  k(  xr. t        j(                  j*                  j,                  j.                  xsP | j                  j
                  t        j0                  k(  xr' |j                  j
                  t        j2                  k(  }t        j                  dk(  rg }
|sd| j5                  dd      }|rct        j                  dk(  rPg }
t7        ||||
| |||      }|)t9        j:                  |
f| |f|dt=        ||||||       |
j?                  |	       tA        |       xs tA        |      }tC        ||||	      D ]+  }t9        j:                  |
f| |f|dt=        ||||||       - |rH|rFtE        ||||      r8tG        jH                  |
|| |gd
d
       tK        jH                  |
|| |gd
d
       |r|
s|	g}
d}| |g}t        jL                  jN                  jQ                  |      rItS        | |||||
||tU               tV        
      }|s't        j                  dk(  r||
jY                  d|       t[        ||
||      S )NrL   atenr   ztl.ztorch. 	heuristicrR   )has_int8_tensorTr   mixed_mmr   ).r*   r\   aten_fallback_mixed_mmr[   rM   rQ   r8   float32r   is_transposedr   rP   r   rh   mixed_mm_choicer   r   has_featurer   TRITON_TEMPLATESbackendsr   matmul
allow_tf32bfloat16r:   replacer   r]   r^   r-   rd   r=   r)   r"   r   r_   r   rb   r   rc   rf   r   r   r   r   )rH   rI   r   rm   rn   ro   rM   rr   rs   r   rq   skip_tritonb_prologue_cast_typer   r   r0   rS   rv   s                     r<   tuned_mixed_mmr	    s\   ")$T"BAq!VT414,GL*%**D$<@HjG
 KK. Q[[..0ODKK4M4M4OP
	V !!4!45	
	V
 **f4
	V ww""6==.2Q2QRR
	V KK.X5>>3F3F3M3M3X3X
	V KK/TDKK4E4E4T  &&(2!$ZL199(BGO;;{JG"1aGT4VTF!//!%t! !Aq&:NO	 NN8$&t,BT0B&q!QP 	F++!4L VQ1f6JK		 
';FAq!'L66VdD\Dt	
 	66VdD\Dt	
 7*D,K//5!!$
 //;>"NN1f%$T7KHHr>   c                   |2t        j                  |j                         t         j                        n|}t	        | ||||      \  }}}}} }}g }t        |||      D ]S  }	t        j                  |f| ||f|dt        t        |	||||      d      dt        j                  j                  d U t        d|| ||g|      S )Nr   rR   ztl.int32)ACC_TYPEr%   )suffix_argsr   r   )r8   promote_typesr7   r   r*   r(   r]   r^   dictr-   r   r   mulr   )
rH   rI   mat3r   rM   rm   rn   ro   rq   r   s
             r<   tuned_fused_int_mm_mulr    s      	DNN,ekk: 
 )0dD9)%Aq!VT4 %'G!!Q* 
''	
tT*	
 :faAv6L		

 			

 %XwtT8JFSSr>   )NN)a	functoolsloggingtypingr   r   r   r   r8   +torch._inductor.autoheuristic.autoheuristicr   1torch._inductor.autoheuristic.autoheuristic_utilsr   r	   r
   r   r   r   )torch._inductor.codegen.cpp_gemm_templater   torch._inductor.virtualizedr   r   r   rh   codegen.commonr   codegen.cuda.gemm_templater   r   'codegen.rocm.ck_universal_gemm_templater   codegen.wrapperr   irr   r   loweringr   select_algorithmr   r   r   r   utilsr   r   r    r!   r"   r#   r$   	mm_commonr&   r'   r(   r)   r*   r+   r,   r-   r.   	getLogger__name__rj   r   r   r]   r/   rZ   rF   defaultr   _int_mmr   _sparse_semi_structured_mmr   r=   rJ   r   rw   r\   r   r   r   r   r   	lru_cacher   boolr   r   r   rf   r   r   r	  r  r   r>   r<   <module>r)     sN     , ,  T  L ) ( + U D , * (   
 
 
 g!yy~~		>BH UXX|
4	KKdjj.@.@
 "%--?"4	$$$# 8 (,11 I %Z6 4775#' _E 6_ED!* 4<<T:'+ (R ;(RV 4::48*+!D O- 9O-d 422M(,T) N)X8 ,,=tD  T# 4  
5+r  ;- C=;-|.QIp CG Tr>   