
    sg             	      8   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
mZmZmZmZmZmZmZmZmZ d dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZ d d	l m!Z!m"Z"m#Z# d d
l$m%Z% ddl&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3 ddl4m5Z5m6Z6 ddl7m8Z8 ddl9m:Z:m;Z; ddl<m=Z=m>Z> ddl?m@Z@mAZAmBZBmCZCmDZDmEZEmFZFmGZG ddlHmIZJmKZKmLZLmMZMmNZN ddlOmPZP ddlQmRZRmSZSmTZTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[ ddl\m]Z]m^Z^m_Z_m`Z`maZambZb ddlcmdZdmeZemfZfmgZg erddl/mhZh  ej                  ej      Zkej                  j                  ejd      Znej                  j                  ejd      Zoej                  j                  ejd      Zp ed      d         Zq ed      d!        Zre*j                  e*j                  e*j                  fD  ci c]  } |  ej                  e(|     d"d#d#$        c} Zwe*j                  e*j                  e*j                  fD  ci c],  } |  ej                  e(|    j                          d%d#d#&      . c} Zyej                   G d' d(             Z{ej                   G d) d*             Z|dCd+Z} G d, d-eX      Z~ e~       j                  Zd. Zd/ Zd0 Zd1 Z G d2 d3eT      Z G d4 d5eW      Zej                  d6       dDd7Z G d8 d9e      ZdEd:Z G d; d<      Zej                   G d= d>             Z G d? d@ea      Z G dA dBeb      Zyc c} w c c} w )F    )annotationsN)	lru_cache)
AnyCallablecastDictIterableListOptionalTupleTYPE_CHECKINGUnion)preserve_rng_state)AutotuneHintDeviceProperties)is_integer_dtype)
OrderedSet)CeilDivFloorDivModularIndexing)has_triton_package   )free_symbol_is_type
prefix_strsymbol_is_typeSymT)ValueRanges   )configir)	code_hashget_pathPyCodeCache)is_metric_table_enabledlog_kernel_metadata)benchmarker)ReductionHintTRITON_MAX_BLOCK)get_max_y_gridnext_power_of_2)cache_on_selfget_bounds_index_exprget_fused_kernel_nameget_kernel_metadatais_welford_reductionPlaceholder	sympy_dot
sympy_subs)_ops
OpsHandlerReductionType	StoreModeV)"get_kernel_category_by_source_code   )
BackendFeatureCSECSEVariableDeferredLineIndentedBufferOpOverridesPythonPrinterSizeArg	TensorArgWorkspaceArg)constant_reprIterationRangesEntryIterationRangesRootpexpr
SIMDKernelSIMDScheduling)	config_ofshould_unwrap_unspec_argsignature_ofsignature_to_meta)IRNode
perf_hintsschedulefusionc                 d    t               syddl} t        | j                  j                  d      ryy)zd
    import AttrsDescriptor if the triton version is new enough to have this
    class defined.
     r   NAttrsDescriptorz4from triton.compiler.compiler import AttrsDescriptor)r   triton.compiler.compilerhasattrcompiler)tritons    Q/var/www/html/venv/lib/python3.12/site-packages/torch/_inductor/codegen/triton.pygen_attr_descriptor_importrZ   \   s+     #v''):;E    c                     t               } | j                  d       t               x}r| j                  |       | j                  d       | j	                         S )NzD
        import triton
        import triton.language as tl
        a,  
        from torch._inductor.runtime import triton_helpers, triton_heuristics
        from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
        from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, instance_descriptor, DeviceProperties
        )r>   splicerZ   	writelinegetvalue)imports	attr_descs     rY   gen_common_triton_importsrb   m   s[    GNN	 /00y0)$NN	 r[   offsetT)integernonnegativeBLOCK)rd   positivec                  h    e Zd ZU ded<   ded<   ded<   ded<   ded	<   d
ed<   d Zd Zd Zd Zd Zy)IndexingOptionsstr	index_strOrderedSet[str]	mask_varsmask_strzOptional[str]
expand_strbool_has_rindex
sympy.Exprindexc                ,    t        | j                        S N)rp   rm   selfs    rY   has_maskzIndexingOptions.has_mask   s    DNN##r[   c                J    t        | j                  t        j                        S ru   )r   rs   r   TMPrv   s    rY   has_indirectzIndexingOptions.has_indirect   s    "4::txx88r[   c                    | j                   S ru   )rq   rv   s    rY   
has_rindexzIndexingOptions.has_rindex   s    r[   c                    d| j                   v S )Ntmprn   rv   s    rY   has_tmpmaskzIndexingOptions.has_tmpmask   s    %%r[   c                    d| j                   v S )Nrmaskr   rv   s    rY   	has_rmaskzIndexingOptions.has_rmask   s    $--''r[   N)	__name__
__module____qualname____annotations__rx   r{   r}   r   r    r[   rY   ri   ri      s=    NM$9 &(r[   ri   c                      e Zd ZU ded<   ded<   ded<   ded<   d	ed
<   edd       Zedd       Zedd       Zedd       Ze		 	 	 	 	 	 	 	 	 	 dd       Z
ddZdddZedd       Zd Zd Zd dZd Zd Zd Zy)!BlockPtrOptionsBlockParametersparamsrr   constant_offset	List[int]orderrl   rm   	List[str]reshape_suffixc                .    | j                   j                  S ru   )r   shaperv   s    rY   r   zBlockPtrOptions.shape   s    {{   r[   c                .    | j                   j                  S ru   )r   block_shaperv   s    rY   r   zBlockPtrOptions.block_shape   s    {{&&&r[   c                .    | j                   j                  S ru   )r   stridesrv   s    rY   r   zBlockPtrOptions.strides       {{"""r[   c                .    | j                   j                  S ru   )r   offsetsrv   s    rY   r   zBlockPtrOptions.offsets   r   r[   c                   |D cg c]  }|j                   j                          d! }}t        |      t        | j                        k(  | j                  D cg c]  }|dk(  	 c}t	              D ]  \  }}|s	sd||<    t
        j                  j                  r%|d   j                   dk(  sJ |j                  d       t
        j                  j                  smt        | j                        t        t
        j                  j                        dz
  k(  r1t
        j                  j                  d   dk7  r|j                  d       fd}	t        d
i t        j                  |       j                         D 
ci c]  \  }
}|
 |	|       c}}
} dd} || j                         | _         || j                        | _        t#        | t
        j$                  j&                  j)                  |      t+        t-        t/        t        | j                                           ||	      S c c}w c c}w c c}}
w )z,Helper to create a  BlockPtrOptions instancerf   r   1xr9   c                    t        |       t              k(  sJ t        |       D cg c]  \  }}|rs| c}}S c c}}w )z3Removes any broadcasting dims from a given sequence)lenzip)ititemis_broadcastingbroadcasting_dimdrop_broadcastss      rY   filterz&BlockPtrOptions.create.<locals>.filter   sN    r7c"23333 .15E-F)D/&o   s   >c                z    | D cg c]+  }t         j                  j                  j                  |      - c}S c c}w ru   )r7   graphsizevarslookup_precomputed_size)exprsexprs     rY   lookup_sizez+BlockPtrOptions.create.<locals>.lookup_size   s+    OTUtAGG$$<<TBUUUs   08)r   r   r   rm   r   r   )r   zIterable[sympy.Expr]returnList[sympy.Expr])prefixupperr   r   	enumerater7   kernelno_x_dimpopinside_reductionnumelsappendr   dataclassesasdictitemsr   r   r   r   r   listreversedrange)r   r   range_treesrm   tr   sir   r   keyvalr   r   r   s                @@rY   createzBlockPtrOptions.create   s    ?JJQXX^^-.e4JJ n-V^^1DD,2NN;qAF;"+,<"= 	(A?$'q!	(
 88q>((C///q! ))FNN#s188??';a'??#q( !!#&	 ! 
0;0B0B60J0P0P0RSHCsF3KS
	V #6<<0$V^^4GG,,DD_Uxc&,,&7 89:)
 	
[ K <: Ts   $H2H7H<c                L    t         t        j                     }t        |||i      S )zH
        Replaces instances of roffset with the new expression.
        )block_offsetsr   RINDEXr2   )rw   r   replacementroffsets       rY   replace_roffsetzBlockPtrOptions.replace_roffset   s$      ,$+ 677r[   c           	        t         j                  j                  }g | j                  }|s2|D cg c]'  }| j	                  |t        j                  d            ) }}| j                  dk7  r| d || j                         dn|d || j                         d || j                         d || j                         d || j                         d ||       g}d	d
j                  |       dS c c}w )a  
        Codegen a call to tl.make_block_ptr()

        Args:
            name: variable name for pointer
            roffset: should roffset be included in offsets=..., for use with tl.advance()

        Returns:
            "tl.make_block_ptr(...)"
        r    + ()zshape=zstrides=zblock_shape=zorder=zoffsets=ztl.make_block_ptr(, )r7   r   index_to_strr   r   sympyIntegerr   r   r   r   r   join)rw   namer   fr   rc   argss          rY   formatzBlockPtrOptions.format  s    HH!!!DLL/MTCI$$VU]]1-=>G 
 ##q( fD4//013Qtzz]O$q'(1T--./0Qtzz]O$qzl#	
 $DIIdO#4A66s   ,C6c           
        t         j                  j                  }t        j	                         D ci c]$  \  }}|t
        t        |   j                            & }}}t        t        | j                              D cg c]  }|j                  | j                  |   t        j                  d            s|j                  | j                  |   | j                   |         sy|j                  | j                  |   t#        | j                   |   |            sCt         j$                  j&                  r'| j                   |   t        t(        j*                     k(  s| c}S c c}}w c c}w )z6List of indices to pass to tl.load(boundary_check=...)r   )r7   r   r   block_sizesr   r(   r   r   r   r   r   statically_known_equalsr   r   r   statically_known_multiple_ofr   r2   r   r   r   XBLOCK)rw   r   symt
block_sizeblock_to_maxidxs         rY   boundary_checkzBlockPtrOptions.boundary_check!  s3    77## %0$5$5$7/
 j (D)9)?)?)ABB/
 /
 S_-
44LL%u}}Q'7 !==JJsOT%5%5c%: !==JJsOZ0@0@0E|%T HH%%((-T[[1II 
 	
/


s   )E>CE%c           
        t         t        j                     }| j                  D cg c]:  }| j	                  ||      | j	                  |t        j                  d            z
  < }}t        j                  j                  |      S c c}w )af  
        Codegen string to pass to tl.advance(name, ...).

        Advance is the difference between offsets in each loop iteration.
        To compute it, we replace roffset with multiples of RBLOCK.
        Since we expect roffset to vary in range(0, rnumel, RBLOCK), the first
        iteration has roffset=0, while the second has roffset=RBLOCK.
        r   )
r   r   r   r   r   r   r   r7   r   r   )rw   rblockrc   advances       rY   advance_roffsetzBlockPtrOptions.advance_roffsetA  s     T[[) ,,

  $$VV4&&vu}}Q/?@A
 
 xx$$W--
s   ?Bc                     yNFr   rv   s    rY   r{   zBlockPtrOptions.has_indirectT      r[   c                :    t        d | j                  D              S )Nc              3  P   K   | ]  }t        |t        j                           y wru   )r   r   r   ).0r   s     rY   	<genexpr>z-BlockPtrOptions.has_rindex.<locals>.<genexpr>X  s     Wd&tT[[9Ws   $&)anyr   rv   s    rY   r}   zBlockPtrOptions.has_rindexW  s    WdFVFVWWWr[   c                "    | j                         S ru   )r}   rv   s    rY   r   zBlockPtrOptions.has_rmaskZ  s      r[   c                     yr   r   rv   s    rY   r   zBlockPtrOptions.has_tmpmask]  r   r[   c                4    t        | j                               S ru   )rp   r   rv   s    rY   rx   zBlockPtrOptions.has_mask`  s    D'')**r[   N)r   r   )
r   r   r   rr   r   zList[IterationRangesEntry]rm   rl   r   r   )r   rr   r   rr   r   rr   )T)r   rj   r   rj   )r   r   r   rp   )r   r   r   r   propertyr   r   r   r   staticmethodr   r   r   r+   r   r   r{   r}   r   r   rx   r   r[   rY   r   r      s    ! ! ' ' # # # # ;
;
 $;
 0	;

 #;
 
;
 ;
z87: 
 
>.&X!+r[   r   c                   t        |t              rt        |t              sJ ||k(  r| S |D cg c]
  }|dk7  s	| c}|k7  rd|  ddj                  |       dS d}g }|D ]G  }|t        |      k  r|||   k(  r|j	                  d       |dz  }0|dk(  sJ |j	                  d	       I |t        |      k(  sJ |  d
dj                  |       dS c c}w )z7Workaround https://github.com/openai/triton/issues/2836r   ztl.reshape(z, [r   z])r   :r9   None[])
isinstancer   r   r   r   )value	old_shape	new_shaper   r   expandsizes          rY   triton_reshaper  d  s    i&:i+FFFI)aS)Y6UG3tyy';&<B??
CF "YDIcN$:MM#1HC3;;MM&!" #i.   WAdii'(** *s
   
CCc                      e Zd Zd Zd Zd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zd ZeZd ZddZd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zy)TritonPrinterc                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS )Nr9   libdevice.trunc(r   ).to(r   r   r   _printr7   r   index_dtyperw   r   s     rY   _print_TruncToIntzTritonPrinter._print_TruncToInt~  M    499~"""t{{499Q<89qxx?S?S>TTUV	
r[   c                    t        |j                        dk(  sJ | j                  | j                  |j                  d                dS )Nr9   r   z.to(tl.float64))r   r   parenr  r
  s     rY   _print_ToFloatzTritonPrinter._print_ToFloat  s@    499~"""**T[[1678HHr[   c                    |j                   \  }}| j                  |      }| j                  |      }|j                  r1|j                  r%| j                  |       d| j                  |       S d| d| dS )N % z!triton_helpers.remainder_integer(r   r   )r   r  is_nonnegativer  rw   r   quotdivquot_sdiv_ss         rY   _print_PythonModzTritonPrinter._print_PythonMod  sv    II	cT"C 3#5#5jj()TZZ->,?@@26("UG1EEr[   c                   |j                   sJ |j                  \  }}| j                  |      }| j                  |      }|j                  r3|j                  r'd| j	                  |       d| j	                  |       dS d| d| dS )N( // r   z!triton_helpers.div_floor_integer(z,  )
is_integerr   r  r  r  r  s         rY   _print_FloorDivzTritonPrinter._print_FloorDiv  s    II	cT"C 3#5#5tzz&)*$tzz%/@.ACC26(#eWAFFr[   c                    |j                   \  }}| j                  | j                  |             d| j                  | j                  |             S )Nz / )r   r  r  )rw   r   lhsrhss       rY   _print_IntTrueDivzTritonPrinter._print_IntTrueDiv  sF    99S**T[[-./s4::dkk#>N3O2PQQr[   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS Nr9   libdevice.floor(r   r  r   r  r
  s     rY   _print_floorzTritonPrinter._print_floor  r  r[   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS r#  r  r
  s     rY   _print_FloorToIntzTritonPrinter._print_FloorToInt  r  r[   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS Nr9   libdevice.ceil(r   r  r   r  r
  s     rY   _print_ceilingzTritonPrinter._print_ceiling  K    499~""" TYYq\!: ;5AUAU@VVWXXr[   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS r)  r  r
  s     rY   _print_CeilToIntzTritonPrinter._print_CeilToInt  r,  r[   c                ,    d| j                  |       dS )Nlibdevice.sqrt(z.to(tl.float32)))r  r
  s     rY   _helper_sqrtzTritonPrinter._helper_sqrt  s     T!2 33CDDr[   c                    d| j                  |j                  d          d| j                  |j                  d          dS )Nlibdevice.pow(r   r   r9   r   )r  r   r
  s     rY   _print_FloatPowzTritonPrinter._print_FloatPow  s?    T[[167r$++diiPQl:S9TTUV	
r[   c                    | j                  |j                  d         }| j                  |j                  d         }| j                  |j                  d         }d| d| d| dS )Nr   r9   r   	tl.where(r   r   )doprintr   )rw   r   cpqs        rY   _print_WherezTritonPrinter._print_Where  s_    LL1&LL1&LL1&1#Rs"QCq))r[   c                   t        |j                        }t        |j                        dk(  r| j                  |j                  d         S t        |j                        dz  }t        |      }| j                   ||j                  d|        }| j                   ||j                  |d        }t	        d ||fD              \  }}|dv sJ d| d       d	| d
| d| d| d| d
| d| d| dS )zK
        Helper for max/min code genereration.
        cmp: > or <
        r9   r   r   Nc              3  (   K   | ]
  }d | d  yw)r  r   Nr   r   r   s     rY   r   z6TritonPrinter._print_min_max_helper.<locals>.<genexpr>  s     .!q1X.s   )><zUnexpected comparator: ''r  z * ( z= z) + )))r   r   r  typetuple)rw   r   cmpnargsmidclsabs           rY   _print_min_max_helperz#TritonPrinter._print_min_max_helper  s    
 DIItyy>Q;;tyy|,,$))n!4jKKTYYt_-.KKTYYst_-. .1v..1j C$<SE"CC 1#T!AcU"QCtA3d1#Qse1QCrBBr[   c                &    | j                  |d      S )Nr@  rL  r
  s     rY   
_print_MinzTritonPrinter._print_Min      ))$44r[   c                &    | j                  |d      S )Nr?  rN  r
  s     rY   
_print_MaxzTritonPrinter._print_Max  rP  r[   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr9   tl_math.abs(r   r   r   r   r  r
  s     rY   
_print_AbszTritonPrinter._print_Abs  s9    499~"""dkk$))A,78::r[   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr9   zlibdevice.cos((r   ).to(tl.float32))rU  r
  s     rY   _print_OpaqueUnaryFn_cosz&TritonPrinter._print_OpaqueUnaryFn_cos  :    499~""" TYYq\!: ;;LMMr[   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr9   zlibdevice.cosh((r   rX  rU  r
  s     rY   _print_OpaqueUnaryFn_coshz'TritonPrinter._print_OpaqueUnaryFn_cosh  :    499~"""!$++diil";!<<MNNr[   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr9   zlibdevice.acos((r   rX  rU  r
  s     rY   _print_OpaqueUnaryFn_acosz'TritonPrinter._print_OpaqueUnaryFn_acos  r]  r[   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr9   zlibdevice.sin((r   rX  rU  r
  s     rY   _print_OpaqueUnaryFn_sinz&TritonPrinter._print_OpaqueUnaryFn_sin  rZ  r[   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr9   zlibdevice.sinh((r   rX  rU  r
  s     rY   _print_OpaqueUnaryFn_sinhz'TritonPrinter._print_OpaqueUnaryFn_sinh  r]  r[   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr9   zlibdevice.asin((r   rX  rU  r
  s     rY   _print_OpaqueUnaryFn_asinz'TritonPrinter._print_OpaqueUnaryFn_asin  r]  r[   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr9   zlibdevice.tan((r   rX  rU  r
  s     rY   _print_OpaqueUnaryFn_tanz&TritonPrinter._print_OpaqueUnaryFn_tan  rZ  r[   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr9   zlibdevice.tanh((r   rX  rU  r
  s     rY   _print_OpaqueUnaryFn_tanhz'TritonPrinter._print_OpaqueUnaryFn_tanh  r]  r[   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr9   zlibdevice.atan((r   rX  rU  r
  s     rY   _print_OpaqueUnaryFn_atanz'TritonPrinter._print_OpaqueUnaryFn_atan  r]  r[   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr9   zlibdevice.llrint(r   r   rU  r
  s     rY   _print_RoundToIntzTritonPrinter._print_RoundToInt  s9    499~""""4;;tyy|#<"=Q??r[   c                    t        |j                        dk(  sJ |j                  \  }}|j                  r|dk  sJ t        d| d      d| d| j	                  | j                  |             d|  S )Nr   r   zOFor integer inputs, only non-negative ndigits are currently supported, but got .zlibdevice.nearbyint(1e * z) * 1e)r   r   r  
ValueErrorr  r  )rw   r   numberndigitss       rY   _print_RoundDecimalz!TritonPrinter._print_RoundDecimal  s    499~"""))Q;;abiajjkl  (yDJJt{{6?R4S3TTZ\c[cZdeer[   N)r   rr   rF  rj   r   rj   )r   r   r   r  r  r  r  r!  r%  r'  r+  r.  r1  r4  _print_PowByNaturalr;  rL  rO  rR  rV  rY  r\  r_  ra  rc  re  rg  ri  rk  rm  rt  r   r[   rY   r  r  }  s    
IFGR

YYE

 **C(55;NOONOONOO@	fr[   r  c                    t        |       j                  d      d   }|dk(  rd}n@|dv rt        j                  j                  rd}n|dk(  rd}n|d	k(  rd
}n|dk(  rd}n|dk(  rd}d| S )Nro  r   rp   int1)float16bfloat16float32float8_e4m3fn
float8e4nvfloat8_e5m2float8e5float8_e4m3fnuz
float8e4b8float8_e5m2fnuzfloat8e5b16tl.)rj   splitr   rX   codegen_upcast_to_fp32dtypetriton_type_names     rY   triton_compute_typer    s    5z'',R06!!33MM00 %	_	,'	]	*%	.	.'	.	.(!"##r[   c                *   t        | d      r| j                  r\| t        j                  t        j                  fv rt
        j                  j                  ryt        j                  |       j                  S t        j                  |       j                  S y)Nis_floating_point    r   )rV   r  torchry  rx  r   rX   r  finfobitsiinfo)r  s    rY   _get_primitive_bitwidthr  /  sj    u)*"" %..%--88MM88;;u%***;;u%***r[   c                t    t        |       j                  d      d   }|dk(  rd}n|dk(  rd}n|dk(  rd}d	| S )
Nro  r   rp   int8r{  r|  r}  r~  r  )rj   r  r  s     rY   triton_store_typer  ?  sU    5z'',R06!!	_	,'	]	*%!"##r[   c                ~    t        |       r(| j                  r| t        j                  k(  rdnd}d| S t	        |       S )N@   r  ztl.int)r   	is_signedr  int64r  )r  nbitss     rY   triton_acc_typer  J  s:    5??u{{*wu%%r[   c                  &     e Zd Zd fdZd Z xZS )TritonCSEVariablec                D    t         |   ||       t               | _        y ru   )super__init__r   rm   )rw   r   bounds	__class__s      rY   r  zTritonCSEVariable.__init__R  s    v&*4,r[   c                0   |D ]  }t        |t              r&| j                  j                  |j                         9t        |t        j
                        sT|j                  d   dv sf| j                  j                  |j                  d    dh        y )Nr   xyrmask)r   r  rm   updater   Symbolr   )rw   r   r   kwargsargs        rY   update_on_argsz TritonCSEVariable.update_on_argsW  su     	>C#01%%cmm4C.388A;%3G %%#((1+d';&<=	>r[   )r  zValueRanges[Any]r   r   )r   r   r   r  r  __classcell__r  s   @rY   r  r  Q  s    7
>r[   r  c                     e Zd ZdZe	 	 dP	 	 	 dQd       ZedRd       Zed        Zed        Z	ed        Z
ed	        Zed
        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zedej.                  dddd       Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Z ed        Z!ed         Z"ed!        Z#ed"        Z$ed#        Z%ed$        Z&ed%        Z'ed&        Z(ed'        Z)ed(        Z*ed)        Z+ed*        Z,ed+        Z-ed,        Z.ed-        Z/ed.        Z0ed/        Z1ed0        Z2ed1        Z3ed2        Z4ed3        Z5ed4        Z6ed5        Z7ed6        Z8ed7        Z9ed8        Z:ed9        Z;ed:        Z<ed;        Z=ed<        Z>ed=        Z?ed>        Z@ed?        ZAed@        ZBedA        ZCedB        ZDedC        ZEedD        ZFedE        ZGedF        ZHedG        ZIedH        ZJedI        ZKedJ        ZLedK        ZMedL        ZNedM        ZOedN        ZPedO        ZQy)STritonOverrideszMap element-wise ops to TritonNTc                :   	 	 	 	 	 	 dd}|>t         |||      t        j                  j                        t        j                  _        |t        j
                  k(  rd|  dS |t        j                  k(  r|  dS |rt        |      }nt        |      }|  d| dS )Nc                   | |k(  ryt         j                  t         j                  f}| |v r||v r| |k7  rJ d       | t         j                  k(  s|t         j                  k(  ry| t         j                  k(  s|t         j                  k(  ryy)Nr   zCConversions between float8_e5m2 and float8_e4m3fn is not supported!   r   )r  r{  r}  )	src_dtype	dst_dtype
fp8_dtypess      rY   _get_min_elements_per_threadz>TritonOverrides.to_dtype.<locals>._get_min_elements_per_threadl  s     I% ##!!J Z'+*U U	U 
 E---e>O>O1OE///9@S@S3Sr[   r  z != 0)z.to(tl.int8).to(tl.uint8).to(r   )r  torch.dtyper  r  r   int)	maxr7   r   min_elem_per_threadr  rp   uint8r  r  )r   r  r  use_compute_typesr  	out_dtypes         rY   to_dtypezTritonOverrides.to_dtypee  s    	"	/:		6   ,/,Y>,,,AHH(
 EJJqc= ekk! S122+E2I)%0ID1%%r[   c                   t        |      }|t        j                  t        j                  fv rt        j
                  j                  rvt        |      j                  d      d   }|  d| d}|t        j                  t        j                  fv r"t        |      j                  d      d   }d| }| d| d}| dS t        |      }t        |      }||k(  rd	nd
}	|  d| d|	 dS )Nro  r   z.to(tl.r   r  r  z, bitcast=True).to(tl.float32)TrueFalsez
, bitcast=)
r  r  rx  ry  r   rX   r  rj   r  r  )
r   r  r  triton_dtypetriton_src_dtypecast_xr  src_dtype_bitwidthtarget_dtype_bitwidthbitcasts
             rY   to_dtype_bitcastz TritonOverrides.to_dtype_bitcast  s    *51 %--8844"9~33C8<s'"2!315F77#&u:#3#3C#8#< !$%5$67xtL>AFX_--!8!C$;E$B! 26K KfQXGS\N*WIQ??r[   c                    t         j                  j                  |      }t         ||             }t	        |      }|dk(  r|S d| d| d| dS )Nz
tl.float32tl.full(r   r   )r  _prims_commondtype_to_typerD   r  )r   r  r   type_
triton_valtriton_types         rY   _shaped_constantz TritonOverrides._shaped_constant  s[    ##11%8"5<0
)%0,& %:,bQ??r[   c                *    | j                  ||g       S )Nr   )r  )rI  r   r  s      rY   constantzTritonOverrides.constant  s    ##E5#;;r[   c                    d|  dS )NrT  r   r   r   s    rY   abszTritonOverrides.abs      aS""r[   c                    d|  dS )Nzlibdevice.abs(r   r   r  s    rY   libdevice_abszTritonOverrides.libdevice_abs      s!$$r[   c                    d|  dS )Nztl_math.exp(r   r   r  s    rY   expzTritonOverrides.exp  r  r[   c                    d|  dS )Nzlibdevice.exp(r   r   r  s    rY   libdevice_expzTritonOverrides.libdevice_exp  r  r[   c                    d|  dS )Nzlibdevice.exp2(r   r   r  s    rY   exp2zTritonOverrides.exp2       1%%r[   c                    d|  dS )Nzlibdevice.expm1(r   r   r  s    rY   expm1zTritonOverrides.expm1      !!A&&r[   c                    d|  dS Nr0  r   r   r  s    rY   sqrtzTritonOverrides.sqrt  r  r[   c                    d|  dS r  r   r  s    rY   libdevice_sqrtzTritonOverrides.libdevice_sqrt  r  r[   c                   t         j                  j                  }|dk(  ry|dk(  r	d|  d|  dS |dk(  r|  dS |8t        j                  t        j
                  d	t        j                        |       S t        d
|      )Ncompile_errorzcompile error!runtime_errorz"triton_helpers.device_assert_then(z == 0, "injected assert fail", r   accuracyz + 1r   z:unrecognized config triton.inject_relu_bug_TESTING_ONLY = )	r   rX   inject_relu_bug_TESTING_ONLYopsmaximumr  r  int32AssertionError)r   bugs     rY   reluzTritonOverrides.relu  s    mm88/!#O# 8s:YZ[Y\\]^^JS:[;;s||Au{{;Q?? LSGT r[   c                    d|  d| dS )Nztriton_helpers.minimum(r   r   r   rJ  rK  s     rY   minimumzTritonOverrides.minimum      (2aS22r[   c                    d|  d| dS )Nztriton_helpers.maximum(r   r   r   r  s     rY   r  zTritonOverrides.maximum  r  r[   c                    d|  d| d| dS )Nr6  r   r   r   )rJ  rK  r8  s      rY   wherezTritonOverrides.where  s    1#Rs"QCq))r[   r9   )constraintsr  is_purepackc                    t        |      }dj                  |D cg c]  }t        |       c}      }|#dj                  dg|D 	cg c]  }	d c}	z         }d|  d| d| d| d| d	| d
S c c}w c c}	w )Nr   z=rrztl.inline_asm_elementwise('z', 'z', [z	], dtype=z
, is_pure=z, pack=r   )r  r   rj   )
asmr  r  r  r  inputsr  r   
input_refs_s
             rY   inline_asm_elementwisez&TritonOverrides.inline_asm_elementwise  s     *%0YY71A78
))TF6-Bac-B$BCK,SEk]$zlR[\g[hhrszr{  |C  DH  CI  IJ  K  	K  8-Bs   A.	A3
c                    d|  dS )Nztl_math.cos(r   r   r  s    rY   coszTritonOverrides.cos  r  r[   c                    d|  dS )Nzlibdevice.cos(r   r   r  s    rY   libdevice_coszTritonOverrides.libdevice_cos  r  r[   c                    d|  dS )Nztl_math.sin(r   r   r  s    rY   sinzTritonOverrides.sin  r  r[   c                    d|  dS )Nzlibdevice.sin(r   r   r  s    rY   libdevice_sinzTritonOverrides.libdevice_sin  r  r[   c                    t        d      )Nz/ops.index_expr not implemented outside a kernelNotImplementedError)rI  r   r  s      rY   
index_exprzTritonOverrides.index_expr!  s    !"STTr[   c                    t        d      )Nz+ops.masked not implemented outside a kernelr  )r  bodyothers      rY   maskedzTritonOverrides.masked%  s    !"OPPr[   c                    d|  dS )Nzlibdevice.lgamma(r   r   r  s    rY   lgammazTritonOverrides.lgamma)      "1#Q''r[   c                    d|  dS )Nzlibdevice.erf(r   r   r  s    rY   erfzTritonOverrides.erf-  r  r[   c                    d|  dS )Nzlibdevice.cosh(r   r   r  s    rY   coshzTritonOverrides.cosh1  r  r[   c                    d|  dS )Nzlibdevice.sinh(r   r   r  s    rY   sinhzTritonOverrides.sinh5  r  r[   c                    d|  dS )Nzlibdevice.acos(r   r   r  s    rY   acoszTritonOverrides.acos9  r  r[   c                    d|  dS )Nzlibdevice.acosh(r   r   r  s    rY   acoshzTritonOverrides.acosh=  r  r[   c                    d|  dS )Nzlibdevice.asin(r   r   r  s    rY   asinzTritonOverrides.asinA  r  r[   c                    d|  dS )Nzlibdevice.asinh(r   r   r  s    rY   asinhzTritonOverrides.asinhE  r  r[   c                    d|  d| dS )Nzlibdevice.atan2(r   r   r   r   ys     rY   atan2zTritonOverrides.atan2I      !!Bqc++r[   c                    d|  dS )Nzlibdevice.atan(r   r   r  s    rY   atanzTritonOverrides.atanM  r  r[   c                    d|  dS )Nzlibdevice.atanh(r   r   r  s    rY   atanhzTritonOverrides.atanhQ  r  r[   c                    d|  d| dS )Nzlibdevice.copysign(r   r   r   r(  s     rY   copysignzTritonOverrides.copysignU  s    $QCr!A..r[   c                    d|  dS )Nzlibdevice.erfc(r   r   r  s    rY   erfczTritonOverrides.erfcY  r  r[   c                    d|  dS )Nzlibdevice.erfinv(r   r   r  s    rY   erfinvzTritonOverrides.erfinv]  r  r[   c                    d|  d| dS )Nzlibdevice.hypot(r   r   r   r(  s     rY   hypotzTritonOverrides.hypota  r+  r[   c                    d|  dS )Nzlibdevice.log10(r   r   r  s    rY   log10zTritonOverrides.log10e  r  r[   c                    d|  dS )Nzlibdevice.log2(r   r   r  s    rY   log2zTritonOverrides.log2i  r  r[   c                    d|  d| dS )Nzlibdevice.nextafter(r   r   r   r(  s     rY   	nextafterzTritonOverrides.nextafterm  s    %aS1#Q//r[   c                    |  d| S N & r   r  s     rY   logical_andzTritonOverrides.logical_andq      Cs|r[   c                    |  dS )Nz == 0r   rJ  s    rY   logical_notzTritonOverrides.logical_notu  s    E{r[   c                    |  d| S Nz | r   r  s     rY   
logical_orzTritonOverrides.logical_ory  rB  r[   c                    d|  d| dS )Nr   ^ r   r   r  s     rY   logical_xorzTritonOverrides.logical_xor}  s    1#S1~r[   c                    |  d| S r?  r   r  s     rY   bitwise_andzTritonOverrides.bitwise_and  rB  r[   c                    d|  S )N~r   rD  s    rY   bitwise_notzTritonOverrides.bitwise_not  s    1#wr[   c                    |  d| S rG  r   r  s     rY   
bitwise_orzTritonOverrides.bitwise_or  rB  r[   c                    |  d| S )NrJ  r   r  s     rY   bitwise_xorzTritonOverrides.bitwise_xor  rB  r[   c                    |  d| S )Nz << r   r  s     rY   bitwise_left_shiftz"TritonOverrides.bitwise_left_shift      D}r[   c                    |  d| S )Nz >> r   r  s     rY   bitwise_right_shiftz#TritonOverrides.bitwise_right_shift  rW  r[   c                     d| d}d|  d| dS )Nr  ).to(tl.uint32)ztl.rand(r   r   r   seedrc   s     rY   randzTritonOverrides.rand  s%    VHO,$r&++r[   c                     d| d}d|  d| dS )Nr  r[  z	tl.randn(r   r   r   r\  s     rY   randnzTritonOverrides.randn  s%    VHO,4&6(!,,r[   c           	     ,    d| d}d|  d| d| d| d	S )Nr  r[  ztriton_helpers.randint64(r   r   r   )r]  rc   lowhighs       rY   	randint64zTritonOverrides.randint64  s1    VHO,*4&6("SED6KKr[   c                    t        d      )Nz.ops.load_seed not implemented outside a kernelr  )r   rc   s     rY   	load_seedzTritonOverrides.load_seed  s    !"RSSr[   c                    d|  dS )Nzlibdevice.rsqrt(r   r   r  s    rY   rsqrtzTritonOverrides.rsqrt  r  r[   c                    d|  dS )Nzlibdevice.log1p(r   r   r  s    rY   log1pzTritonOverrides.log1p  r  r[   c                    d|  dS )Nzlibdevice.tan(r   r   r  s    rY   tanzTritonOverrides.tan  r  r[   c                    d|  dS )Nzlibdevice.tanh(r   r   r  s    rY   tanhzTritonOverrides.tanh  r  r[   c                    d|  dS )Nztl.sigmoid(r   r   r  s    rY   sigmoidzTritonOverrides.sigmoid  s    QCq!!r[   c                    d|  d|  d|  dS )Nzlibdevice.signbit(z) if (z).dtype is tl.float32 else z < 0r   r  s    rY   signbitzTritonOverrides.signbit  s      $A3fQC/J1#TRRr[   c                    d|  d| dS )Nzlibdevice.fmod(r   r   r   r  s     rY   fmodzTritonOverrides.fmod  s     2aS**r[   c                    d|  d| dS )Nr3  r   r   r   r  s     rY   powzTritonOverrides.pow  s    s"QCq))r[   c                    d|  dS )Nztl_math.log(r   r   r  s    rY   logzTritonOverrides.log  r  r[   c                    d|  dS )Nzlibdevice.log(r   r   r  s    rY   libdevice_logzTritonOverrides.libdevice_log  r  r[   c                    d|  dS )Nzlibdevice.isinf().to(tl.int1)r   r  s    rY   isinfzTritonOverrides.isinf      !!M22r[   c                    d|  dS )Nzlibdevice.isnan(r|  r   r  s    rY   isnanzTritonOverrides.isnan  r~  r[   c                    d|  dS )Nzlibdevice.nearbyint(r   r   r  s    rY   roundzTritonOverrides.round  s    %aS**r[   c                    d|  dS )Nr$  r   r   r  s    rY   floorzTritonOverrides.floor  r  r[   c                H    |  d| }|  d| }d|  d| d| d| d| d| d	S )
Nr  r  z
tl.where((z
 < 0) != (z < 0), tl.where(z != 0, z - 1, ), r   r   )rJ  rK  r  rems       rY   floordivzTritonOverrides.floordiv  sV    
 D}3qclA3j+;C5vVTXSYY\]a\bbcddr[   c                f   t        j                  dt        j                        }t        j                  t        j
                  ||       t        j                        }t        j                  t        j
                  | |      t        j                        }t        j                  ||      }| d|  dS )Nr   r  .dtype))r  r  r  r  r  ltr  sub)r   zleftrightr  s        rY   signzTritonOverrides.sign  su    LLEKK(||SVVAq\EJJ7cffQlUZZ8ggdE"d1#W%%r[   c                    d|  dS )Nr  r   r   r  s    rY   trunczTritonOverrides.trunc  r  r[   c                    |  d| S )Nr  r   r  s     rY   truncdivzTritonOverrides.truncdiv  s     D}r[   c                    d|  dS )Nr*  r   r   r  s    rY   ceilzTritonOverrides.ceil   r  r[   )NT)r  r  r  zOptional[torch.dtype])r  r  r  r  )Rr   r   r   __doc__r   r  r  r  classmethodr  r  r  r  r  r  r  r  r  r  r  r  r  r  rz  r  r  r	  r  r  r  r  r  r  r  r  r   r"  r$  r&  r*  r-  r/  r1  r3  r5  r7  r9  r;  r=  rA  rE  rH  rK  rM  rP  rR  rT  rV  rY  r^  r`  rd  rf  rh  rj  rl  rn  rp  rr  rt  rv  rx  rz  r}  r  r  r  r  r  r  r  r  r   r[   rY   r  r  b  s   ( ,0	6&6& )6& 6&p @ @0 @ @ < < # # % % # # % % & & ' ' & & & &  " 3 3 3 3 * * "&emmTPQK K # # % % # # % % U U Q Q ( ( % % & & & & & & ' ' & & ' ' , , & & ' ' / / & & ( ( , , ' ' & & 0 0                     , , - - L L T T ' ' ' ' % % & & " " S S + + * * # # % % 3 3 3 3 + + ' ' e e & & ' '  
 & &r[   r  rX   c                    | S ru   r   hs    rY   _typecheck_TritonOverridesr  	      Hr[   c                  `    e Zd ZdZed        Zed        Zed        Zed        Z	ed        Z
y)TritonKernelOverridesa   Map element-wise ops to Triton within a TritonKernel

    Unlike TritonOverrides, these assume the code is going to be inserted into
    the body of the main triton kernel and so it may use indexing and mask
    variables which are assumed to already be defined in the current scope.
    c                r    t         j                  j                         }dg|z  }| j                  |||      S )Nr9   r  )r7   r   triton_tensor_ndimr  )rI  r   r  ndimr   s        rY   r  zTritonKernelOverrides.constant  s9    
 xx**,d
##E5#>>r[   c                $   t         j                  j                  |d      }t        |t              sJ t         j                  j
                  j                  t         j                  j                  |j                  t        |            }|t        j                  t        j                  fvrRt         j                  j
                  j                  t         j                  j                  | j                  ||            }|j                  |_        |S )NF	block_ptrr  )r7   r   indexingr   ri   csegeneratecomputerk   r,   r  r  r  r  rm   )rI  r   r  r  vars        rY   r  z TritonKernelOverrides.index_expr  s    88$$TU$;(O444hhll##HHh009Nt9T $ 
 ekk22((,,''(8(8#,,sE:RSC **
r[   c           
        | _t         j                  j                  Et        j                  j
                  j                  t        j                  j                  |  d      } |j                  j                  d      }|sJ d       d}|D ]=  }|j                  D ],  }|j                  dk7  st        |j                  d         s+d}. ? |rd n|}t        j                  j                  | |	      5 } |       }	d d d        |r	j                  j                  rt!        |      }t        j                  j
                  j                  t        j                  j                  d
|	 dt#        |       d|	 dt%        j&                  |            }t)        j*                  |	|      }
n	}
|
j,                  j/                         |
S # 1 sw Y   xY w)N.to(tl.int1)output)opz)graph for body does not contain an outputFloadr   T)r   r  z.shape, r   r  r  )r  versionhipr7   r   r  r  r  r   
find_nodesr   targetrK   
mask_loadsr  is_boolrp   rD   r   wrapr  r  rm   discard)r  r  r  nodes
need_wherenoder  r   new_maskresultrets              rY   r  zTritonKernelOverrides.masked+  s    1 1 =88<<((  &%D
 

%%%2AAAu
 	&Dyy &::'+CCHHQK+P!%J&	&
 #XX  U 3 	xVF	 }}$$UHHLL))  6((=+?*@6('R"''. * E
 ))Hfe4CCh'
%	 	s   2GGc                    t         j                  j                  j                  |       }d| dt         j                  j                  j	                  d|       dS )Ntl.load(z + load_seed_offsetr   )r7   r   r   inputseed_offset)r   rc   r  s      rY   rf  zTritonKernelOverrides.load_seedQ  sI    hhmm!!$'se3qxx}}889KVTUUVW	
r[   c                   d|  d}|t         j                  j                  j                  v r't         j                  j                  j                  |   S t         j                  j                  j	                         }t         j                  j                  j	                         }t         j                  j
                  j                  | d| d|  d       ||ft         j                  j                  j                  |<   ||fS )Nzfrexp(r   r   z = triton_helpers.frexp()r7   r   r  cachenewvarr  r^   )r   	cache_keymantissaexponents       rY   frexpzTritonKernelOverrides.frexpX  s    QCqM	***88<<%%i0088<<&&(88<<&&(	""j8*$<QCqA	
 *28(<9%(##r[   N)r   r   r   r  r  r  r  r   r  rf  r  r   r[   rY   r  r    sl     ? ? 
 
 # #J 
 
 $ $r[   r  c                    | S ru   r   r  s    rY    _typecheck_TritonKernelOverridesr  h  r  r[   c                  H    e Zd ZU dZded<   ded<   ddZdddd	Zd
 Zd Zy)HelperFunctionsz#An ordered set of helper functions.zDict[str, str]_templates_seenr   finalized_helpersc                     i | _         g | _        y ru   )r  r  rv   s    rY   r  zHelperFunctions.__init__r  s    !!#r[   _triton_helper_fn	base_namec                   | j                   j                  |      }||S | t        | j                         }|| j                   |<   | j                  j	                  |j                  |             |S )a9  This accepts a function definition with the function name
        left as a format specifier e.g.

            @triton.jit
            def {name}(arg0, arg1):
                return arg0 + arg1

        We add the templated code to the function set and return the name
        assigned to that function.

        )r   )r  getr   r  r   r   )rw   template_coder  existing_namer   s        rY   addzHelperFunctions.addv  sw     ,,00?$  S!7!789:.2]+%%m&:&:&:&EFr[   c                ,    t        | j                        S ru   )iterr  rv   s    rY   __iter__zHelperFunctions.__iter__  s    D**++r[   c                     | j                   |   S ru   )r  )rw   r   s     rY   __getitem__zHelperFunctions.__getitem__  s    %%c**r[   N)r   r   )r  rj   r   rj   )	r   r   r   r  r   r  r  r  r  r   r[   rY   r  r  l  s+    -##  $ 4G ,,+r[   r  c                      e Zd ZU dZ ej
                  e      Zded<    ej
                  e      Z	ded<    ej
                  e      Z
ded<    ej
                  e      Zded<   d
dZy	)r   zM
    Class representing ND block dimensions, for block pointer analysis.
    )default_factoryr   r   r   r   r   c                    t        |       }t        d | |fD              \  }} |di |D ci c]  }|||   ||   z    c}S c c}w )z0
        Concatenates block parameters.
        c              3  F   K   | ]  }t        j                  |        y wru   )r   r   r>  s     rY   r   z*BlockParameters.__add__.<locals>.<genexpr>  s     Bq[''*Bs   !r   )rD  rE  )rw   r  rI  rJ  rK  r   s         rY   __add__zBlockParameters.__add__  sR     4jBT5MBB19a8sc1S6AcF?*8998s   AN)r  r   r   r   )r   r   r   r  r   fieldr   r   r   r   r   r   r  r   r[   rY   r   r     sn     0k//EEE$5K$5$5d$KK!K 1 1 1$ GGG 1 1 1$ GGG:r[   r   c                  L    e Zd ZU eZded<   eZded<   dZdde	j                  dddd	 	 	 	 	 d7 fd	Zd8d
Zd9dZd9dZd:dZd Zd Zd;dZd Zed<d       Zddddd	 d=dZ	 d>	 	 	 	 	 	 	 d?dZd>dZ	 	 	 	 	 	 	 	 d@dZd ZdAdZ	 dB	 	 	 	 	 	 	 	 	 dCdZ	 	 	 	 	 	 	 	 	 	 	 	 dDdZd Z	 	 	 	 	 	 	 	 	 	 dEdZdFdZ d<d Z!	 	 	 	 	 	 	 	 dGd!Z"	 	 	 	 	 	 	 	 	 	 dHd"Z#d# Z$dBd$Z%d% Z&d& Z'e(d'        Z)dBd(Z*d) Z+d* Z,d+ Z-d, Z.dBdId-Z/d. Z0d/ Z1dJd0Z2d1 Z3d2 Z4d3 Z5dKd4Z6d5 Z7d6 Z8 xZ9S )LTritonKernelr  helper_functionszCallable[[sympy.Expr], str]kexprTNr   )	mutations	pid_cachereduction_hintr  override_persistent_reductionoptimize_maskc          	         || _         t        	|   ||||||d t               | _        t               | _        || _        t        j                         | _
        t               | _        t               | _        d | _        | j                          y )N)r	  r  r  r  r  )r  r  r  r>   suffixr   outside_loop_varsr  	itertoolscountblock_ptr_idr  r  autotune_hintstriton_metacodegen_range_tree)
rw   r	  r  r  r  r  r  r  groupsr  s
            rY   r  zTritonKernel.__init__  s     $1#)*G	
 '5&62<,#6 %OO- / 1 9C8<!r[   c                z    t        j                         D ci c]  \  }}||
 }}}||j                     S c c}}w ru   )r   r   r   )rw   treer   r   prefix_to_symts        rY   	_get_symtzTritonKernel._get_symt  s;    ;E;K;K;MN<4&$,NNdkk** Os   7c                2    t         | j                  |         S ru   )r   r  rw   r  s     rY   _get_block_sizezTritonKernel._get_block_size  s    4>>$/00r[   c                2    t         | j                  |         S ru   )r   r  r  s     rY   _get_block_offsetzTritonKernel._get_block_offset  s    T^^D122r[   c                D    t         |j                  j                            S ru   )r(   r   r   r  s     rY   _max_block_sizezTritonKernel._max_block_size  s     1 1 344r[   c                :   | j                   D ]+  }|j                  r| j                  || j                         - | j                  rU| j                   d   j                  r;| j                  j                  d| j                  | j                   d                 y y y )Nr   zrbase = )r   is_loopiteration_ranges_codegen_headerr  r   r^   iteration_ranges_ranges_coder  s     rY   r  zTritonKernel.codegen_range_tree  s    $$ 	FD<<44T499E	F   T%5%5b%9%A%A II4<<T=M=Mb=QRST &B r[   c                     y)z
        Indicate whether we need provide numel as arguments for the generated
        kernel calls in the benchmark.

        Should be true for pointwise/reduction kernels but false for triton
        matmul kernels.
        Tr   rv   s    rY   need_numel_argszTritonKernel.need_numel_args  s     r[   c                X   | j                   rt        j                  j                  syt        j
                  dij                  | j                  d      }t        j                  j                  r|dz  }| j                  d   }t        j                  j                  j                  ||      S )z^
        Heuristic to set self.persistent_reduction and add guards
        if needed.
        Fi   r     r   )r   r   rX   persistent_reductionsr'   INNERr  r  multi_kernelr   r7   r   r   statically_known_leq)rw   	threshold
last_numels      rY   should_use_persistent_reductionz,TritonKernel.should_use_persistent_reduction  s    
 %%&--*M*M

#d!!2
& 	 ==%%OI[[_
ww44ZKKr[   c                    | j                   t        j                  k(  xr_ | j                  xrQ t	        | j
                        dk(  xr7 t        j                  j                  j                  | j
                  d   d      S )Nr   r      )
r  r'   r  persistent_reductionr   r   r7   r   r   statically_known_geqrv   s    rY   want_no_x_dimzTritonKernel.want_no_x_dim  sk    =#6#66 L))LDKK A%L   55dkk"osK		
r[   c                     y)Nztl.device_assertr   rv   s    rY   assert_functionzTritonKernel.assert_function  s    !r[   F)
copy_shapedense_indexingoverride_maskr  c          
          j                        j                  }d}t               |D ]l  }t        |t        j
                        sJ |xs t        |t        j                        }|rAt        |t        j                        r? j                  j                  |j                     }	j                  |	j                         t        |t        j                  t        j                   t        j"                  t        j$                  t        j&                  t        j(                  f      rt        |t        j                  t        j*                  t        j,                  f      sJ |j                         j/                  |j                  d    d       o t0        j2                  j4                  xs |xs  j6                  duxr dk7  }
d}d}t               } j9                         D ]@  }|j;                  |j<                        rd}nd}|j/                  |j>                   d       B |r j@                  rt0        j2                  jB                  rz|sx j6                  sltE        |z
        dk(  r[ jG                        sJ|rH jH                  dk(  r9	 	 	 	 	 	 d fd	 	 	 	 	 	 d fd	 	 	 	 	 	 dfd	d fd
} |       }||S d} jK                        }t        t        jL                        r8|r| dn jO                         }d| d| d}tQ        |t               d||      S |
r%|s#|r| dn jO                         }d| d| d}|n|s|rd| d| d}||rt        |g       j6                  rj/                   j6                          jS                         r(djU                  tW        tY        tZ                          nd}tQ        ||||      S )zO
        Compute the index and mask to pass to tl.load() or tl.store()
        Fr   r  NTtl.int32c                    |j                         }t        j                  d|g      }| j                  ||z        }|yt	        |j
                  gj                  |      g||   gj                  |      g      S )z
                Matches expressions of the form:
                    idx = s * xindex

                This implies stride (s,), and shape (XBLOCK,).
                strideexcludeNr   r   r   r   )symbolr   Wildmatchr   numelr  r   )rs   
range_treer#  r  mrw   s        rY   match_strided_blockz2TritonKernel.indexing.<locals>.match_strided_block[  s     $**,Hvh?KK09&%++,!%!5!5j!A BvYK!33J?@	 r[   c                b   t        dt        j                        | j                  t              | j                  t
              z         }|j                         }t        j                  t        j                  |g      }t        |      D cg c]  } |d|        }}t        |      D cg c]  } |d|        }}dd} ||d|       }	t	        ||	d         gt        |dd |	dd       D 
cg c]  \  }
}t        |||
       c}}
z   }t        ||      }| j                  |      y|dd D ]  }
|
vst        j                  d      |
<   ! |dd D ]  }|vst        j                  d      |<   ! t         j"                  j$                  dfd	}|d   g|dd D 
cg c]
  }
 ||
       c}
z   }|D cg c]
  } ||       }} ||      }	|D cg c]  }t'        |       }}|d   vsJ d
       j)                  |j*                  |	d         sy|j*                  |	d   z  |d<   j-                  |      t/        fd|	D              rydd}j1                  |      }t3        ||	d         gt        |	dd |dd       D 
cg c]%  \  }}
t        j4                  t3        ||      |
      ' c}
}z   }|D cg c]  }t'        ||j7                  |      i      ! }}t9        ||||      S c c}w c c}w c c}}
w c c}
w c c}w c c}w c c}
}w c c}w )a  
                Matches higher-dimensional blocks coming from FloorDiv and ModularIndexing.

                Example expression to match:
                   sN * ((rindex//(d1 * ... * d(N-1))))
                       + s1 * ModularIndexing(rindex, 1, d1)
                       + ...
                       + s(N-1) * ModularIndexing(rindex, d1 * ... * d(N-2), d(N-1))

                This iterates over a block of shape (dN, ..., d1) and stride
                (sN, ..., s1). (d1,...,d(N-1)) and (s1,...,sN) are
                wildcards that we match.

                Note that dN does not appear in the expression, but we solve for it
                using range tree numels and the other dims.
                r   r   dim_mod
stride_modc                    t        j                  d      g}| ddd   D ]  }||d   z  }|j                  d|        |S )z
                    Compute the cumulative size of each dimension's slice.
                    This proceeds from the last dim up to the second.
                    r9   Nr   r   )r   r   insert)dimsr   dimr&  s       rY   get_slice_numelszLTritonKernel.indexing.<locals>.match_mod_div_block.<locals>.get_slice_numels  sP    
 $mmA./F#EQrE{ 0 #fQia/0 "Mr[   Nr   r9   c                ,    j                  |          S ru   )r   )r   r%  r   s    rY   	get_matchzETritonKernel.indexing.<locals>.match_mod_div_block.<locals>.get_match  s    #;;E$KHHr[   z,Expected not to match the leading dimension!c              3  l   K   | ]+  }j                  |       xr j                  |        - y wru   )r   statically_known_power_of_2)r   r&  	max_blockr   s     rY   r   zETritonKernel.indexing.<locals>.match_mod_div_block.<locals>.<genexpr>  sH        !==eYOO H$@@GGHs   14c                    | S ru   r   )r   s    rY   identityzDTritonKernel.indexing.<locals>.match_mod_div_block.<locals>.identity  s    Kr[   r"  )r/  	List[Any]r   r9  )r   rr   r   rr   )r  r   range_tree_nodesr  r   r   r#  	functoolspartialr   r$  r   r   r1   r%  r   r7   r   r   r2   r   r&  r  r   r  r   Minr   r   )rs   r'  num_dims	index_varwildr   r/  r   r1  slice_numelsr0  r&  block_index_exprs
match_exprr  r3  r   r8  linear_block_sizer   r   r%  r6  r   rw   s                        @@@rY   match_mod_div_blockz2TritonKernel.indexing.<locals>.match_mod_div_blockq  s   . --.[[*U[[-II '--/	 ((i[I5:8_*.1D73%)* * 9>h-14D:cU+,- -	"  0Yh@%-ia%I$J&)$qr(L4D&EN"U $Iuc:N %! 'w0AB
 J/=  8 6C%'%*]]1%5c
6 &abk 9FU*(-a(8f9 77++I Qyd12h#GsIcN#GG;BC9V,CC/58I%04JtU+%! % G5(BAB(<<$$l1o  $**\!_<Q !00<	  ". 
    %)$8$8$D!-|A?1 '*,qr*:DH&E"s IIg&7?E1 !23 ti1G1G
1S%TU3 3
 ' +#)	 U*-$N6 $HC%X3s0   L2L9L1L	L&L!*L&$L,c                6    fD ]  } || |      }||c S  y)ze
                Match a block indexing subexpression involving a single range tree.
                Nr   )r   r'  
match_funcr%  rE  r)  s       rY   match_block_pointer_subexprz:TritonKernel.indexing.<locals>.match_block_pointer_subexpr  s:     ('# %J 'tZ8E($% r[   c            	       
 t        j                  j                         D  ci c]  \  } }| |j                   c}}       }j	                  d      }|D ch c]  }|j                          c}
t        j                  j                  |      }t               }|D ]w  }|j                         t        j                  d      t        fd|D              z   }t        
j                  |j                              dkD  r y  ||      }| y ||z  }y t        
fd|D              }	j                         t         j#                  ||	|      S c c}} w c c}w )NT)reorderr   c              3  @   K   | ]  }|j                   v s|  y wru   )free_symbols)r   r   r#  s     rY   r   zETritonKernel.indexing.<locals>.match_block_pointer.<locals>.<genexpr>!  s#      5!%$BSBS8S5s   r9   c              3  X   K   | ]!  }j                  |j                        s| # y wru   )intersectionrL  )r   r   range_symbolss     rY   r   zETritonKernel.indexing.<locals>.match_block_pointer.<locals>.<genexpr>1  s-      (55d6G6GH s   '*)r   r   r   rm   )r2   r:  r   r   active_range_treesr#  r   Add	make_argsr   r   sumr   rN  rL  filter_masksr   r   )vr   index_relative_to_xyr_indexr   r  index_termsblock_paramssubexprr   rc   rO  r#  rs   rm   rH  rw   s             @@rY   match_block_pointerz2TritonKernel.indexing.<locals>.match_block_pointer  sd   .8$2G2G2M2M2OP$!QAqvvIP/+ #55d5C <G G4 G#ii112MN.0' +D "[[]F#mmA. 5)45 2 G =55g6J6JKLqP# 9$GF~# F*L%+*   +  !!),&--'$* +'	 .  K Q
 !Hs   EEz.shaper  r   z, tl.int32)r   tl.broadcast_to(r   .shape)r@  )rs   rr   r'  rE   r   Optional[BlockParameters])r   rr   r'  rE   r   r]  )r   zOptional[BlockPtrOptions]).prepare_indexingrL  r   r   r   r  r   r   r   rz   r  varname_mapr   r  rm   UNBACKED_INTSIZEPRECOMPUTED_SIZEINDEXFLOATUNBACKED_FLOATr   YBLOCKr  r   rX   r  
_load_maskrP  rN  var_listr   allow_block_ptruse_block_ptrr   is_indirect_indexingr	  r   r   dense_size_strri   rT  r   sortedmaprj   )rw   rs   r  r  r  r  
index_varsr}   r  cse_var
need_dense
have_densehave_loop_varsdense_mask_varsr  rZ  optionsro   rk   rn   rm   rH  rE  r)  s   ``                  @@@@rY   r  zTritonKernel.indexing  s    %%e,''

%/\	 	4Cc5<<000#G~c4;;'GJTXX.((..sxx8  !2!23%%II))JJJJ''
  &$++t{{DKK@ 88  !T235	4: MM(( ++d* qj	 	 
+5<++- 	6D&&t}}5!%"
4;;-t 45	6 $$++!OOI/0A5--e4  J.!/C*,O!O/CO*Ob .B* , ,^ *+G"
%%e,	eU]]+2<J<v.$BUBUBWJ":,b;GI":<Z  j2<J<v.$BUBUBWJ*9+R
|1EI'IJ*9+R
|7KI'I"M?3I??MM$//*)$>G5::fSi%89:Vy)Xz:W\]]r[   c                   d }|j                         }|sd}n|r|dk(  sJ d|d}nd|}| j                  r| j                  d   j                  r|j	                         rwdt        | j                         }| j                  j                  t        || d|j                  |d	                    t        || d
| d|j                          d      }n|j                  |      }|||fS )NrS   , other=0.0, boundary_check=z, padding_option='zero'r   r   = F)r   z = tl.advance(r   r   )r   r   r   r  r}   nextr  r  r^   r=   r   r   )rw   r   r  r  r  advance_block_ptrcheckr  s           rY   codegen_block_ptrzTritonKernel.codegen_block_ptra  s    !'')EM)))'y0GHE'y1E!!  $,,##%#D):):$;#<=IIIYKs8??3?+N*OP
 !-+^I;b9Q9Q9S8TTUV!
 !,I+U22r[   c                \   d| d| j                  |j                         d}|j                  D cg c]!  }t        j                  j                  |      # }}t        ||j                  |      }| dt        t        j                  j                  |             d}d| d| | dS c c}w )Nr[  r   r   r  	tl.store()	r   r   r   r7   r   r  r  r   	get_dtype)rw   r   r  r  r   r  r   r   s           rY   codegen_block_ptr_store_linez)TritonKernel.codegen_block_ptr_store_line  s     ugR(9(9(:Q:Q(R'SSTU 	 @H?S?STtqxx,,T2TTuh&=&={K'/0A0A$0GHIK9+RwugQ77	 Us   &B)c                   |s|sy t        |t        j                        sJ | j                  |d      }t        |t              sJ |j
                  }|j                         r|j                  nd }|rt        | j                  |            nd }| j                  ||rdnd ||      }	| j                  |      xs t        d |j                  D              }
| j                  |      }| j                  j!                  ||	d       y )NFr  0c              3  <   K   | ]  }t        |t                y wru   r   r  )r   r(  s     rY   r   z,TritonKernel.check_bounds.<locals>.<genexpr>  s      :
12Jq+,:
   )
assignment)r   r   Exprr  ri   rk   rx   rn   texprrename_indexingindirect_assertrk  r   rm   get_load_bufferr  r  )rw   r   r   lowerr   r  rk   rn   size_strlineindirectbuffers               rY   check_boundszTritonKernel.check_bounds  s     $

+++===7(O444&&	(0(9(9(;8$$8=5--d344 ##esx
 ,,T2 
c :
6>6H6H:
 7
 %%h/&$59r[   c                    |j                         s|j                         r| j                  S | j                  r5| j                  d   j
                  r|j                         s| j                  S | j                  S )Nr   )	r{   r   r  r   r   r  r}   r  loads)rw   r  s     rY   r  zTritonKernel.get_load_buffer  sb      "h&:&:&<<<!!  $,,'') 99::r[   c           	     z   | j                   j                  |      }| j                  |      }|}| j                  |d      }|j	                         }|j                         }t        d | j                  |      j                         D              }	| j                  |      rd}
n|	sd}
n| j                  r| j                  d   j                  r|| j                   j                  v r-t        | j                   j                  |   j                        }nt        |g      }t!        || j"                  z        dkD  }| xr |xs |}|rd}
nd}
nd}
|s|r8|j%                         r(| j&                  rd	t)        | j&                         }nd
}nd}d }d }t+        |      r|}nRt-        |t.              r\| j1                  ||||      \  }}}d| | |
 d}|j2                  D cg c]  }t5        |       }}t7        |||j8                        }nTt-        |t:        j<                        rd| d| d}|j>                  }n$d| d|j@                   d|jB                   |
 | d	}tD        jF                  jI                  |      }|tJ        jL                  tJ        jN                  fv rtP        jR                  jT                  r|dz  }|tJ        jV                  k(  rtJ        jX                  jZ                  |dz  }| j]                  |      }| j^                  ja                  ||      }t-        |tb              sJ |jd                  |_2        |r%d| d| d}| j^                  ja                  ||      }|r|jg                  |       | j                  r|ji                         s|s| jj                  jm                  |       |S c c}w )NTr  c              3  &   K   | ]	  }|d k(    yw)r9   Nr   )r   r   s     rY   r   z$TritonKernel.load.<locals>.<genexpr>  s      
AF
   z, eviction_policy='evict_last'r   r   z, eviction_policy='evict_first'rS   z, other=rw  r  r   r   rC  r  r  r  r[  r   )7r   r  rk  r  r}   r   r   get_strides_of_loadvaluesis_broadcastedr   r   r  inplace_buffersr   other_namesr   
last_usagerx   _load_otherrD   rK   r   r   r}  r   rj   r  r   r   r   ro   rk   rn   r7   r   r  r  rx  ry  r   rX   r  rp   r  r  r  r  r  r  rm   r^   r   r  r  )rw   r   rs   r  indirect_indexingoriginal_indexr  r}   r   is_coalescedepnameslast_use
evict_lastr  r{  append_broadcastr  r  r0  r   r  load_buffer
result_vars                           rY   r  zTritonKernel.load  s   iiood# 55e<==$=7((*
**,  
 44^DKKM
 
 ~.1B1B""t'7'7';'C'Ctyy000)3II--d3??* #D6*54??23a7H%K:+J9JJ56B:8+<+<+>"=1A1A#B"CD%E #D)D(O46:6L6L#x73	,e ")UGB4q93;3G3GHCs3xHH%dK9P9PQNEMM:!#d>*:"=#+#6#6 !#d8+=+=*>c(BSBSATUWTXY^X__`aGG%%d+E%--88MM88))

"u}}'8'8'@ &**84XX&&{D9
*&7888'11
%j\4D3EQGD**;=J!!"34$$X-?-?-A*""&&z2G Is   !N8c           	        | j                   j                  |      }|}| j                  |d|d u       }|| j                   j                  v }| j	                  |      }	|r'|	r%| j
                  j                  t        |d             d }
t        |t              r-| j                  |||      \  }}
}| j                  |||||      }n]|$d| d|j                   d| d|j                   d	}n7|d	k(  r$d
| d|j                   d| d|j                   d	}nt        d|       | j
                  j                  t        ||             |
r| j
                  j                  |
       | j                  s| j                   j#                  |       y y )NT)r  r  ztl.debug_barrier()r  r   r  r   r   
atomic_addztl.atomic_add(z, sem='relaxed')zstore mode=)r   r  r  r  r  storesr^   r=   r   r   r}  r  rk   rn   r  r   r  r  )rw   r   rs   r   moder  r  r  
is_inplacer  r{  r  r  r  s                 rY   storezTritonKernel.store  s    iit$==ttt|=T TYY666
,,^<.KK!!,t5I"JK h0262H2Hc83/I(% 44h	5%D \se4(:(:';3ugRHYHYGZZ[\D\!#C5X-?-?,@E7"XM^M^L__opD%D6&:;;l467KK!!"34$$""&&u- %r[   c                   | j                   j                  t        j                         | j                  j                  |      }| j                         }| j                  |      }|t        j                  k(  rd}	n!|t        j                  k(  rd}	nt        d      | j                  j                  | j                  d| d| d|	 d| d| d| d      }
|
S )z3
        See [Note: Inductor bucketize op]
        r  tl.int64z5Bucketize only supports indexing with int32 and int64z'triton_helpers.bucketize_binary_search(r   r   )r  r  r   ELEMENTS_PER_WARP_32r   r  rl  r   r  r  r  r  r  r  r  )rw   r  offsets_nameoffsets_sizeindexing_dtyper  offsets_ptrr   offsets_size_strr  r  s              rY   	bucketizezTritonKernel.bucketizeB  s      	 A ABiiool3((*
,,\:U[[(%Lu{{*%L%G  ""LL5fXR}B|n\^_d^eeghxgyy{  }G  |H  HI  J

 r[   c                |    | j                         }|dk(  rd| dS dg|z  }d|d<   | ddj                  |       d	S )
Nr9   z!triton_helpers.promote_to_tensor(r   r   r   r   r   r   r   )r  r   )rw   r   ndimssizess       rY   reduction_resizezTritonKernel.reduction_resizeh  sW    '')A:6ugQ??b	$))E*+1--r[   c                    !"#$  j                   sJ t        d  j                  D              } j                  |       t	        |      } j
                  r|j                   j
                          j                  d   j                  } j                         ! j                  ! fd|      }" fd}"# fd}||f}	|	 j                  j                  v r j                  j                  |	   S  j                         dz
  "t        |      }
 j                  j                         }t        d |D              |_        dj!                  |        fd	$ j"                  rt$        j&                  j)                  |      } j                  t*        |      } $fd
}t-        |t.              r&t1        ||      D cg c]  \  }} |||       }}}n	 |||      }dv rUt3         j                  j5                   j6                  d| d| d            }ddd   # | j6                  |||       ndk(  r j9                  ||      }n߉dk(  rl|\  }}}d| d| d| d" d	} fdt;        d      D        \  }}} j6                  j=                  | d| d| d|        t/         fd|||fD              }nn j                  j5                   j6                   ||            }n@d| }t$        j&                  j?                  |      } j                  t*        |      }t-        |t.              s5 j@                  j=                  | d j                          d| d|
 d       dv rd| d}tC        jD                  tB        jF                        jH                  } j@                  j=                  | d j                          d| d       ddd   # j6                  jK                  d| d | d!# d"| d| d| d| d#| d $| d$|       d%| d $| d$|       d%        | jL                  |||       ntO              r!| d&}| d'}| d(} j@                  j=                  | d) j                          d|
 d        j@                  j=                  | d) j                          d|
 d        j@                  j=                  | d) j                          d|
 d       dk(  r>|\  }}} j6                  jK                  d*| d | d | d+| d| d| d,| d| d| d-       n8dk(  sJ  j6                  jK                  d*| d | d | d.| d| d| d| d/        j6                  jK                  d| d $| d$|       d%| d $| d$|       d%| d $| d$|       d%       |} j                  j                         } j                  j                         } jL                  jK                  d| d0| d0| d1| d| d| d" d2| d jQ                  | d3       d%| d jQ                  | d3       d%| d jQ                  | d3       d%       |||f}nt%        jR                  |      } |||      } j6                  j=                  | d $||              |tB        jT                  k(  r;| d4}tW        |      } jL                  j=                  | d ||       d5| d       n& jL                  j=                  | d ||              | j                  j                  |	<   t-        |t.              r4tY        d6 |D              sJ  xjZ                  t        |      z  c_-        |S t-        |t\              sJ  jZ                  j_                  |       |S c c}}w )7Nc              3  :   K   | ]  }|j                    d   ywr  Nr   r   r  s     rY   r   z)TritonKernel.reduction.<locals>.<genexpr>y       MDdkk]$/M   r   c                ^    j                   j                  j                  d|  d d      S )Nr[  r   r   r  r  r  )rU  rl  rw   s    rY   <lambda>z(TritonKernel.reduction.<locals>.<lambda>  s0    dhh'' 02n5EQG r[   c           
         dv }|rdnd}dv rj                  | d d|  d d      S j                  | d d	|  d d      S )
N>   r   r  minprodtriton_helperstl>   r  r  ro  z2(r   r   r  )r  )r   
use_helpermoduler0  reduction_typerw   s      rY   final_reductionz/TritonKernel.reduction.<locals>.final_reduction  s~    '+HHJ)3%F/,,ha/r%3%qA  ((F81^4DAeWBseST)UVVr[   c                x    | j                  d| d d| d| d d| dj                  | d       d       y )	Nz                _, z_tmp = triton_helpers.z_with_index(r   z)
                ry  _tmp
                )r]   r  )r  r  r   rs   r0  root_oprw   s       rY   final_argreducez/TritonKernel.reduction.<locals>.final_argreduce  sg    MM<5gYl5'QSTYSZZ\]`\a bC 5 5D6I JK Lr[   r9   c              3  2   K   | ]  }|d    dk7  s|  yw)r   r   Nr   )r   r  s     rY   r   z)TritonKernel.reduction.<locals>.<genexpr>  s     )P##a&C-#)Ps   r@  c                :    s| S t         j                  | |      S ru   )r  r  )tvalfvalconds     rY   
where_condz*TritonKernel.reduction.<locals>.where_cond  s     (..tT4@@r[   c                ^    j                   j                  j                   | |            S ru   r  )r   defaultrw   r  s     rY   _mask_valuez+TritonKernel.reduction.<locals>._mask_value  s%    xx((z%7QRRr[   >   argmaxargminr[  zindex, r\  r  r  )r  r  welford_reducewelford_combineztriton_helpers.welford(r   r   c              3  R   K   | ]  }j                   j                            y wru   )r  r  )r   r  rw   s     rY   r   z)TritonKernel.reduction.<locals>.<genexpr>  s     #H!DHHOO$5#Hs   $'r   ry  c              3     K   | ]9  }j                   j                  j                  j                  |             ; y wru   )r  r  r  r  )r   var_namerw   s     rY   r   z)TritonKernel.reduction.<locals>.<genexpr>  s9      #  HH%%dllD4I4I(4ST#s   ?Ar   = tl.full(_indexz, tl.int64)z                z_next, z_next = triton_helpers.z%imum_with_index(
                    z(index
                )
                _nextr  _mean_m2_weightz = tl.zeros(z                    z@_next = triton_helpers.welford_combine(
                        z,
                        z+
                    )
                    z?_next = triton_helpers.welford_reduce(
                        z9, roffset == 0
                    )
                    z_tmp, z3_tmp = triton_helpers.welford(
                    z#
                )
                r  z.to(tl.int8)r  c              3  <   K   | ]  }t        |t                y wru   r  r>  s     rY   r   z)TritonKernel.reduction.<locals>.<genexpr>I  s     LAz!%67Lr  )0r   r   r   rT  rm  rg  r   r   rl  _map_tuple_or_scalarr  reduction_cacher  r  r  rm   r   r  r    	Reductiondefault_valuerD   r   rE  r   rj   r  r  welford_reduce_fallbackr   r^   default_accumulatorr  r  r  r  r  r]   r  r/   r  get_reduction_combine_fnrp   r  allr  r  r  )%rw   r  r  r  r   masksreduction_range_prefixr  r  r  acc_typer  r  r  rU  dmasked_valueaccumulator_indexmeanm2weightwelfordaccumulatorlong_maxaccumulator_m2accumulator_weightresult_mean	result_m2result_weight
combine_fnupdatedresult_typer  rl  r0  r  r  s%   `  `                            @@@@@rY   	reductionzTritonKernel.reductionq  s    $$$$MD<L<LMM% u??LL)!%!1!1"!5!<!< ,,.)) 	
	W	 6	00088++I66%%'!+"9-((//+
))P)PP
zz% 	A
 $$ll00KG//wGGS %'>A%>QRdaAq 1RR*5':!55$'HH%%*+A*B',W^_%! &+e<^LLL*l<M  #33 "99%G
#44#/ b&3D6B4r&C5PQR#HuQx#H b&&&$r"Rxs7)'LM" #%)2v$6# 

 "XX..LL/,"?
 j\*Kll66~yQG//wGGgu-		##"m;t/B/B/D.ERyPRS[R\\]^ !55&'
|6$:! ;;u{{377		##()T5H5H5J4K2hZWbc &+e<^L##W%6$77Nwi X M$5#6brBXAY ZS{m5,A;!O P Q"#3z5F4Gu2MO`'a&b c  ZFWX%n5!+E2$.<s!3(2|7%;"		##"m<0C0C0E/Fb
RST 		##%&l43F3F3H2IH:UVW 		##)*,t7J7J7L6MRPXzYZ[ "%66',$D"fLL'' M(8@R?S T$R'7r:L9M NbBvh / *-====LL'' M(8@R?S Tr+b0@CUBV W ##S{m5,A;!O P Q J.1A/G$X#Y Z#$C
6H5I3OQc(d'e f ) HHOO-	 $ 1""VI;f]O D MN#326H5IC5 QS!6!6+d7K!L M N3t44	{$5GHI Js4#8#8M?$9O#P"Q R	 ))]B
88S
$[%8&&"m3z';'G&HI 

* &1M">K"5e"<KKK))%,c/+*F)GtK=XYZ KK))%,c/+*F)GH /9  +j%(LLLLL""j&<<"
  j*;<<<""&&z2m  Ss   _c                   | j                   sJ d| _         | j                  |d      }d| _         | j                  j                  |      }t	        |t
              rY| j                  j                  t        || j                  |||j                  |      |d|j                                            y t	        |t              sJ | j                  j                  t        |d| d|j                   d| d|j                   d		             y )
NFTr  rx  r  r   r  r   r   )r   r  r   r  r   r   r  r^   r=   r  r   r   ri   rk   rn   )rw   r   rs   r   r  r  s         rY   store_reductionzTritonKernel.store_reductionQ  s   $$$$ %==$=7 $iit$h0KK!!55  ,+H,C,C,E+HI	 h888KK!!uD););(<CwbIZIZH[[\]r[   c           	       	
 t               		j                  d       t        d      D cg c]  t        fdt        |      D              ! }}dj	                  t
        j                  j                  |            }	j                  d| d       t        dd      t        t        j                               d	
 G 	
fd
d      }	j                         5  t        j                   |             5   || }dj	                  d |D              }	j                  d|        d d d        d d d        | j                  j                  	j!                         
      S c c}w # 1 sw Y   AxY w# 1 sw Y   ExY w)Nz@triton.jitr   c              3  .   K   | ]  }d  d|   yw)r  r  Nr   )r   nr   s     rY   r   z,TritonKernel._lift_helper.<locals>.<genexpr>s  s     =A3asm=s   r   zdef {name}():rS   )r   r  r  c                       e Zd Zd fdZy)+TritonKernel._lift_helper.<locals>.CSEProxyc                    fd}|S )Nc                 X    d z  j                   t              | i |      S )Nr  )r  getattr)r   r  r  helperhelper_namer   	overridess     rY   innerzFTritonKernel._lift_helper.<locals>.CSEProxy.__getattr__.<locals>.inner  s=    Qtf:-K<<0	40$A&A r[   r   )rw   r   r  r  r  r  r  s    ` rY   __getattr__z7TritonKernel._lift_helper.<locals>.CSEProxy.__getattr__  s      r[   N)r   rj   r   zCallable[..., CSEVariable])r   r   r   r  )r  r  r  r  s   rY   CSEProxyr    s    	 	r[   r  c              3  2   K   | ]  }t        |        y wru   )rj   )r   r  s     rY   r   z,TritonKernel._lift_helper.<locals>.<genexpr>  s     BFB   return r  )r>   r^   r   rE  r   r  chainfrom_iterabler;   r  r7   MockHandlerindentset_ops_handlerr  r  r_   )rw   fnnum_argsr   r   	signaturer  outputsr  r  r  r  s      `    @@@@rY   _lift_helperzTritonKernel._lift_helpern  sC     !'GLQxP!=U8_==PPIIioo;;DAB	=267B'#AMMO4	 *
	 
	 ]]_ 	2a//
; 	2$iGiiB'BBGwwi01	2 	2
 $$(():k(RR= Q2	2 	2 	2 	2s)   $E+#E<>2E00E<0E9	5E<<Fc                     j                   sJ t        d  j                  D              } j                  |       t	        |      } j
                  rJ d        j                  d   j                  }g }g }t        j                   j                  j                   j                        } j                  |t        |            }	 j                         dz
  }
t        ||      D ]X  \  }}t!        |      }dj#                  |      } j                  j                   j                  | dt%        |       d      } j                  j                   j                  d| d	 j'                          d      }|j)                  |       t!        |      }dj#                  |      } j*                  r͉ j                  j-                         } j/                         }d
|d<   dd	j#                  |       d}|j0                  rdnd} j2                  j5                  | d| d	| d	| d       |j)                  |       [ d  fd} |d |       d|
 d	|	 dt        |      |      } j*                  s|D cg c]  } |d| d       }} |t7        |      t7        |            } |t7        |      |      }t        ||      D cg c]  \  }} |d| d	| d       }}}t        |||      D ]*  \  }}} j                  j5                  | d| d	| d       , n|}|D ]	  }||_         t7        |      S c c}w c c}}w )Nc              3  :   K   | ]  }|j                    d   ywr  r  r  s     rY   r   z$TritonKernel.scan.<locals>.<genexpr>  r  r  z(ops.scan not supported inside ops.maskedr   r9   r@  r  r   r[  r   r   r   r   zfloat('nan')z-1r  c                2    dj                  d | D              S )NrB  c              3  &   K   | ]	  }| d   yw,Nr   r   r   s     rY   r   z1TritonKernel.scan.<locals>.csv.<locals>.<genexpr>       <EugQK<r  r   r  s    rY   csvzTritonKernel.scan.<locals>.csv      88<V<<<r[   c                   t        |      D cg c]  }|  d| d|  }}t        
fd|D              r'|D cg c]  }
j                  j                  |    c}S t        |      D cg c]  }
j                  j	                          }}
j
                  j                   	|       d|         t        ||      D ]'  \  }}|r||_        |
j                  j                  |<   ) t        |      S c c}w c c}w c c}w )Nr   c              3  N   K   | ]  }|j                   j                  v   y wru   r  r  r   r  rw   s     rY   r   z:TritonKernel.scan.<locals>.cse_multiple.<locals>.<genexpr>       K99.K   "%ry  
r   r  r  r  r  r  r^   r   rm   rE  r  r  r  r   
cache_keysr  r  result_varsr  r0  rw   s            rY   cse_multiplez'TritonKernel.scan.<locals>.cse_multiple      ;@8DaTF"QCr%1DJDK
KKCMNiy1NN6;Ah?488??,?K?LL""{#$Cv. *-[*)E 7%
I+0J(,6y)7 %% EN?   C4 C9*!C>ztl.associative_scan((r  ztriton_helpers.select_one((z1), rbase == (RBLOCK - 1), dim=-1, keep_dims=True)ztl.where(roffset > 0, z = tl.where(roffset > 0, )r   r   r   rT  rm  rg  r   r;  r<  r  r  r  r%  r   r  r   r  r   r  rl  r   r  r  dense_size_listr  r  r^   rE  rm   ) rw   dtypesr  r  r  r  broadcasted_valuesaccumulatorscse_computecombine_helper_fnr0  r   r  r  r  value_dtyper  reduced_sizer  r<  partial_scan_varspartial_scan_varpartial_reduce_vars	accs_nextfull_scan_vars	full_scanpartial_scanr;  acc_nextpartial_reducer  r0  s    `                              @rY   scanzTritonKernel.scan  s    $$$$MD<L<LMM% u??N$NN"!%!1!1"!5!<!<''(9(94<<H --j#f+F%%'!+/ 	1LE5&u-H::e$D((++'1%89;K HH%%";-r$2E2E2G1HJE %%e,&u-H::e$D,,"hhoo/#335#&R !"499\#:";1=,1,C,C.		##"m;|nBwir(STU ##K09	1<	=	& )#C(:$;#<CuBGXFYYZ[K
 (( ):	# % 12B1CCtu# # #5#6>Q8RSI'l(;=NON 03>CT/U+I| 4YKr,qQRK  :=<)<: 5+~ &&"m#<XJbHXXYZ ,K% 	)J#(J 	) [!!1#s   ?MMc                     j                   sJ t        d  j                  D              } j                  |       t	        |      } j
                  rJ d        j                  sJ d        j                  d   j                  }t        j                   j                  j                   j                        } j                         dz
  }|D 	cg c]  }	 |d|	 d j                          d      ! }
}	d	  fd
} j                  d   j                  dk(  sJ  j                   j                  d         rdnd}t!        |      dk(  r/d|
d    d|
d    d| d| d| d| d} ||t!        |      |      }nt#        d      t%        ||      D ]  \  }}||_        |j(                  |_         t+        |      S c c}	w )Nc              3  :   K   | ]  }|j                    d   ywr  r  r  s     rY   r   z$TritonKernel.sort.<locals>.<genexpr>	  r  r  z(ops.sort not supported inside ops.maskedz3ops.sort is only supported in persistent reductionsr   r9   r[  r   r   c                2    dj                  d | D              S )NrB  c              3  &   K   | ]	  }| d   ywr*  r   r,  s     rY   r   z1TritonKernel.sort.<locals>.csv.<locals>.<genexpr>	  r-  r  r.  r/  s    rY   r0  zTritonKernel.sort.<locals>.csv	  r1  r[   c                   t        |      D cg c]  }|  d| d|  }}t        
fd|D              r'|D cg c]  }
j                  j                  |    c}S t        |      D cg c]  }
j                  j	                          }}
j
                  j                   	|       d|         t        ||      D ]'  \  }}|r||_        |
j                  j                  |<   ) t        |      S c c}w c c}w c c}w )Nr   c              3  N   K   | ]  }|j                   j                  v   y wru   r4  r5  s     rY   r   z:TritonKernel.sort.<locals>.cse_multiple.<locals>.<genexpr>	  r6  r7  ry  r8  r9  s            rY   r<  z'TritonKernel.sort.<locals>.cse_multiple	  r=  r>  r   r   rnumelr   ztriton_helpers.sort_with_index(r   z	, stable=z, descending=zUnhandled sort)r   r   r   rT  rm  rg  r  r   r;  r<  r  r  r  r  rl  _has_constant_maskr   r  r   rm   r  rE  )rw   r@  r  stable
descendingr  r  rC  r0  r   rA  r<  rW  r  r;  r  	input_varr0  s   `                @rY   sortzTritonKernel.sort  s    $$$$MD<L<LMM% u??N$NN"%%	A@	A%!%!1!1"!5!<!<''(9(94<<H%%'!+  
 *5'D4G4G4I3J!LM
 

	=	& #**c1112243C3CB3GHhv;!12DQ2G1HK]^_K`Ja b82cU)F8=AO  'tS[%@K !122%(f%= 	1!J	#(J  ) 0 0J	1 [!!M
s   $Gc                   | j                   s1| j                  s%| j                  s| j                  s| j                  sy| j
                  rX| j                  d   j                  r>| j                  j                  d       | j                  j                         5  | j                  | j                  d   | j                         | j                  j                  | j                          | j                  j                  | j                         | j                  j                  | j                         | j                  j                  | j                         ddd       | j                  j                  | j                         | j                  d   j!                          n| j                  j                  | j                          | j                  j                  | j                         | j                  j                  | j                         | j                  j                  | j                         | j                  j                  | j                         | j                   j#                          | j                  j#                          | j                  j#                          | j                  j#                          | j                  j#                          y# 1 sw Y   xY w)a  
        Concat output code from index_code, loads, compute, stores,
        suffix into self.body.

        For pointwise kernels, this is called just once at the end.

        For reduction kernels, this generates a loop over the reduction
        axis.
        Nr   z(for roffset in range(0, rnumel, RBLOCK):)indexing_coder  r  r  r  r   r   r  r  r^   r  r  r]   r  
invalidater  cache_clearclearrv   s    rY   codegen_bodyzTritonKernel.codegen_body8	  s    zz{{||{{  T%5%5b%9%A%AII JK!!# .44T5E5Eb5I499U		  !3!34		  ,		  .		  -. HH 6 67R ,,.IIT//0IITZZ(IIT\\*IIT[[)		%  "

+. .s   B>KK)c                   t               }| j                  j                         \  }}}}|j                  g d       |j	                         5  t        j                         }g }	t        ||      D ]  \  }
}dt        |       }t        j                  j                  |
      }|r|j                  | dt        j                  j                  j                  |j                                dt        j                  j                  j                  |j!                                d|j#                          d|j%                          d
       n|
t        j                  j&                  v rt        j                  j&                  |
   }|j                  | dt        j                  j                  j                  |j)                                dt        j                  j                  j                  |j+                                d|j,                   d|j.                   d
       nt1        |t2              rZt        j                  j                  j5                  |j6                        }d|j8                  v rd	}|j                  | d
|        nt1        |t:              rvt        j                  j<                  j?                         }t        j                  j                  j5                  |j@                        }|j                  | d| d| d       ntC        d|
       |	jE                  |        |j                  ddjG                  |	       d       d d d        |j                  g d       |g }g }d }| jI                         D ]p  }tK        t        j                  j                  j5                  |jL                              }|jE                  |       |jN                  dk7  s`|jE                  |       r | jQ                         r#djG                  tS        tT        |            dz   }nd}| ddjG                  |       d}nd| }t        j                  j<                  j?                         }|jV                  }|j	                         5  |j                  dt        j                  jX                  j[                  |       d       |j	                         5  |j                  t        j                  jX                  j]                  |             d| }|j                  | d| d       |j                  tU        t^        j`                         d| d| d       d d d        d d d        |j                  g d       |j	                         5  |j                  dt        j                  jX                  j[                  |       d       |j	                         5  |j                  t        j                  jX                  j]                  |             |j                  dtU        t^        j`                         d| d       d d d        d d d        |j                  g d       |j	                         5  |j                  d       |j                  d       |j                  d       |j                  d        |j                  d!|        |j                  d"       |j                  d#       d d d        |S # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   |S xY w)$N)rS   rS   zdef get_args():arg_z = rand_strided(r   z
, device='z	', dtype=r   r  r   ry  z = torch.zeros(z', dtype=torch.uint8)z*Don't find the buffer or const tensor for r  r+  )
re  zdef call(args):r   rS   z
grid=grid(zgrid=zwith r   streamz = get_raw_stream(z.run(*args, z	, stream=)re  re  z def benchmark_all_configs(args):z.benchmark_all_configs(*args, )re  re  zif __name__ == '__main__':z<from torch._inductor.runtime.benchmarking import benchmarkerzargs = get_args()zKms = benchmarker.benchmark_gpu(lambda: call(args), rep=40, fast_flush=True)z	num_gb = zgb_per_s = num_gb / (ms / 1e3)z<print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s"))1r>   r   python_argdefs
writelinesr  r  r  r   rz  r7   r   try_get_bufferr^   r   
size_hintsget_size
get_stride
get_devicer  	constantsr   r  devicer  r   rA   	size_hintr   r   rC   	schedulerget_current_device_or_thrownbytesKeyErrorr   r   rP  rG   r&  r   r  rn  rj   rs   
device_opsdevice_guard
set_devicer0   KERNEL_NAME)rw   num_gbgridr  argdefs	call_argsr#  r  name_cnt	var_namesarg_namearg_sigr  bufconst_tensorsymval_hintro  rs  
extra_argsextra_args_strr  r   grid_argcurrent_devicers   stream_names                             rY   codegen_kernel_benchmarkz%TritonKernel.codegen_kernel_benchmarkd	  s   !+/99+C+C+E(Iq56]]_ $	@ (HI%(I%>  +!'!$x.!12gg,,X6$$#*$4QWW5E5E5P5PQTQ]Q]Q_5`4aacdedkdkdtdtdd  AD  AO  AO  AQ  eR  dS  S]  ^a  ^l  ^l  ^n  ]o  ox  y|  yF  yF  yH  xI  IJ  K !2!22#$77#4#4X#>L$$#*$4QWW5E5E5P5PQ]QbQbQd5e4ffhijipipiyiy  jE  jE  FR  FY  FY  F[  j\  i]  ]g  ht  h{  h{  g|  |E  FR  FX  FX  EY  YZ  [  1"#''"2"2"<"<W\\"JK
 %4&'$$z[M%BC6WW..JJLFWW--77GF$$#*OF8:fXMbc #DXJO    *A +B wtyy';&<A>?I$	@L 	9:<DJ!N//1 &QWW--77

CD!!$';;#%KK%	&
 ##%!%3sJ+?!@4!G!#()DIIdO3DAFHtf~H**FFH$$]]_ 
	uQWW%7%7%D%DU%K$LANO   GG&&11%8 !'ug.  K=0B5'!KL  ;2234L
)T_S``ab
	 	JK]]_ 	uQWW%7%7%D%DU%K$LANO   GG&&11%8   c+"9"9:;;YZbYccde		 	DE]]_ 	N R 01] y12=>N	  E$	@ $	@v 
	 
	  	 		  sf   L\A\%%B\+\%A\>+A(\2\>A;]
\\"	\%%\/2\;	7\>>]
]c                    t        j                  dj                  t        j                  j
                  j                  d                  S )Nz
            from torch._dynamo.testing import rand_strided
            {}
            import torch
            from torch._inductor.runtime.triton_heuristics import grid, split_scan_grid
        get_raw_stream)textwrapdedentr   r7   r   ru  import_get_raw_stream_asrv   s    rY   imports_for_benchmark_kernelz)TritonKernel.imports_for_benchmark_kernel	  s=    
 F"";;<LM	
 		
r[   c                T    | j                   r| j                  sJ y| j                  ryy)Nr  r  	pointwise)r  r   rv   s    rY   _get_heuristiczTritonKernel._get_heuristic	  s,    $$(((()""r[   c                    t         j                  j                  j                         t        j                         t
        j                  t
        j                  t
        j                  j                  t
        j                  t
        j                  t
        j                  t
        j                  t
        j                  t
        j                  j                  t
        j                  j                   t
        j                  j"                  d} t         j$                  j&                  d| d<   t        j(                         rd| d<   t
        j*                  rLt
        j*                  | d<   t
        j,                  | d<   t
        j.                  | d<   t
        j0                  | d<   t
        j2                  r9t
        j2                  | d	<   t
        j4                  | d
<   t
        j6                  | d<   | S )N)backend_hash$are_deterministic_algorithms_enabledassert_indirect_indexingautotune_local_cacheautotune_pointwiseautotune_remote_cacheforce_disable_cachesdynamic_scale_rblockmax_autotunemax_autotune_pointwisemin_split_scan_rblockspill_thresholdstore_cubinTis_hip	is_fbcodeprofile_bandwidthprofile_bandwidth_regexprofile_bandwidth_output/profile_bandwidth_with_do_bench_using_profilingcoordinate_descent_tuning coordinate_descent_search_radius'coordinate_descent_check_all_directions)r  utils_tritontriton_hash_with_backendr  r   r  r  rX   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )inductor_metas    rY   inductor_meta_commonz!TritonKernel.inductor_meta_common	  s    "KK//HHJ494^4^4`(.(G(G$*$?$?"(--"B"B%+%A%A$*$?$?$*$?$?"//&,&C&C%+]]%H%H%}}<<!==44
 ==(&*M(#)-M+&##171I1IM-.7=7U7UM348>8W8WM45 FF A ++ 00 +
 77 2
 >> 9 r[   c                "   t               }g }| j                  D ]s  }t        j                  j                  j                  |      }t        |t        t        j                  f      sd}nt        t        |            }|j                  |       u | j                  s|j                          | j                         }|H|j                  t!                      t"        j$                  r|j                  | j'                                | j(                  j+                         \  }}	}
}	t-        |
      D ]  \  }}t        |t.              st1        t        j2                  |j4                        }|t        j                  j                  j6                  v sbt/        |j8                  t        j                  j                  j6                  |         |
|<    t;               }| j<                  D ]  }|| j(                  j>                  v r(|jA                  | j(                  j>                  |          || j(                  jB                  v r\|t        j                  jD                  vr@|| jD                  vr2|jA                  | j(                  jB                  |   jF                         || j(                  jH                  v s|jA                  | j(                  jH                  |           tK        ||
      D ]4  \  }}t        |tL              s|jN                  s$|jA                  |       6 tQ        |      }tS        |
| jT                        }|tW        jX                  t        j                  jZ                  j]                               i d}t_        | j`                        tc        td        jf                        || jh                  | jj                  | jl                  d| jo                         }d }t"        j$                  st"        jp                  r| js                         dz  }||d<   | ju                         D ]w  }t/        |jv                   d|jx                        }|
j                  |       t{        || jT                        |t}        |      <   |j                  |jv                   d       y t        |
      g|d<   |d   d	   j                  D ]
  }d
|d   |<    || _A        | j                  D ]W  }|jv                  dk(  r| j                  r|j                  ,|j                  |jv                  j                          d       Y | j                          | j                  D ]$  }|j                  d       |j                  |       & | j                  r| j                  }d| d|d| d|d|d}nBd}t}        |      dk(  rt}        |
      dk(  rd}nd}d| d|d| d|d|d| j                   d}|j                  |       |j                  d|xs tc        td        j                         ddj                  |       d       |j                         5  | j                  |       | j(                  j                         D ]  \  }}|j                  | d|         |j                  | j                         d d d        t"        j$                  r |j                  | j                  |             |j                         S # 1 sw Y   IxY w) Ni    )
size_dtype)r#  ro  rn  )r  kernel_namemutated_arg_namesr   num_loadnum_reductiong    eAkernel_num_gbr&  configsr   r9   rn  r   zBLOCK : tl.constexprrS   z$
                @triton_heuristics.z!(
                    size_hints=z%,
                    reduction_hint=zI,
                    filename=__file__,
                    triton_meta=z$,
                    inductor_meta=z;
                )
                @triton.jit
            r   r  ztile_hint=TileHint.SQUARE,ztile_hint=TileHint.DEFAULT,r   zH
                    filename=__file__,
                    triton_meta=z*,
                    min_elem_per_thread=zdef r  r  ry  )Sr>   r   r7   r   r   symbolic_hintr   r  r   r   r*   r   r   r   r  r]   rb   r   benchmark_kernelr  r   rg  r   rA   r   r  r   inv_precomputed_replacementsr   r   r  input_buffersr  r  removed_buffers
inner_nameoutput_buffersr   rC   	zero_fillrm  rM   r	  r   r   rq  rr  setr  rj   r0   DESCRIPTIVE_NAMEr   r  r  r  r  estimate_kernel_num_bytesrP  r   r&  rL   r   rJ   
equal_to_1r  r   r  
tensor_dimr   rb  r  r^   r  r  rx  r   r  codegen_static_numelsaliasesr  r  r_   )rw   r   coderj  r&  
numel_hintrp  
heuristicsr{  r  r#  r   r  r#  mutated_argsmutationargnametriton_meta_signaturer  r  ry  r  sizeargarg_numr  r  heuristics_line	tile_hintoldnews                                 rY   codegen_kernelzTritonKernel.codegen_kernel	
  s^   
[[ 	)E))77>Jj3*>? !	+C
O<	i(#	)& $$NN((*
<KK134&&D==?@#'99#;#;#= Iq	* 	FAs#w' ellCHH5QWW--JJJ#*!''"2"2"O"OPV"W$IaL	 )3 
	EH499222  !8!8!BCDII555AGG$;$;;D$8$88  !:!:8!D!O!OP499333  !9!9(!CD
	E*  3 	*LGS#|,  )	* l+ 1$"2"2!
 /&--!!==? 
 "$"5"56{;;<!-!//
 '')
 ""f&>&>335;F-3M/*++- 	2DU3TZZ@GW%2>D$4$43!#g,/ NNdkk]%01	2 #,I"6!7I #9-a0;; 	2G01K$W-	2 '$$ 	ID{{c!d&?&?&NNdkk//122FGH	I 	++ 	 FNN2KK	    !00N#$$.< 0  *~ .$$2#3 4!!, 0##0"3 4	O I:!#y>Q& <I =I#$$.< 0  *~R	{ ;!!, 0##0"3 4))-)A)A(B C	O 	O$473{6678$))G:L9MRP	
 [[] 	#&&t, II--/ 1S#c#/01KK		"		# ""KK55f=>}}	# 	#s   A%\\c                   t         j                  j                  j                  |      }t	        |t
        j                  t        f      rt        |      }t        |      }|S d}t         j                  j                  j                  ||      s?|dk  s
J d|        |dz  }t         j                  j                  j                  ||      s?|S )N   i @  z!Failed to find static RBLOCK for r   )
r7   r   r   simplifyr   r   r   r  r*   r  )rw   rW  r   s      rY   _get_persistent_RBLOCKz#TritonKernel._get_persistent_RBLOCK
  s    !!**62fu}}c23f+C!#&C 
	 Cgg&&;;FCHi'U+LVH)UU'q gg&&;;FCH 
r[   c                J   | j                   D ]  }|j                  dk7  s| j                  r|t        j                  j
                  j                  |j                        }t        |t        j                  t        f      r)|j                  |j                   dt        |              |j                  dk(  r;| j                  r/| j                  |j                        }|j                  d|        |j                  dk(  s| j                  s|j                  d        y)a  
        We get a small speedup from hard coding numels if they are static.

        This code stomps on the passed-in values by writing an constant to the top of the kernel.

        In a kernel like:
        def KERNEL_NAME(in_ptr0, in_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):

        We would add
        xnumel = 4096
        rnumel = 768

        After the signature, before the kernel code, if we decided to make these static. As its hardcoded, it becomes
        a better signal to triton on how to unroll and do some static indexing. So, it's not so much that downstream
        knows that its a static numel, as that you just plop a constant into the kernel.
        r   znumel = zRBLOCK: tl.constexpr = r   zXBLOCK: tl.constexpr = 1N)r   r   r   r7   r   r   r  r&  r   r   r   r  r^   r  r  r   )rw   r  r  simplified_tree_numelr   s        rY   r  z"TritonKernel.codegen_static_numels
  s    " $$ 	;D{{c!T%:%:()(8(8(A(A$**(M%3emmS5IJNNdkk](3?T;U:V#WX{{c!d&?&?11$**=!8>?{{c!dmm9:	;r[   c                     y)Nrz  r   rv   s    rY   _get_grid_fnzTritonKernel._get_grid_fn
  s    r[   c                   | j                   D ]  }t        |j                  t        j                  t        j
                  f      r|j                  }n*t        j                  j                  j                  ||      }|j                  dk7  s| j                  r+|j                  |       |j                  t        |             |j                  |j                  |        y )Nr   )r   r   r&  r   r   r  r7   r   wrapper_codegenerate_numel_exprr   r   r   rD  grid_dim)rw   r   r|  	arg_typesrz  r  r   s          rY   add_numel_to_call_args_and_gridz,TritonKernel.add_numel_to_call_args_and_grid
  s    $$ 
	"D$**u}}ell&CDzzww++??dK{{c!T%:%:  &  d,}}(D!
	"r[   c                   t         j                  j                  }|j                          | j                  j                         \  }}}}g }| j                  ||||       t         j                  j                  j                         }| j                  j                  =| j                  j                  }	|j                  |	j                  ||	j                         |j                  ||      }|j                  ||||j                  dd|| j!                         | j"                  	       | j                  j                  "|j%                  |j'                  dg             y y )NT)cudarX   r  grid_fnr  	workspace)r7   r   r  write_triton_header_oncer   rg  r  rq  rr  workspace_arggenerate_workspace_allocationrs  r  generate_default_gridgenerate_kernel_callrs   r  r  r^   make_free_by_names)
rw   r   r  wrapperr  r|  r  rz  r  wss
             rY   call_kernelzTritonKernel.call_kernel
  s.   ''&&((*%)YY%=%=%?"9a,,T9iN**FFH99"".((B11		>2<< ,,T48$$  %%'(( 	% 
	
 99"".g88+GH /r[   c                   t         j                  j                  }| j                  j	                         \  }}}}t        ||      D ]  \  }}t        |t              st         j                  j                  rBt        j                  r|j                  d| d| d       Z|j                  d| d| d       sd| d}|j                  |       d| d}|j                  |        y )	Nz:AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_check_inf_and_nan("z", z));zassert_inf_and_nan("z);zassert not z.isnan().any().item()z.isinf().any().item())r7   r   r  r   rg  r   r   rB   cpp_wrapperr   abi_compatibler^   )rw   r  r  r|  arg_signaturesr  arg_signaturer  s           rY   codegen_nan_checkzTritonKernel.codegen_nan_check  s    ''&&*.))*B*B*D'9na"%i"@ 	,C-377&&,,))XY\X]]`ad`eehi  )),@SR*PQ(-BCD%%d+(-BCD%%d+	,r[   c                    t        |i |S ru   )r  )rw   r   r  s      rY   create_cse_varzTritonKernel.create_cse_var+  s     $1&11r[   c                   |j                    d| j                  | j                  |j                               }|j                  j
                  r| j                  j                  |       y | j                  j                  |       y )Nry  )	r   r  r  r   rootr  r^  r^   r  )rw   entryr  s      rY   codegen_iteration_ranges_entryz+TritonKernel.codegen_iteration_ranges_entry.  sd    **SD,@,@,L!M NO::((. II%r[   c                    |j                   J | j                  |j                         }| j                  }|dk7  rd| dnd}d|j                  j	                          d| | S )Nr  r  r   rS   ztl.arange(0, zBLOCK))r  indexing_size_strr	  r   r   )rw   r  r   r	  converts        rY   r  z)TritonKernel.iteration_ranges_ranges_code6  sq    +++%%e&6&67&&+6*+DDQ'"u||1134F4&	JJr[   c                ^    | j                   }| j                         }dg|z  }d| d| d| dS )Nr9   r  r   r   )r	  r  )rw   r  r   r	  r  r   s         rY   iteration_ranges_scalar_codez)TritonKernel.iteration_ranges_scalar_code=  sA    &&&&(sTz$r%;-q99r[   c                   |j                   J d|j                    d}|j                   dk(  rk|j                  s_t        j                  j                  j                  |j                  t                     s#d| d|j                   dz    d|j                    d}|j                  j                  ||      }| j                  dk7  r| d	| j                   dS |S )
Nztl.program_id(r   r9   r  z + tl.program_id(z) * tl.num_programs(rC  r  r  )r  has_zdimr7   r   r   r  r&  r)   r  r  r	  )rw   r  r   pids       rY   iteration_ranges_get_pidz%TritonKernel.iteration_ranges_get_pidC  s    ~~)))u~~.a0 NNaNNGG$$99%++~GWX
 cU+ENNQ,>+??STYTbTbSccefCoo!!#s+z)U$t//022
r[   c                   | j                   syt        j                  j                  j	                  |j
                  d      ry|j                  dk(  r(| j                  r| j                  |j
                        }n`|j                  dk(  r| j                  rd}nB|j                  j                         t        vryt        |j                  j                            }t        j                  j                  j                  |j
                  |      S )NFr9   Tr   r   )r  r7   r   r   r   r&  r   r  r  r   r   r(   r   )rw   r  r6  s      rY   rX  zTritonKernel._has_constant_maskV  s    !!7733DJJB ;;#$";";33DJJ?I[[CDMMI{{  "*::():):)<=I ww<<TZZSSr[   c                    | j                   D ]2  }| j                  |      s|j                  |j                   d       4 y )Nr  )r   rX  r  r   )rw   rm   r  s      rY   rT  zTritonKernel.filter_masksl  s>    $$ 	8D&&t,!!T[[M"67	8r[   c                   |j                   }|j                  r%|j                  |j                   d| d| d       n|j                  D|j                  |j                   d| j                  |              |j                  | d       n|j                  | d| j                  |       }n| j                  || d      }|j                  | d| j                  |       d|j                          d|j                   d| g       | j                  |      r(| j                         }|j                  | d	| d
       y |j                  | d|j                   d| d       y )Nry  z	offset + basez
offset = 0rc   z	offset = rp  rf   zmask = tl.full(z, True, tl.int1)zmask = z < r&  )r   r  r^   r   r  r  r  r  rh  r  r   rX  rl  )rw   r  r  r   r  r  s         rY   r  z,TritonKernel.iteration_ranges_codegen_headerq  s`   LL==NNejj\QCy4@A^^#NNejj\T-N-Nu-U,VWXNNaS
+,+Id&G&G&N%OP881#VMOOc4#@#@#G"HAGGI;V[\zzl#dV, ""5)'')ENNaSw6FGHNNaS

|3qc?@r[   )r	  rj   r  zOptional[OrderedSet[str]]r   r   )r  rE   r   r   )r  rE   r   zsympy.Symbol)r  rE   r   r  r   )r   rj   )rs   rr   )rS   )r   rj   r  rj   r  r   r   z'Tuple[str, Optional[DeferredLine], str])r   rr   r   rr   r  rp   r   rp   )r   rj   rs   rr   ru   )
r   rj   rs   rr   r   r<   r  r6   r   r   )r  r<   r  rj   r  rr   r  r  r  rp   r   r<   )
r  r  r  r  r  r5   r   +Union[CSEVariable, Tuple[CSEVariable, ...]]r   r  )r   rj   rs   rr   r   r<   )r@  Tuple[torch.dtype, ...]r  zUCallable[[Tuple[CSEVariable, ...], Tuple[CSEVariable, ...]], Tuple[CSEVariable, ...]]r  Tuple[CSEVariable, ...]r   r
  )
r@  r	  r  r
  rY  rp   rZ  rp   r   r
  )r   rj   r  zOptional[IRNode])r  rE   )r  rF   ):r   r   r   r  r  r   r  r  ri  r'   DEFAULTr  r  r  r   r  r  r  r  r  r   r  r  r}  r  r  r  r  r  r  r  r  r	  r%  rP  r\  rb  r  r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  rX  rT  r  r  r  s   @rY   r  r    s   %I%%).E&.O 04$,,&*" " -	" 
"@+135
L(
 " " O^O^d
 EG33!3-<3	03@
8:: : 	:
 ::_D SW&.&. *&.3>&.FO&.	&.P$$ $ !	$
 $$ $ 
$L.^^ ^ &	^
 ;^ 
5^@:#SJg"'g"
g" (g" 
!g"R:"':" (:" 	:"
 :" 
!:"x*XgR

 % %Nun
;<"I<,$2&K:&T,8
Ar[   r  c                     e Zd ZdZdZeZej                  e	j                  e	j                  e	j                  e	j                  e	j                  e	j                  g      Zej$                  j&                  6ej)                  ej                  e	j*                  e	j,                  g             ed	d       Zd Zd Zd Zd Zy)
TritonSchedulingr  r  Nc                    | j                   S ru   )backend_features)rI  ro  s     rY   get_backend_featuresz%TritonScheduling.get_backend_features  s    ###r[   c                   t         j                  j                  }t        ||      \  }}|r|j	                  |       t
        j                  rvddlm}m	 t        fd|D              sY|D cg c]  }t        ||      r|j                           }}|j	                  |j                   ddj                  |              y y y c c}w )Nr   )BaseSchedulerNodeForeachKernelSchedulerNodec              3  6   K   | ]  }t        |        y wru   )r   )r   r  r  s     rY   r   z3TritonScheduling.codegen_comment.<locals>.<genexpr>  s      >?
189s   z Fused node name list: r   )r7   r   r  r.   r^   r   debug_fusiontorch._inductor.schedulerr  r  r   r   get_namecommentr   )	rw   node_scheduler  originsdetailed_originsr  r  
node_namesr  s	           @rY   codegen_commentz TritonScheduling.codegen_comment  s    ''&&$7w$O!!g&
  CP  +!!%67 JJL
 
 !!''>tyy?T>UV s   .#Cc                r   t         j                  j                  }||j                  v r|j                  |   }|S t        j
                  j                  r$t        |t        j
                  j                        nd}t        |      d d }dj                  d|||j                         g      }||j                  |<   t        j
                  j                  r|nd}|j                  t        t        j                        |      }|j                  t        t        j                         |      }|j                  dd      }t#        t%        |j'                               d      \  }	}
}t)               }|j+                  d	|d
       |j-                  |d       t         j                  j.                  j1                         }|j+                  d|j2                   d       d| }t5        ||      \  }}|d|z   dz   |z   z  }|j7                  ||j9                         |       t;        d      rt=        |||       |S )NrS   r   r  rX   triton_z#pragma CMT#pyzasync_compile.triton(z, '''T)stripz''', device_str='z')z# kernel path: re  kernel_metadata)r7   r   r  src_to_kernelr   rX   descriptive_namesr-   r8   r   next_kernel_suffixunique_kernel_namesreplacerj   r0   r  rx  r"   r!   r"  r>   r^   r]   rq  rr  rD  r.   define_kernelr_   r$   r%   )rw   src_coder  r   r  r  
fused_namekernel_category	subs_namebasenamer  kernel_pathcompile_wrapperr  metadata_commentr  r  s                    rY   r)  zTritonScheduling.define_kernel  s   ''&&w,,,!//9K\ U ==22 &mV]]5T5TU 
 AJ2ANO((?J8R8R8TUK /:G!!(+'-}}'H'HiI
  ''K,H,H(I;WH''K,C,C(DiPH  ''s;H'/	(..:J0KT'R$Ha,.O%%(=i]%&PQ""84"8WW..JJLN%%(9.:M:M9Nb&QR!0>(;M7(S%G%w 58H HH!!_5579I ''89#KhGr[   c                n  	
 t               5  | j                  |d      }t        j                  |      fd		fd}	fd}t        j                  d|D ch c]  }|j                          c}j                          |       j                  fcd d d        S j                         j                  
j                  	  
 j                   d          j                  }t        |      d
k(  sJ |d   j                   dkD  rt        d	      n6t#        j$                  
fd      t#        j$                  fd      z
  t        j                  d|D ch c]  }|j                          c}        |        j                  fcd d d        S c c}w # t        $ rl}t        j                  d||D ch c]  }|j                          nc c}w c}       t        d	       |        j                  fcY d }~cd d d        S d }~ww xY wc c}w # 1 sw Y   y xY w)NT)r  c                 ~     j                   J t        j                  j                   j                         d   dz   S Nr   z.kernel_perf__file__ospathsplitextmods   rY   cache_file_pathz?TritonScheduling.benchmark_fused_nodes.<locals>.cache_file_path  s6    ||///ww''5a8>IIr[   c                             } t         j                  j                  |       r.t        |       5 }t	        |j                               cd d d        S y # 1 sw Y   y xY wru   )r7  r8  existsopenfloatreadr8  fdr<  s     rY   
load_cachez:TritonScheduling.benchmark_fused_nodes.<locals>.load_cache  sM    &(77>>$'d 0r$RWWY/0 00s   AA c                             } t        | d      5 }|j                  t                     d d d        y # 1 sw Y   y xY w)Nwr?  writerj   )r8  rC  r<  mss     rY   store_cachez;TritonScheduling.benchmark_fused_nodes.<locals>.store_cache  s;    &($_ &HHSW%& & &s	   9A%kernel src code for %s written to: %sr   z*Exception (%s) in compiling fused nodes %sinfr9   c                 4      j                     d         S Nr   
clone_argsr   callwrapped_jit_functions   rY   r  z8TritonScheduling.benchmark_fused_nodes.<locals>.<lambda>0       D!@!5!@!@$!G!JK r[   c                 "     j                     S ru   rO  r   rS  s   rY   r  z8TritonScheduling.benchmark_fused_nodes.<locals>.<lambda>8  s    ;0;;TB r[   z+The fused kernel for %s took %.3f ms to run)r   generate_kernel_code_from_nodesr#   r  rx  debugr  r6  get_argsrR  r  rP  	Exceptionr@  	launchersr   n_spillsr&   benchmark_gpu)rw   r  r*  rD  rJ  r  er[  r   r<  rR  r;  rI  rS  s           @@@@@@rY   benchmark_fused_nodesz&TritonScheduling.benchmark_fused_nodes  s   ! J	$;; < H ""8,CJ&
 II7',-!-
 B~3<<';J	$ J	$> <<>D88D#&;; 
(4)44d;A>? -66Iy>Q&&&|$$q(5\ !..K +33B  II=',-!-
 Ms||#UJ	$ J	$0 .  (		@+01aQZZ\11
 5\3<<'']J	$ J	$L(@ .MJ	$ J	$sm   AH+F)5(H+'(H+F.(BH+,H&H+)H+.	H#7HG$#,HH#H+H##H++H4c           	        fdfd}fd}dg }}d}t         j                  j                  }t        |      t         j                  _        t         j                  j                  }t        |      t         j                  _        t
        j                  dkD  }	t
        j                  dkD  }
| j                  |d|	|
d      }|D ]  \  }}}|D cg c]  }|j                          }}|D cg c]  }|D ]  }|j                           }}}|j                  t        t        j                        d      }t        j                   |      t"        j%                  d|j&                          |       \  &|z  }|z  }|j)                  j&                         ܉j+                         j,                  j.                    j0                   d          j2                  }t5        |      d	k(  sJ |d   j6                  dkD  rt9        d
      xn3t;        j<                  fd      t;        j<                  fd      t"        j%                  d|D ch c]  }|j                          c}        |        |z  }|z  }|j)                  j&                          |t         j                  _        |t         j                  _        |||fS c c}w c c}}w c c}w )Nc                 ~     j                   J t        j                  j                   j                         d   dz   S r4  r5  r:  s   rY   r<  z@TritonScheduling.benchmark_combo_kernel.<locals>.cache_file_pathD  s6    <<+++77##CLL1!4~EEr[   c                             } t         j                  j                  |       rCt        |       5 }t	        d |j                         j                         D              cd d d        S y# 1 sw Y   yxY w)Nc              3  2   K   | ]  }t        |        y wru   )r@  )r   r^  s     rY   r   zNTritonScheduling.benchmark_combo_kernel.<locals>.load_cache.<locals>.<genexpr>L  s      Eaq Er  )NN)r7  r8  r>  r?  rE  rA  r  rB  s     rY   rD  z;TritonScheduling.benchmark_combo_kernel.<locals>.load_cacheH  s^    "$Dww~~d#$Z F2  E2779??3D EEF FFs   .A,,A5c                             } t        | d      5 }|j                  t              dz   t              z          d d d        y # 1 sw Y   y xY w)NrF  rB  rG  )r8  rC  r<  rI  ms_clones     rY   rJ  z<TritonScheduling.benchmark_combo_kernel.<locals>.store_cacheO  sH    "$DdC 8BR3X678 8 8s   *AAr   T)subkernel_nodescustom_part_algorithmenable_autotunemixed_sizesonly_gen_src_coder  rK  r9   rL  c                 4      j                     d         S rN  rO  rQ  s   rY   r  z9TritonScheduling.benchmark_combo_kernel.<locals>.<lambda>  rT  r[   c                 (     j                     d   S rN  rO  rV  s   rY   r  z9TritonScheduling.benchmark_combo_kernel.<locals>.<lambda>  s    ;0;;TB1E r[   zDThe fused kernel for %s took %.3f ms to run, %.3f ms to clone inputs)r7   r   r  r   inplaced_to_remover   combo_kernels_autotunecombo_kernel_allow_mixed_sizesgenerate_combo_kernel_code	get_nodesr  r(  rj   r0   rx  r#   r  rx  rX  r6  r   rY  rR  r  rP  r[  r   r\  r@  r&   r]  )rw   	node_listrD  rJ  total_ms	file_listtotal_clone_msremoved_buffers_originplaced_to_remove_origrh  ri  kernel_code_listr*  r  
node_groupr  fused_node_listsr  r  r  r[  r   r<  rR  r;  rI  re  rS  s                        @@@@@@@rY   benchmark_combo_kernelz'TritonScheduling.benchmark_combo_kernelC  s   	F	 	8
  ) ww66",-A"B"#''"<"<%/0G%H" 77!;;;a?::%"&+#" ; 
 (8 2	+#Ha=GHT 0HH/?OeOAQZZ\O\OEO''K,C,C(DiPH""8,CII7
 &<LB~B(*  .<<>D88D#&;;  0%00$7:;,66Iy>Q&&&|$$q( %e,X !..K '44E IIV'12!2	 MNHh&NS\\*e2	+f #7%<"22i  IOR 3s   K*7K/,K5)ro  ztorch.device)r   r   r   
int32_type
int64_typer  kernel_typedictfromkeysr:   FOREACH	BUCKETIZEINPLACE_BUFFERSMASKED_SCATTER_WITH_INDEXSCANTRITON_TEMPLATESr  r  r  r  r  TUPLE_REDUCTIONSORTr  r  r  r)  r_  r{  r   r[   rY   r  r    s    JJK}}""$$**44++	
	 }} MM #22"''	
 $ $41fK$ZV3r[   r  )r   rj   r   r   r   r   )r  r  r   OpsHandler[str])r  r  r   r  )
__future__r   r   r;  r  loggingr7  r  r   typingr   r   r   r   r	   r
   r   r   r   r   r   r  torch._loggingtorch._dynamo.utilsr   torch._inductor.runtime.hintsr   r   torch._prims_commonr   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._tritonr   utils._sympy.symbolr   r   r   r   utils._sympy.value_rangesr   rS   r   r    	codecacher!   r"   r#   metricsr$   r%   runtime.benchmarkingr&   runtime.hintsr'   r(   runtime.runtime_utilsr)   r*   r  r+   r,   r-   r.   r/   r0   r1   r2   virtualizedr3   r  r4   r5   r6   r7   wrapper_benchmarkr8   commonr:   r;   r<   r=   r>   r?   r@   rA   rB   rC   simdrD   rE   rF   rG   rH   rI   triton_utilsrJ   rK   rL   rM   rN   	getLoggerr   rx  _logginggetArtifactLoggerperf_hint_logschedule_log
fusion_logrZ   rb   r   rf  r   r  r   r   r   	dataclassri   r   r  r  r7  r  r  r  r  r  r  r  _initialize_pointwise_overridesr  r  r  r  r   r  r  )r   s   0rY   <module>r     s   "     	        2 H 0 / K K 2 X X 4  8 8 B . ; C	 	 	 O N B     g!00<H~~//*E^^--hA
 4   4 . dkk4;;7 	,%,,*T*+62Dd
SS dkk4;;7 	,%,,*T*0023594RV
WW ( ( (0 y+ y+ y+x+2WfM Wft 	$* $&> >"`&k `&F  / / 9W$O W$v$+ $+N : : :&bA: bAJ7M3~ M3SX
s    #L1L