
    sg                      d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
mZmZmZmZmZmZmZmZmZmZmZ d dlZd dlZd dlZd dlmZ d dlmZmZmZ d dlm Z m!Z!m"Z" ddl#m$Z$ d	d
l%m&Z&m'Z'm(Z( d	dl)m*Z* d	dl+m,Z,m-Z-m.Z.m/Z/ d	dl'm0Z0m1Z1 d	dl2m3Z3 d	dl4m5Z5 d	dl6m7Z7m8Z8 d	dl(m9Z9m:Z:m;Z; d	dl<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZC d	dlDmEZEmFZFmGZG ddlHmIZImJZJmKZKmLZL ddlMmNZN  ej                  eP      ZQej                  j                  ePd      ZTej                  j                  ePd      ZUej                  j                  ePd      ZV eL       j                  ZXej                   G d d             ZZ G d deZ      Z[ G d deZ      Z\d  Z] G d! d"eK      Z^ G d# d$e:      Z_ej                   G d% d&             Z` G d' d(      Za G d) d*      Zb G d+ d,ec      Zdy)-    )annotationsN)AnyCallableCounterDefaultDictDictIterableListOptionalSequenceTupleUnion)
OrderedSet)FloorDivIdentityModularIndexing)free_symbol_is_typesymbol_is_typeSymT   )counters   )configir	scheduler)	code_hash)Dep	MemoryDepStarDepWeakDep)IRNodeTritonTemplateBuffer)!indexing_dtype_strength_reduction)ReductionHint)
green_textyellow_text)BaseSchedulerNodeBaseScheduling	WhyNoFuse)get_dtype_sizeIndentedBufferPlaceholdersympy_index_symbolsympy_product
sympy_subsunique)ops
OpsWrapperV   )CSEVariableindex_prevent_reorderingKernelPythonPrinter)MultiKernel
perf_hintsschedulefusionc                       e Zd ZdZ ej
                  d       ej
                  d      d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZd Z xZS )IterationRangesa  
    Each range tree represents multiple sets of iteration indexing
    in a single tiled dimension in the output kernel.

    If you have two loops ranges one (4, 3, 2) and another (4, 6),
    then the range tree will be:
            4 (i0)
        3 (i1)  6 (i3)
        2 (i2)
    Where i0 is shared between both loops, but then the split into
    different indexing vars.  All loop ranges must iterate over
    the same number of elements.
    r4   )divisorlengthc                   t         
|           || _        || _        || _        || _        || _        || _        || _        || _	        |	| _
        y N)super__init__namevar_list
var_rangesnumelprefixr?   r@   kernelroot)selfrE   rF   rG   rH   rI   rJ   r?   r@   rK   	__class__s             O/var/www/html/venv/lib/python3.12/site-packages/torch/_inductor/codegen/simd.pyrD   zIterationRanges.__init__R   sO     		 $
	    c                ,    t        | j                        S rB   )r-   rE   rL   s    rN   symbolzIterationRanges.symbolj   s    !$)),,rO   )rE   strrF   zList[sympy.Symbol]rG   zDict[sympy.Symbol, sympy.Expr]rH   
sympy.ExprrI   rS   rJ   
SIMDKernelrK   IterationRangesRootreturnNone)	__name__
__module____qualname____doc__sympyIntegerrD   rR   __classcell__rM   s   @rN   r>   r>   B   s~    . a u}}Q % 3	
    " 
0-rO   r>   c                  |     e Zd Z	 d		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d
 fdZddZd Zd Zd ZddZddZ	ddZ
 xZS )rV   c          	         |i }t         |   |g i ||||        || _        i | _        || _        |r	|dk(  r|	J || _        || _        |	| _        |
| _        y )N)rE   rF   rG   rH   rI   rJ   rK   r)	rC   rD   indexnodes	pid_cacheis_loop
tensor_dimgrid_dimhas_zdim)rL   rE   rH   rI   rd   rJ   rf   rg   rh   ri   rj   rM   s              rN   rD   zIterationRangesRoot.__init__o   s     I 	 	
 
=?
 *3 v}1ABB$  rO   c                <    d| j                   d| j                   dS )NzIterationRangesRoot(, z, ...))rE   rH   rQ   s    rN   __repr__zIterationRangesRoot.__repr__   s    %dii]"TZZLGGrO   c                b    | j                   j                         D ]  }|j                           y rB   )re   valuescache_clear)rL   nodes     rN   rp   zIterationRangesRoot.cache_clear   s*    JJ%%' 	D	rO   c                2    t        | j                   d      S )Nrd   )r-   rI   rQ   s    rN   	index_symzIterationRangesRoot.index_sym   s    !T[[M"788rO   c                   t         j                  j                  j                  ||z  | j                        rt        | j                         |      }nt        | j                         ||      }|| j                  vrt        | j                   t        t         j                  j                         ||||       }|t         j                  j                  |j                         <   | j                   j#                  |j                                || j$                  |j                         <   || j                  |<   | j                  |   S )zF
        Lookup a given RangeTreeEntry, creating it if needed
        )r3   graphsizevarsstatically_known_equalsrH   r   rs   r   re   IterationRangesEntryrI   nextrJ   iter_vars_countrange_tree_nodesrR   rF   appendrG   )rL   r?   r@   exprrq   s        rN   lookupzIterationRangesRoot.lookup   s     7733Gf4DdjjQDNN,g6D"4>>#3WfEDtzz!';;-QXX%=%= >?@D 8<AHH%%dkkm4MM  /-3DOODKKM*#DJJtzz$rO   c                    t        j                  d      }g }t        |      D ](  }|j                  | j	                  ||             ||z  }* t        t        |            S Nr4   )r]   r^   reversedr|   r~   list)rL   lengthsr?   itervarsr@   s        rN   construct_entriesz%IterationRangesRoot.construct_entries   s\    --"w' 	'FOODKK89&G	' HX&''rO   c                f    | j                  |      D cg c]  }|j                          c}S c c}w rB   )r   rR   )rL   r   es      rN   	constructzIterationRangesRoot.construct   s'    $($:$:7$CDq
DDDs   .c           
     `  	 |j                   D cg c]+  }t        j                  j                  j	                  |      - }}|D cg c]!  }|s|j
                  | j
                  k(  s |# }}|j                  d        t        j                  d      g g 		fd}|D ]v  }t        j                  j                  j                  |j                        s8 || j                  t        |j                                     |j                   ||       x t        j                  j                  j                  | j                        s, || j                  t        | j                                     t!        t#                    t!        t#        	            fS c c}w c c}w )z,Figure out vars from this tree used in indexc                    t         j                  j                  j                  | j                  t
        j                        S )N)fallback)r3   ru   rv   	size_hintr?   r   unbacked_symint_fallbackxs    rN   <lambda>z4IterationRangesRoot.vars_and_sizes.<locals>.<lambda>   s/    !''**44		F$C$C 5  rO   keyr4   c                    j                  | j                                j                  | j                         | j                  z  y rB   )r|   rR   r@   )rq   r?   
index_varssizess    rN   addz/IterationRangesRoot.vars_and_sizes.<locals>.add   s5    dkkm,LL%+GrO   )free_symbolsr3   rJ   r{   getrI   sortr]   r^   ru   rv   rw   r?   r~   r   rH   r   r   )
rL   rd   sre   nr   rq   r?   r   r   s
          @@@rN   vars_and_sizesz"IterationRangesRoot.vars_and_sizes   sL   ;@;M;MNa**..q1NN!CqQ188t{{+BCC

 	 	

 --"
	,  	D77##;;DLL'RDKK$,,)HIJ,,I	 ww77

GLGXdjj'%BCDHZ()4+@@@7 OCs   0F&F+F+*F+rB   )rE   rS   rH   rT   rI   rS   rd   intrJ   rU   rg   boolrh   Optional[int]ri   r   rj   r   rW   rX   rW   rS   )r   zList[sympy.Expr]rd   rT   )rY   rZ   r[   rD   rm   rp   rs   r~   r   r   r   r_   r`   s   @rN   rV   rV   n   s     )!)! )!
 )! )! )! )! ")!  )! )! 
)!VH9 .(EArO   rV   c                  d     e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 d	 fdZd
dZd Zd Zd Zd Zd Z	d Z
 xZS )rx   c                $   t         |   ||j                  |z  |j                  |j                  |j
                  |||j                  |j                  	       || _         t        j                  d       | j                        | _        || _        y )N)	rE   rH   rF   rG   rI   r?   r@   rJ   rK   )rC   rD   rH   rF   rG   rI   rJ   rK   parent	functools	lru_cache_codegencodegenr}   )rL   rE   r?   r@   r}   r   rM   s         rN   rD   zIterationRangesEntry.__init__   s~     	,,'__((==== 	 
	
 0y**40?	rO   c                    d| j                    d| j                   d| j                   d| j                   d| j                   dS )NzIterationRangesEntry(rl   ))rE   r?   r@   r}   rG   rQ   s    rN   rm   zIterationRangesEntry.__repr__   sH    &tyykDLL>DKK=PRSWS\S\R]]_`d`o`o_ppqrrrO   c                L    fd| _         d | j                   _        | _        y )Nc                      S rB    rE   s   rN   r   z/IterationRangesEntry.set_name.<locals>.<lambda>  s    t rO   c                      y rB   r   r   rO   rN   r   z/IterationRangesEntry.set_name.<locals>.<lambda>      rO   )r   rp   rE   )rL   rE   s    `rN   set_namezIterationRangesEntry.set_name  s    ##/ 	rO   c                8    | j                   j                          y rB   )r   rp   rQ   s    rN   rp   z IterationRangesEntry.cache_clear  s      "rO   c                X    t         j                  j                  |        | j                  S rB   )r3   rJ   codegen_iteration_ranges_entryrE   rQ   s    rN   r   zIterationRangesEntry._codegen	  s    	//5yyrO   c                   g }t        | j                  t        j                        r|S t        | j                  t        t
        f      sJ t        | j                               | j                  j                  dd  D ]l  }t        |t        j                  t        j                  f      r.|j                  }t        |      dkD  sIt        d |D              s\|j                  |       n |S )Nr4   r   c              3  P   K   | ]  }t        |t        j                           y wrB   )r   r   SIZE.0r   s     rN   	<genexpr>z8IterationRangesEntry.precomputed_args.<locals>.<genexpr>  s       ,56N1dii0,   $&)
isinstancer}   r]   Symbolr   r   typeargsr^   r   lenallr|   )rL   precomputed_argsargsymbolss       rN   r   z%IterationRangesEntry.precomputed_args  s    -/dii.##$))h%@AR4		?RA99>>!"% 	1CcEMM5<<#@A**w<!# ,:A, ) %++C0	1  rO   c                ,    t        | j                        S rB   )hashrE   rQ   s    rN   __hash__zIterationRangesEntry.__hash__  s    DIIrO   c                4    | j                   |j                   k(  S rB   r   )rL   others     rN   __eq__zIterationRangesEntry.__eq__  s    yyEJJ&&rO   )rE   rS   r?   rT   r@   rT   r}   rT   r   r>   rW   rX   r   )rY   rZ   r[   rD   rm   r   rp   r   r   r   r   r_   r`   s   @rN   rx   rx      sf      	
    
.s
# 'rO   rx   c                    | t        d      k(  ry| t        d      k(  ryt        j                  |       ryt        |       S )Ninfzfloat("inf")z-infzfloat("-inf")zfloat("nan"))floatmathisnanrepr)values    rN   constant_reprr   #  s9    e	%-		E	;rO   c                      e Zd ZU dZeZded<   dZddej                  dd	 	 	 	 	 d, fdZ
d Zd	 Zd-d
Zd.dZd/dZd Zd Zd Zd0dZd Zd Zd1dZd1dZd Zd Zd Ze	 	 	 	 d2d       Ze	 	 	 	 d2d       Zd3dZd4dZ d4dZ!d5dZ"	 	 d4dZ#d6dZ$d7d Z%d8d!Z&d9d:d"Z'e(jR                  d#        Z*d4d$Z+ed%        Z,d& Z-d' Z.d( Z/d) Z0d* Z1d;d+Z2 xZ3S )<rU   zo
    Common base class for Triton/Halide codegen which both use flattened indexing rather than loop nests.
    zCallable[[sympy.Expr], str]kexprFN)	mutationsrf   reduction_hintoverride_persistent_reductionc                   |i }t         	           t                _        t                _        |D cg c]+  }t
        j                  j                  j                  |      - c} _	        ||n	t                _        g  _        i  _        t        j                          _         j                  d   dk7   _        | _        | _        t                _        t+        j,                  t.               _        ||n j3                          _         j7                          _        d  _        t=        j>                  d       d fd       }| _          jC                  |       y c c}w )Nr4   c                    t         j                  j                  j                  | j	                               } j
                  D ]  }j                  | |      }  j                  |       S rB   )r3   ru   rv   simplify_with_rangesrG   range_treescombine_contiguous_dimscombine_modular_indexing_pairs)rd   treerL   s     rN   simplify_indexingz.SIMDKernel.__init__.<locals>.simplify_indexingY  sb    GG$$99%ARSE(( B44UDAB 66u==rO   r   )"rC   rD   r+   bodyindexing_coder3   ru   rv   simplifynumelsr   r   r   r{   	itertoolscountrz   inside_reductionr   index_dtype
last_usagecollectionsdefaultdictr   buf_accessesshould_use_persistent_reductionpersistent_reductionwant_no_x_dimno_x_dimr   r   r   r   initialize_range_tree)
rL   r   r   rf   r   r   groupsr   r   rM   s
   `        rN   rD   zSIMDKernel.__init__6  sD    I"$	+-=CDqww''003D".IJL 	 79JL(0 $B1 4, ++5<9D9P9PQU9V -8 *557 	!
 **,+/ 
		T	"	> 
#	> "3""9-= Es   0E'c                     yNFr   rQ   s    rN   r   zSIMDKernel.want_no_x_dimd      rO   c                   | j                    xs | j                  d   dk(  }d}|t        | j                         d  d}| j                  rd}n|rd}nd}dj	                  fd|D              }t              D ]  \  }}|dk(  }||v r|j                  |      nd }	|rd n|j                  |      }
|
|n|
}| j                  j                  t        | d	| j                  |   ||| ||xr | j                   |	|
d
v 
              y )Nr   r4   zyxrxyzrc   xyzr c              3  ,   K   | ]  }|v s|  y wrB   r   )r   pactive_prefixess     rN   r   z3SIMDKernel.initialize_range_tree.<locals>.<genexpr>u  s     MA_8LaMs   	rd   z)rf   rg   rh   ri   rj   )r   r   r   r   join	enumeratefindr   r|   rV   r   )rL   rf   no_r_dimprefixes	grid_dimstensor_dimsirI   is_reductionrh   ri   rd   r   s               @rN   r   z SIMDKernel.initialize_range_treeg  s(   ,,,DB10D"C$4#4#67	==KK KggMMM"?3 	IAv!S=L5;{5J))&1PTJ+t1GH!)AxE###he$KKN'(J1J1J-J)% O3	rO   c                     y)zr
        Hook called right before codegen with every index that will be
        used in the fused kernel.
        Nr   )rL   indicess     rN   finalize_indexingzSIMDKernel.finalize_indexing  r   rO   c                v    | j                   }d| _         	 | j                  |||      || _         S # || _         w xY wr   )r   store)rL   rE   rd   r   priors        rN   store_reductionzSIMDKernel.store_reduction  s;    %% %	*::dE51$)D!ED!s   / 	8c                     yr   r   rQ   s    rN   r   z*SIMDKernel.should_use_persistent_reduction  r   rO   c                t    t        t        j                  j                  d | j                  D                    S )Nc              3  P   K   | ]  }|j                   j                            y wrB   )rG   itemsr   r   s     rN   r   z(SIMDKernel.var_ranges.<locals>.<genexpr>  s"      *,0%%'*r   )dictr   chainfrom_iterabler   rQ   s    rN   rG   zSIMDKernel.var_ranges  s4    OO)) *484D4D* 
 	
rO   c                :    t        d | j                  D              S )Nc              3  J   K   | ]  }t        |j                  d u        y wrB   )r   rh   r  s     rN   r   z0SIMDKernel.triton_tensor_ndim.<locals>.<genexpr>  s     Q3td23Qs   !#)sumr   rQ   s    rN   triton_tensor_ndimzSIMDKernel.triton_tensor_ndim  s    Q@P@PQQQrO   c                ^    dg| j                         z  }d||<   ddj                  |       dS )NrX   :[rl   ])r  r   )rL   r  r   s      rN   indexing_size_strzSIMDKernel.indexing_size_str  s9    42244a499U#$A&&rO   c                    dg| j                         z  }| j                  D ]U  }|j                  |j                  dk7  s| j                  s,|j                  j                          d||j                  <   W |S )N1rc   BLOCK)r  r   rh   rI   r   upper)rL   r   r   s      rN   dense_size_listzSIMDKernel.dense_size_list  sy    //11$$ 	GD&{{c!T%:%:,0KK,=,=,?+@)Fdoo&	G rO   c                L    | j                         }ddj                  |       dS )Nr  rl   r  )r$  r   rL   r   s     rN   dense_size_strzSIMDKernel.dense_size_str  s)    $$&499U#$A&&rO   c           	        t        |t              s|S |j                  d   }| j                  j	                  |      x}|S t        |||j                  i      }t        j                  j                  j                  |      }t        ||j                  j                         |j                  j                  t        j                  d      |j                  j                         j#                         i      S )Nr   r4   )r   r   r   r{   r   r/   r}   r3   ru   rv   r   rK   rs   r~   r]   r^   rH   rR   )rL   rd   r   	tree_node	new_indexs        rN   r   z)SIMDKernel.combine_modular_indexing_pairs  s    %1LJJqM..22155I>Luq)..&9:	GG$$CCIN	((*INN,A,AMM!$inn&:&:-&(
 	
rO   c                    t         j                  j                  j                  |      x}r!|\  }}t	        | j                  ||      |      S | j                  ||      S rB   )r3   ru   rv   expand_floor_divr   _combine_contiguous_dims)rL   rd   r   
expand_resr*  denominators         rN   r   z"SIMDKernel.combine_contiguous_dims  sY    ))::5AA:A%/"I{D99)TJKXX00==rO   c                   t        |t        j                  t        j                  f      r|S |j	                  |      \  }}t        |      dk  r|S t        j                  j                  j                  ||t        |g||            \  }}}||k(  r|S |j                  |      }t        |t        t        | ||                        }	|	S )zI
        More aggressive simplification to merge contiguous dims
        r4   )r   r]   r^   r   r   r   r3   ru   rv   _simplify_loopsr6   r   r/   r  zip)
rL   rd   r   r   r   	new_sizesreindexprunenew_index_varsr*  s
             rN   r-  z#SIMDKernel._combine_contiguous_dims  s     eemmU\\:;L //6
Eu:?L$%GG$4$4$D$D7US%
!	7E L	2ud3z7>;R+S&TU	rO   c                    | j                   r| j                  ry t        t        j                  j                  d |D                    | _        y )Nc              3  F   K   | ]  }|t         us|j                    y wrB   )EnableReductionr   )r   r   s     rN   r   z,SIMDKernel.set_last_usage.<locals>.<genexpr>  s       *!"q/G*s   !!)r   r   r   r   r  r  r   )rL   re   s     rN   set_last_usagezSIMDKernel.set_last_usage  sA    $$(A(A$OO)) *&+* 
rO   c                x      j                   d   j                  t        j                   fd       } |       S )Nr   c               3     K    j                   d   dk(  r j                  rJ d  y r j                          d _        	 d  r j                          d _        y # d _        w xY ww)Nr   r4   FT)r   r   codegen_body)rL   should_flushs   rN   ctxz)SIMDKernel.disable_reduction.<locals>.ctx  sl     {{2!#0000 !!#$)D!-%%'(,%%s   ?A-A! A-!	A**A-)r   rg   
contextlibcontextmanager)rL   r?  r>  s   ` @rN   disable_reductionzSIMDKernel.disable_reduction  s:    ''+33		"	"	- 
#	-$ urO   c                    t        |      t        | j                        k(  sJ t        || j                        D cg c]  \  }}|j                  |       c}}S c c}}w rB   )r   r   r2  r   )rL   r   r@   rangess       rN   
set_rangeszSIMDKernel.set_ranges  s]    7|s4#3#34444 #&gt/?/?"@
 V$
 	
 
s   Ac                   t         j                  j                  | D cg c]  }g  c}| D cg c]  }j                  |       c}t	        j
                         fd}d }g }d}|D ];  }g }	|D ]  }
j                  |
d      r|	j                  d        )|t              k  r>j                  |   d      r)|dz  }|t              k  rj                  |   d      r)|dz   t              k  roj                  |
|         rZj                  |
|         st        |   }t        |
|         }|	j                   || |||       ||dz   |                   |	j                  t        j                   |||
                   " |j                  |	       > t        d D              sJ d d|        |fS c c}w c c}w )	Nc                    j                  |      }j                  |    |      st        t        |    |      | <   |    j	                  |       t              S rB   )r   statically_known_multiple_of	CantSplitr   r|   ry   )r  r}   
new_ranges	remainingsv	var_counts     rN   	add_rangez5SIMDKernel._split_iteration_ranges.<locals>.add_range  sZ    ;;t$D229Q<F#IaL$7IaLqM  &	?"rO   c                      fd}|S )Nc                     |    z  |    z   S rB   r   )	flat_varsidx1idx2sizes    rN   getterzISIMDKernel._split_iteration_ranges.<locals>.make_combined.<locals>.getter  s    io-	$??rO   r   )rT  rR  rS  rU  s   ``` rN   make_combinedz9SIMDKernel._split_iteration_ranges.<locals>.make_combined  s    @ MrO   r   r4   c                ,    t        j                  d      S )Nr   )r]   r^   )_s    rN   r   z4SIMDKernel._split_iteration_ranges.<locals>.<lambda>*  s    EMM!4D rO   c              3  t   K   | ]0  }t         j                  j                  j                  |      d k(   2 yw)r4   Nr3   ru   rv   r   r   s     rN   r   z5SIMDKernel._split_iteration_ranges.<locals>.<genexpr>J  s/      
34AGG&&q)Q.
s   68zfailed to set ranges  )r3   ru   rv   r   r   r   rw   r|   r   statically_known_gtrH  rI  r   operator
itemgetterr   )r   r   rX  grN  rV  return_getters_groupscurrent_grouplength_groupreturn_gettersrT  size1size2rJ  rK  rL  rM  s                @@@@rN   _split_iteration_rangesz"SIMDKernel._split_iteration_ranges  s    WW:@-AQb-A
-34R[[^4	OO%		#	 !## "	9LN$ --dA6"))*DE#c)n49S9Sm,a: "Q&M	 $c)n49S9Sm,a: !1$s9~5":P:P)M2; ::i6 (%m4E$T9]+CDE"))%!%mU;%ma&7? #)) ++ImT,JK;@ "((8E"	9H  
8A
 
 	9"9+Qwi8	9 
 000{ .B4s
   	GGc                H    	 | j                  ||       y# t        $ r Y yw xY w)NTF)rf  rI  )clsr   r   s      rN   is_compatiblezSIMDKernel.is_compatibleP  s,    	''8 		s    	!!c           
     (   | j                   D cg c]  }|j                   }}| j                  st        j                  d      |d<   t        |      t        | j                         k(  r+t        d t        ||      D              r | j                  | S | j                  ||      \  }}t        t        j                  j                   | j                  |             }|D cg c]  }|D cg c]
  } ||       c} c}}S c c}w c c}w c c}}w )a  
        We may want to fuse `for i0 in s0*s1` into a tiled kernel with groups (s0, s1).

        To do this we need to split up the iteration space of i0 into something like:
            for i1 in s0:
              for i2 in s1:
                i0 = i1*s1 + i2
                ....

        This function matches and resplits lengths to the groups of
        this kernel to enable tiled + non-tiled fusions.
        r4   r   c              3     K   | ]?  \  }}t         j                  j                  j                  t	        |      |z
        d k(   A ywr   Nr3   ru   rv   r   r.   )r   r   r_  s      rN   r   z2SIMDKernel.split_and_set_ranges.<locals>.<genexpr>k  s@      9
1 GG%%mA&6&:;q@9
s   AA)r   rH   r   r]   r^   r   r   r2  rE  rf  r   r   r  r  )	rL   r   rtr   rJ  r`  r   fnsfns	            rN   split_and_set_rangeszSIMDKernel.split_and_set_rangesZ  s     &*%5%56r"((66$$q)F2Jw<3t//00S 9
GV,9
 6
 #4??G,,,0,H,HG-
)
) 	55odooz6RST8MN,"H,NN 7 -Ns   D#	D,D	;D	Dc                6    t        |t        j                        S rB   )r   r   TMPrL   rd   s     rN   is_indirect_indexingzSIMDKernel.is_indirect_indexingw  s    "5$((33rO   c                   | j                  |      rydgt        | j                        z  }|j                  D ]g  }|| j                  vr| j                  |   }t        |j                  t              sJ ||j                  j                  xx   |j                  z  cc<   i t        j                  j                  j                  t        fdt        || j                        D              S )NFr4   c              3  F   K   | ]  \  }} |       |      k7    y wrB   r   )r   	idx_range
iter_ranger   s      rN   r   z,SIMDKernel.is_broadcasted.<locals>.<genexpr>  s,      
%	: Y8J#77
s   !)ru  r   r   r   r{   r   r   rV   rd   r@   r3   ru   rv   r   anyr2  )rL   rd   index_numelsrR   entryr   s        @rN   is_broadcastedzSIMDKernel.is_broadcasted{  s    $$U+sS--(( 	=FT222))&1Eell,?@@@++,<,	= 77##,, 
),\4;;)G
 
 	
rO   c                    t        |t              r)ddj                  t        | j                  |             dS | j                  | j                  |            S )a  
        Convert an index expr to a string that can be used in output code.
        e.g. a sympy expression "s2" may actually appear as "ks1" in the generated kernel.

        Index expressions often need to be passed in as arguments to the triton kernel.
        Rename_indexing and codegen_indexing keep track of the needed indices and add
        new parameters to the function signature.
        r  rl   r  )r   r   r   mapindex_to_strr   rename_indexingrt  s     rN   r  zSIMDKernel.index_to_str  sN     eT"tyyT%6%6!>?@BBzz$..u566rO   c                n   | j                  |      }t        |t        j                  j                  j
                        }t        |j                  t        j                              s(t        |j                  t        j                              r3|j                  t        j                  j                  j
                        }t        |j                  t        j                              r|j                  t        j                        D ]g  }|j                  }t        |      dkD  st        d |D              s1|t        j                  j                  j                  |      i}t        ||      }i | j                  |      }t        |t               s|n|j"                  d   }| j%                  |      S )Nr   c              3  p   K   | ].  }t        |t        j                  t        j                  f       0 y wrB   )r   r   r   PRECOMPUTED_SIZEr   s     rN   r   z.SIMDKernel.prepare_indexing.<locals>.<genexpr>  s.      , #1tyy$2G2G&HI,s   46)r   r/   r3   ru   rv   precomputed_replacementsr   atomsr]   floorceilingsubsr   r   lookup_precomputed_sizer   r   r   codegen_indexing)rL   rd   ar   replacements
simp_indexs         rN   prepare_indexingzSIMDKernel.prepare_indexing  sG    &&u-5!''"2"2"K"KLu{{5;;'(CEMM0J,KJJqww//HHIE u{{5==)*[[/ 	< ..w<!# ,$, ) %&qww'7'7'O'OPQ'R#SL&ul;E	< ++E2
 )X>JJOOTUDV 	 $$Z00rO   c                r   | j                   D cg c]   }|j                  dk7  s| j                  s|" }}|rut        |      dkD  rgt	        d |D              }dj                  d |d | D              d| d  k(  s"J |d | D cg c]  }|j                   c}       t        |d |       |d | |S c c}w c c}w )Nrc   r4   c              3  8   K   | ]  }|j                   d v   yw)r   NrI   r   ts     rN   r   z0SIMDKernel.active_range_trees.<locals>.<genexpr>  s     9aE)9s   r   c              3  4   K   | ]  }|j                     y wrB   r  r  s     rN   r   z0SIMDKernel.active_range_trees.<locals>.<genexpr>  s     ;188;s   zyx)r   rI   r   r   r  r   r   )rL   reorderr  treesr   s        rN   active_range_treeszSIMDKernel.active_range_trees  s    ''
188s?d>S>SA
 
 s5zA~9599E77;U6E];;ueVW~M "'-PP M %U6E]3E&5M

Ps    B/B/B4c                4   t         j                  j                  j                  || j	                               }t        |j                  t              D ]  }|| j                  v si }| j                  |   j                         D ].  }t         j                  j                  j                  |      ||<   0 t        |      dkD  r5t        | j                  |   j                  |      | j                  |   _        | j                  |   j                           |S )Nr   r   )r3   ru   rv   r   rG   sortedr   rS   r{   r   r  r   r/   r}   r   )rL   r}   symr  pss        rN   r  zSIMDKernel.codegen_indexing  s    ww44T4??;LM$++5 	5Cd+++  "//4EEG TB'(ww'7'7'O'OPR'SL$T|$q(6@--c2777D))#.3 %%c*224	5 rO   c                    t        d      )NzNYI: codegen_nan_checkNotImplementedErrorrQ   s    rN   codegen_nan_checkzSIMDKernel.codegen_nan_check  s    !":;;rO   c                    t        d      )NzNYI: call_kernelr  )rL   rE   rq   s      rN   call_kernelzSIMDKernel.call_kernel  s    !"455rO   c              #     K   | j                   }| j                  }|rt        j                  ||      }t	        j
                  |      }|| _         || _        	 | || _         || _        y# || _         || _        w xY ww)z:Context manager to add an additional mask to tl.load/storeN)
_load_mask_load_otherr1   logical_andr2   _unwrap)rL   maskr   r  	prior_vals        rN   
mask_loadszSIMDKernel.mask_loads  sy      $$	??4/D!!$' 	)J#DO(D $DO(Ds   AA=A* A=*A::A=c                (   | j                   j                         D ci c]  \  }}||j                   }}}t        ||      }i }| j                  D ]7  }t        |j                        }t        ||di      t        ||di      z
  ||<   9 |S c c}}w )a\  
        This gets the stride of the index for each of the tiling variables
        (technically, it does it at index 0)

        For example, if
        xindex = x0 + 512*x1 + 1024*r0
        x0 = (xindex//512)
        x1 = (xindex % 512)
        r0 = rindex // 1024

        this function would return
        {xindex: 512, rindex: 1024}
        r4   r   )r{   r  r}   r/   r   r-   rE   )	rL   rd   kvindex_to_tile_indexesindex_in_tile_varsstrides
range_treer   s	            rN   get_strides_of_loadzSIMDKernel.get_strides_of_load  s     8<7L7L7R7R7T Utq!AFF U U'/DE** 	J":??3A#$6A?*"QFC GAJ	
  !Vs   Bc                \    t        |t              rt        t        | |            S  | |      S rB   )r   tupler  )rp  r   s     rN   _map_tuple_or_scalarzSIMDKernel._map_tuple_or_scalar  s'    eU#R((%yrO   c           	        g }t        t        | j                  j                  j	                                     }| j                  j                         \  }}}}t        j                  j                  j                  t        | j                              }t        |      D ]?  \  }}|| j                  vr|j                  d       't        j                  j                  |      }t        j                  j                  j                  |      }	|	|kD  rzt!               }
d}| j                  |   D ]M  }t#        |t$        t&        f      r|
j)                  d|        |dz  }3|
j)                  |j*                         O t        |
      |z  }n|	}t        j                  j-                  |      }t/        |      }|j                  ||z  dt1        ||k        z   z         B t3        |      S )a+  
        Try the best to estimate the total size (in bytes) of the
        kernel's inputs and outputs, which is used for estimating the memory
        throughput of this kernel. This information is used for checking how
        far we are from the peak memory bandwidth. It's important that
        we want to avoid overestimating the sizes of the inputs and outputs,
        because it can wrongfully give us a very large memory traffic value,
        which may be even larger than the theoretical bandwidth and thus
        become very misleading. This is particularly problematic for cases
        where we slice some inputs. In those cases, we should only count
        the size of the "slices" instead of the original inputs, because
        only the slices contribute to the real memory traffic.
        r   no_index_dep_r4   )r   r0   r   inplace_buffersro   python_argdefsr3   ru   rv   r   r.   r   r   r   r|   	get_numelr   r   r   r    r   rd   	get_dtyper*   r   r  )rL   nbytesninplace_argsrX  	call_args	out_numelr  r   	arg_numelbuf_sizer	  no_index_dep_countdeprH   dtype
dtype_sizes                   rN   estimate_kernel_num_bytesz$SIMDKernel.estimate_kernel_num_bytes  s    F499#<#<#C#C#EFG!YY5579a GG$$..}T[[/IJ		* 	MFAs $+++a ))#.Iww''11)<H)# ,6<%&",,S1 /C!#'9:m4F3G$HI*a/*CII./ Gy0 GG%%c*E'.JMM%*,CM8I4J0JKL9	M: 6{rO   c           	        t        | j                  j                        dk(  rEt        | j                  j                        dk(  r#t        | j                  j                        dk(  ry| j                  j                         \  }}}}d}|D ]3  }t        j                  j                  |      }|s&t        |j                  j                        dk(  sIt        |j                  j                  D 	cg c]
  }	|	dk(  s	|	 c}	      dk(  rt        j                  |j                  j                        }
||
}||
k7  st        d| dd|
 d	| z         }t        j!                  |       |D cg c]i  }t        j                  j                  |      rFt        j                  t        j                  j#                  |      j                  j                        ndk }}|D cg c]V  }t        j                  j                  |      r3t        j                  j#                  |      j                  j                  ndX }}|D cg c]@  }|t        j                  j$                  v rd
n|t        j                  j&                  v rdndB }}t        d| d| d| d| d| dz         }t        j!                  |        y t)        d| d      }t        j!                  |       yc c}	w c c}w c c}w c c}w )zr
        Print message if the kernel have mixed layout inputs.
        Only care about 4D tensor for now.
        r4   r   N   r   zExpected stride order z, but found stride orderr[  z for kernel 
GraphInputIntermediateBufferz  param names z
  buf names z
  strides z	
  sizes z
  sources 
z%All the inputs for the triton kernel z have uniform layout)r   r   input_buffersoutput_buffersr  r  r3   ru   try_get_bufferlayoutrT  r   get_stride_orderstrider&   logwarning
get_buffergraph_inputsname_to_bufferr%   )rL   kernel_nameargdefsr  	signaturerX  uniform_stride_orderarg_namebufr   stride_ordermsgrE   stride_order_list	size_listsource_lists                   rN   warn_mix_layoutzSIMDKernel.warn_mix_layoutT  s    		''(A-DII,,-2DII--.!3
 +/99+C+C+E(Iq#! *	H''((2Cs3::??+q03::??=aa1f=>!C!223::3D3DE'/+7()\9%01E0FF^_l^<}EFC KK$ %.	) ! 7711$7 ++AGG,>,>t,D,K,K,R,RS!")% ) %.	! ! 7711$7 **4077<<!"!I ! %.# !	  177#7#77 %  177#9#99 2!	"#K # &(		{,WhVij&ykk]"MNC KK$U*	V 3K=@TU
 	CU >)!#s!   1
K'
<K'
0A.K,$AK1AK6c                x   t        j                  ||d|      }d| _        t        j                  | j                  d   |      }t        j
                  ||      }d| _        t        j                  ||      }t        j                  ||      }t        j                  ||d|      }t        j                  |||f      S )Nr  Fr   T)
r1   	reductionr   
index_exprr   truedivsubmulr2   r  )	rL   r  r   sum_rnumelmeandxdx2m2s	            rN   welford_reduce_fallbackz"SIMDKernel.welford_reduce_fallback  s    }}UE5%8 %B7{{4( $WWUD!ggb"o]]5%4!!4V"455rO   c                    t         rB   r  rQ   s    rN   codegen_kernelzSIMDKernel.codegen_kernel      !!rO   c                     y rB   r   rQ   s    rN   r=  zSIMDKernel.codegen_body      rO   c                     y rB   r   )rL   r|  s     rN   r   z)SIMDKernel.codegen_iteration_ranges_entry  r  rO   )r   rS   r   zOptional[OrderedSet[str]]rW   rX   )r	  zSequence[sympy.Expr])rE   rS   rd   rT   r   r5   rW   r   )rW   z	List[str])rd   rT   r   rV   )r   zIterable[sympy.Expr]r   zSequence[Sequence[sympy.Expr]])r   zList[List[sympy.Expr]]r   )rd   rT   rW   rS   F)r}   rT   rW   rX   rB   )rE   rS   rq   zOptional[IRNode]rW   rX   )r|  rx   )4rY   rZ   r[   r\   pexprsexpr__annotations__allow_block_ptrr$   DEFAULTrD   r   r   r
  r  r   rG   r  r  r$  r'  r   r   r-  r:  rB  rE  staticmethodrf  classmethodri  rq  ru  r}  r  r  r  r  r  r  r@  rA  r  r  r  r  r  r  r  r=  r   r_   r`   s   @rN   rU   rU   -  s{    E&&O 04$,,&*,. ,. -	,. 
,.\"H*
R'
'
$>$
0
 A1$A1/MA1 A1F )4R O:4
,7$1$1L
 <6 ) )"0  
<|?B
6"rO   rU   c                  t    e Zd ZeZdZdZd fdZd Zd Z	e	Z
e	Zd Z	 	 ddZed        Ze	 	 	 	 	 	 dd	       Zed
        Zd Zd Zd Zd Z	 d	 d dZd Z	 d	 	 	 	 	 	 	 	 	 	 	 d!dZd Ze ej8                  d      d               Ze ej>                  d      fd       Z d Z!d"dZ"ddZ#d Z$d Z% xZ&S )#SIMDSchedulingztorch.int32ztorch.int64c                0    t         |           || _        y rB   )rC   rD   r   )rL   r   rM   s     rN   rD   zSIMDScheduling.__init__  s    "rO   c                &    t        d |D              S )Nc              3     K   | ]6  }t         j                  j                  j                  t	        |             8 y wrB   rm  r   s     rN   r   z*SIMDScheduling.group_fn.<locals>.<genexpr>  s*     PQQWW%%..}Q/?@Ps   <>)r  r&  s     rN   group_fnzSIMDScheduling.group_fn  s    P%PPPrO   c                   t        |t        j                        st        |t        j                        r t        j                  j                  ||      S |j                  \  }\  }}|j                  \  }\  t        ||      }|j                         r)|j                         s|j                         rA |d       n8|j                         r(|j                         s|j                         r |d       |j                         r,|j                         r|k(  xr |k(  }|s |d||       |S |j                         sC|j                         s2|k(  r|k(  s |d||       y|j                         r&t        |j                  t              }|s |d       |S | j                  |j                         ||      }	| j                  |j                         ||      }
| j                  |j                         |j                         z   ||      }t        j                  j                  rVd}t!        |	      dkD  r%t!        |
      dkD  r|	|
cxk(  xr |k(  nc }n|	|k(  }nt!        |
      dkD  r|
|k(  }|s |d|	|
|       yy|j                         s|j                         r|d	k(  rd	k7  sJ |z  k(  rt#        fd
|j                         D              s	 |d       yt        j                  j$                  rE|j                         s5| j                  |j                         |      |d	fd	ffv }|s |d       |S y|k7  r |d       |k(  S |j                         r|j                         rJ | j'                  ||      S )z
        Hook called by Scheduler to determine if the Triton backend
        can fuse node1 and node2.  These nodes might already be
        FusedSchedulerNodes.
        z&Split scan cannot fuse with reductionsz1numel/rnumel mismatch (reduce) (%s, %s), (%s, %s)z5numel/rnumel mismatch (non-reduce) (%s, %s), (%s, %s)Fz!node1 is not TritonTemplateBufferTr   ztiling mismatch (%s, %s, %s)r4   c              3  j   K   | ]*  }t         j                  f|j                                , y wrB   )rU   ri  
get_ranges)r   r   numel2rnumel2s     rN   r   z*SIMDScheduling.can_fuse.<locals>.<genexpr>	  s1       ,,fg->Os   03z"nodes numel/rnumel incompatibilityzinvalid tiling for reductionznodes numel incompatibility)r   r   ForeachKernelSchedulerNodecan_fusegroupr)   is_split_scanr  is_templaterq   r"   select_tiling	get_nodesr   triton tiling_prevents_pointwise_fusionr   r    tiling_prevents_reduction_fusioncan_fuse_horizontal)rL   node1node2rX  numel1rnumel1whyreduction_can_fuseis_triton_templatetiling1tiling2tiling3condis_reduction_tiling_validr
  r  s                 @@rN   r  zSIMDScheduling.can_fuse  s    eYAABj977G
 77@@NN${{FG${{FGu% )<)<)>!!#<=  "5+>+>+@!!#<=E$6$6$8!'6!1!Hg6H%G &%!!#E,>,>,@f$G);K   " &0

<P%Q");<)) (():FGLG(():FGLG((!EOO$55vwG }}==w<!#7|a'&'<W<&'1\A%"g-D6	 !!!#(:(:(<a<GqL00')) "__.  <= MMBB!--/040B0B)61  !,1- 5:;4412V##!!#E,>,>,@@@''u55rO   c           
        g t               t               fd}fd}fd}t        j                  fd       }fd}t        |      D ]  \  }	}
|
v rj	                  |
        ||
      r# ||
      r |       5  	 d d d         ||
       G ||
      r" |       5  j                  |
       d d d        qt        d d d|
j                  d	           S # 1 sw Y   `xY w# 1 sw Y   xY w)
Nc                b    | j                   \  }\  }}|k(  xr |k(  xs |z  k(  xr |dk(  S r   r  r   rX  
node_numelnode_rnumelrH   r  s       rN   fits_in_main_bodyz@SIMDScheduling.generate_node_schedule.<locals>.fits_in_main_body0  sH    +,77(A(
K%'AK6,A efn,A1ArO   c                N    | j                   \  }\  }}|k(  xr |dk(  xr dk7  S r   r%  r&  s       rN   fits_outside_reductionzESIMDScheduling.generate_node_schedule.<locals>.fits_outside_reduction6  s4    +,77(A(
K&K;!+;K!KrO   c                   j                  |        j                  |        | j                         rt        | t        j
                        rtt        | j                  t        j                        rOt        | j                  j                  t        j                        s j                  | j                                y y y y y rB   )r   r|   r  r   r   SchedulerNoderq   r   ComputedBufferdataScanget_name)r   donenode_schedulenot_ready_yet_nodess    rN   schedule_node_in_loopzDSIMDScheduling.generate_node_schedule.<locals>.schedule_node_in_loop:  s    HHQK  #  q)"9"9:qvvr'8'89"166;;8#''

5 9 : ; !rO   c               3     K    r d   t         u r j                          n j                  t               d   j                  t                j	                          y w)Nr   )r9  popr|   DisableReductionclear)r3  r4  s   rN   end_current_reduction_loopzISIMDScheduling.generate_node_schedule.<locals>.end_current_reduction_loopG  sN     r!2o!E!!#$$%56  1%%'s   AA!c                    dk(  ry| j                   z  sy|rt        |d   t        t        f      rJ t	              S )Nr4   Fr   )	ancestorsr   r9  r8  r   )rq   r3  r4  r  s     rN   #requires_closing_previous_reductionzRSIMDScheduling.generate_node_schedule.<locals>.requires_closing_previous_reductionQ  sN    {&7 b!O5E#F*   +,,rO   zunexpected group: (rl   z) != r4   )r   r@  rA  r   r   r|   r  r  )rL   re   rH   r  r)  r+  r5  r:  r=  rd   rq   r2  r3  r4  s     ``       @@@rN   generate_node_schedulez%SIMDScheduling.generate_node_schedule)  s   #%8B 0:|		L	6 
	"	"	( 
#	(	- %U+ 	KE4t|HHTN &6t]K35  &d+'-/1 /!((./ / *)%6(%

1O 	&  
/ /s   C./C:.C7	:D	c                   |j                         }t        |d       j                  \  }\  }}| j                  |||      }t	        j
                  t              }|D ]R  }|j                  j                  |j                  j                  z  D ]   }||j                     j                  |       " T t        j                  d|       | j                  ||||      S )zK
        Given a set of pre-fused nodes, generate a Triton kernel.
        c                4    t        | j                               S rB   r   r  r   s    rN   r   z-SIMDScheduling.codegen_node.<locals>.<lambda>y  s    c!..:J6K rO   r   zSchedule:
 %s)r  maxr  r>  r   r   r   read_writesreadswritesrE   r|   schedule_logdebugcodegen_node_schedule)	rL   rq   re   rX  rH   r  r3  r   accesss	            rN   codegen_nodezSIMDScheduling.codegen_nodep  s     04~~/? ,KLRR?E633E5&I"..t4 	9D**0043C3C3J3JJ 9V[[)0089	9 	+];))-ufUUrO   c                "   | j                         sJ t        d t        j                  | j                  j
                  | j                  j                        D              rt        j                  S | j                  j                  j                  S )Nc              3  <   K   | ]  }|j                           y wrB   )is_contiguousr   r  s     rN   r   z0SIMDScheduling.reduction_hint.<locals>.<genexpr>  s!      
 
s   )r  r   r   r  rC  rD  rE  r$   INNERrq   r/  r   )rq   s    rN   r   zSIMDScheduling.reduction_hint  sn      """ 
 t'7'7'='=t?O?O?V?VW
 
 !&&&99>>000rO   c                   t        j                  t         j                        j                  t        j
                  j                  j                  t        j
                  j                  j                  j                  fd |       sy|D cg c]H  }t        |j                         t        j                        s|j                         j                         J }}t        fd|D              syt        j
                  j                  j!                  |        |D ],  }t        j
                  j                  j!                  |       . yc c}w )Nc                    t         j                  j                  j                  | k        ry |       xr  |       k  S )NT)r3   ru   rv   is_expr_static_and_true)r   has_hintint_maxr   s    rN   within_32bitz;SIMDScheduling.can_use_32bit_indexing.<locals>.within_32bit  s<     ww77WEA;:9Q<7#::rO   Fc              3  .   K   | ]  } |        y wrB   r   )r   rT  rU  s     rN   r   z8SIMDScheduling.can_use_32bit_indexing.<locals>.<genexpr>  s     <$<%<   T)torchiinfoint32rB  r3   ru   rv   r   	shape_envrS  r   
get_layoutr   MultiOutputLayoutstorage_sizer   	guard_leq)	rH   buffersr  	buf_sizesrT  rS  rT  r   rU  s	        @@@@rN   can_use_32bit_indexingz%SIMDScheduling.can_use_32bit_indexing  s    ++ekk*..GG$$..	77##--66	; E" 
cnn.0D0DE NN))+
	 
 <)<< 	
""5'2 	6DGG&&tW5	6
s   AEc                   t               }|D ][  }t        |t        j                        s|j	                  |j                                |j	                  |j                                ] dd}|D cg c]!  }t        j                  j                  |      # }}||z  }	t        j                  |	|      r| j                  S | j                  S c c}w )Nc                d    t         j                  j                  |       }|t        d|        |S )Nz$Failed to find buffer matching name )r3   ru   r  RuntimeError)rE   r  s     rN   _get_bufferz6SIMDScheduling.select_index_dtype.<locals>._get_buffer  s3    ''$$T*C{"%I$#PQQJrO   )rE   rS   rW   zUnion[ir.Buffer, ir.TensorBox])r   r   r   r'   updateget_buffer_namesused_buffer_namesr3   ru   r  r  rb  
int32_type
int64_type)
rh  r3  rH   reduction_numelbuffer_namesrq   rf  rE   r`  total_numels
             rN   select_index_dtypez!SIMDScheduling.select_index_dtype  s     )3! 	:DdI$?$?@ 5 5 78 6 6 89	:	 9EE177%%d+EE
 o-00gF>>!~~ Fs   3&Cc                    t        t        fd|            }|D ]R  }t        d t        j                  |j
                  j                  |j
                  j                        D              rR y y)Nc                z    | t         t        fvxr+ | j                          xr | j                  d   d   z  k(  S )Nr4   r   )r9  r8  r  r  )r   rH   r  s    rN   r   zJSIMDScheduling.has_non_contiguous_pw_in_reduction_kernel.<locals>.<lambda>  sD    !O5E#FF 4((4GGAJqMUV^3 rO   c              3     K   | ]e  }t        |t               xsN |j                         xs< t        |j                  t        j
                  t        f      xs |j                          g y wrB   )r   r   rM  rd   r]   r^   r   stride1_for_last_dimrN  s     rN   r   zKSIMDScheduling.has_non_contiguous_pw_in_reduction_kernel.<locals>.<genexpr>  si      
 	 sI.. .$$&.cii%--)=>. ++-.s   A+A-TF)r   filterr   r   r  rC  rD  rE  )rL   r3  rH   r  pointwise_nodesrq   s     ``  rN   )has_non_contiguous_pw_in_reduction_kernelz8SIMDScheduling.has_non_contiguous_pw_in_reduction_kernel  sw    4 	
 $ 	D 
 %??$$**D,<,<,C,C  	 rO   c                b   t        t        d |            }t        |      dkD  r|D cg c]  }| j                  |       }}|j	                  |d         t        |      k(  r|d   }nt
        j                  }|t
        j                  k(  r4| j                  |||      r!t
        j                  }nt
        j                  }t               }|D ]E  }	|	t        t        fv r|	j                         D ]!  }
|j                  |
j                                # G | j                  |||      }|||fS c c}w )Nc                B    | t         t        fvxr | j                         S rB   )r9  r8  r  r   s    rN   r   z0SIMDScheduling.get_kernel_args.<locals>.<lambda>  s#    !O5E#FF %NN$ rO   r   )r   rt  r   r   r   r$   r  rO  rv  r   r8  r9  get_outputsrg  get_mutationsro  )rL   r3  rH   rl  
reductionsr   hintsreduction_hint_valr   rq   r  r   s               rN   get_kernel_argszSIMDScheduling.get_kernel_args  s5   %

 z?Q5?@T((+@E@{{58$E
2%*1X"%2%:%:" #m&9&99BB!5/ &3%:%:"!.!6!6%/\	! 	6D(/::'') 6  !2!2!456		6 --mUOT!9k995 As   D,c                |   ddl m} | j                  |||      }| j                  |||      \  }}}	t	        d |D              }
| j
                  }|
rt        ||      r|}|}t        |||	      }d t	        fd|D              }|rd|d<    ||i |}||_        d }|j                  rt        j                  j                  r|s | j
                  |i |dd	i}| j                  ||       t        j                  |      5  |j!                         }d d d        | j#                  ||      }||_        t'        |      |_        t)        |j*                        |_        | j                  ||       t        j                  |      5  |j!                         }d d d        | j#                  ||      }t,        j/                  d
|       ||_        t'        |      |_        |t1        ||g      n|}t        j                  |      5  |D ]!  }|t2        t4        fvs|j7                          # 	 d d d        | j9                  |       |j;                  |j$                         t        j<                  r|j?                          t        j@                  r|jA                  |       t        jB                  xjD                  |jD                  z  c_"        t        jB                  xjF                  |jF                  z  c_#        t        jB                  jH                  jJ                  rt        jL                  r|jN                  jQ                         }|D ]  }tS        |tT        jV                        s|jY                         }||vr3|jZ                  J |jZ                  j]                         }|^t^        d   dxx   dz  cc<   t        jB                  jH                  ja                  d|jb                  d| d        | jT                  je                          y # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w)Nr   )TritonSplitScanKernelc              3  `   K   | ]&  }t        |t              xr |j                          ( y wrB   )r   r'   r  )r   rq   s     rN   r   z7SIMDScheduling.codegen_node_schedule.<locals>.<genexpr>  s1      
 t./HD4F4F4HH
s   ,.r   r   r   c                    | t         t        fv ry| j                  j                  j                  j                  dd      }t        |      S )NFcall_methodr   )optarget)r9  r8  _body
root_blockru   
find_nodesr   )rq   
sort_nodess     rN   _node_has_sortz<SIMDScheduling.codegen_node_schedule.<locals>._node_has_sort*  sK    )9::..44??  @ J 
##rO   c              3  .   K   | ]  } |        y wrB   r   )r   rq   r  s     rN   r   z7SIMDScheduling.codegen_node_schedule.<locals>.<genexpr>5  s     F~d+FrW  Tr   Fz+Generating kernel code with kernel_name: %sinductorintermediate_hooksr4   zrun_intermediate_hooks(rl   r   )3)torch._inductor.codegen.triton_split_scanr  r  r  rz  kernel_type
issubclassr  r   r   r   r  multi_kernel!codegen_node_schedule_with_kernelr3   set_kernel_handlerr  define_kernelr  r   setmust_keep_buffersr  rG  r9   r9  r8  mark_runcodegen_commentr  nan_assertsr  r  ru   removed_buffersinplaced_to_removewrapper_codesupports_intermediate_hooksgenerate_intermediate_hooksr   live_output_buffersr   r   r'   r1  rq   get_origin_noder   	writelinerE   free_buffers)rL   r3  r   rH   rl  r  tiled_groupsr~  r   r   r  r  kernel_argskernel_kwargshas_sortrJ   kernel2	src_code2kernel_name2src_coder  final_kernelrq   	live_outsrE   origin_noder  s                             @rN   rH  z$SIMDScheduling.codegen_node_schedule  s    	T))-P
   G		
  
%
 
 !,,Z(={K/K"-#
	$ FFF=AM9:

 +(,&&6==+E+Eh&d&& /4G
 22='J%%g. 5#224	5--iOL".G )) 4G (+7+D+D'EF$..}fE!!&) 	/,,.H	/ ((=&I		?M($X.9@9L{FG#45RX!!,/ 	$% $1ABBMMO$	$
 	]+  !9!9:**,!!((5	<#?#??	""l&E&EE" GG  <<22 779I% !$	(C(CD}}y(yy,,,"ii779*Z()=>!C>GG((221+2B2B1ERvQO 	##%w5 5	/ 	/	$ 	$s*   PP$P1P1P!$P.1P;c           	        d }|5  t        j                         }|j                   ||             i }|D ]  }|t        u r |j	                  |j                                +|t        u r|j                          D|j                          |j                  |j                               }|j                  t        j                  |j                  j                  |      j!                                       |j#                  |j%                                t'        |      D ]  \  }}|t        u r |j	                  |j                                .|t        u r+|j                          |j                   |||d               at)        |j                         |j                  |j                               }|j+                  |        	 d d d        y # 1 sw Y   y xY w)Nc                0    t        j                  d |       S )Nc                    | t         uS rB   )r8  ry  s    rN   r   zcSIMDScheduling.codegen_node_schedule_with_kernel.<locals>.current_reduction_nodes.<locals>.<lambda>  s    :J1J rO   )r   	takewhile)re   s    rN   current_reduction_nodeszQSIMDScheduling.codegen_node_schedule_with_kernel.<locals>.current_reduction_nodes  s    &&'JERRrO   )r@  	ExitStackr:  r8  enter_contextrB  r9  closedecide_inplace_updaterq  r	  rg  r  fromkeysr  indexing_from_argsro   r
  keysr   r#   r   )	rL   r3  rJ   r  stackall_indexingrq   r   r  s	            rN   r  z0SIMDScheduling.codegen_node_schedule_with_kernel  s   	S  !	-((*E!!"9-"HIL & ++''(@(@(BC_,KKM..0!'!<!<T__=N!OJ '' JJ99*ELLN $$\%6%6%89 %]3 
-4++''(@(@(BC_,KKM))*A-PQPRBS*TU 6djjA!'!<!<T__=N!OJLL,
-/!	- !	- !	-s   GGGc           	     &   |j                   \  }\  }}|dk(  sJ |j                  j                  |j                        \  }}|5  |s|g|D ]  }	|	j                            |       }
|j	                  d      5  |D ]0  }	|	j                  |j                  |	j                                      2 	 ddd       ddd       t        
t              s$|
j                  d       |
j                  dd       t        j                  |      5  |j	                  d      5  t        |
t              r|
}n|
j                  d       |
j                  }ddd       |g|}t        j                  r|j!                         dz  }t        j"                  j$                  j'                  |j(                        }|j*                  J d	        |j,                  g ||j*                   }|j/                          d
 d
|j1                  ||      j3                          }|rcddd       S | j5                  ||      }ddd       | j7                         |j9                  |j                         t        j"                  xj:                  |j:                  z  c_        t        j"                  xj<                  |j<                  z  c_        | j>                  jA                          y# 1 sw Y   2xY w# 1 sw Y   7xY w# 1 sw Y   xY w# 1 sw Y   xY w)z
        Codegen a triton template

        If `only_gen_src_code` the src code will be returned instead of codegen'd into the wrapper
        r4   z<STORE_OUTPUT>Nz<DEF_KERNEL>z	<ARGDEFS>F)strictg    eAzmeta is Noner  )!r  rq   make_kernel_renderr  set_subgraph_bodyr   rq  r	  r   rS   finalize_hookr3   r  coder   benchmark_kernelr  ru   rv   
size_hints
call_sizesmetagrid_fnimports_for_benchmark_kernelcodegen_kernel_benchmarkgetvaluer  r  r  r  r  r   r  )rL   template_nodeepilogue_nodesonly_gen_src_coderX  rH   r  rJ   renderrq   partial_coder  r3  num_gb	grid_argsgridr  s                    rN   codegen_templatezSIMDScheduling.codegen_template  s    +00?E6{{&++>>}?Q?QR 	Q$*<^< $DMMO$!8L))*:; Q* QDLL!<!<T__=N!OPQQ	Q ,,&&~6&&{5&A!!&) 	N))*:; 1lC0+H ../?@+00H1 +<^<M&&99;cAGG,,778I8IJ	{{.>>.%v~~>y>&++>::<=Rj66vtDMMOPR  !-	N 	N0 ,,X}fMK1	N4 	]+;(:(:;	6#9#99	""f&?&??"##%QQ Q	Q 	Q1 1	N 	NsO   5K-96K 0K-	L1K:CL#L K*	%K--K7:L	?LLc                    t         j                  j                  j                  t         j                  j                  j                                y rB   )r3   ru   r  r  
device_opssynchronizerQ   s    rN   codegen_synczSIMDScheduling.codegen_sync  s-    	&&qww'9'9'E'E'GHrO   c           
        ddl m} |D cg c]  }|j                          }}i i }
}	t        ||      D ]  \  }}t	        |d       j
                  \  }\  }}| j                  |||      }| j                  |||      }||||f|
|<   | j                  |||      \  }}} |j                  ||||| d|	|<    |j                  || ||	|
      }t        j                  dt        |      |D cg c]  }t        |       c}       g }|D ]R  }|D cg c]  }|j                          }} |||      }t        ||      D ]  \  }}|r|D ]  }t               |_         | j!                  |
|   d	   |j#                  |	|                |	|   }|
|   d	   }|sEt%        j&                  |      5  |D ]!  }|t(        t*        fvs|j-                          # 	 d d d        t$        j.                  xj0                  |j0                  z  c_        t$        j.                  xj2                  |j2                  z  c_         |j5                         }|j7                  |||f       U |S c c}w c c}w c c}w # 1 sw Y   xY w)
Nr4   )ComboKernelc                4    t        | j                               S rB   rA  r   s    rN   r   z;SIMDScheduling.generate_combo_kernel_code.<locals>.<lambda>      #ann>N:O rO   r   )r   r   r   optimize_mask)re   triton_schedulingcustom_algorithm
kernel_mapnode_info_mapz1ComboKernels: %d nodes partitioned into %s groups)enable_autotunemixed_sizesr   )triton_combo_kernelr  r  r2  rB  r  r>  r  r  create_triton_kernelhorizontal_partitionr  rG  r   r   r   r  create_sub_kernelr3   r  r9  r8  r  ru   r  r  r  r|   )rL   subkernel_nodescustom_part_algorithmr  r  r  r  rq   fused_node_listssubkernel_mapnode_schedule_mappnre   rX  rH   r  r3  r  r~  r   r   
partitionsr   kernel_code_list
node_grouprJ   r   	subkernelr  s                                rN   generate_combo_kernel_codez)SIMDScheduling.generate_combo_kernel_code  s    	59HIDNN,II+-r(_.>? 	IB!$U0O!P!V!VAv 77ufMM--mUFKL$1<$Nb!
 $$]E6B	" @ @ @1#'"-o!M"	$ !55!"2$+ 6 

 			? '(SV(	

 $ 	DJ=GHT 0HH  /'F
 !-=> K	E$" 4'1|466%b)!,,,]2->? *"-	 1" 5a 8(--i8 0$1 0D#O=M+NN $00 ''9+D+DD'**i.J.JJ*#K& ,,.H##Xvz$BC7	D8  { J< )  I$0 0s#   I"-I'I,-I1I11I:c                   |j                         }|j                  }|j                  }t        j                  dkD  xs t        j                  dk(  xr |}| j                  ||||      }|D ]l  \  }}}	| j                  ||g|      }
| j                  |g       t        j                  d|
       |j                  t        j                  j                  |
       n | j                  j                          y )Nr4   z"ComboKernels: generated kernel %s.)get_subkernel_nodesuse_custom_partition_algor  r   combo_kernel_allow_mixed_sizesr  r  r  r  rG  r  r3   ru   r  r   r  )rL   combo_kernel_noder  r  r  r  r  r  rJ   rX  r  s              rN   codegen_combo_kernelz#SIMDScheduling.codegen_combo_kernel1  s    +??A 1 K K+;;;;a? 
11Q6P;P 	  ::2O[
 $4 	BHfa,,X8I7JFSK  "3!45II:KHqww33[A		B 	##%rO       c           
        | j                         \  }}t        |      dk  ry| j                         }t        |j                        t        |      k(  sJ |j                  |j
                  g}t        d t        j                  j                  |      D              sJ t        j                  j                  |      D cg c]:  }|j                  t        j                  j                  vrt        |t              r|< }}|j
                  D ch c]  }|j                   }}g }|D ]  }t        j                  j                   j#                  |j$                  |j                        }	t        |	      t        |      k(  sJ 	 |	j%                  d      dz   }
|
t        |      k(  rt        d |	|
d  D              r	 t        j                  j                   j)                  t+        |d |
             t        j                  j                   j)                  t+        ||
d              f}t        j                  j                   j-                  t+        d t/        ||	      D                    }|j                  |v r|dz  }t0        j3                  |d         r|dz  }t0        j3                  |d         r|dz  }t        j                  j                   j-                  |t+        t        j                  ||            z
        dk\  s|j5                  t1        |||j                                |S c c}w c c}w # t&        $ r Y w xY w)Nr4   r   c              3  H   K   | ]  }t        |t        t        f        y wrB   )r   r   r   rN  s     rN   r   z3SIMDScheduling.candidate_tilings.<locals>.<genexpr>T  s$      
 sY01
s    "c              3  &   K   | ]	  }|d k(    ywrl  r   r   s     rN   r   z3SIMDScheduling.candidate_tilings.<locals>.<genexpr>h  s     7!qAv7s   c              3  2   K   | ]  \  }}|d k7  s|  ywrl  r   )r   rT  r  s      rN   r   z3SIMDScheduling.candidate_tilings.<locals>.<genexpr>u  s      )T6Vq[Ds   r   r   )r	  r   pointwise_read_writes
range_varsrD  rE  r   r   r  r  rE   r3   ru   r  r   r   rv   stride_hintsrd   
ValueErrorr   r.   r   r2  CandidateTilingis_good_sizer|   )rq   rD  reduction_rangesrwdep_sourcesr  depswrite_namestilingsr  splitr  scores                rN   candidate_tilingsz SIMDScheduling.candidate_tilingsE  s    $(??#4  v;!'')2==!S[000 xx+ 
 44[A
 
 	
 
 !44[A
xxqww666:c9;U 
 

 ,.995Csxx55)+ &	OCgg&&33CIIr}}MGw<3v;...
a(1,CK'7wuv77  8   ))-v*GH  ))-uv*GHL
 GG$$.. -0-A E
 xx;&
++LO<
++LO<
   **M)//&BR*STT 
 |UCHHMNM&	ON a

 6   s$    ?L1L6"L;'L;;	MMr4   c                   |dk7  st         j                  j                  dk  rvt        j                  t
        j                  k  rQt        j                  |      D ]9  }t        | j                  |            dkD  s!t        j                  d        ||fS  ||fS t               }t        j                         }t        j                  |      D ]c  }| j                  |      D ]M  }|j                  |v r|j!                  |j                         ||j"                  xx   |j$                  z  cc<   O e |j'                         D cg c]  \  }}|	 }	}}t         j                  j                  dk\  rt)        dt        |	            D ]  }
|	d   \  }}|	|
   \  }}t*        j,                  j.                  j1                  ||z
        dk(  rCt*        j,                  j.                  j1                  ||z
        dk  r|	|
   \  }}|	d   \  }}t*        j,                  j.                  j1                  ||z
        dkD  sJ t*        j,                  j.                  j3                  ||      s|t5        ||      |f}|g|	z   }	 n t        |	      dkD  rt        j                  d|	       t         j                  j6                  rt        j                  |      D cg c]/  }t9        |t:        j<                        r|j?                         d   1 }}t               }|D ]q  }tA        dt        |      t         j                  j                  z
        }|dz   }tC        |d|       }|gtE        ||d       z   }|j!                  tG        |             s tI        |t        d      }||	z   }	|	D ]!  }g ||tK        fd	|D              sc S  ||fS c c}}w c c}w )
z
        Heuristics to decide how to tile kernels.
        Currently, we tile based on stride-1 dimensions.

        Returns:
            `(tile1, tile2, reduction_numel)` s.t. `tile1 * tile2 == numel`

        r4   r   z"reduction over non-contiguous dimsr   zpossibly bad tiling: %sNT)r   reversec              3     K   | ]B  }t        |t        j                        r&t        j	                  |j                                D y wrB   )r   r   r-  rU   ri  r	  )r   rq   
new_groupss     rN   r   z/SIMDScheduling.select_tiling.<locals>.<genexpr>  s=      dI$;$;< ((T__5FGs   AA)&r   r  	max_tilesperf_hint_loglevelloggingWARNINGr9  rt  r   r  infor   r   r   rE   r   tilingr  most_commonranger3   ru   rv   r   rH  r   prefer_nd_tilingr   r   r-  r	  rB  r.   r   r  r  r   )rh  r3  rH   rl  rq   
seen_namescandidate_tilesr  r  ranked_tilingsr  a0a1b0b1node_rangesnew_tilings
node_rangenum_leading_dimsfirst_trailing_dimcollapsed_leading_dimranked_new_tilingsr  r  s                          @rN   r  zSIMDScheduling.select_tiling  s    a6==#:#:a#? ""goo5+22=A D30067!;%**+OP?++	 ?++&0l
(3(;(;(=#**=9 	?D//5 ?;;*,v{{+.&,,>.	?	? 7F6Q6Q6ST]VU&TT==""a' 1c.12 '*B'*B77##--b2g6!;77##--b2g6:+A.FB+A.FBww''11"r':Q>>>77##@@RH (2r"2B7F&,X%>N ~"8.I ==)) ,22=AdI$;$;< !!$K 
 :DK) /
#&q#j/FMM<S<S*S#T %5%9"(5jATBT6U(V%/04
CUCV8W3XXf./ "(d!K/.@N* 	"L9<99J ) 
 "!	" ''w U>s   O
4Oc                     y rB   r   rQ   s    rN   flushzSIMDScheduling.flush  r  rO   c                     yr   r   rQ   s    rN   ready_to_flushzSIMDScheduling.ready_to_flush  r   rO   c                   t         j                   G d d             }|D cg c]  } |||j                         }}|D ]  }t               |_         |d   j	                         st        |d       j                  \  }\  }}| j                  |||      }	| j                  |	||      }
| j                  |	||      \  }}} | j                  |
|||d}| j                  |	|       t        j                  d|      5  t        j                  |      5  |j!                         }d d d        d d d        n=|d   }|dd  }t        j                  d|      5  | j#                  ||d	
      }d d d        j%                  t'        t(        j*                        d      }|S c c}w # 1 sw Y   xY w# 1 sw Y   ExY w# 1 sw Y   QxY w)Nc                  *    e Zd ZU ded<   ded<   ddZy)GSIMDScheduling.generate_kernel_code_from_nodes.<locals>.LastUsageHolderr   r   r   c                :    | j                   | j                  _         y rB   )r   r   rQ   s    rN   __del__zOSIMDScheduling.generate_kernel_code_from_nodes.<locals>.LastUsageHolder.__del__  s    $(OO!rO   Nr  )rY   rZ   r[   r  r1  r   rO   rN   LastUsageHolderr/    s    FO4rO   r2  r   c                4    t        | j                               S rB   rA  r   s    rN   r   z@SIMDScheduling.generate_kernel_code_from_nodes.<locals>.<lambda>  r  rO   r   r  r  r4   T)r  triton_)dataclasses	dataclassr   r   r  rB  r  r>  r  r  r  r  r   patchr3   r  r  r  replacerS   r,   KERNEL_NAME)rL   re   r  r2  r   last_usage_holdersrX  rH   r  r3  r  r~  r   r   rJ   r  r  r  s                     rN   generate_kernel_code_from_nodesz.SIMDScheduling.generate_kernel_code_from_nodes  s   				4 	4 
	4 INN1oa>NN  	(A%<AL	( Qx##%!$U0O!P!V!VAv 77ufMM--mUFKL9=9M9Muf:6	; &T%%1#'	F 22=&I"$4 3##F+3 "0023 3 3
 "!HM"12YN02BC 00!>T 1 
 ##C(?(?$@)LK O.3 3 3 3 s5   F>F/F#%F/F;#F,	(F//F8;Gc                     y rB   r   )rL   r3  s     rN   r  zSIMDScheduling.codegen_comment  r  rO   c                    t         rB   r  )rL   r  r3  rJ   s       rN   r  zSIMDScheduling.define_kernel  r  rO   r  )rq   z<Union[scheduler.FusedSchedulerNode, scheduler.SchedulerNode])rH   rT   r`  z(Iterable[Union[ir.Buffer, ir.TensorBox]]rW   r   r  )rW   Optional[str])r  zList[BaseSchedulerNode]r  r   r  r   r  r   r  r   rW   zList[Tuple[str, Any, Any]]r  )'rY   rZ   r[   rU   r  rj  rk  rD   r  r  can_fuse_verticalr  r>  rJ  r  r   rb  r   ro  rv  r  rH  r  r  r  r  r  r   r   r  r]   r^   r  r*  r,  r;  r  r  r_   r`   s   @rN   r  r    s   KJJ#Qm6^ !"ENVPV* 1 1 ""$L"	" "H  :.#:Js&j%-P @E8	8tI #(G 0G   $G  	G 
 G   G  
$G R&( YA  AF ANqAQ W( W(r.`"rO   r  c                  @    e Zd ZU ded<   ded<   dZded<   ed        Zy)	r  zTuple[sympy.Expr, sympy.Expr]r  r   r  Nr>  rE   c                r    t         j                  j                  j                  |       } | dk\  xr | dz  dk(  S )z@Somewhat arbitrary heuristic used to boost scores for some sizesr  r   rZ  )r   s    rN   r  zCandidateTiling.is_good_size'  s5     GG&&q)Bw(AFaK(rO   )rY   rZ   r[   r  rE   r  r  r   rO   rN   r  r  !  s)    ))JD-) )rO   r  c                      e Zd ZdZy)r8  z
    Marker to invoke `kernel.disable_reduction()`.  This closes a
    reduction loop and allows for pointwise ops to occur on the output
    of a reduction.
    N)rY   rZ   r[   r\   r   rO   rN   r8  r8  .  s    rO   r8  c                       e Zd ZdZed        Zy)r9  z1
    Marker to end a DisableReduction block.
    c              #  Z   K   d}| D ]   }|t         t        fv r	|t        u }|r| " yw)zf
        Get the nodes from node_schedule skipping those in a
        DisableReduction block.
        FN)r9  r8  )r3  disabledrq   s      rN   rt  zEnableReduction.filter;  s@      ! 	D)9::#33
	s   )+N)rY   rZ   r[   r\   r  rt  r   rO   rN   r9  r9  6  s      rO   r9  c                      e Zd Zy)rI  N)rY   rZ   r[   r   rO   rN   rI  rI  L  s    rO   rI  )e
__future__r   r   r@  r5  r   r   r  r   r]  typingr   r   r   r   r   r	   r
   r   r   r   r   r]   rX  torch._loggingtorch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._sympy.symbolr   r   r   _dynamo.utilsr   r   r   r   r   	codecacher   dependenciesr   r   r   r    r!   r"   optimize_indexingr#   runtime.hintsr$   runtime.runtime_utilsr%   r&   r'   r(   r)   utilsr*   r+   r,   r-   r.   r/   r0   virtualizedr1   r2   r3   commonr5   r6   r7   r8   r  r9   	getLoggerrY   r  _logginggetArtifactLoggerr  rF  
fusion_logdoprintr  r6  r>   rV   rx   r   rU   r  r  r8  r9  	ExceptionrI  r   rO   rN   <module>r\     s   "               / L L O O % $ $ ! ; ; - A ) ; D D   - , P P % g!00<H~~//*E^^--hA
 	 (- (- (-VuA/ uAp:'? :'z{	 {	|s"^ s"l 	) 	) 	)  ,		 	rO   