
    sg9                   V   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlmZmZmZmZmZmZmZmZmZmZmZmZmZ d dlZd dlZd dlZd dlmZm Z  d dl!m"Z"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z* d d	l+m,Z, d
dl-m.Z.m/Z/m0Z0m1Z1m2Z2 d
dl3m4Z4 d
dl5m6Z6m7Z7m8Z8 d
dl9m:Z: d
dl0m;Z;m<Z<m=Z=m>Z> d
dl1m?Z?m@Z@mAZA d
dlBmCZC d
dlDmEZEmFZF d
dlGmHZH d
dlImJZJmKZKmLZLmMZMmNZNmOZOmPZPmQZQmRZRmSZSmTZT d
dlUmVZV  ej                  eX      ZYej                  j                  eXd      Z\ej                  j                  eXd      Z]ej                   G d d             Z_ G d d      Z` G d d      Zad9dZb G d d       Zc	 	 	 	 	 	 	 	 d:d!Zdej                  j                  j                  ej                  j                  j                  ej                  j                  j                  ej                  j                  j                  d"Zk G d# d$e`      Zl G d% d&e`      Zm G d' d(e`      Znd;d)Zo	 	 	 	 	 	 	 	 d<d+Zp G d, d-e`      Zq G d. d/eq      Zr G d0 d1e`      Zs	 d=	 	 	 	 	 	 	 d>d2Ztej                   G d3 d4             Zu ej                         Zw G d5 d*      Zx G d6 d7      Zyd?d8Zzy)@    )annotationsN)AnyCallableCounterDefaultDictDictGenericListOptionalSequenceSetTupleTypeVarUnion)countersdynamo_timed)get_metric_tableis_metric_table_enabled)free_unbacked_symbols)
OrderedSet)free_symbol_is_typeSymT)
has_triton   )commsconfigdependenciesirmetrics)
write_text)BackendFeatureget_scheduling_for_deviceKernel) estimate_nccl_collective_runtime)Dep	MemoryDepStarDepWeakDep)ComputedBufferMultiOutputMultiOutputLayout)LoopBody)
green_textred_text)SimplifyIndexing)cache_on_selfcmpdevice_need_guardget_device_tflopsget_dtype_sizeget_gpu_dram_gbpsIndentedBufferis_collectiveis_gpuis_waitsympy_product)Vfusionloop_orderingc                      e Zd ZU ded<   ded<   ded<    ej
                  e      Zded	<   dd
ZddZ	ddZ
ddZddZddZddZddZy)SchedulerBuffer	Scheduler	schedulerz	ir.BuffernodeBaseSchedulerNodedefining_op)default_factoryList[NodeUser]usersc                @    t        | j                  j                        S N)hashrB   nameselfs    L/var/www/html/venv/lib/python3.12/site-packages/torch/_inductor/scheduler.py__hash__zSchedulerBuffer.__hash__P   s    DIINN##    c                v   t               }| j                         }|j                  | dt        | j                        j
                          |j                  | d| j                  j                          | j                         r-|j                  | dt        | j                                       | j                         r-|j                  | dt        | j                                       t        | j                        dk  r0|j                  | d| j                          |j                         S |j                  | d       |j                  d      5  | j                  D ]  }|j                  | d        	 d d d        |j                  d	       |j                         S # 1 sw Y   *xY w)
N: z
.layout = z.aliases = z.mutations = r   z	.users = z
.users = [,])r6   get_name	writelinetyperB   __name__layoutget_aliasespformatget_mutationslenrG   indentgetrawvalue)rM   resultrK   users       rN   	debug_strzSchedulerBuffer.debug_strS   s   !}}D6DO$<$<#=>?D6DII,<,<+=>?v[9I9I9K1L0MNOv]74;M;M;O3P2QRStzz?avYtzzl;< !!## vZ01q! 1 JJ 1D$$vQZ011 S!!!##	1 1s   &F//F8c                6    | j                   j                         S rI   rB   rU   rL   s    rN   rU   zSchedulerBuffer.get_nameg       yy!!##rP   c                   | j                   J | j                   j                         sy | j                   j                         s| j                   j                         r4t        j
                  j                  j                  | j                          y t        t        j                  d      r| j                         t        j                  j                  v rt        j
                  j                  j                  | j                  j                  t        j                  j                  | j                               j                   | j                          y t        j
                  j                  j                  | j                          y )Nargs)rB   should_allocateget_inputs_that_alias_outputget_mutation_namesr;   graphwrapper_codecodegen_allocationhasattrkernelrU   inplace_update_bufferscodegen_inplace_reuserA   name_to_bufrL   s    rN   allocatezSchedulerBuffer.allocatej   s    yy$$$yy((*99113tyy7S7S7UGG  33DII> AHHf%188#B#BBGG  66**HH33DMMOD$			 GG  33DII>rP   c                    | j                   J t        | j                   j                  t        j                        ry| j
                  D ]  }t        |j                   t              s y y)NFT)rB   
isinstancerY   r   
NoneLayoutrG   
OutputNode)rM   uses     rN   can_freezSchedulerBuffer.can_free   sV    yy$$$dii&&6:: 	C#((J/	 rP   c                ,   i }|D ]o  }t        |j                        |v r>|j                  |t        |j                                 |t        |j                        <   X||t        |j                        <   q t        |j	                               | _        y rI   )idrB   mergelistvaluesrG   )rM   rG   r`   rx   s       rN   	set_userszSchedulerBuffer.set_users   st    &( 	+C#((|v%'*yy3881E'Fr#((|$'*r#((|$		+
 &--/*
rP   c                R    | j                   J | j                   j                         S rI   )rB   ri   rL   s    rN   rZ   zSchedulerBuffer.get_aliases   s%    yy$$$yy5577rP   c                R    | j                   J | j                   j                         S rI   )rB   rj   rL   s    rN   r\   zSchedulerBuffer.get_mutations   %    yy$$$yy++--rP   Nreturnintr   strr   Noner   bool)rG   rF   r   r   r   zSequence[str])r   	List[str])rX   
__module____qualname____annotations__dataclassesfieldr}   rG   rO   rb   rU   rs   ry   r   rZ   r\    rP   rN   r?   r?   I   sR    
O""-K--dCE>C$$($?.+8.rP   r?   c                     e Zd ZU ded<   ded<   ded<   ded<   ded	<   d2d
Zd3dZd4dZd4dZd4dZd4dZ	d5dZ
	 	 	 	 	 	 d6dZd7dZd8dZd9dZd:dZ	 	 	 	 	 	 d;dZd5dZd<dZd<dZd5dZd5dZ	 	 	 	 d=dZd4dZd4dZd<dZd<d Zd>d!Zd?d"Zd@d#ZdAd$Zd9d%Zd9d&Z d9d'Z!d9d(Z"d9d)Z#dBd*Z$d9d+Z%d5d,Z&	 dC	 	 	 	 	 dDd-Z'dEd.Z(dFd/Z)dGd0Z*y1)HrC   z7Tuple[torch.device, Tuple[Tuple[sympy.Expr, ...], ...]]groupdependencies.ReadWritesread_writeszOrderedSet[Dep]unmet_dependenciesr   	min_order	max_orderc                    || _         y rI   rA   )rM   rA   s     rN   __init__zBaseSchedulerNode.__init__   s	    $-rP   c                >   || _         t               | _        t               | _        d| _        |j                         D cg c]  }t        | j                  ||        c}| _        | j                  D ci c]  }|j                         | c}| _
        y c c}w c c}w )NF)rA   rB   rD   )rB   r   	ancestors
last_usagewrittenget_outputsr?   rA   outputsrU   outputs_by_name)rM   rB   outputbufs       rN   _init_from_nodez!BaseSchedulerNode._init_from_node   s    ,0	*4, L 	  **,/
  .. /
 ,0<<<
$'CLLNC<
/
<
s   B4Bc                T    t        |       j                   d| j                         dS )Nz(name=)rW   rX   rU   rL   s    rN   __repr__zBaseSchedulerNode.__repr__   s'    t*%%&fT]]_,?qAArP   c                H   | j                         }t               }|j                  | dt        |       j                   dt        t        | dd            j                   d| dt        | j                  j                         d| dt        | j                         d| d	t        | j                  j                  | j                  z
         d| d
       |j                         5  | j                         D ]!  }|j                  |j                                # 	 ddd       |j                  d       	 |j                  | j                                |j'                         j)                         S # 1 sw Y   XxY w# t         $ r t"        j%                  dd       Y Lw xY w)#Longer form printout for trace logsrR   (rB   N)

.writes = 
.unmet_dependencies = .met_dependencies = z.outputs = [
        rT   Ignoring error in debug_str()Texc_info)rU   r6   splicerW   rX   getattrr[   r   writesr   readsr^   r   rb   rV   debug_str_extra	Exceptionlogwarningr_   rstrip)rM   rK   r   outs       rN   rb   zBaseSchedulerNode.debug_str   s   }}

bd		QtGD&$$?@IIJ Kj))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU V 		
 ZZ\ 	,'') ,

3==?+,	, 	c	HJJt++-.  ''))	, 	,  	HKK7$KG	Hs   %5E25E> 2E;> F! F!c                     y)N r   rL   s    rN   r   z!BaseSchedulerNode.debug_str_extra       rP   c                   t        | j                  dd       }d}t        |t        j                  j
                  j                        r'd|j                  |j                         gdd      z   }nct        |t        j                  j
                  j                        r5d|j                  |j                         |j                         gdd      z   }|  | S )Ndatar   z, F)shorten	multiline)r   rB   ru   torch	_inductorr   	Pointwise
str_helperget_size	Reductionget_reduction_sizeget_reduction_type)rM   
maybe_datadata_strs      rN   debug_str_shortz!BaseSchedulerNode.debug_str_short   s    TYY5
j%//"4"4">">?j33$$&'% 4  H 
EOO$6$6$@$@Aj33..0*2O2O2QR 4  H
 z""rP   c                p    t         j                  d| | j                  | j                  j                         y )Nz(%s: unmet_dependencies = %s, writes = %s)r   infor   r   r   rL   s    rN   log_detailszBaseSchedulerNode.log_details   s,    6####		
rP   c                     y rI   r   )rM   self_dep	other_deps      rN   reorder_loops_by_dep_pairz+BaseSchedulerNode.reorder_loops_by_dep_pair   s     	rP   c                X    | j                  | j                  j                  |             y rI   )set_read_writesr   renamerM   renamess     rN   update_mutated_namesz&BaseSchedulerNode.update_mutated_names   s!    T--44W=>rP   c                X    | j                  | j                  j                  |             y rI   )r   r   	with_readrM   deps     rN   add_fake_depzBaseSchedulerNode.add_fake_dep   s!    T--77<=rP   c                B    t        d | j                         D              S )Nc              3  `   K   | ]&  }|j                         xs |j                          ( y wrI   )rZ   r\   ).0r   s     rN   	<genexpr>z=BaseSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>  s-      
9<COO4!2!2!44
s   ,.)anyr   rL   s    rN   has_aliasing_or_mutationz*BaseSchedulerNode.has_aliasing_or_mutation  s%     
@D@P@P@R
 
 	
rP   c                h    || _         | j                   j                  | _        | j                          y rI   )r   r   r   
prune_deps)rM   rws     rN   r   z!BaseSchedulerNode.set_read_writes  s(    "&"2"2"8"8rP   c           	         | j                         }t        |D cg c]  }|j                  ||       c}      }||z
  | _        y c c}w rI   )used_or_aliased_buffer_namesr   getr   )rM   future_used_buffersmutation_real_nameused_buffersks        rN   set_last_usagez BaseSchedulerNode.set_last_usage  sH     88:!"VA#5#9#9!Q#?"VW&)<< #Ws   Ac                F    | j                   D ]  }|j                           y rI   )r   rs   )rM   r   s     rN   mark_runzBaseSchedulerNode.mark_run  s    << 	CLLN	rP   c                    t        d t        j                  | j                  j                  | j                  j
                        D              S )Nc              3  4   K   | ]  }|j                     y wrI   rK   r   r   s     rN   r   z6BaseSchedulerNode.used_buffer_names.<locals>.<genexpr>  s      
 HH
   )r   	itertoolschainr   r   r   rL   s    rN   used_buffer_namesz#BaseSchedulerNode.used_buffer_names  s?     
 t'7'7'='=t?O?O?V?VW
 
 	
rP   c                (   t               }t        j                  | j                  j                  | j                  j
                        D cg c]  }|j                   }}t        |      dkD  r|j                         }|j                  |       t        j                  j                  j                  |      rFt        j                  j                  |   j                         D ]  }||vs|j                  |        t        |      dkD  r|S c c}w Nr   )r   r   r   r   r   r   rK   r]   popaddr;   rk   name_to_bufferr   ri   append)rM   
used_namesr   depsaliass        rN   r   z.BaseSchedulerNode.used_or_aliased_buffer_names  s    &0l
 !t'7'7'='=t?O?O?V?VW
 HH
 
 $i!m((*CNN3ww%%))#.WW33C8UUW +EJ.E*+	 $i!m 
s   Dc                L     t         fd j                  D               _        y )Nc              3  f   K   | ](  }|j                   j                  j                  vr| * y wrI   )rK   rA   available_buffer_namesr   r   rM   s     rN   r   z/BaseSchedulerNode.prune_deps.<locals>.<genexpr>-  s/      -
xxt~~DDD -
s   .1r   r   rL   s   `rN   r   zBaseSchedulerNode.prune_deps,  s#    ", -
..-
 #
rP   c                     d fdt        fd j                  j                  D              } j                   j                  j	                  |             y )Nc                    t        | t              syj                  j                  | j                     j
                  }|j                         t        j                  j                  v S NF)
ru   r(   rA   rr   rK   rD   rU   r;   rk   removed_operations)r   oprM   s     rN   should_prunez7BaseSchedulerNode.prune_weak_deps.<locals>.should_prune5  sI    c7+++CHH5AAB;;=AGG$>$>>>rP   c              3  4   K   | ]  } |      s|  y wrI   r   r   r   r  s     rN   r   z4BaseSchedulerNode.prune_weak_deps.<locals>.<genexpr>;  s      
\#5FC
   r   r%   r   r   )r   r   r   r   remove_reads)rM   	to_remover  s   ` @rN   prune_weak_depsz!BaseSchedulerNode.prune_weak_deps3  sN    	?  
++11
 
	 	T--::9EFrP   c                F    t        | || j                  j                         y rI   )_prune_redundant_depsrA   rr   )rM   name_to_fused_nodes     rN   prune_redundant_depsz&BaseSchedulerNode.prune_redundant_deps@  s     	d$68R8RSrP   c                R    | j                   J | j                   j                         S rI   )rB   get_operation_namerL   s    rN   rU   zBaseSchedulerNode.get_nameE  r   rP   c                "    | j                         S rI   rU   rL   s    rN   get_first_namez BaseSchedulerNode.get_first_nameI  s    }}rP   c                B    t        d | j                         D              S )Nc              3  <   K   | ]  }|j                           y wrI   r"  )r   rB   s     rN   r   z8BaseSchedulerNode.get_operation_names.<locals>.<genexpr>M  s     Gd$--/G   )r   	get_nodesrL   s    rN   get_operation_namesz%BaseSchedulerNode.get_operation_namesL  s    Gdnn6FGGGrP   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wrI   r"  r   r   s     rN   r   z5BaseSchedulerNode.get_buffer_names.<locals>.<genexpr>P  s     AS#,,.Ar&  )r   r   rL   s    rN   get_buffer_namesz"BaseSchedulerNode.get_buffer_namesO  s    ADLLAAArP   c                    | gS rI   r   rL   s    rN   r'  zBaseSchedulerNode.get_nodesR  s	    vrP   c                    | j                   S rI   )r   rL   s    rN   r   zBaseSchedulerNode.get_outputsU  s    ||rP   c                     | j                   |   S rI   )r   )rM   buf_names     rN   
get_outputzBaseSchedulerNode.get_outputX  s    ##H--rP   c                R    | j                   J | j                   j                         S rI   )rB   
get_devicerL   s    rN   r3  zBaseSchedulerNode.get_device[  s%    yy$$$yy##%%rP   c                     yr  r   rL   s    rN   is_reductionzBaseSchedulerNode.is_reduction_      rP   c                     yr  r   rL   s    rN   is_split_scanzBaseSchedulerNode.is_split_scanb  r6  rP   c                     yr  r   rL   s    rN   is_templatezBaseSchedulerNode.is_templatee  r6  rP   c                     yr  r   rL   s    rN   	is_externzBaseSchedulerNode.is_externh  r6  rP   c                     yr  r   rL   s    rN   
is_foreachzBaseSchedulerNode.is_foreachk  r6  rP   c                     yr  r   rM   read_deps     rN   can_inplacezBaseSchedulerNode.can_inplacen  r6  rP   c                     yr  r   rL   s    rN   has_side_effectsz"BaseSchedulerNode.has_side_effectsq  r6  rP   c                	   ddl m} t        | t        f      rt        j
                  rt        j                  j                  | j                         t        j                        r{t        t        j                  t        j                  j                  j                   j"                        rt%        t        j                  dd      t'        t        j                  d      syt)        | j*                  j,                  d       }| j/                         D ]  }|j0                  }|J |j3                         rJ|j5                         s:|j7                         s*|j9                         t        j                  j:                  v ro|D ]@  }| j<                  j>                  jA                  |jB                        }|s6t        j                  jD                  jG                  ||       sat        |jH                  tJ              r||jL                  J |jL                  D cg c]4  }|j0                  j9                         | j<                  jN                  vr|6 }}tQ        |      dk(  s|d   jR                  s|d   j0                  | u s|j0                  t        |j0                  jU                         tV        jX                  tV        jZ                  f      rWt        |jH                  j0                  tV        j\                  tV        j^                  f      r(tQ        |j0                  j5                               dkD  r ||j0                         ||j0                        k(  st        j                  j`                  jc                  |j9                         |j9                                t        t        j                  t        j                  j                  j                   j"                        rnt        j                  jd                  jg                  |j9                                t        j                  jd                  jg                  |j9                                | jh                  jk                  |j9                                |j9                         t        j                  jl                  |j9                         <      yc c}w )	z~
        Decide if there should be inplace updates for the node
        and record the decision in the active kernel.
        r   )buffer_reuse_key	mutationsNrg   c                    | j                   S rI   r   xs    rN   <lambda>z9BaseSchedulerNode.decide_inplace_update.<locals>.<lambda>  s
    QVV rP   keyr   )7codegen.wrapperrF  ru   SchedulerNoder   inplace_buffersr;   rk   has_featurer3  r!   INPLACE_BUFFERSro   r   r   codegensimd
SIMDKernelr   rn   sortedr   r   r   rB   rh   ri   rj   rU   removed_buffersrA   rr   r   rK   rl   	can_reuserD   NopKernelSchedulerNoderG   completed_operationsr]   rB  
get_layoutr   r+   MutationLayoutSHOULDREMOVEFallbackKernelr*   rg   make_inplacerG  r  r   discardrp   )	rM   rF  ordered_readsr   buf_noderead	input_bufrJ  remaining_usess	            rN   decide_inplace_updatez'BaseSchedulerNode.decide_inplace_updatet  ss   
 	6 tm-.&&##DOO$5~7U7UVqxx)@)@)E)E)P)PQ188[$7C &)t//55;KL##% A	CxxH''',,.88:..0<<>QWW%<%<<% 67;~~7Q7Q7U7UII8	 ,,66y$G&y'<'<>TU$??666 "+&66??,DNN4W4WW &N & N+q0*1-99*1-22d:%NN6 *%NN557 " 4 4 " = =! ' ) 5 5 : :!#!2!2BNN C !$INN$O$O$Q RUV V,Y^^<+CHH56 2293E3E3GX%HHeoo&=&=&B&B&M&M HH..2293E3E3GHHH..223<<>B //	0B0B0DE &..0 77LLN m6A	*&s   9Sc                .   t         j                  sy |r| j                  ry | j                  J | j                  j	                         }g }|D ]  }|j
                  dk(  r|j                  d       |j                  d       d|j
                   d|j                   }d|j                  v r|d|j                  d    z   }|j                  |       d|j                  v s|j                  d    }|j                  d	      d
   }|j                  d|j                  dd      j                  dd      j                  dd      z          |j                  d       |j                  d       ! t        |      dk(  ry |j                  |       d| _        y )Nr   r   z#pragma CMT ORIGIN:z#pragma CMT  seq_nrz seq_nr:stack_trace|{z{{}z}}r   \z#pragma CMT END ORIGINr   T)r   comment_originr   rB   get_originsr  r  targetmetasplitreplacer]   
writelines)	rM   buffer	only_onceorigins	out_linesoop_info_strri  stack_trace_last_lines	            rN   codegen_originating_infoz*BaseSchedulerNode.codegen_originating_info  s    $$yy$$$))'')	 	%AttxR 23(az:K166!)hqvvh7G6H,II[)&!"!6 7(3(9(9#(>r(B%  "+33C>WS$'WT4()   !9:  $-	%0 y>Q 	)$rP   c                t   	
 t         t              ryt         t              rt         j                  t              ryddt         t
              r@ t         j                         d         t         j                         d         z        nt        d      t        j                  t              } j                  j                   j                  j                  z  D ]   }||j                     j!                  |       " t#        d  j                  j                  D              }t#        d  j                  j                  D              }d fdt         t$              rt#         fd|D              }||z
  }||z
  }d}||z  D ]  }t'        fd	||   D              	|t(        j*                  j,                  v rt(        j*                  j,                  |   }n;|t(        j*                  j.                  v rt(        j*                  j.                  |   }nd	
 fd

| 
|      z  } |S )aM  
        Counting the number of bytes accessed for a kernel is
        surprisingly tricky. In particular, there is a differentiation
        between 'theoretical' memory accesses and practical memory
        accesses. For example, a layernorm kernel may actually access an
        input 3 times, but in theory, it only needs to access its input
        once (and may be optimized to do so through say, persistent
        reductions)

        Another example is that even though a buffer is passed in, we may
        not access the entire buffer. This may occur if we are accessing
        a slice of the buffer. Another tricky case is for indirect
        indexing, where the amount of bytes accessed depends on the
        values of the input.

        What this function aims to compute is the memory accesses for
        worst-case inputs, best-case optimization. What this means is
        that for each buffer we compute the amount of potential accesses in two ways and take the minimum.

        1. Numel in ranges multiplied by number of deps the buffer has
        2. The buffer size
        r   c                X    t         j                  j                  j                  | d      S )Nr   fallback)r;   rk   sizevars	size_hint)ss    rN   try_size_hintzEBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.try_size_hint  s"    77##--a!-<<rP   r       eAc              3  4   K   | ]  }|j                     y wrI   r   r   s     rN   r   zABaseSchedulerNode.get_read_write_buffers_sizes.<locals>.<genexpr>&  s     F388Fr   c              3  4   K   | ]  }|j                     y wrI   r   r   s     rN   r   zABaseSchedulerNode.get_read_write_buffers_sizes.<locals>.<genexpr>'  s     HCHHHr   c                    j                   j                  |    j                  }t        d |D              }t	        |t        |      z
        dkD  S )Nc              3  4   K   | ]  }|j                     y wrI   )rB   r   ra   s     rN   r   zZBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.is_materialized.<locals>.<genexpr>+  s     !>$))!>r   r   )rA   rr   rG   r   r]   )r   snodesrG   buf_usesrM   s       rN   is_materializedzGBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.is_materialized)  sG    NN..s399E!!>!>>Hx*V"44599rP   c              3  J   K   | ]  } |j                         r|  y wrI   r  )r   r   r  rM   s     rN   r   zABaseSchedulerNode.get_read_write_buffers_sizes.<locals>.<genexpr>/  s#      )_S$++-N)s   ##c              3  "   K   | ]  }  y wrI   r   )r   r   
node_numels     rN   r   zABaseSchedulerNode.get_read_write_buffers_sizes.<locals>.<genexpr>7  s     $RCZ$Rs   c                   | syt        | j                  t              rj                  j                  | j                            j                  }d}|D ]x  }t        |j                  t              sJ t        |j                  j                  t              r5|j                  j                         D ]  }| |j                        z  } x y |S t        | j                  t        j                        r"t        fd| j                         D              S  	t        | j!                                     }t#        | j%                               t'        |      z  S )Nr   c              3  h   K   | ])  } t         j                  j                  |             + y wrI   )r;   rk   
get_buffer)r   mut_nameget_buf_bytess     rN   r   zXBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.get_buf_bytes.<locals>.<genexpr>T  s-      $ &agg&8&8&BCs   /2)ru   rY   r+   rA   rr   rU   rG   rB   rC   r*   r   r   rv   sumrj   r:   r   r4   	get_dtypemin)
r   rG   totra   	sched_buf	buf_elemsbuf_accessed_elemsr  rM   r  s
         rN   r  zEBaseSchedulerNode.get_read_write_buffers_sizes.<locals>.get_buf_bytes@  s"    cjj*;< NN66s||~FLLEC % 	%)$))5FGGG%diinnkB-1YY-B-B-D E	 #}Y^^'D DE $%	% J

BMM: (+(>(>(@  
 !.mCLLN.K LI)#--/:S*I>  rP   )r  z
sympy.Exprr   r   )r   r   r  Sequence[BaseSchedulerNode]r   r   )r   z(Optional[Union[ir.Buffer, ir.TensorBox]]r   r   )ru   rY  ExternKernelSchedulerNoderB   r*   rO  r:   
get_rangesr   collectionsdefaultdictr}   r   r   r   rK   r  r   FusedSchedulerNoder  r;   rk   r  graph_inputs)rM   buf_accessesr   r   r   rW  
node_bytesr0  r   r  r  r  r  r  s   `        @@@@@rN   get_read_write_buffers_sizesz.BaseSchedulerNode.get_read_write_buffers_sizes  s   . d23d56:II{<
 	= dM*&doo/23 1! 456J
 SJ"..t4##))D,<,<,C,CC 	/C"))#.	/ Ft/?/?/E/EFFH0@0@0G0GHH	:
 d./( )%) O o-FO+E
 (	-H!$$R<;Q$R!R177111gg,,X6QWW111gg**84 < -,,JQ(	-T rP   c                L   | j                         d   j                         d   }|j                  j                         }|j                  j	                         }|j
                   t        |j
                  j                        syt        | j                        r<t        | j                  t        j                        sJ 	 t        | j                        S t!        | j                        ry	 t#               }t%        |      dz  }t        | t(              rt        | j                  t        j*                        sJ dt        | j                               t,        j/                  t1        | j                  dd      d      }|Xddlm} ddlm}	 t;        d	 | j                  j<                  D              ry |       5 }
 |	d
      5 }t?        j@                  | j                  jB                        5  t?        jD                  |
      5  ddl
m#} | j                  j<                  D cg c]  } ||d
       }}| j                  jH                  } |jJ                  |g|i | j                  jL                   d}|jO                         }| jQ                         }||z  |z  dz  }||z  }tS        ||      cddd       cddd       cddd       cddd       S yt        | tT              st        | j                  tV              r| jQ                         |z  S y# t        $ r}t        j                  |       Y d}~yd}~ww xY w# t&        $ r Y yw xY wc c}w # 1 sw Y   nxY wddd       n# 1 sw Y   nxY wddd       n# 1 sw Y   nxY wddd       y# 1 sw Y   yxY w)zB
        Returns estimated op runtime in nanoseconds (ns)
        r   Nl    J)type(self.node)=python_kernel_namer   )FakeTensorMode)FlopCounterModec              3  f   K   | ])  }t        t        |j                                     d kD   + ywr   N)r]   r   	get_numelr   ns     rN   r   z:BaseSchedulerNode.get_estimated_runtime.<locals>.<genexpr>  s.       -akkm<=As   /1F)displayr   )ir_node_to_tensor)guard_shapeg      ?r  ),r'  r   rB   r[  r  devicer8   rW   r7   ru   r   IRNoder$   
ValueErrorr   r   r9   r5   r3   r   r  ExternKernelkernel_name_to_opr   r   torch._subclasses.fake_tensorr  torch.utils.flop_counterr  r   inputsr;   set_current_nodefx_nodeset_fake_moder  	__class__process_kernelkwargsget_total_flopsr  maxr  r)   )rM   r   rY   dtypeegpu_memory_bandwidth	gpu_flopsr  r  r  	fake_modeflop_counter_moder  inputfake_inputsclsfactorcounted_flopscounted_bytescompute_timetransfer_times                        rN   get_estimated_runtimez'BaseSchedulerNode.get_estimated_runtimeb  s=    nnq!--/2$$&""$==$VFMM4F4F-G #dii3337		BB TYY
 	#4#6 )%069I d56dii9P>Nd499o=O;PP9"&&		#7<dB
 ~HD !YY--  #% <O!5 <&(:(:II%%)< ??	< 6 &*YY%5%5#! *%UC#K # ))--C&C&&rLKL499;K;KL !F$5$E$E$GM$($E$E$GM$*]$:Y$F##ML$14H$HM |];1< < < < <@  01ZII~6
 4469MMMM   	   		>#< < < < < < < < <@ A<@ s   L -M ?
N	*N3M0	M	(M9A?M	8	M0	N
	N	M%L??M	MMM	M$ M0'	N0M95N<	NN	
NN#c                     y rI   r   rL   s    rN   get_template_nodez#BaseSchedulerNode.get_template_node      rP   N)rA   r@   r   r   )rB   ir.Operationr   r   r   r   r   r&   r   r&   r   r   r   Dict[str, str]r   r   )r   r%   r   r   r   )r   r   r   r   r   OrderedSet[str]r   r  r   r   r   r  r  Dict[str, BaseSchedulerNode]r   r   r   r  )r   zSequence[SchedulerBuffer])r0  r   r   r?   r   torch.devicerA  zdependencies.Depr   r   )T)rv  r6   rw  r   r   r   r   )r   floatr   zOptional[ir.TemplateBuffer])+rX   r   r   r   r   r   r   rb   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  rU   r#  r(  r,  r'  r   r1  r3  r5  r8  r:  r<  r>  rB  rD  re  r}  r  r  r  r   rP   rN   rC   rC      sN   BB(('' NN.
&B*2#
!.7	
?>


=#2=HV=	=
 
GT">T	T
.HB.&Wt 9=*$*15*	*XgRWrrP   rC   c                  B    e Zd ZU g dZded<   ded<   d
dZddZddZy	)	WhyNoFuse)node1node2reasonrg   r   r  zTuple[Any, ...]rg   c                     || _         || _        y rI   )r  r  rM   r  r  s      rN   r   zWhyNoFuse.__init__  s    

rP   c                J    || _         || _        t        j                  |        y rI   )r  rg   
fusion_logdebug)rM   r  rg   s      rN   __call__zWhyNoFuse.__call__  s    	rP   c                    d| j                   j                          d| j                  j                          d| j                  | j                  z  z   S )Nzcannot fuse z with rR   )r  rU   r  r  rg   rL   s    rN   __str__zWhyNoFuse.__str__  sK    djj1134F4::;N;N;P:QQSTKK$))#
 	
rP   N)r  rC   r  rC   r   r   )r  r   rg   r   r   r   r   )rX   r   r   	__slots__r   r   r  r  r   rP   rN   r  r    s#     5IK


rP   r  c                    t        | t              rt        | t              } t	        j
                  | d      }d|v rdt        j                  |d       S |S )NrL     )r^   r       )ru   r   rV  r   pprintr[   textwrapr^   )objr`   s     rN   r[   r[     sM    #z"Sc"^^C*Fv~HOOFG4566MrP   c                  0    e Zd ZddZddZddZd	dZeZy)
rw   c                &    t        |g      | _        y rI   r  r   s     rN   r   zOutputNode.__init__  s    ",cU"3rP   c                     yr  r   rL   s    rN   r5  zOutputNode.is_reduction  r6  rP   c                     y)Nr   r   rL   s    rN   ri   z'OutputNode.get_inputs_that_alias_output  r   rP   c                     y)NOUTPUTr   rL   s    rN   rU   zOutputNode.get_name  s    rP   N)r   r'   r   r   r   r   r   )rX   r   r   r   r5  ri   rU   r   r   rP   rN   rw   rw     s    4 HrP   rw   c                    t        j                          j                  D ]X  }t        |t              r|j
                     j                  }|j                            j                         xx   dz  cc<   Z d fdt        fd j                  D              }|r? j                  |z
   _         j                   j                  j                  |             yy)am  
    Prunes weakdeps intended for mutation ordering
    on an upstream fused node if after fusion there is another dependency
    on the fused upstream node, making the weakdep redundant

    In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
    be incrementally removed, enabling other fusions, ensuring they are fused in order.
    r   c                    t        | t              rN| j                     j                  j	                         }|   j	                            dkD  }|   k(  }|xs |S y)Nr   F)ru   r(   rK   rD   rU   )r   op_nameis_redundantis_self_deprr   name_to_dep_countr  rB   s       rN   r  z+_prune_redundant_deps.<locals>.should_prune  sh    c7#!#((+77@@BG,-?-H-Q-Q-STWXXL -W5=K.;.rP   c              3  4   K   | ]  } |      s|  y wrI   r   r  s     rN   r   z(_prune_redundant_deps.<locals>.<genexpr>  s      ,s2Cr  Nr  )r  r   r   ru   r(   rK   rD   rU   r   r   r   r  )rB   r  rr   r   r  deps_to_pruner  r  s   ```   @@rN   r  r    s     '2&9&9&;&& Q#w'SXX&22B0?HHJKqPKQ

 
  .. M "&"9"9M"IT--::=IJ rP   )zextern_kernels.convolutionzextern_kernels.mmzextern_kernels.bmmzextern_kernels.addmmc                  8     e Zd Zd fdZddZddZddZ xZS )r  c                    t         |   |       | j                  |       | j                  |j	                                y rI   superr   r   r   get_read_writesrM   rA   rB   r  s      rN   r   z"ExternKernelSchedulerNode.__init__"  5    #T"T1134rP   c                V    | j                          dt        | j                  dd        S )Nz.node.kernel = r  )rU   r   rB   rL   s    rN   r   z)ExternKernelSchedulerNode.debug_str_extra'  s*    --/"/'$))EY[_2`1abbrP   c                     yNTr   rL   s    rN   r<  z#ExternKernelSchedulerNode.is_extern*  r  rP   c                    | j                   J t        | j                   d      xr | j                   j                         S )NrD  )rB   rn   rD  rL   s    rN   rD  z*ExternKernelSchedulerNode.has_side_effects-  s6    yy$$$tyy"45V$)):T:T:VVrP   rA   r@   rB   r  r   r   r   r   )rX   r   r   r   r   r<  rD  __classcell__r  s   @rN   r  r  !  s    5
cWrP   r  c                        e Zd Zd fdZ xZS )rY  c                    t         |   |       | j                  |       | j                  |j	                                y rI   r  r  s      rN   r   zNopKernelSchedulerNode.__init__3  r  rP   r  )rX   r   r   r   r  r  s   @rN   rY  rY  2  s    5 5rP   rY  c                       e Zd Z	 	 	 	 	 	 d fdZ	 	 d	 	 	 	 	 ddZ	 	 d	 	 	 	 	 ddZddZddZ	 	 	 	 	 	 ddZddZ	ddZ
dd	Zdd
ZddZddZddZ	 	 	 	 ddZddZed d       Zd!dZed"d       Z xZS )#rO  c                f    t         |   |       | j                  |       | j                          y rI   )r  r   r   _compute_attrsr  s      rN   r   zSchedulerNode.__init__:  s,    
 	#T"rP   c                ,   t        | j                  t        j                  t        j                  f      sJ | j                  j                  ||      \  | _        | _        | j                  j                  | j                  j                               j                  }| j                  j                          || j                        f| _        t        j                   xs' | j                  j                         j                  dk7  }t        | j                  t        j                        r,| j!                  | j                  j#                  |             y | j!                  t%        j"                  | j                  g| j                  d|i       y )Nextra_indexing_constraintsrecompute_sizes_body_funccuda	normalizer"  )ru   rB   r   r)   TemplateBuffersimplify_and_reorder_sizes_bodyrA   get_backendr3  group_fnr   r   loop_ordering_after_fusionrW   r   extract_read_writesr   )rM   r  r  r(  should_normalizes        rN   r  zSchedulerNode._compute_attrsC  sC   
 $))b&7&79J9J%KLLL"&))"@"@'A&? #A #
TZ
 >>--dii.B.B.DENNii**,ht{{.CD

 111 5yy##%**f4 	
 dii!2!23  		--8H-I   00JJ!%8HrP   c                *    | j                  ||       y )Nr  )r  )rM   r  r  s      rN   recompute_size_and_bodyz%SchedulerNode.recompute_size_and_bodyc  s    
 	'A&? 	 	
rP   c                   | j                   j                  D ch c]  }t        |t        t        f      s| }}| j                  t        j                  | j                  g| j                  d|ij                  |             y c c}w )Nr"  )r   r   ru   r(   r'   r   r   r*  r&  r%  r   )rM   r"  r   	fake_depss       rN   refresh_dependenciesz"SchedulerNode.refresh_dependenciesm  s      ++11
ZgwEW5XC
	 
 	,,

![[4=i	"	

s
   BBc                    | j                   j                  |      | _         | j                   j                  | _        | j	                  d       y )NFr!  )r&  reorder_iter_loopssizesr%  r0  )rM   	new_orders     rN   apply_new_loop_orderz"SchedulerNode.apply_new_loop_order|  s?    ZZ22

 jj&&!!E!2rP   c                   d }| j                   d   }t        |      |j                  cxk(  r|j                  k(  rn n|j                  |      }|rPt        xj
                  dz  c_        t        j                  d| j                         |       | j                  |       y t        j                  d| j                                y )Nr   r   z"Reorder loops for %s with order %szEDon't reordering %s because we can not decide the suitable loop order)
r%  r]   num_varsdecide_loop_order_to_matchr   num_loop_reorderingloop_ordering_logr  rU   r5  )rM   r   r   r4  
self_sizess        rN   r   z'SchedulerNode.reorder_loops_by_dep_pair  s     	[[^
z?h//E93E3EE ;;IFI''1,'##4dmmoy %%i0##WrP   c                >   | j                         }| d| j                  d    | d| j                  d    | d| j                   g}| j                  j	                         D ]g  }t        |t              r|j                  }t        j                  j                  |      }|j                  | dt        |j                                i t        | j                  t              rR|j                  d| d       |j                  t!        j"                  | j                  j%                         d	             | j&                  J t)        j*                  | j&                  j-                               r|j/                  t1        |              d
j3                  |      S )Nz.group.device = r   z.group.iteration = r   z	.sizes = z
_layout = zclass z_loop_body:r  r   )rU   r   r%  r   reads_and_writesru   r(   rK   r;   rk   r  r  r[   rY   r&  r,   r  r^   rb   rB   r   	is_tritonr3  extenddebug_triton_codejoin)rM   rK   linesr   r0  r   s         rN   r   zSchedulerNode.debug_str_extra  sQ   }}f$TZZ]O4f'

17fIdkk]+

 ##446 	KCc7+88gg((2zGCJJ4G3HIJ		K
 djj(+LL6${34LL)=)=)?HIyy$$$<<		,,./LL*401yyrP   c                    | j                   S rI   )r%  rL   s    rN   r  zSchedulerNode.get_ranges      {{rP   c                    t        | j                  t        j                  t        j                  f      sJ dt        | j                               t        | j                  j                               S Nr  )ru   rB   r   r)   r#  rW   r   r   rL   s    rN   r5  zSchedulerNode.is_reduction  s`    II))2+<+<=
 	!d499o 	! 
 DII00233rP   c                L   t        | j                  t        j                  t        j                  f      sJ dt        | j                               t        | j                  t        j                        xr. t        | j                  j                  t        j                        S rF  )ru   rB   r   r)   r#  rW   r   	SplitScanrL   s    rN   r8  zSchedulerNode.is_split_scan  s~    II))2+<+<=
 	!d499o 	! 
 $))R%6%67 
JIINNBLL=
 	
rP   c                J    t        | j                  t        j                        S rI   ru   rB   r   r#  rL   s    rN   r:  zSchedulerNode.is_template  s    $))R%6%677rP   c                f    t        | j                  t        j                        r| j                  S d S rI   rJ  rL   s    rN   r  zSchedulerNode.get_template_node  s$    &tyy"2C2CDtyyN$NrP   c                f    | j                          | j                          | j                  |       y rI   )re  r   rS  )rM   
index_varss     rN   runzSchedulerNode.run  s#    ""$Z rP   c                &   | j                   }t        t        t        |            t        t        t        |            k(  sJ t	        t        t        j                  j                  |      t        j                  j                  |                  }|S rI   )	r%  r  mapr]   dictzipr   r   from_iterable)rM   rM  r3  
var_rangess       rN   ranges_from_index_varsz$SchedulerNode.ranges_from_index_vars  sp     3sE?#s3sJ+?'@@@@--j9--e4

 rP   c                   | j                  |      }	 t        j                  t        t        j                         |            5  t        j
                  j                  |       5   | j                  |  d d d        d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w# t        $ r" t        j                  d| j                          w xY w)NzError in codegen for %s)rU  r;   set_ops_handlerr/   get_ops_handlerro   r  r&  r   r   fatalrB   )rM   rM  rT  s      rN   rS  zSchedulerNode.codegen  s    00<
	"" !2!2!4jA (xx((.( 

J'( ( ( ( ( (  	II/;	sA   1B  B$B4B<B B	
BBB B +Cc                    | j                   \  }}t        j                  | j                  |t	        j
                  d      gt        |      z  g      S )zH
        Get the memory dependencies in the non-reduction axis.
        r   )hidden_args)r%  r   r*  r&  sympyIntegerr]   )rM   r3  reduction_sizess      rN   pointwise_read_writesz#SchedulerNode.pointwise_read_writes  sI    
 "&//JJU]]1-=,>_AU,U+V
 	
rP   c                   | j                         ryt        d | j                         D              ryt        | j                  j
                        dk(  rt        |t        j                        rt        t        | j                  j
                              }t        |t        j                        sJ dt        |             |j                  |j                  k(  xr |j                  |j                  k(  S y)NFc              3  <   K   | ]  }|j                           y wrI   )rZ   r+  s     rN   r   z,SchedulerNode.can_inplace.<locals>.<genexpr>  s     ?Ss ?r&  r   ztype(write_dep)=)r:  r   r   r]   r   r   ru   r   r&   nextiterrW   indexsize)rM   rA  	write_deps      rN   rB  zSchedulerNode.can_inplace  s    ?D,<,<,>??t&&'1,l,,2
 T$"2"2"9"9:;Ii)?)?@WEUT)_DVBWW@>>Y__4X)..9XXrP   c                   t               }t        | j                  t              r| j                  j	                         D ]  }|j
                  dk(  s|j                  dk(  s#d|j                  v r|j                  d   dk(  s,t        |j                        dk(  s\|j                  d   dk(  so|j                  d|j                  v r|j                  d   n(t        |j                        dk\  r|j                  d	   nd
        |S )Ncall_methodstoremode
atomic_add   r  rK      r   r   )r   ru   r&  r,   r'  r  rq  r  r]   rg   r  )rM   buffers_store_as_atomic_addrB   s      rN   _get_atomic_add_buffersz%SchedulerNode._get_atomic_add_buffers  s    7A|#djj(+

,,. GG},w.4;;.4;;v3F,3V		Na/DIIaLL4P 033!T[[0 F+.1$))n.Adiilr +*rP   )rA   r@   rB   z+Union[ir.ComputedBuffer, ir.TemplateBuffer]r   r   NN)r  z*Optional[Tuple[Dict[Any, Any], List[Any]]]r  zOptional[Callable[..., Any]]r   r   )r"  r   r   r   )r4  zSequence[int]r   r   r  r   )r   Sequence[Sequence[sympy.Expr]]r   r  )rM  zSequence[sympy.Expr]r   r   )rM  rq  r   zDict[sympy.Expr, sympy.Expr])rM  rq  r   r   )r   r   r  r  )rX   r   r   r   r  r-  r0  r5  r   r   r  r5  r8  r:  r  rN  rU  rS  r0   r_  rB  ro  r  r  s   @rN   rO  rO  9  s    : 
	 RVBF$N $@ 
	D RVBF
$N
 $@
 
	

3!.7	( ,4
8O!
8	%	 
 
 + +rP   rO  c           	     n     j                   } j                  t        j                  j	                  |D cg c]  }|j
                   c}             t         fdt        j                  |D cg c]  }|j                   c} D               j
                  j                  z
   _        y c c}w c c}w )Nc              3  Z   K   | ]"  }|j                   j                         vr| $ y wrI   rK   r,  )r   r   group_snodes     rN   r   z2refresh_group_node_dependencies.<locals>.<genexpr>  s.      
xx{;;== 
   (+)
r  r   r   
ReadWrites
merge_listr   r   unionr   r   )ru  r  rJ  s   `  rN   refresh_group_node_dependenciesrz    s    F**6+JaAMM+JK
 	 
!'')O1!*>*>)OP
 	

 
!
!
(
(	) " ,K *Ps   B-0B2r@   c                   t        | t        t        f      sJ || _        || _        d | _        t        j                  |D cg c]  }|j                  |j                   c} | _        t        |        t        d | j                  D              | _        t        d | j                  D              | _        | j                         D ci c]  }|j                         | c}| _        y c c}w c c}w )Nc              3  4   K   | ]  }|j                     y wrI   r   r   rJ  s     rN   r   z"init_group_node.<locals>.<genexpr>-       HHr   c              3  4   K   | ]  }|j                     y wrI   )r   r~  s     rN   r   z"init_group_node.<locals>.<genexpr>.  r  r   )ru   r  GroupedSchedulerNoder  rA   rB   r   ry  r   rz  r  r   r  r   r   rU   r   )ru  rA   r  rJ  r   s        rN   init_group_noder    s    
 k$68L#MNNNK%KK&,,%	A!)@!++	AK $K0H[5G5GHHKH[5G5GHHK'2'>'>'@# ##K 
B#s   C*C*	C/c                  t    e Zd ZU dZded<   e	 	 	 	 	 	 dd       Z	 	 	 	 	 	 ddZd fdZe	dd       Z
ddZe	d d	       Zd!d
ZddZddZ	 	 	 	 	 	 d" fdZe	d d       Ze	d d       Zd#dZddZe	d$d       Ze	d$d       Ze	d$d       Ze	d%d       Zd&dZe	d$d       Zd'dZd(dZd)dZddZ xZS )*r  z
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be fused together. The way it does this is by maintaining
    its unmet dependencies as the union of its constituent nodes.
    List[BaseSchedulerNode]r  c                2   |j                   |j                   u sJ t        |t        t        f      sJ t        |t        t        f      sJ t	        t        j                  |j                         |j                                     } | |j                   |      S rI   )rA   ru   rO  r  r}   r   r   r'  )r  r  r  nodess       rN   fusezFusedSchedulerNode.fuse=  sz     %//111%-1C!DEEE%-1C!DEEEY__U__%68IJK5??E**rP   c                   | j                         ry d }| j                  D ]N  }t        |t              sJ |)||j                  d   k7  rt
        j                  d        y |j                  d   }P d }|J t        |      |j                  cxk(  r|j                  k(  rn n|j                  |      }|s%t
        j                  d| j                                y t        xj                  dz  c_        t
        j                  d| j                         |       | j                  D ]%  }t        |t              sJ |j                  |       ' t        |        y )Nr   z1Can not reorder fused node due to different sizeszODont reordering fused node %s because we can not decide the suitable loop orderr   z-Reorder loops for fused node %s with order %s)r:  r  ru   rO  r%  r:  r  r]   r7  r8  rU   r   r9  r5  rz  )rM   r   r   r;  snoder4  s         rN   r   z,FusedSchedulerNode.reorder_loops_by_dep_pairG  s>    
[[ 	)Ee]333%*Q*G!''G aJ	) 	%%%z?h//E93E3EE ;;IFI##a ##q(#;T]]_i	
 [[ 	2Ee]333&&y1	2 	(-rP   c                    t         |   |       t        | ||       g | _        t	        |d       j
                  | _        y )Nc                4    t        | j                               S rI   )r   r5  rI  s    rN   rK  z-FusedSchedulerNode.__init__.<locals>.<lambda>p  s    s1>>3C/D rP   rL  )r  r   r  rG   r  r   rM   rA   r  r  s      rN   r   zFusedSchedulerNode.__init__l  s8    #i0%'
%DEKK
rP   c                z    dj                  | j                  D cg c]  }|j                          c}      S c c}w N_rA  r  rU   rM   rJ  s     rN   rU   zFusedSchedulerNode.get_namer  )    xxt{{;!;<<;   8c                <    | j                   d   j                         S r  r  rU   rL   s    rN   r#  z!FusedSchedulerNode.get_first_namev      {{1~&&((rP   c                |    t        j                  | j                  D cg c]  }|j                          c} S c c}w rI   r   ry  r  r,  r  s     rN   r,  z#FusedSchedulerNode.get_buffer_namesy  .    !L1!"4"4"6!LMM!L   9c                j    g }| j                   D ]!  }|j                  |j                                # |S rI   r  r?  r   rM   r`   rB   s      rN   r   zFusedSchedulerNode.get_outputs}  4    (*KK 	.DMM$**,-	.rP   c           
        t        | j                        D cg c]+  \  }}| j                          d| d|j                          - }}}| j                  d   j                  }|?|j                         }t        j                  |      r|j                  t        |              t        j                  dj                  |      j                         d      S c c}}w )Nz.snodes[z] =
r   r   r  )	enumerater  rU   rb   rB   r3  r   r>  r?  r@  r  r^   rA  r   )rM   irB   rB  r  s        rN   r   z"FusedSchedulerNode.debug_str_extra  s     %T[[1
4 }}xs%0@/AB
 
 {{1~""__&F||F#.t45tyy/668&AA
s   0Cc                h    | j                   D cg c]  }|j                          }}|  d| S c c}w )Nz
, snodes: )r  r   )rM   rB   
snodes_strs      rN   r   z"FusedSchedulerNode.debug_str_short  s9    9=Ed**,E
Ez*.. Fs   /c                    t         |   ||       t               }t        | j                        D ]/  }|j                  ||       |j                  |j                         1 y rI   )r  r   r   reversedr  updater   )rM   r   r   rB   r  s       rN   r   z!FusedSchedulerNode.set_last_usage  s\    
 	24FG 0:|T[[) 	8D 35GH&&t7	8rP   c                |    t        j                  | j                  D cg c]  }|j                          c} S c c}w rI   )r   ry  r  r   r  s     rN   r   z$FusedSchedulerNode.used_buffer_names  s.    !MA!"5"5"7!MNN!Mr  c                |    t        j                  | j                  D cg c]  }|j                          c} S c c}w rI   )r   ry  r  r   r  s     rN   r   z/FusedSchedulerNode.used_or_aliased_buffer_names  s3    8<D1a,,.D
 	
Dr  c                    | j                   S rI   r  rL   s    rN   r'  zFusedSchedulerNode.get_nodes  rD  rP   c                T    t        |       j                   d| j                          dS )Nz(nodes=r   r   rL   s    rN   r   zFusedSchedulerNode.__repr__  s'    t*%%&gdmmo->a@@rP   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wrI   )r5  r~  s     rN   r   z2FusedSchedulerNode.is_reduction.<locals>.<genexpr>  s     91>>#9r&  r   r  rL   s    rN   r5  zFusedSchedulerNode.is_reduction  s    9T[[999rP   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wrI   )r8  r~  s     rN   r   z3FusedSchedulerNode.is_split_scan.<locals>.<genexpr>  s     :1??$:r&  r  rL   s    rN   r8  z FusedSchedulerNode.is_split_scan  s    :dkk:::rP   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wrI   )r:  r~  s     rN   r   z1FusedSchedulerNode.is_template.<locals>.<genexpr>  s     8q1==?8r&  r  rL   s    rN   r:  zFusedSchedulerNode.is_template  s    8DKK888rP   c                j    | j                   D ]$  }|j                         s|j                         c S  y rI   )r  r:  r  rM   rB   s     rN   r  z$FusedSchedulerNode.get_template_node  s5    KK 	0D!--//	0 rP   c                     | j                   d   S r  )r   rL   s    rN   r3  zFusedSchedulerNode.get_device  s    zz!}rP   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wrI   )r   r~  s     rN   r   z>FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>  s     EA1--/Er&  r  rL   s    rN   r   z+FusedSchedulerNode.has_aliasing_or_mutation  s    EEEErP   c                    t         rI   NotImplementedErrorr   s     rN   r   z'FusedSchedulerNode.update_mutated_names      !!rP   c                    t         rI   r  rM   rK   s     rN   r   zFusedSchedulerNode.add_fake_dep  r  rP   c                    t         rI   r  r@  s     rN   rB  zFusedSchedulerNode.can_inplace  r  rP   c                P   | j                         }dj                  d | j                  D              }t               }|j	                  | dt        |       j                   d| d| dt        | j                  j                         d| dt        | j                         d| d	t        | j                  j                  | j                  z
         d| d
       |j                         5  | j                         D ]!  }|j	                  |j                                # 	 ddd       |j                  d       	 |j	                  | j!                                |j)                         j+                         S # 1 sw Y   XxY w# t"        $ r t$        j'                  dd       Y Lw xY w)r   rS   c              3  F   K   | ]  }t        |      j                    y wrI   )rW   rX   r  s     rN   r   z/FusedSchedulerNode.debug_str.<locals>.<genexpr>  s     FQQ 0 0Fs   !rR   r   r   r   r   r   r   z.outputs = [
            NrT   r   Tr   )rU   rA  r  r6   r   rW   rX   r[   r   r   r   r   r^   r   rb   rV   r   r   r   r   r_   r   )rM   rK   node_typestrr   r   s        rN   rb   zFusedSchedulerNode.debug_str  s   }}xxF$++FF

bd		Q|n -j))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU V 	
 ZZ\ 	,'') ,

3==?+,	, 	c	HJJt++-.  ''))	, 	,  	HKK7$KG	Hs   )5E69F 6E? F%$F%r  rC   r  rC   r   r  r  rA   r@   r  r  r   r   r   r  r   zList[SchedulerBuffer]r  r  r   r  r  r  )rK   r%   r   r   r  ) rX   r   r   __doc__r   classmethodr  r   r   r0   rU   r#  r,  r   r   r   r   r   r   r'  r   r5  r8  r:  r  r3  r   r   r   rB  rb   r  r  s   @rN   r  r  4  sn    $#+%+.?+	+ +#.!#..7#.	#.JL = =) N NB/8#28HV8	8 O O 
 

A : : ; ; 9 9   F F
"""*rP   r  c                  N    e Zd ZU dZ	 	 	 	 ddZ	 	 	 	 ddZedd       Ze	 	 	 	 	 	 dd       Z	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	e	 	 	 	 dd       Z
e	 	 	 	 dd       ZeZd	ed
<   e	 	 	 	 dd       Ze	 	 	 	 dd       ZddZddZddZddZd dZd!dZ	 	 	 	 d"dZ xZS )#ForeachKernelSchedulerNodez
    This is a schedular node that consists of a set of scheduler nodes that
    has no data dependencies among them and can be executed in parallel.
    c                    |j                         D ]=  }|j                         | j                  v s | j                  |j                            c S  y rI   )r   rU   read_to_node)rM   producerr   s      rN   get_consumer_subnode_forz3ForeachKernelSchedulerNode.get_consumer_subnode_for  sL     '') 	9C||~!2!22((88	9 rP   c                   t               }|j                  j                  D ]  }|j                  | j                  j
                  vr&| j                  j
                  |j                     j                  j                         }|| j                  v sp|j                  | j                  |           t        |      dk(  rt        t        |            S y Nr   )setr   r   rK   rA   rr   rD   rU   name_to_noder  r]   rb  rc  )rM   consumer	producersrd	node_names        rN   get_producer_subnode_forz3ForeachKernelSchedulerNode.get_producer_subnode_for   s     E	&&,, 	<Bwwdnn88822277;GGPPRID---d//	:;	< y>QY((rP   c                   t        |      }j                         r|j                         rt        j                  t              t        j                  t        |      }t        j                        t        |j                        k(  }|s |d       |xr2 t        fdt        j                  |j                        D              S |j                         rkj                         r	 |d       yt        j                  t        |      }|j                        }||j                  j                  |      S  |d       yj                         rk|j                         r	 |d       yt        j                  t              j                  |      }|j                  j                  ||      S  |d       yt        d      )	Nzforeach do not have same lengthc              3  \   K   | ]#  \  }}j                   j                  ||       % y wrI   )rA   can_fuse)r   lrr  s      rN   r   z6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>  s0      )Aq ""++Aq1)s   ),zXcandidate producer is a reduction, foreach ops cannot be fused with reductions currentlyFz5candidate producer is not dep of any foreach consumerzXcandidate consumer is a reduction, foreach ops cannot be fused with reductions currentlyz5candidate consumer has no dep in any foreach producerzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)r  r>  typingcastr  r]   r  allrR  r5  r  rA   r  r  AssertionError)r  r  r  whyforeach_matchconsumer_subnodeproducer_subnodes    `     rN   r  z#ForeachKernelSchedulerNode.can_fuse  s   (+ X%8%8%:{{#=xHH{{#=xHH0C4HHM 56  S )A) &    "$$&n {{#=xHH'@@J+))228=MNNGH  "$$&n {{#=xHH'@@J+))223CXNNGHf
 	
rP   c                
   |j                         s|j                         sJ |j                         r3t        j                  t        |      }|j                  }|j
                  }n2t        j                  t        |      }|j                  }|j
                  }d }d }|j                         r|j                         r|t        j                  t        |      }t        j                  t        |      }t        |j                  |j                        D cg c]  \  }}t        j                  ||       }	}}n/|j                         rt        j                  t        |      }|j                  |      }
g }	|}d }|j                  D ]A  }||
u r*t        j                  ||      }|}|	j                  |       1|	j                  |       C n|j                         rt        j                  t        |      }|j                  |      }g }	|}d }|j                  D ]A  }||u r*t        j                  ||      }|}|	j                  |       1|	j                  |       C nt        d       | |j                  |	||||      S c c}}w )NzTAt least one node passed to ForeachKernelSchedulerNode.fuse should be a foreach node)use_custom_partition_algoprev_node_1prev_node_2enable_autotune)r>  r  r  r  r  r  rR  r  r  r  r  r  r  r  rA   )r  r  r  r  r  r  r  r  r  fused_nodesr  rB   new_noder  s                 rN   r  zForeachKernelSchedulerNode.fuseB  sZ    ""$(;(;(=== {{#=xHH(0(J(J%&66O{{#=xHH(0(J(J%&66O X%8%8%:{{#=xHH{{#=xHH  AAq #''1-K    "{{#=xHH'@@JK"KK  -++166tXFH"*K&&x0&&t,-   "{{#=xHH'@@JK"KK  -++166xFH"*K&&x0&&t,- !f  &?##+
 	
Ks    I?c                    i  _         i  _        ||qt           ||       |D ]Z  }|j                  j
                  D ]  }| j                   |j                  <    |j                         D ]  }	| j                  |	<    \ n| _        | _	        d  _
        g  _         j                  t        j                  j                  |j                  |j                  g             t!         fdt!        j"                  |j$                  |j$                        D               j                  j&                  z
   _        t)        |j*                  |j*                  g       _        t-        |j.                  |j.                  g       _        |j1                         rt3        |t4              sJ ||}}
nt3        |t4              sJ ||}}
|
j6                   _         j6                  j9                  |j6                         |
j                   _        |j                         D ]  }	| j                  |	<    | _        |d   j=                         t?        j@                  d      fff _!        t!                _"        | _#        y )Nc              3  Z   K   | ]"  }|j                   j                         vr| $ y wrI   rt  r  s     rN   r   z6ForeachKernelSchedulerNode.__init__.<locals>.<genexpr>  s0       xxt'<'<'>>	 rv  r   combo_kernel)$r  r  r  r   r   r   rK   r(  rA   r  rB   rG   r   r   rw  rx  r   ry  r   r   r  r   r  r   r>  ru   r  r   r  r  r3  r\  Exprr   rx  r  )rM   rA   r  r  r  r  r  rB   rb  rK   foreach_node
other_noder  s   `           rN   r   z#ForeachKernelSchedulerNode.__init__  sW    +"5GY/ 3 ,,22 8D37D%%dii08 !446 3D.2D%%d+3	3 'DN DKDI)+DJ  ''22 ,,k.E.EF  )//#668V8V   ""))* # !+"7"79N9N!OPDN +"7"79N9N!OPDN%%'!+/IJJJ+6j!+/IJJJ+6j)33DNNN!!*"6"67 , 9 9D"668 5*4!!$'5 *C&Qi**,

>0J/L.NO
2<,.rP   c           	        |D cg c]  }t        |t              s| }}|rSt        j                  dt	        |      |D cg c])  }|j
                  |j
                  j                         + c}       |D cg c]  }t        |t        t        f      s| }}|D cg c]  }t        |t              s| }}|rt        j                  dt	        |             |D cg c]  }t        |t              r| }}|D cg c]  }|j                         s| }}|r t        j                  dt	        |      h       |D cg c]	  }||vs| }}|S c c}w c c}w c c}w c c}w c c}w c c}w c c}w )Nz/ComboKernels: %d external nodes are filtered %sz+ComboKernels: %d foreach nodes are filteredz,ComboKernels: %d template nodes are filtered)
ru   r  r   r  r]   rB   rp  rY  r  r:  )r  r  rJ  externrB   filtered_nodesforeach_nodestemplate_nodess           rN   combinable_nodesz+ForeachKernelSchedulerNode.combinable_nodes  ss    #Oj4M&N!OOIIAF5;UTtyy?T&&(U 
a"8:S!TU 
 
 &
A7Q)RA
 
 IICSEWX%
Z;U-VA
 
 &4Gq}}!GGII>^AT@U &4Oq7N!OO5 P
 V




 H
 PsL   EEEE:EE#5E# E(6E( E-E-	E2E2c           
         | j                         }g }d}|D ];  }|j                  t        dt        |      |      D cg c]
  }||||z     c}       = |S c c}w )zS
        Returns a list of lists of nodes that are to be grouped together.
           r   )_topological_sort_nodesr?  ranger]   )rA   sorted_nodesgrouped_nodesmax_num_nodesr  r  s         rN   &_default_group_nodes_for_combo_kernelszAForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels  sw     !88:! 	E   #1c%j-@ !a-/0	 s   A
4Callable[[Scheduler], List[List[BaseSchedulerNode]]]!group_algorithm_for_combo_kernelsc                    | t         _        y rI   r  r  )custom_group_algorithms    rN   %set_group_algorithm_for_combo_kernelsz@ForeachKernelSchedulerNode.set_group_algorithm_for_combo_kernels  s    
 # 	#DrP   c                ,    t         j                  |       S rI   r   r   s    rN   group_nodes_for_combo_kernelsz8ForeachKernelSchedulerNode.group_nodes_for_combo_kernels  s     *KKIVVrP   c                    t         rI   r  rL   s    rN   r   z#ForeachKernelSchedulerNode.mark_run  r  rP   c                    t        | j                  t        j                        sJ dt	        | j                                | j                  j                          | j                  j                                       y rF  )ru   rB   r   r)   rW   get_store_functionmake_loaderrL   s    rN   rS  z"ForeachKernelSchedulerNode.codegen  s\    $))R%6%67N<LDO;M9NN7&		$$&'>tyy'<'<'>'@ArP   c                     yr  r   rL   s    rN   r>  z%ForeachKernelSchedulerNode.is_foreach  r  rP   c                ,    t        | j                        S )zeReturns a list of nodes which comprise the combo kernel.
        These nodes may be vertically fused.)r}   r  rL   s    rN   get_subkernel_nodesz.ForeachKernelSchedulerNode.get_subkernel_nodes  s     DKK  rP   c                t    t        t        j                  j                  d | j                  D                    S )zqReturns all nodes contained in this kernel, unpacking fused nodes
        into their constituent scheduler nodes.c              3  <   K   | ]  }|j                           y wrI   )r'  r~  s     rN   r   z7ForeachKernelSchedulerNode.get_nodes.<locals>.<genexpr>  s     1UA!++-1Ur&  )r}   r   r   rS  r  rL   s    rN   r'  z$ForeachKernelSchedulerNode.get_nodes  s(     IOO111U1UUVVrP   c                <    | j                   d   j                         S r  )r  r#  rL   s    rN   r#  z)ForeachKernelSchedulerNode.get_first_name  s    {{1~,,..rP   c                    t        | || j                  j                         | j                  D ]  }|j	                  |        y rI   )r  rA   rr   r  r  )rM   r  rB   s      rN   r  z/ForeachKernelSchedulerNode.prune_redundant_deps"  s=     	d$68R8RSKK 	:D%%&89	:rP   )r  rC   r   Optional[BaseSchedulerNode])r  rC   r   r  r  rC   r  rC   r   r   )r  rC   r  rC   r   r  )NNF)rA   r@   r  r  r  r   r  r  r  r  r  r   r   r   r  r  r   r  )rA   r@   r   List[List[BaseSchedulerNode]])r  r  r   r   r   r   r   r  r  r   r  )rX   r   r   r  r  r  r  r  r  r   r  staticmethodr  r  r   r  r  r   rS  r>  r  r'  r#  r  r  r  s   @rN   r  r    s   
)	$)	$& ,
 ,
\ >
(>
4E>
	#>
 >
J 4837 %@/@/ (@/ $(	@/
 1@/ 1@/ @/ 
@/D +	  > 	& * 	/ & ( / 
 T
	
 
 WW	&W W
"B!
W
/:">:	:rP   r  c                       e Zd ZU dZded<   edd       Zd fdZddZddZ	e
dd       Zdd	Ze
dd
       ZddZddZedd       Z xZS )r  aC  
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be *grouped* together (it does not allow another node to be scheduled
    in between its constituent nodes, nor does it allow another node to fuse into any of its constituent nodes).
    The way it does this is by maintaining its unmet dependencies as the union of its constituent nodes.
    Fusion will still happen among the nodes within each GroupedSchedulerNode.
    At codegen time, this scheduler node will be unpacked and codegen is called on each constituent node.
    r  r  c                    |d   j                   t        fd|D              sJ  | |      }|D ]  }|j                  |j                         <   ! |j                  |j                         <   |S )Nr   c              3  :   K   | ]  }|j                   u   y wrI   r   )r   rB   rA   s     rN   r   z.GroupedSchedulerNode.create.<locals>.<genexpr>:  s     B44>>Y.Bs   )rA   r  r  rU   )r  r  grouped_snoder  rA   s       @rN   createzGroupedSchedulerNode.create7  sy    1I''	B6BBBBIv. 	KE=JI(()9:	KAN	$$]%;%;%=>rP   c                >    t         |   |       t        | ||       y rI   )r  r   r  r  s      rN   r   zGroupedSchedulerNode.__init__A  s    #i0rP   c                   | j                   D ])  }|| j                  j                  |j                         <   + | j                  j                  | j                         = | j                  j	                  | j                         S )z
        Do fusion among nodes within this GroupedSchedulerNode,
        and then unpack this GroupedSchedulerNode into regular nodes.
        )r  rA   r  rU   
fuse_nodes)rM   r  s     rN   unpackzGroupedSchedulerNode.unpackE  se    
 [[ 	HEBGDNN--enn.>?	HNN--dmmo>~~((55rP   c                    | j                  | j                  j                  |             | j                  j	                  |       y rI   )r   r   r   r   r  )rM   fake_deps     rN   r   z!GroupedSchedulerNode.add_fake_depO  s5    T--77AB##H-rP   c                z    dj                  | j                  D cg c]  }|j                          c}      S c c}w r  r  r  s     rN   rU   zGroupedSchedulerNode.get_nameS  r  r  c                <    | j                   d   j                         S r  r  rL   s    rN   r#  z#GroupedSchedulerNode.get_first_nameW  r  rP   c                |    t        j                  | j                  D cg c]  }|j                          c} S c c}w rI   r  r  s     rN   r,  z%GroupedSchedulerNode.get_buffer_namesZ  r  r  c                j    g }| j                   D ]!  }|j                  |j                                # |S rI   r  r  s      rN   r   z GroupedSchedulerNode.get_outputs^  r  rP   c                    | j                   S rI   r  rL   s    rN   r'  zGroupedSchedulerNode.get_nodesd  rD  rP   c                     yr  r   )r  r  r  s      rN   r  zGroupedSchedulerNode.can_fuseg  s     rP   )r  r  r   r  r  r  )r   r%   r   r   r   r  r  r  r  )rX   r   r   r  r   r  r  r   r  r   r0   rU   r#  r,  r   r'  r  r  r  s   @rN   r  r  +  s~     $# 16. = =) N N  rP   r  c           
          t         j                  d fd       }t        t        t	        t         d                           }t        |      dkD  r|D cg c]  } |   	 c} t        j                  r|j                  |       |S c c}w )z
    A heuristic to decide loop iteration orders.  This has not been well
    tuned and may be something we should autotune.
    c                t   |    dk(  s|   dk(  rt        |    dk(  |   dk(        S D cg c]  }t        ||           }}D cg c]  }t        ||          }}t        d t        ||      D              }t        d t        ||      D              }||kD  ry||kD  ryt        ||       S c c}w c c}w )Nr   c              3  :   K   | ]  \  }}|d k(  xs ||k    ywr  r   r   sl_asl_bs      rN   r   z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>  )      
)3tDAI$$
   c              3  :   K   | ]  \  }}|d k(  xs ||k    ywr  r   r*  s      rN   r   z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>  r-  r.  rk  )r1   absr  rR  )	abslstride_len_astride_len_ba_firstb_firstr3  stride_lengthss	          rN   	index_cmpz"pick_loop_order.<locals>.index_cmpw  s    8q=E!HMuQx1}eAh!m44 .<<rBqE
<<-;<rBqE
<<  
7:<7V
 
  
7:<7V
 
 WW 1ay# =<s   B0	B5r   rL  )r1  r   r2  r   r   r   )		functools
cmp_to_keyr}   r  r  r]   r   pick_loop_orderssort)r8  r3  priority_idxr9  orderpis   ``    rN   pick_loop_orderrA  m  s      4 %N1$5 6789E
<17CD.,D

y
!L Es   Bc                  T    e Zd ZU ded<   dZded<   dZded<   ddZddZdd	Zdd
Z	y)NodeUser$Union[BaseSchedulerNode, OutputNode]rB   Fr   rB  is_weakc                v    t        | j                  j                         | j                  | j                  f      S rI   )rJ   rB   rU   rB  rE  rL   s    rN   rO   zNodeUser.__hash__  s+    TYY'')4+;+;T\\JKKrP   c                    t        |t              xrW | j                         |j                         k(  xr4 | j                  |j                  k(  xr | j                  |j                  k(  S rI   )ru   rC  rU   rB  rE  rM   others     rN   __eq__zNodeUser.__eq__  s[    uh' .5>>#33.  E$5$55. -		
rP   c                6    | j                   j                         S rI   rd   rL   s    rN   rU   zNodeUser.get_name  re   rP   c                    | j                   |j                   u sJ t        | j                   | j                  xr |j                  | j                  xr |j                        S rI   )rB   rC  rB  rE  rH  s     rN   r|   zNodeUser.merge  sP    yyEJJ&&&II2!2!2LL*U]]
 	
rP   Nr   )rI  objectr   r   r   )rI  rC  r   rC  )
rX   r   r   r   rB  rE  rO   rJ  rU   r|   r   rP   rN   rC  rC    s3    
..K GTL
$
rP   rC  c                  z    e Zd ZU ded<   d8dZd8 fdZd9dZd:dZd;dZd<dZ	d:d	Z
d:d
Zd:dZ	 	 	 	 d=dZd>dZd?dZd:dZd:dZd=dZd:dZ	 	 	 	 d@dZd:dZdAdZ	 	 	 	 	 	 dBdZ	 	 	 	 d=dZdCdDdZdEdZ	 	 	 	 dFdZ	 	 	 	 	 	 dBdZ	 	 	 	 	 	 dBdZ	 	 	 	 	 	 	 	 dGdZ	 	 	 	 	 	 dBdZdBdZ 	 	 	 	 	 	 dBd Z!	 	 	 	 	 	 	 	 dHd!Z"dId"Z#	 	 	 	 	 	 dJd#Z$dKd$Z%	 	 	 	 	 	 dLd%Z&	 	 	 	 dMd&Z'	 	 	 	 dNd'Z(d:d(Z)d:d)Z*d:d*Z+dOd+Z,dOd,Z-d:d-Z.dPd.Z/dQd/Z0dQd0Z1dRd1Z2d:d2Z3d:d3Z4	 	 	 	 dSd4Z5dTd5Z6dUd6Z7d:d7Z8 xZ9S )Vr@   zDict[Dep, int]_Scheduler__dep_size_hint_cachec                f    t        d      5  | j                  |       d d d        y # 1 sw Y   y xY w)NzScheduler.__init__)r   _initrM   r  s     rN   r   zScheduler.__init__  s,    ./ 	JJu	 	 	s   '0c                N
    t                    i  _         t        j                  _        i  _        t        t               _	        t                _        t        g t        j                  j                  j                         t        j                  j                  j                         t        j                  j                  j                                _        |D cg c]  } j#                  |       c} _         j'                           j                   j)                  t        j                  j                  j                                 j$                  D ]  }|j+                            j$                  D ci c]  }|j-                         | c} _         j$                  D ci c](  }|j1                         D ]  }|j-                         | * c}} _         j.                  j5                          _        i  _        i  _         j=                           j?                   j$                         _         jA                           j$                  D ci c]  }|j-                         | c} _         jC                          tD        jF                  r:tI        jJ                   j$                   j2                   j6                         _        tL        xjN                  tQ         j$                        z  c_'        t        jR                  jU                   j$                         tQ         j$                         _+         jY                           j?                   j$                         _        t                _-        tD        j\                  $tE        j\                   j$                         _         j_                   j$                         _         ja                           jc                          tD        jF                  r$tI        jd                   j$                         _        tD        jf                  r ji                  d         jk                           jm                          t        jR                  jo                   j$                         t        jR                  jq                   j$                          js                          d  _:        t                _;        i  _<        t{        d      j}                   fd       y c c}w c c}w c c}}w c c}w )N)num_ck_nodesgraph_statsc                 ^     j                    j                  t         j                        dS )N)graph_idnum_nodes_before_fusionnum_nodes_after_fusion)post_grad_graph_idnum_orig_nodesr]   r  rL   s   rN   rK  z!Scheduler._init.<locals>.<lambda>  s'     33+/+>+>*-djj/ rP   )?r  r   rO  r;   rk   rA   backendsrb  _post_grad_graph_counterrZ  r   rZ  r  keys	constantstorchbind_constantsr  create_scheduler_noder  update_zero_dim_cpu_tensorr  r   rU   r  r   rr   copyr  r   mutation_renamescompute_dependenciestopological_sort_scheduledead_node_eliminationcompute_ancestorsr    reorder_for_compute_comm_overlapr   decide_global_ordering_of_commsr   ir_nodes_pre_fusionr]   r  ir_pre_fusionr[  create_foreach_nodeslogged_slow_fusion_pre_fusion_custom_passr  merge_loopsfinalize_multi_template_buffers$reorder_compute_and_comm_for_overlapcombo_kernelscreate_combo_kernel_nodesprocess_grouped_nodescompute_last_usageir_post_fusiongraph_diagramdebug_draw_graphcurrent_devicebuffer_names_to_freeorigin_to_indexr   add_row)rM   r  r  rB   r   r  s   `    rN   rQ  zScheduler._init  s   %'" <>"&'?"@5?\!&0%%**,""'') ,,113'
# >CCd003C
'')##**177+<+<+A+A+CDJJ 	DOO	 &*ZZ;
 !AJJL!O;
 -1JJ8
$($BRBRBT8
;>CLLNC8
8
 AE@Q@Q@V@V@X 35 13!!#33DJJ?
""$<@JJ"Gq1::<?"G 22>>

  ''DJ 	##s4::6#	djj)!$**o!!#33DJJ?
?I|))577

CDJ__TZZ0
,,.22CCDJJODJ***=""$!	tzz*	djj) 7;5?\! :<'//	
S D;
8
2 #Hs   #TT2-T#T"c                8    | j                   x}r|S t        d      )NzNo current device)rz  RuntimeErrorrM   r  s     rN   get_current_device_or_throwz%Scheduler.get_current_device_or_throw&  s$    (((6(M233rP   c                    t         j                  j                  dd      dk(  rddlm}  || j
                  d       yy)z,Generate an image of the graph for debuggingINDUCTOR_WRITE_SCHEDULER_GRAPHN1r   )draw_buffersT)print_graph)osenvironr   r  r  r  )rM   r  s     rN   ry  zScheduler.debug_draw_graph,  s1    ::>>:DASH+6 IrP   c                    t         j                  t        j                        r8t         j	                  d|       | j
                  D ]  }|j                           y y )Nz%s:)r   isEnabledForloggingINFOr   r  r   )rM   labelrB   s      rN   debug_print_nodeszScheduler.debug_print_nodes3  sF    GLL)HHUE"

 #  "# *rP   c                6   |j                         J d       |j                         rt        | |      S t        |t        j
                  t        j                  f      rt        | |      S t        |t        j                        rt        | |      S t        |      )Nz2All nodes passed to scheduling must have an origin)rp  is_no_oprY  ru   r   r)   r#  rO  r  r  r  r  s     rN   ra  zScheduler.create_scheduler_node9  s    *	@?	@*==?)$55r00"2C2CDE t,,boo.,T488%d++rP   c                   t               }g }| j                  j                         }t        j                  j
                  j                         D ]  }|D cg c]%  }||v rt        | j                  |   t              s|' }}|s6|j                  |       |D cg c]  }| j                  |    }}t        j                  dkD  }t        | |d|      }|j                  |       |D ]  }|| j                  |<     | j                  D 	cg c]  }	|	j!                         |vs|	 c}	t#        |      z   | _        y c c}w c c}w c c}	w )Nr   Fr  r  )r   r  r^  r;   rk   listsr~   ru   r  rY  r  r   combo_kernels_autotuner  r  r  rU   r}   )
rM   removed_node_namesfe_nodeskept_node_namesnamesrK   r  r  fe_noderB   s
             rN   rm  zScheduler.create_foreach_nodesF  sN   .8l11668WW]]))+ 	8E "?*"4#4#4T#:<RS E  %%e,:?@$d''-@F@$;;a?O0*/ /	G OOG$ 807''-81	88 "ZZ
4==?BT+TD
N
5 A
s   *D<EE#Ec                ^     t        d      } G fddt        |         t        j                         j                  D ]  }|j                         D ]}  }|j                         }|j                         D ]X  }|v r=|v r9|   }|   }||z   }j                         D ]  }	|	   |u s|	   |u s||	<    D|v r	|   |<   Q|   |<   Z   d  fd 	 	 d	 	 	 	 	 	 	 	 	 d fd}
i }t        j                  j                  j                         D ]6  \  }}t        |t        j                        s!|j                   D ]  }d||<   	 8  j                  D ]  }t"        j%                  d|j&                         |j&                  J t)        |j&                  j+                         d 	      }|D ]6  }t        |t        j,                        sJ ||vs$|j                         ||<   8 t)        |j&                  j/                         d
 	      }|D ]d  }||v sJ | d|        ||   x} j0                  |   j                         D ]*  }|j3                  t5        |j                                      , f t7        |j8                  j:                        dk(  rGt=        t?        |j8                  j:                              x}rt        |t@              r|jB                  }nd}|j                         D ]  }t7        |jE                               dk  sJ |jE                         D ]  }  |      } |
||       |j3                  t5        ||             |   j                  D ]  }|j                         |j                         k(  r%t        |j&                  tF              sJ |j&                  jI                         D ]?  }  |      }|j3                  tK        ||j                                       |
||d       A    |j8                  jL                  D ]6  }t        |tJ              r |
|jN                  ||jQ                  |             8 |jS                   jT                         |j                         D ]  }|jE                         D ]y  }|j                          jT                    |      <   |j                          jT                  |<    jV                  jY                  ||       jV                  |j                         <   {   t        j                  j[                         D ]3  }t"        j%                  d|        |
|t]        t5        |                   5 t        j                  j^                  D ]  }|j/                         D ]|  }||v sJ | d|j                                 ||   x}s) j0                  |   jI                         D ]4  }t"        j%                  d||        |
|t]        t5        |                   6 ~   jT                  D ]  }|t        j                  j                  v rE |
|t]        t5        |                   t        j                  j`                  jc                  |       d|t        j                  jd                  v s |
|t]        t5        |                    tg        t        j                  j                  j                               D ci c]  \  }}||
 }}}t        j                  j`                  D cg c]  }||   	 c}t        j                  _4         j                  D ]C  }|j                         D ].  }|jk                  |j                            j                         0 E yc c}}w c c}w )zi
        Create dependency edges between nodes, handling aliasing and
        mutation properly.
        Tc                  >    e Zd ZdZ	 	 d	 	 	 	 	 ddZddZd	 fdZy)
1Scheduler.compute_dependencies.<locals>.DedupListan  
            This data structure behaves like a list except it makes sure the
            elements remain unique.
            Normally one could use a OrderedSet/dict for this purpose however
            the list in question gets elements appended as it is being
            iterated over which means that we need to keep the list
            semantics.
            Nc                @    |xs g | _         |xs
 t               | _        y rI   )itemsr   
membership)rM   r  r  s      rN   r   z:Scheduler.compute_dependencies.<locals>.DedupList.__init__|  s    
 #[b
","<
rP   c                    || j                   v ry | j                  j                  |       | j                   j                  |       y rI   )r  r  r  r  )rM   	node_users     rN   r  z8Scheduler.compute_dependencies.<locals>.DedupList.append  s5    /

!!),##I.rP   c                    t        j                  | j                  |j                        }| j                  |j                  D cg c]  }|| j                  vs| c}z   } ||      S c c}w rI   )r   ry  r  r  )rM   rI  new_membershiprJ  	new_items	DedupLists        rN   __add__z9Scheduler.compute_dependencies.<locals>.DedupList.__add__  sc    !+!1!1$//5CSCS!T JJ${{*at.FA* 	 !N;;*s   A+A+rp  )r  zOptional[List[T]]r  zOptional[OrderedSet[T]]r   r   )r  r  r   r   )rI  DedupList[T]r   r  )rX   r   r   r  r   r  r  )r  s   rN   r  r  r  s;     ,06:=(= 4= 	=/<rP   r  c                N    | j                   v r j                   |          S | S rI   )rd  )r  r   rM   s    rN   r   z.Scheduler.compute_dependencies.<locals>.rename  s,    D)))d33A677HrP   c                P     |          j                  t        |||             y rI   )r  rC  )used_by_name	user_noderB  rE  name_to_usersr   s       rN   add_userz0Scheduler.compute_dependencies.<locals>.add_user  s)     &./66K9rP   Nzscheduling %sc                    | j                   S rI   r   rI  s    rN   rK  z0Scheduler.compute_dependencies.<locals>.<lambda>  
    AFF rP   rL  c                    | j                   S rI   r   rI  s    rN   rK  z0Scheduler.compute_dependencies.<locals>.<lambda>  r  rP   z not in r   )rj  )mutating_bufT)rE  zscheduling output %sz+scheduling output %s for unbacked symint %s)r  r   r   r   )FF)
r  r   r  rD  rB  r   rE  r   r   r   )6r   r	   r  r  r  r   rU   rZ   r^  r;   rk   r  r  ru   r\  r  free_symbolsr   r  rB   rV  get_unbacked_symbol_defsSymbolget_unbacked_symbol_usesr  r   r'   r]   r   r   rb  rc  r&   rj  r\   rC   r,  r(   r   rK   rB  r   rd  r   r   get_output_namesrw   graph_outputsmutated_inputsr  r_  r  mutated_input_idxsr   )!rM   r  rB   buf1	buf1_name	buf2_namelist1list2combinedrM  r  unbacked_symbol_to_origin_noderK   valfsunbacked_symbol_defsr  unbacked_symbol_usesr  r   r   	node_modealt_namera   
other_namerb  r0  r   rd  	inp_namesr  r  r   s!   `                             @@@rN   re  zScheduler.compute_dependenciesj  s    CL	<
 	<> @K?V?V@
 JJ 	LD((* L MMO	!%!1!1!3 LI M1i=6P -i 8 -i 8#(5=#0#5#5#7 >C -c 2e ;#0#5#>5=c 2> #m33@3Ki03@3Ki0LL	L(	 !&!			;	 	 		
 	 MO&
 --335 	>ID##uzz*** >B9=226>	>
 JJ J	HDIIotyy1 99(((#)		224:J$  * H!!U\\222 ::8<215H $*		224:J$  * C77BS!? @AB77::AG#003??A C))'#,,.*ABCC D$$++,1 d&6&6&=&=!>??S?sI.HH	 	 '') E3,,./1444 # 1 1 3 EH%h/HXt,%%ghY&GH -h 7 = = E==?dmmo=$)$))5FGGG*.))*D*D*F EJ)/
);J -- '
 P %ZtDEEEE, ((.. F!$0TYYd.>.>t.DEF %%d&;&;< '') H # 1 1 3 HH>AllnD))&*:;69llnD))(3 //33HhG ++HHIJ	HZ 002 	>HII,h7Xz'(*;<=	>
 77(( 
	JC113 	J77IS!?!D!D!F GHI76q9919$($5$5a$8$I$I$K J		I8UV !:gh6G+HI	J	J
	J )) 	:Dqww+++z'$-89&&**40***z'$-89	: ,5QWW5I5I5N5N5P+Q
'E4D%K
	 
 )*(>(>&
 $IdO&
"
 JJ 	CD'') CmCLLN;AABC	C
&
s   ^$/^*c                   g }t        | j                        D ]!  }ddd}|j                         D ]  }t        fd|j                  D              }|r\t
        j                  d|j                                t        j                  j                  j                  |j                                d} |j                          xr | }|s|j                  |       t
        j                  d|j                                t        j                  j                  j                  |j                                $ t        t        |            | _        | j                  D ]  }|j!                           y)	z0
        Remove any nodes without users
        c                r    | j                   xs* | j                         t        j                  j                  v S rI   )rE  rU   r;   rk   r  )ra   s    rN   can_eliminate_userz;Scheduler.dead_node_elimination.<locals>.can_eliminate_userB  s&    ||Tt}}!'':T:T'TTrP   Fc              3  .   K   | ]  } |        y wrI   r   )r   ur  s     rN   r   z2Scheduler.dead_node_elimination.<locals>.<genexpr>G  s     #Ma$6q$9#Ms   zremoved dead buffer: %sTzremoved dead operation: %sN)ra   rC  r   r   )r  r  r   r  rG   r   r  rU   r;   rk   rW  r  rD  r  r  r}   r  )rM   updated_nodesrB   active_buffersr   can_eliminater  s         @rN   rg  zScheduler.dead_node_elimination8  s-    TZZ( 	@DU #N'') * ##M399#M M II7HGG++//?%)N* !% 5 5 77N<NM $$T* 		6H**..t}}?+	@. (=12
 JJ 	#D  "	#rP   c                    t               t               g dfd|D ]  }|j                         D ]  }||<   	  |D ]
  } |        S )z?
        Ensure nodes is in topologically sorted order
        c                    | vrdj                  |        t        | j                  d       D ]&  }|j                  vr |j                            ( j	                  |        y y )Nc                    | j                   S rI   r   )ds    rN   rK  zDScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>j  s
    aff rP   rL  )r  rV  r   rK   r  )r  r   r  r`   seenvisits     rN   r  z2Scheduler.topological_sort_schedule.<locals>.visitg  se    }!!"6"6<LM 2Cxx|3 ,sxx01	2
 a  rP   )r  rC   r   r   )r   rQ  r,  )rM   r  rB   rK   r  r`   r  r  s       @@@@rN   rf  z#Scheduler.topological_sort_schedule]  sr     /9l59V*,	! 	!  	*D--/ *%)T"*	*  	D$K	rP   c                j    t               }t        |t        t        t        t
        f      r-|j                  D ]  }|j                  |j                          nt        dt        |       d       fd|D        }t        |D ch c]  } j                  |j                            ! c}      S c c}w )Nz+get_unmet_dep_nodes is not implemented for .c              3  P   K   | ]  }j                   |   j                    y wrI   )rr   rD   r  s     rN   r   z1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>  s"     Qs))#.::Qs   #&)r  ru   rO  r  rY  r  r   r  rK   r  rW   r}   r  rU   )rM   r  
unmet_depsr   unmet_dep_opsr  s   `     rN   _get_unmet_dep_nodeszScheduler._get_unmet_dep_nodesx  s    U
)&"	
 // )sxx() =d5k]!L  RjQMRqT,,QZZ\:RSSRs   $B0c                z   g }t         j                  | j                  d      }i }| j                  D ]P  }| j                  |      }t	        |      ||<   |D ]*  }|j                  |g       }|j                  |       |||<   , R |j                         D 	cg c]  \  }}	|	dk(  s| }
}}	|
rx|j                  |
       |
D ]7  }|j                  |g       D ]  }||xx   dz  cc<    |j                  |       9 |j                         D 	cg c]  \  }}	|	dk(  s| }
}}	|
rx|rJ d       |S c c}	}w c c}	}w )zU
        Sort nodes by their topological order, return a list of node lists.
        r   r   zTopological sort failed!)	rQ  fromkeysr  r  r]   r   r  r  r  )rM   r?  r  childrenrB   r  r   cr  vzero_deg_nodesra   s               rN   r  z!Scheduler._topological_sort_nodes  sF    djj!,#%JJ 	"D,,T2Dd)E$K "LLb) !"	" ).@1a!@@LL(# $LLB/ %D$K1$K%		! -2KKMDDAqQ!VaDND  444y A Es   D1%D1D7D7c                ~   i }| j                   D ]  }t               }|j                  D ]L  }| j                  |j                     j
                  j                         }|j                  |       |||   z  }N |||j                         <   ||_         t        | j                         D ]  \  }}||_
        ||_         y)z.
        Populate each node.ancestors
        N)r  r   r   rr   rK   rD   rU   r  r   r  r   r   )rM   name_to_ancestorsrB   r   r   dep_node_namer?  s          rN   rh  zScheduler.compute_ancestors  s    
 9;JJ 	'D)3I.. > $ 0 0 : F F O O Qm,.}==	> 2;dmmo.&DN	' %TZZ0 	#KE4"DN"DN	#rP   c                   | j                   D ]  }t        j                  st        |t        t
        f      r0|j                         j                  dk7  rt        j                  dk7  r[|j                         D ]o  }t        |t              r|j                         r$|j                  j                         |_        |j                  j                  |_        |j                  d       q  y )Nr   halideTr!  )r  r   r)  ru   rO  r  r3  rW   cpu_backendr'  r:  r&  rp  r3  r%  r0  )rM   rB   r  s      rN   rp  zScheduler.merge_loops  s    JJ 	;D44 d]4F$GH!&&&0V5G5G85S) ;!%75;L;L;N#kk557${{00
 **T*:;	;rP   c                *   t        d      D ]  }t        |      }t        j                  d|dz   |       | j	                  |      }t        |      }t        j                  d|dz   ||       ||k(  s|dk(  sjt        j                  d|dz           |S  |S )zB
        Combine eligible nodes into FusedSchedulerNodes.
        
   z/===== attempting fusion (%d/10): %d nodes =====r   z=completed fusion round (%d/10): fused %d nodes into %d nodes
z+===== fusion complete (%d iterations) =====)r  r]   r  r  fuse_nodes_once)rM   r  r  old_lennew_lens        rN   r  zScheduler.fuse_nodes  s     r 	A%jGAA
 ((/E%jGPA	 '!W\  !NPQTUPUV%	$ rP   c                    g }| j                   D ]4  }|j                  t        |t              r|j	                         n|g       6 || _         y)zA
        Unpack GroupedSchedulerNode into regular nodes.
        N)r  r?  ru   r  r  )rM   	new_nodesrB   s      rN   ru  zScheduler.process_grouped_nodes  sJ     .0	JJ 	D!+D2F!GdV	 
rP   c                    t        |      dkD  sJ |d   j                         }|| _        | j                  |      }|j	                  |      S 
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        r   )r]   r3  rz  r'  benchmark_fused_nodes)rM   r  r  backends       rN   r  zScheduler.benchmark_fused_nodes  sO     5zA~~q$$&$""6*,,U33rP   c                ^   	 	 	 	 	 	 dd}t        | j                        D ]
  \  }}t        |t              st        |j                  t
        j                        s=|j                  }|j                         \  }}t        |t        j                  j
                  j                        r|j                  j                  |       |j                         }|j                  }t        |t
        j                        sJ |j                  }	t        |	t
        j                        sJ |j                   |	_         |||	       | j#                  |	      }
|
| j                  |<   |
| j$                  |j'                         <   |
| j(                  |j'                         <   t+        |
j-                         |j-                               D ]3  \  }}|| j.                  |j'                         <   |j0                  |_        5 |j2                  |
_        |j4                  |
_        |j6                  |
_         y )Nc                   |j                         }| j                         }t        |t              rt        |t              sJ |j                         }| j                         }t        |t              rt        |t              sJ t        j
                  j                  |= ||_        t        j
                  j                  |= ||_	        t        j
                  j                  j                  |       }t        j
                  j                  j                  |       |t        j
                  j                  |<   |t        j
                  j                  |<   t        j
                  j                  j                  |       }t        j
                  j                  j                  |       |t        j
                  j                  |<   |t        j
                  j                  |<   y rI   )rU   ru   r   r   r;   rk   r  rK   
name_to_opoperation_namebuffersrd  remove
operations)	orig_noder  replaced_buf_nameorig_buf_namereplaced_op_nameorig_op_nameorigs          rN   replace_operation_bufferzKScheduler.finalize_multi_template_buffers.<locals>.replace_operation_buffer	  sW    !) 1 1 3%..0MmS1jARTW6XXX'::<$779LlC0Z@PRU5VVV&&'89)HM""#34&2H#77??((3DGGOO""8,$,AGGOOD!4<AGG""=177%%++I6DGG%%h/'/AGGt$/7AGG|,rP   )r  zir.MultiTemplateBufferr  zir.OperationBufferr   r   )r  r  ru   rO  rB   r   MultiTemplateBufferget_min_choicer   r   TritonTemplateCallerBasefinalize_as_triton_calleroutput_noder   
StorageBoxOperationBufferrY   ra  r  rU   r  rR  r   rr   rG   r   r   r   )rM   r  r  rB   
multi_nodemin_node_unfusedr  out_tensorboxout_storage
out_buffernew_scheduler_nodenew_outold_outs                rN   rq  z)Scheduler.finalize_multi_template_buffers
	  s   	8-	89K	8	86 !, $	@GAt$.:		2114 "YY
&0&?&?&A# !$OO&&?? II778HI 0 < < >+00!+r}}===(--
!*b.@.@AAA$.$5$5
!(Z@%)%?%?
%K" 2

15G!!$--/2;M''8(+&224d6F6F6H) 2$GW <CD$$W%5%5%78$+MMGM	2 04~~",/3~~",04"-I$	@rP   c                &    t        d |D              S )Nc              3     K   | ]q  }t        |j                  d       xrU |j                  duxrE t        |j                  j                  d      xr# |j                  j                  j                  dk(   s yw)r   Nscatter_moderk  )rn   rB   r   r  r  s     rN   r   z,Scheduler._any_atomic_add.<locals>.<genexpr>M	  so      

 	 AFFF# 9d"9^49 ((L89
s   A7A9)r   rM   	node_lists     rN   _any_atomic_addzScheduler._any_atomic_addL	  s     

 
 
 	
rP   c                   j                         xr( t        j                         t        j                        }t
        j                  s|syj                         r(t        j                         t        j                        r j                         sj                         ryj                         }|d   j                         }|j                  dk(  ryj                         }t        t        j                  ||            }| j                  |      ryddlm} t%              }	dfd}
t        t&              r\t        j(                  t        j                        r7j(                  }|j*                  }|j-                         \  }| j/                  |      \  t1        d      }d}d}t3        |j5                         d 	      D ]  \  }}t        |t6        j8                  j                  j:                        s5|z   k\  r n]|d
z  }|t
        j<                  kD  r nCj(                  j?                  |      5  | j/                  |      \  }|k  r}|}ddd         |
|       |z   k  r|j(                  jA                  |       yy	 | j/                  |      \  tC        jD                        r	 |	d       y| j/                  |      \  tC        jD                        r	 |	d       y| j/                  |      \  tC        jD                        r	 |	d       y	  |
       tI        d      rWz   k\  rOf| jJ                  vr?| jJ                  jM                  f       tO        d      jQ                  fd       z   k  S # 1 sw Y   xY w# |$ r}dtG        |      v rY d}~y d}~ww xY w)
        If config.benchmark_fusion is False, always return True.
        Otherwise, return True if fusion can brings speedup.
        Tr   cpuCompilationErrorc           
     t   t         j                  t        j                        r| ||z   k  rFt         j	                  dj                         j                         t        ||z   | z  d             y t         j	                  dj                         j                         t        | ||z   z  d             y y )Nz9can fuse (benchmark): fusing %s with %s cause %sx speedup.3fz=cannot fuse (benchmark): fusing %s with %s cause %sx slowdown)r  r  r  DEBUGr  r,  r-   r.   )ms_fusedms1ms2r  r  s      rN   
log_fusionz/Scheduler.speedup_by_fusion.<locals>.log_fusion	  s    &&w}}5cCi'$$S..0..0"sSyH&<S%AC	 $$W..0..0 Hc	$:3#?A	 6rP   infNc                    | d   S r  r   rI  s    rN   rK  z-Scheduler.speedup_by_fusion.<locals>.<lambda>	  s
    ad rP   rL  r   Fz%register spilling of the first kernelz&register spilling of the second kernelz%register spilling of the fused kernelLoop-carried variableslow_fusionc            	     $      z   z  dS )N)kernel1_pathkernel1_latencykernel2_pathkernel2_latencyfused_kernel_pathfused_kernel_latencyslow_down_ratior   )r"  r#  r!  path1path2
path_fuseds   rN   rK  z-Scheduler.speedup_by_fusion.<locals>.<lambda>	  s(    $)'*$)'*)3,4'/39'= rP   )r!  r  r"  r  r#  r  r   r   ))r:  ru   r  r   r  r   benchmark_fusionTritonTemplateBufferr>  r'  r3  rW   r}   r   r   r  triton.compiler.errorsr  r  rO  rB   choice_timingsr  r  r  rV  r  r   r   r   max_epilogue_benchmarked_choicesswap_as_triton_callerr  mathisinfr   r   rn  r  r   r}  )rM   r  r  is_multi_templatenode_list_1r  node_list_2node_list_fusedr  r  r$  r  r7  r  min_ms_fusedms_fused_choicetriton_choiceschoiceunfused_timer  r"  r#  r!  r1  r2  r3  s    ``                 @@@@@@rN   speedup_by_fusionzScheduler.speedup_by_fusionU	  s    "--/ 
J##%r'='=5
 &&/@ u668":Q:QR!! oo'Q**, ;;%oo'y{KHI
 0;u%	" e]+
JJ..1
 J'66N..0FAs33K@JC <L"ON(.$$&N) 1$ "&%//*<*<*U*UV39,!#!F$K$KK ZZ55f= 1"&"<"<_"MKHa,.'/*01 11, |S#. sSy)o.I

44_E!77D
U::c??@ !77D
U::c?@A '+'A'A/'R$*::h'?@  ( 	8S#&#M2C#I%d&=&==##''7]+33 
 #)##o1 1< $ *c!f4s6   N5*1O 1O 1O 5N?	OOOOc                &   t        |      }t        j                  t        j                        rBt        j                  d       |D ](  }t        j                  d|j                         z          * | j                  |      D ]V  \  }}| j                  |j                            }| j                  |j                            }| j                  ||      sT| j                  ||      rg| j                  ||      szt        j                  d|j                         |j                                |j                         }| j                  |      j!                  ||      }|j#                  |       |j#                  |       |j%                  |       | j                  j'                  |j)                         D ci c]  }|j                         | c}       Y t+        |d       }| j-                  |      }| j/                  |       |S c c}w )a  
        Combine eligible nodes into FusedSchedulerNodes.

        This relies on two key functions to control the logic:
            - self.can_fuse(): checks if a fusion is legal
            - self.score_fusion(): assigns priority to a given fusion
        zfuse_nodes_once, candidates:z  zfusing %s with %sc                    | j                   S rI   r}  rI  s    rN   rK  z+Scheduler.fuse_nodes_once.<locals>.<lambda>
  s
    !++ rP   rL  )r   r  r  r  r   r  r   get_possible_fusionsr  r#  r  will_fusion_create_cyclerE  rU   r3  r'  r  r  r  r  r'  rV  rf  r  )	rM   r  r  rB   r  r  r  node3r  s	            rN   r  zScheduler.fuse_nodes_once	  s    !'""7==1;<# @  (<(<(>!>?@ 55e< 	LE5++E,@,@,BCE++E,@,@,BCE}}UE*43P3Pu4 --eU;  ')95>>;K
 ))+((055eUC""5)""5)&''..27//2CDQQZZ\5(D%	* {(=>..u5!!%( Es   :H
c                   t        | j                        }d}t        | j                        }t        j	                  d|       t        t        j                  |             D ]  \  }}t        j                  |      }t        |      dk  r+|||kD  r n| j                  |      st        j	                  d|       \|dz  }t        j                  dkD  }t        |d   j                  |d|      }t        j                  d	t        |      |       |D ]  }	|j                  |	        |j                  |       | j                   j#                  |j%                         D 
ci c]  }
|
j'                         | c}
       ! t)        |d
       | _        | j+                  | j                        | _        t        j                  d||t        | j                               | j-                  | j                         yc c}
w )z'
        Groups parallel nodes
        r   z2ComboKernels: Generating with num_ck_nodes = %d...rm  Nz)ComboKernels: Not speeding up %d-th groupr   Tr  z0ComboKernels: Combining %d nodes for %d-th groupc                    | j                   S rI   r}  rI  s    rN   rK  z5Scheduler.create_combo_kernel_nodes.<locals>.<lambda>6
  s
    q{{ rP   rL  zEGenerated ComboKernel nodes: %d ComboKernels, totally %d -> %d nodels)r  r  r]   r   r  r  r  r  r  speedup_by_combo_kernelr   r  rA   r   r  r  r  r  r'  rU   rV  rf  r  )rM   rT  r  countnum_nodes_orignumr  r  ru  rB   r  s              rN   rt  z#Scheduler.create_combo_kernel_nodes
  s    $**oTZZ		FU'&DDTJ
 	NC 3CCINI9~!'EL,@//	:		EsKQJE$;;a?O4!&&*. /	K HHBI
 " )""4()OOK(##**4?4I4I4KLq{*L7	< K-BC
33DJJ?
S

O		
 	!!$**- Ms   !G=
c                H    |D ]  }|j                  | j                          y rI   )r  r  )rM   r  rB   s      rN   r  zScheduler.prune_redundant_deps@
  s%     	?D%%d&=&=>	?rP   c                |   	
 g 	t               
d	
 fd}t        j                  t              }|D ]+  }|j	                         D ]  }||   j                  |        - |j                         D ]
  } ||        t        j                  rat        j                  t              }|D ]&  }t        |dd      }|s||   j                  |       ( |j                         D ]
  } ||         j                  	      		j                   j                  d       t        j                  dt        	             	S )z^
        Helper to find all legal fusion opportunities, sorted by self.score_fusion()
        c                P   t        |       D ]  \  }}| |dz   d  D ]  }||f}|v rj                  |       j                  ||      rj                  |       A|j	                         s|j                         sbj                  ||      suj                  ||f         y r  )r  r  r  r  r:  r>  )r  node1_indexr  r  rM  possible_fusionsr  rM   s        rN   check_all_pairsz7Scheduler.get_possible_fusions.<locals>.check_all_pairsM
  s    &/&6 @"U";?#45 @E %.Cd{ HHSM}}UE2(//4++-1A1A1CuJ )//?@@rP   r   NT)rM  reversezfound %d possible fusionsr  r  r   r   )r   r  r  r}   r   r  r~   r   aggressive_fusionr   *get_possible_fusions_with_highest_priorityr=  score_fusion_keyr  r  r]   )rM   r  rV  buffer_names_groupingrB   r   node_groupinggroup_groupingr   rU  r  s   `        @@rN   rH  zScheduler.get_possible_fusionsD
  sE    HR	@  !, 7 7 = 	8D--/ 8%c*11$78	8 399; 	+MM*	+ ##(44T:N 7gt4"5)0067 "0!6!6!8 /./  JJ
 	$"7"7F4c:J6KLrP   c                    t               d fd|j                         j                  j                         |j                         j                  j                         z  |j                  j                  j                         |j                  j                  j                         z  z
  t         fdD              }|r t        ||      d       |S )z~
        Finds whether there's a path from node1 to node2 (or vice-versa)
        caused indirectly by other fusions.
        c                   t        | t              rq| vrmj                  |        | j                         j	                        ryt        | j                  z        xs" t        fd| j                  z
  D              S y)NFc              3  H   K   | ]  } j                   |           y wrI   r  r   r  
found_pathrM   s     rN   r   zIScheduler.will_fusion_create_cycle.<locals>.found_path.<locals>.<genexpr>
  s+      H #4#:#:1#=>H   ")ru   r  r  r(  issubsetr   r   r   )rB   combined_ancestorscombined_namesrd  rM   visiteds    rN   rd  z6Scheduler.will_fusion_create_cycle.<locals>.found_path~
  s    $ 23G8KD!++-667IJ !   ?@ C H!%2D!DH E  rP   c              3  H   K   | ]  } j                   |           y wrI   rb  rc  s     rN   r   z5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>
  s!     WqJt66q9:Wre  zwill create cycle)rB   rC   r   r   )r  r(  _dictr^  r   r   r  )rM   r  r  cyclerg  rh  rd  ri  s   `   @@@@rN   rI  z"Scheduler.will_fusion_create_cyclet
  s     ,/5	 	2 %%'--224'')//4467 	
 OO!!&&(5??+@+@+E+E+GG WDVWW#IeU#$78rP   c                    t        t        |j                  |j                  z
        t        |j                  |j                  z
              }|dkD  S )aB  
        This function prevents fusion for nodes that can increase memory
        footprint. This problem is more common in horizontal fusion, where nodes
        that are far apart in the original order get fused, lengthening the live
        intervals of tensors. This is very evident in models with activation
        checkpointing, where the recomputed nodes from different checkpointed
        regions get fused and significantly increase the memory footprint.

        The current attempt is a quick, possibly hacky, heuristic to prevent the
        fusion of nodes that are far away in the original order.

        A better but difficult to implement heurisitic would be to use live
        intervals of the buffers, find region of peak pressure in the original
        program and prevent fusion that crosses that peak region. We might need
        special care or good approximation in this implementation, as fusion of
        node changes live intervals, and re-computing live intervals and peak
        memory after each fusion can introduce large compilation overhead.
        @   )r  r0  r   r   )rM   r  r  proximity_scores       rN   can_fusion_increase_peak_memoryz)Scheduler.can_fusion_increase_peak_memory
  sE    * %//12%//12
 ##rP   c                   i }|j                   j                         D ci c]  }|j                  | }}|j                   j                         D ci c]  }|j                  | }}|D ]\  }t        j                  j                  |      }	||   }
||   }|
j                         |j                         k7  r(d|
j                          d|j                          ||<   vt        |
j                        t        |j                        k7  rd||<   t        |
t              rt        |t              sdt        |
       dt        |       ||<   |
j                         }|j                         }||k7  rd| d| ||<   |
j                         |j                         k(  rd|
 d| ||<   Ed|
 d| d|	j                   ||<   _ t        |      S c c}w c c}w )	z}
        Try to decide reasons why fusion fail due to no shared memory even though
        there are common buffers.
        zdifferent numel: z v.s. 	broadcastznot MemoryDep: zdifferent offset: zMismatch loop orders: zUnknown reason: z
. Layout: )r   r=  rK   r;   rk   r  r  r:   re  ru   r&   rW   
get_offsetnormalize_with_stride_orderrY   r   )rM   r  r  common_buf_namesreasonsr   node1_name2depnode2_name2depr0  r   lhs_deprhs_deplhs_offrhs_offs                 rN   decide_fusion_fail_reasonz#Scheduler.decide_fusion_fail_reason
  s    383D3D3U3U3WXC#((C-XX383D3D3U3U3WXC#((C-XX( )	RH''$$X.C$X.G$X.G  "g&7&7&99 ((9(9(;'<F7CTCTCVBWX   W\\*mGLL.II$/!gy1GY9W &d7m_F4=/J  ((*G((*G'! '9	y$Q! 3356689 '=WIVG9$U!
 #7)6'*SZZLQ O)	RV 7|] YXs   GGc                z   t         j                  rt        d ||fD              ry|j                  j	                         }|j                  j	                         }||z  }|sy|j                  j                         D ci c]  }|j                  | }}|j                  j                         D ci c]  }|j                  | }}g }	|D ]y  }
||
   }||
   }|j                         |j                         k(  s/|	j                  t        j                  j                  j                  |j                         d      ||f       { t        |	      dk(  ryt        |	dd       d   \  }}}|j                   |j                   k7  r!|j#                         |j#                         k(  S |j%                         s|j'                  ||       nV|j%                         s|j'                  ||       n3t(        j+                  d|j-                         |j-                                | j/                  ||      dkD  S c c}w c c}w )	z
        Right now just greedily reorder the loop of node1 to be compatible with node2,
        but ideally we should have some heuristics to reorder the loop for node2
        to be compatibile with node1 if that's more efficient.
        c              3  V   K   | ]!  }|j                         j                  d k(   # yw)r  N)r3  rW   r  s     rN   r   zBScheduler.has_shared_data_after_reordering_loop.<locals>.<genexpr>  s&      8
-.ALLN5(8
s   ')Fr   r  Tc                    | d   S r  r   rI  s    rN   rK  zAScheduler.has_shared_data_after_reordering_loop.<locals>.<lambda>'  s    QRSTQU rP   )rW  rM  z?Don't reorder loops since both nodes are reductions: %s v.s. %s)r   r)  r   r   buffer_namesr=  rK   rt  r  r;   rk   r  r  r  r]   rV  r7  r"  r5  r   r:  r  rU   score_fusion_memory)rM   r  r  node1_buffer_namesnode2_buffer_namescommon_buffer_namesr   rw  rx  
candidatesbuffer_namery  rz  numels                 rN   %has_shared_data_after_reordering_loopz/Scheduler.has_shared_data_after_reordering_loop
  s+    00C 8
38%.8
 5
 "..;;="..;;=03EE"383D3D3U3U3WXC#((C-XX383D3D3U3U3WXC#((C-XX 
. 	K$[1G$[1G3356689 !!((2273D3D3FQR2S	 z?a #)T~"V#
w w///
 $$&'*;*;*=== !!#++GW=##%++GW=##Q   ''u599_ YXs   >H30H8c                R    u ryt              }t        t              st        t              r	 |d       yt        t        t        f      rj                         s	 |d       yt        t        t        f      rj                         s	 |d       yj                         j                  z  r	 |d       yj                         r	 |d       yj                         r9j                         s j                         st        j                  s	 |d       yj                         j                         z  t        j                  j                  z  r	 |d       yj!                         }j!                         }||k7  r |d	||       y~ j#                        d
k(  }|r j%                         }t&        j)                  dj+                         j+                         |rdnd       |rt        j,                  r j                         sj                         rt/        d      rlj0                  j3                         j0                  j3                         z  t5              d
kD  r)t7        d      j9                   fd        |d       y |d       yj;                         s]j;                         sMt5        j=                               t5        j=                               z   t        j>                  kD  r	 |d       yj                         j                  z  r4 jA                        sy jC                  |      jA                        S  jE                        r	 |d       y jC                  |      jG                        S )zj
        Determine if it is possible to combine node1 and node2 into a
        single fused node.
        Fz/grouped node must not be fused with other nodesznode1 is extern or nopznode2 is extern or nopznode1 must go before node2z!templates can only fuse epiloguesztemplate epilogue not satisfiedz#fusion for buffer explicit disabledzdevice mismatch (%s vs %s)r   z%s and %s has%s shared dataz nor   'fusion_failure_due_to_indexing_mismatchc                 B   t         j                  j                  t         j                  j                  j	                         j	                         t        j                               t        j                               t               j                         dS )N)pre_grad_graph_idrZ  
node1_name
node2_namenode1_debug_strnode2_debug_strr  failure_reason)	r;   rk   rW  rZ  rU   r    rb   r}   r}  )ru  r  r  rM   s   rN   rK  z$Scheduler.can_fuse.<locals>.<lambda>  su    121A1A23''2L2L*/..*:*/..*:/9%//:K/L/9%//:K/L378H3I.2.L.L %u.>/! rP   z'no shared data due to indexing mismatchzno shared datazexceeds max fusionzwill increase peak memory)$r  ru   r  r  rY  r:  r(  r   r   r5  r   epilogue_fusionr,  r;   rk   no_fuse_buffer_namesr3  r  r  r:  r  rU   rY  r   r   r  r]   r   r}  r>  r'  max_fusion_sizecan_fuse_verticalr'  rp  can_fuse_horizontal)rM   r  r  r  r  device2no_shared_dataru  s   ```    @rN   r  zScheduler.can_fuse@  sF    E>u%e12j'7
 ABu8:PQR%%'()u8:PQR%%'()$$&8,-34**,!!#))12 ""$u'='='??GG(() 56!!#""$W,fg>11%?1D!%!K!Ku" N 	)NNNN#E		
 ((E,>,>,@EDVDVDX&'PQ%%224u7H7H7U7U7WW ! '(1,$%NOWW AB  !   "$$&EOO%&U__->)??&BXBXX$%$$&8))%7##F+==eUKK33E5A/0##F+??uMMrP   c                   |j                         }|j                         }t               }t        ||      }|j                  j
                  D ]H  }t        |t              s|j                  D ]&  }| j                  ||      s|j                  |       ( J |j                  D ]8  }	t        |	t              s| j                  |	||      s(|j                  |	       : t        d |j                  |z
  D              }
|
|z  r	 |d       y|
D ]O  }| j                  |   j                  j                         }|| j                   |   j"                  z  sG |d        y y)a  
        Check if it is legal to fuse a consumer (node2) into a producer (node1).

        We can fuse them if all the reads of node2 either match
        corresponding writes in node1, or are written by nodes that can
        be scheduled before the fusion of node1 and node2.
        c              3  4   K   | ]  }|j                     y wrI   r   r   s     rN   r   z.Scheduler.can_fuse_vertical.<locals>.<genexpr>  s      $
CHH$
r   zmemory deps did not matchFz(intermediate nodes between node1 & node2T)r,  r(  r   r  r   r   ru   r&   r   fusable_read_and_writer  r(   fusable_weak_deprr   rD   rU   r  r   )rM   r  r  node1_buf_namesnode1_op_namescomputed_depsr  cdr  r   remaining_depsrK   r  s                rN   r  zScheduler.can_fuse_vertical  sb     002224)3u%##** 	*Bb),.. *..r26!%%b)*	* ++ 	'C#w'D,A,A#ue,T!!#&	' $ $
 % 8 8= H$
 
 O+
 +," 	D&&t,88AACG 7 7 @ J JJ>?		 rP   c                    |j                   |j                         vry|j                  j                  D cg c]  }|j                   |j                  k(  r| }}t        |      dk7  ry|d   t        t              sJ t        j                  t        j                        ry| j                  |j                     }|j                  j                  D cg c]  }|j                   |k(  s| }}t        fd|D              S c c}w c c}w )NFr   r   c              3     K   | ]q  }t        |t              xr[ t        |j                  t        j
                         xr4 |j                  j                  k(  xr |j                  j                  k(   s y wrI   )ru   r&   r   rd  r   TMPre  )r   rb  writes     rN   r   z-Scheduler.fusable_weak_dep.<locals>.<genexpr>  sm      

 	 tY' ('

DHH==(

ekk)( 		UZZ'(
s   A7A:)rK   r,  r   r   r  r]   ru   r&   r   rd  r   r  r   r   r  )	rM   weak_depr  r  r  mutating_writes	real_namerb  relevant_readss	       `    rN   r  zScheduler.fusable_weak_dep  s
    == 6 6 88 **11
zzX222 
 

 1$"%+++u{{DHH5++H,A,AB	"..44
		Y8ND
 
  

 '
 
 	
#

s   "DD,Dc                8   t        |t              rn|j                  |j                  k(  r|j                  y| j                  j	                  |j
                  |j
                        }||j
                  k7  sHt        |j                  t        j                        s$t        |j                  t        j                        ryt        j                  r9|j                  |j                  k7  r |j                         }|j                         }|j                  |j                  k(  xr\ t        |j                        t        |j                        k\  xr/ |j                  d t        |j                         |j                  k(  S t        |t               r| j                  j	                  |j
                  |j
                        }| j                  j	                  |j
                  |j
                        }|j                  |j                  k(  r|j                  ||k(  ryy)NTF)ru   r&   rj  rd  r   rK   r   rd  r   r  r   r)  r7  r"  r]   re  r'   )rM   rb  r  	read_name
write_names        rN   r  z Scheduler.fusable_read_and_write  s|   dI&yyEJJ&5::+A--11$))TYYGI UZZ'&tzz488<&u{{DHH=00T]]enn5T ~~') 

ekk) ?		Nc%**o5?II/EJJ0EJJ>
 g&--11$))TYYGI..225::uzzJJ		UZZ'JJ*+rP   c                d   | j                  ||      }t        t        |j                  |j                  z
        t        |j                  |j                  z
               }|j                         t        j                  k(  xr |dkD  |j                         |j                         k(  xr |dkD  ||fS )a\  
        Assign a score (higher comes first) to the fusion of node1
        and node2.  When different fusions conflict with each other,
        this is the way we decide what order to run them in.

        Our current score is based on:
        - Estimate of the saved memory operations
        - Fusions closer together in original order
        r   )	r  r  r0  r   r   r:  r   epilogue_fusion_firstr5  )rM   r  r  memory_scorero  s        rN   score_fusionzScheduler.score_fusion)  s     //u=%//12%//12
 

 6#?#??TLSTDT E$6$6$88M\A=M	
 	
rP   c                    d}|| j                   vr2	 |j                         s|j                         }|| j                   |<   |S | j                   |   }|S # t        $ r Y -w xY wr  )rO  has_unbacked_symbolsnumbytes_hintKeyError)rM   r   ress      rN   dep_size_hintzScheduler.dep_size_hintA  sz    d000//1++-C /2D&&s+ 
 ,,S1C
   	s    A 	A A c                @    t        |j                  j                        t        |j                  j                        z   }t        |j                  j                        t        |j                  j                        z   }t	        ||      dz  t        ||      kD  r||kD  r|}|}|}g }|j                  j                  |j                  j                  z  D ]D  }||j                  j                  v s||j                  j                  v s4|j                  |       F t         fd|D              S |j                  j                  |j                  j                  z  |j                  j                  |j                  j                  z  z  }t         fd|D              S )zn
        The first term in our fusion score that estimates number of saved
        memory operations.
        r  c              3  @   K   | ]  }j                  |        y wrI   r  r  s     rN   r   z0Scheduler.score_fusion_memory.<locals>.<genexpr>g  s     ?3t))#.?   c              3  @   K   | ]  }j                  |        y wrI   r  r  s     rN   r   z0Scheduler.score_fusion_memory.<locals>.<genexpr>l  s     Is4%%c*Ir  )r]   r   r   r   r  r  r  r  )	rM   r  r  node1_dep_lennode2_dep_lentmpr  r   common_memory_depss	   `        rN   r  zScheduler.score_fusion_memoryQ  sd    E--334s5;L;L;S;S7TTE--334s5;L;L;S;S7TT }m,q03}m3TT},D((..1B1B1I1II %%++111SE<M<M<T<T5TKK$% ?$???#//558I8I8P8PP##e&7&7&>&>>
 I6HIIIrP   c                   t        |      dk(  r|S i }|D ]  \  }}|j                         |j                         k(  sJ |j                         }t        | j                  |      j	                  ||            }||vr	||fg||<   p||   j                  ||f        t        |j                         t        j                  d            d   }t        |      dkD  sJ |S )Nr   rL  r   )
r]   r3  r   r'  get_fusion_pair_priorityr  r  r  operator
itemgetter)rM   rU  "possible_fusions_group_by_priorityr  r  r  fusion_pair_priority&possible_fusions_with_highest_prioritys           rN   rZ  z4Scheduler.get_possible_fusions_with_highest_priorityn  s   
  A%##  	+ - 	LE5##%)9)9);;;;%%'F#&  (AA%O$  $+MMENL23GH 33GHOOEN	 25.446H<O<OPQ<R2

2. 9:Q>>>55rP   c                0    |\  }}| j                  ||      S )z-
        Shim for list.sort(key=...)
        )r  )rM   r  r  r  s       rN   r[  zScheduler.score_fusion_key  s      u  ..rP   c                    t        t        j                  j                               }t	        | j
                        D ]9  }|j                  || j                         |j                  |j                         ; y)zg
        Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
        N)
r   r;   rk   r  r  r  r   r   r  r   )rM   r   rB   s      rN   rv  zScheduler.compute_last_usage  s]    
 0:!'':R:R:T/UTZZ( 	8D 3T5L5LM&&t7	8rP   c                   t        | j                  t        j                  j                  z
  t        j                  j
                  j                  z
        D ]  }|| j                  v rT| j                  |   }|j                         s2t        j                  j
                  j                  |j                         f|t        j                  j                  v st        j                  j                  |   j                  }t        |t        j                        r|j!                         sJ t        j                  j
                  j                  |j                          | j                  j#                          y)z*Free any buffers that are no longer neededN)rV  r{  r;   rk   rW  rl   freedrr   ry   codegen_freerB   r  r   ru   r   r	  is_input_bufferclear)rM   rK   r   storages       rN   free_bufferszScheduler.free_buffers  s   %%gg%%&gg""(()
 	@D
 t'''&&t,<<>GG((55chh?---''..t499!'2==9g>U>U>WWW$$11',,?	@ 	!!'')rP   c                    t         fdt        j                  j                  D              }g t        j                  j                  D ]t  }| j                  vrj                  |       # j                  |   j                  }|J t        d |D              }|j                  |      sdj                  |       v d fd}t        t        |            D ]  }|t        j                  j                  j                  v rt        j                  j                  j                  |   }t        |t              r|j                  d      rrt        fd|j                   D              }|r j#                  |       t        j                  j$                  j'                  |       ͉ j)                  |        y)zr
        Any buffers that are both created and have a last use in the
        same kernel can be removed.
        c              3     K   | ]9  }|j                   v r)j                   |   j                  j                          ; y wrI   )rr   rD   rU   )r   r   rM   s     rN   r   z8Scheduler.remove_kernel_local_buffers.<locals>.<genexpr>  sA      &
d&&& S!--668&
s   ?ANc              3  V   K   | ]!  }|j                   r|j                          # y wrI   )rE  rU   r  s     rN   r   z8Scheduler.remove_kernel_local_buffers.<locals>.<genexpr>  s     U4t}}Us   ))c                    | t         j                  j                  vxrF | t         j                  j                  j                  vxr | j
                  vxr | j                  vS rI   )r;   ro   must_keep_buffersrg   input_buffersrd  r   )r  rM   s    rN   remove_filterz<Scheduler.remove_kernel_local_buffers.<locals>.remove_filter  s]    333 5QXX]]8885T2225 T444	rP   REMOVEDc              3  &   K   | ]  }|v  
 y wrI   r   )r   r  names_to_removes     rN   r   z8Scheduler.remove_kernel_local_buffers.<locals>.<genexpr>  s     KaQ/1Ks   )r  r   r   r   )r   r;   ro   store_buffer_namesrr   r  rG   rf  r}   filterrg   rP  ru   r   
startswithr  other_namesremove_inplace_bufferinplaced_to_remover  remove_buffer)	rM   fused_node_namesout_bufrG   r  rK   r   r  r  s	   `       @rN   remove_kernel_local_buffersz%Scheduler.remove_kernel_local_buffers  sk    & &
xx22&
 

 xx22 		0Gd...&&w/$$W-33E$$$U5UUE~~./&&w/		0	 vm_EF# 
	)Dqxx}}444hhmm33D9c3'CNN9,EK3??KK..t4++//5""4(
	)rP   c                    t         j                  d|       dt        j                  j                  j
                  |<   t        j                  j                  j                  |       y )Nzremove_buffer(%r)r  )r   r  r;   ro   rg   output_buffersrW  r  r  s     rN   r  zScheduler.remove_buffer  sC     			%t,-6$$T*	  $$T*rP   c                R   t         j                  d|       t        j                  j                  j
                  |   j                  }|j                  dd      t        j                  j                  j
                  |<   t        j                  j                  j                  |       y )Nzremoving_inplace_buffer(%r)
in_out_ptrr  )
r   r  r;   ro   rg   rP  
inner_namert  rW  r  )rM   rK   r  s      rN   r  zScheduler.remove_inplace_buffer  sq    		/6XX]]2248CC
.8.@.@)/
%%d+ 	
  $$T*rP   c                    | j                   j                         D ]  }|j                           | j                          y rI   )r\  r~   flushr  )rM   r  s     rN   r  zScheduler.flush  s3    }}++- 	GMMO	rP   c                   t        |t              sJ t        d   dxx   dz  cc<   t        j                  t        d            5  |j                          |j                          d d d        |j                  }t        |t        j                        sJ dt        |             |j                  t        j                  j                         | j                          y # 1 sw Y   |xY w)Ninductorextern_callsr   F)increase_kernel_countztype(node)=)ru   r  r   r;   set_kernel_handlerr#   re  r   rB   r   r  rW   rS  rk   rl   r  )rM   scheduler_noderB   s      rN   codegen_extern_callzScheduler.codegen_extern_call  s    .*CDDD
 	^,1,!!&u"EF 	&002##%	& ""$0B[T$ZM2BB0QWW))*	& 	&s   !C""C+c                6   t        |j                        r|j                  
J | d       t        j                  j                  |       t        |j                        }|t        d|j                         t               s|j                  dk(  r`t        j                  j                  |      x}j                  dk  r2t        d|j                   d|j                   d|j                         t        |j                        rt        d       ||       S )	Nz( should have been normalized in loweringzUnsupported device type: r      zFound z which is too old to be supported by the triton GPU compiler, which is used as the backend. Triton only supports devices of CUDA Capability >= 7.0, but your device is of CUDA capability r  zCannot find a working triton installation. Either the package is not installed or it is too old. More information on installing Triton can be found at https://github.com/openai/triton)r8   rW   rd  r;   rk   add_device_infor"   r  r   r   r   get_device_propertiesmajorrK   minor)rM   r  device_schedulingdevice_propss       rN   create_backendzScheduler.create_backend  s5   v{{#v||'?	?X=>	??	'5fkkB$!:6;;-HII|v%%*ZZ%E%Ef%MM\TTWXX"\../  0j  kw  k}  k}  j~  ~  @L  @R  @R  S  T  $" N  !&&rP   c                x    || j                   vr| j                  |      | j                   |<   | j                   |   S rI   )r\  r  r  s     rN   r'  zScheduler.get_backend  s6    &$($7$7$?DMM&!}}V$$rP   c                    d fd}|j                         D ci c]8  }|j                  *|j                  j                         D ]  } ||      |fd  : }}}t        |j	                               }|rMt        |t        j                  d            \  }}t        j                  j                  j                  |       y y c c}}w )Nc                    | j                   vrLj                   j                  t        | j                  j                        D  ci c]  \  }} | |
 c} }       j                       S c c} }w rI   )r|  r  r  rk   r  )r  r  rM   s     rN   	get_orderz*Scheduler.enter_context.<locals>.get_order$  s\    ,,,$$++i>V,WdaQT,WX''** -Xs   A+
r   rL  )r  ztorch.fx.Noder   r   )r'  rB   rp  r}   r^  r  r  r  r;   rk   rl   enter_context)rM   rB   r  r  r  rx  r  lasts   `       rN   r   zScheduler.enter_context#  s    	+ ^^%
vv!VV'')	
  q\1t#

 
 w||~&'x':':1'=>GAtGG  ..t4 
s   =Cc                d    t        d      5  | j                         cd d d        S # 1 sw Y   y xY w)NzScheduler.codegen)r   _codegenrL   s    rN   rS  zScheduler.codegen5  s)    -. 	#==?	# 	# 	#s   &/c                
   t         j                  rdd l}t        j                         }t               }t        |      D ]  }|j                  dk(  r/|j                  |j                  j                  j                  k(  r nQ|j                  |j                  f}||vs"J d|j                   d|j                   d       |j                  |        | j                  D ]  }	 t        j!                  d|j#                         |j%                                | j)                  |       t+        |t,              s|j/                         x}r|| j0                  k7  s |j3                         s|j5                         r| j7                          || j0                  k7  r| j0                  rGt9        | j0                  j:                        r(t<        j>                  j@                  jC                          t9        |j:                        rF|jD                  J d       t<        j>                  j@                  jG                  |jD                         || _        | jH                  jK                  |jL                         |j5                         r5|jO                         ^}}	| jQ                        jS                  ||	       n|j3                         r,tU        jV                  tX        |      }| j[                  |       n|j]                         rqtU        jV                  t^        |      }| jQ                        }
d	d
l0m1} d	dl2m3} t+        |
||f      r|
}nti        dt;        |             |jk                  |       nYt+        |tl        tn        f      r!| jQ                        jq                  |       n"t+        |t,              sJ |js                          t         jt                  jv                  r| jQ                        jy                          | jz                  jK                  |j}                                | j~                  jK                  |j                                t+        |t,              r|j/                         }|| jQ                  |      j                         s| j7                           | j0                  rGt9        | j0                  j:                        r(t<        j>                  j@                  jC                          | j7                          y # t&        $ r/}t        j!                  d|j#                                Y d }~4d }~ww xY w)Nr   _compile_innerzDuplicate stack frame :zs; did you add a decorator to one of the functions in this stack trace?  If so, try using a context manager instead.z5Generating code for node %s with estimated runtime %fz6Generating code for node %s with estimated runtime 0.0zdevice should have an indexr   CUDACombinedSchedulingSIMDSchedulingztype(self)=)Br   "check_stack_no_cycles_TESTING_ONLYtorch._dynamo.convert_frame	tracebackextract_stackr  r  rK   filename_dynamoconvert_frame__file__linenor  r  r   r  rU   r  r   r   ru   rY  r3  rz  r<  r:  r  r2   rW   r;   rk   rl   codegen_device_guard_exitrd  codegen_device_guard_enterr{  r  r   r'  r'  codegen_templater  r  r  r  r>  r   codegen.cuda_combined_schedulingr  codegen.simdr
  r  codegen_combo_kernelr  rO  codegen_noder   tritondebug_sync_kernelcodegen_syncr  r,  rZ  r(  ready_to_flush)rM   r   stackr  framerM  rB   r  r  epiloguebackend_r  r
  r  s                 rN   r  zScheduler._codegen9  s   44.++-E5D!%  JJ"22%--*E*E*N*NN~~u||4$ ,U^^,<Aell^ LJ J
  JJ E	!D
		KMMO..0 t$d$:;//+++ d111~~''')JJLT000**/@++000 ,,FFH(5%||7V9VV7,,GGU*0D'%%,,T__=!"&.."2x  (99$I!{{#<dC((."{{#=tD++F3T8h9O(PQ&G(KDJ=)9::,,T2D#5}"EF  (55d;!$(>???}}..  (557''..t/D/D/FG%%,,T-E-E-GHd$:;*%$*:*:6*B*Q*Q*SJJLKE	!N #4T5H5H5M5M#N GG  ::<

K  		LMMO s   !3T	U$UUc                    |d   j                         }| t        j                  _        || _        | j                  |      }|j                  |      S r  )r3  r;   rk   rA   rz  r'  benchmark_combo_kernel)rM   r  r  r  s       rN   r$  z Scheduler.benchmark_combo_kernel  sK     1((* $""6*--i88rP   c                   t         j                  sy|}|d   j                         }|j                  dk(  ryddlm} dg }}t        |      D ]  \  }}|j                         }	| j                  |	      rt        j                  d       	 | j                  |	      \  }
}t        j                  |
      rt        j                  d|        y	 ||
z  }|j                  |        	 | j                  |      \  }}}||z
  dk  xs |dk  }t        j!                  t"        j$                        rP||kD  s|r%t        j                  dt'        ||z  d             n$t        j                  dt)        ||z  d             ||z
  |k  xs |S # |$ r.}d	t        |      v rt        j                  d
       Y d}~ y d}~ww xY w# |$ r-}d	t        |      v rt        j                  d
       Y d}~y d}~ww xY w)r  Tr   r  r  g        z<ComboKernel: benchmarking may not accurate due to atomic_addz;ComboKernel benchmark: register spilling of %d-th subkernelFr'  zCComboKernel benchmark: return True because of loop-carried variableNg333333?z/can fuse (benchmark): fusing causes %sx speedupr  z3cannot fuse (benchmark): fusing causes %sx slowdown)r   r$  r3  rW   r6  r  r  r'  r  r  r  r  r:  r;  r   r  r  r  r   r-   r.   )rM   r  subkernel_nodesr  r  r"  
path1_listr  r  r  mspathr  r#  	ms2_clone
path2_listsmall_kernels                    rN   rM  z!Scheduler.speedup_by_combo_kernel  s   
 ,, #..0 ;;%;rZ!/2 	$HAu)I ##I.  R55i@D::b>$$U ! " 2ICd#7	$:
	)-)D)D_)U&CJ Y,9c	""7==1SyL  E#)C2
   Ic	#0
 Y$44M $ *c!f4$$]     	&#a&0  Y 	s<   	?F$F= F:"F54F55F:=G/"G*)G**G/c                p    | j                   |   }|j                  J |j                  j                         S rI   )rr   rB   r[  )rM   r0  r   s      rN   get_buffer_layoutzScheduler.get_buffer_layout  s5    x(xx###xx""$$rP   c                X   | j                   D ]  }|j                         st        |j                         j                        s9|j                  j
                  D ]  }t        j                  j                  j                  |j                        }|s9|j                         sJ|j                         j                  dk(  sht        |j                  t              r|j                         g k(  st        j                  j                  j!                  |j                           y )Nr  )r  r3  r8   rW   r   r   r;   rk   r  r   rK   ru   rY   r+   r   zero_dim_cpu_tensor_listr  )rM   rB   rb  rv  s       rN   rb  z$Scheduler.update_zero_dim_cpu_tensor  s    JJ 	HD VDOO,=,B,B%C ,,22 	HDWW3377		BF"--/"--/44= *6==:K L"OO-388<<TYYG	H	HrP   )r  zList[ir.Operation]r   r   r  r   )r  r   r   r   )rB   r  r   rC   r  )r  rC   r   r  )r   r  r  r  r   zTuple[float, str])r  r  r   r   r  rC   r  rC   r   r   rI   )rT  zOptional[int]r   r   rX  )r  r  r   1List[Tuple[BaseSchedulerNode, BaseSchedulerNode]])r  rC   r  rC   ru  zTuple[str, ...]r   r   )r  r(   r  rC   r  rC   r   r   )rb  r%   r  r&   r   r   )r  rC   r  rC   r   Tuple[bool, bool, int, int])r   r%   r   r   r  rC   r  rC   r   r   )rU  r3  r   r3  )r  z+Tuple[BaseSchedulerNode, BaseSchedulerNode]r   r4  )rK   r   r   r   )r  r  r   r   )r  r  r   BaseScheduling)rB   rC   r   r   r  r  r   zTuple[float, float, str])r  r  r   r   )r0  r   r   z	ir.Layout):rX   r   r   r   r   rQ  r  ry  r  ra  rm  re  rg  rf  r  r  rh  rp  r  ru  r  rq  r  rE  r  rt  r  rH  rI  rp  r}  r  r  r  r  r  r  r  r  rZ  r[  rv  r  r  r  r  r  r  r  r'  r   rS  r  r$  rM  r.  rb  r  r  s   @rN   r@   r@     s   ))_
B47#,"HLC\##J,	 6T(4#&;B0	404	4@@D
P$&P$/@P$	P$d','	 'R..`?. ,. 	:. `,&,/@,	,\$&$/@$	$69 9 !9 *	9
 
9vF:&F:/@F:	F:PrNh*&*/@*	*X

(9
BS
	
J"H
&
/@
	$
0 J&J/@J	J:6 Q6	:6@/@/	$/	8*$+)Z++
'2%
5$#aF949	!9I5V%
HrP   c                      e Zd Zedd       Z	 	 	 	 	 	 ddZ	 	 	 	 	 	 ddZ	 	 	 	 	 	 ddZ	 	 	 	 ddZ	 	 	 	 	 	 ddZ	ddZ
ddZdd	Zdd
Z	 	 	 	 ddZ	 	 	 	 	 	 ddZ	 	 	 	 ddZy)r6  c                     y)z0Return a set of .codegen.common.BackendFeature()r   r   )r  r  s     rN   get_backend_featuresz#BaseScheduling.get_backend_features	  s     rP   c                    t         )zO
        Check whether node1 and node2 can be vertically fused or not.
        r  r  s      rN   r  z BaseScheduling.can_fuse_vertical  
     "!rP   c                    t         )zQ
        Check whether node1 and node2 can be horizontally fused or not.
        r  r  s      rN   r  z"BaseScheduling.can_fuse_horizontal  r<  rP   c                    |j                         s|j                         rt        j                  ||      S t        j                  ||      S )z 
        Fuse two nodes
        )r>  r  r  r  r  s      rN   r  zBaseScheduling.fuse  sA     !1!1!3-225%@@%**5%88rP   c                    t         )z[
        Process the iteration sizes in case a transformation needs to be applied.
        r  )rM   r3  s     rN   r(  zBaseScheduling.group_fn)  r<  rP   c                    t         )z
        Given a template node, generate a kernel.

        This function is only available for triton now. If the third-party backend behaves as a sub-class
        of TritonScheduling, it can override it or reuse it.
        r  )rM   template_nodeepilogue_nodess      rN   r  zBaseScheduling.codegen_template1  s
     "!rP   c                    t         )zD
        Generate a kernel given a list of pre-fused nodes.
        r  r  s     rN   r  zBaseScheduling.codegen_node>  
     "!rP   c                    t         )zt
        Generate synchronization code for the kernel. This method depends on the hardware characteristics.
        r  rL   s    rN   r  zBaseScheduling.codegen_syncD  rD  rP   c                     y)z
        Check whether the backend is requesting the scheduler to flush the generated kernel.
        If not supported, please return False.
        Fr   rL   s    rN   r  zBaseScheduling.ready_to_flushJ  s    
 rP   c                    t         )z]
        Flush the generated kernel and python wrapper code to the source code file.
        r  rL   s    rN   r  zBaseScheduling.flushQ  rD  rP   c                    t         )r  r  rR  s     rN   r  z$BaseScheduling.benchmark_fused_nodesW  
     "!rP   c                     y)z
        Return an unsigned integer which represents the priority of this fusion pair.
        The smaller is with higher priority.
        r   r   r  s      rN   r  z'BaseScheduling.get_fusion_pair_priority`  s     rP   c                    t         )z
        Benchmark the list of nodes to combine and return the execution time
        and memory copy time in milliseconds on randomly generated inputs.
        r  r  s     rN   r$  z%BaseScheduling.benchmark_combo_kerneli  rI  rP   N)r  r  r   zSequence[BackendFeature]r2  r  )r3  rq  r   z"Tuple[Tuple[sympy.Expr, ...], ...])rA  rC   rB  r  r   zOptional[str])rB   z(Union[FusedSchedulerNode, SchedulerNode]r   r   r   r   r1  r5  r7  )rX   r   r   r  r:  r  r  r  r(  r  r  r  r  r  r  r  r$  r   rP   rN   r6  r6    s     "&"/@"	""&"/@"	"	9&	9/@	9		9"3"	+""(" 4" 
	"""""0"	"&/@	"4"	!"rP   r6  c                   g }| j                         }|t        |t        j                        sJ |r0|j                  $|j                  | j                          d       |S ddlm} ddl	m
} t        | t              r| fn| j                  }|d   j                         }| j                  j                  |      }t        |||f      sJ |t         j"                  j                  _        t&        j(                  }|j+                  |      j-                         }	|t&        _        |j                  | j                          d       |j                  t/        j0                  |	d             |S )Nz" Unfinalized multi template bufferr   r  r   r	  z Triton code:r  )r  ru   r   r  make_kernel_renderr  rU   0torch._inductor.codegen.cuda_combined_schedulingr  r  r
  rO  r  r3  rA   r'  r;   rk   rz  r   generated_kernel_countgenerate_kernel_code_from_nodesstripr  r^   )
rB   rB  multi_templater  r
  r  r  r  old_generated_kernel_counttriton_codes
             rN   r@  r@  s  s3   E++-N!Z@V@V%WWW.;;C((JKL. L+	
 	1&t];$%%'..,,V4'N4J#KLLL+1(
 &-%C%C"==fEKKM)C&(67X__[&9:LrP   )r  r   r   r   )rB   rC   r  r  rr   zDict[str, SchedulerBuffer]r   r   )ru  rC   r   r   )ru  rC   rA   r@   r  r  r   r   )r   )r8  zList[List[int]]r3  zList[sympy.Expr]r>  zTuple[int, ...]r   z	List[int])rB   z(Union[SchedulerNode, FusedSchedulerNode]r   r   ){
__future__r   r  r   r:  r   r  r:  r  r  r  r  r  r  r   r   r   r   r   r	   r
   r   r   r   r   r   r   r\  r   torch._inductor.async_compiletorch._dynamo.utilsr   r   torch._inductor.metricsr   r   %torch.fx.experimental.symbolic_shapesr   torch.utils._ordered_setr   torch.utils._sympy.symbolr   r   torch.utils._tritonr   r   r   r   r   r   r   	codecacher    codegen.commonr!   r"   r#   comm_analysisr$   r%   r&   r'   r(   r)   r*   r+   	loop_bodyr,   runtime.runtime_utilsr-   r.   r  r/   utilsr0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   virtualizedr;   	getLoggerrX   r   _logginggetArtifactLoggerr  r:  	dataclassr?   rC   r  r[   rw   r  opsatenconvolutionmmbmmaddmmr  r  rY  rO  rz  r  r  r  r  rA  rC  rN  r]  r@   r6  r@  r   rP   rN   <module>rn     s   "        	           $ 6 M G / ? * 6 6 ! M M ; : : > >  7 &     g!^^--hA
NN44XO  Q. Q. Q.h^ ^B
 
,  &K
&K4&K ,&K 
	&KV #()).."<"<**))..,,!IINN00	 W 1 W"5. 5R+% R+j " $ 
	,z** z*zw:!3 w:t	?, ?J %'+#++ "+ 	+\ 
 
 
> +9??, GH GHT:h" h"VrP   