
    sgH=                      d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlmZ d dlmZmZmZmZmZmZmZmZmZmZ d dlZd dlmZ d dlZd dlZd dlmZ d dlm Z m!Z! d dl"m#Z# d d	l$m%Z% d d
l&m'Z' d dl(m)Z)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2 ddl3m4Z4m5Z5m6Z6 ddl7m8Z8 ddl6m9Z9 ddl:m;Z; ddl<m=Z= ddl>m?Z?m@Z@mAZAmBZBmCZC ddlDmEZE ddlFmGZG ddlHmIZImJZJmKZKmLZL ddlMmNZNmOZOmPZP er
d dlQZQddlRmSZS  eL       j                  ZUeej                  ej:                  eWf   ZXd=dZYd>dZZd?dZ[d@dZ\eeWe]f   Z^eeee]ej4                  f   d f   ee^gee]d f   f   f   Z_	 dA	 	 	 	 	 	 	 	 	 dBd!Z`ej                   G d" d#             Zbd$Zc G d% d&      Zd G d' d(      Zeej                   G d) d*ee             Zfej                   G d+ d,ee             Zgej                   G d- d.ee             Zh G d/ d0ee      Ziej                   G d1 d2ee             Zjej                   G d3 d4ej             Zkej                   G d5 d6ej             Zlej                   G d7 d8ej             Zm G d9 d:ej      ZneWZo G d; d<eI      Zpy)C    )annotationsN)count)
AnyCallableDictIteratorListOptionalSetTupleTYPE_CHECKINGUnion)Expr)dtype)countersdynamo_timed)DebugPrinterManager)MultiKernelState)	cache_dir)ConvertIntKeyDivideByKeySymTypes)_get_qualified_name)SingletonInt)symbol_is_typeSymT   )async_compileconfigir)output_code_log)ReinterpretView)triton_heuristics)DeviceProperties)cache_on_selfget_benchmark_nameLineContextsympy_product	sympy_str)V   )maybe_hipify_code_wrapper)CodeGenDeferredLineIndentedBufferPythonPrinter)	config_ofshould_unwrap_unspec_argsignature_to_meta)GraphLoweringc                    | j                         | j                         t        t        j                  j
                  j                  | j                  j                                     fS N)	
get_device	get_dtyper)   r*   graphsizevarssimplifylayoutstorage_size)nodes    R/var/www/html/venv/lib/python3.12/site-packages/torch/_inductor/codegen/wrapper.pybuffer_reuse_keyr@   I   sL     	!''""++DKK,D,D,FGH     c                   ddl m}m} t        | j                        }|dk(  r.| j
                  | j
                  j                  rd| dS d| dS ||v r||   }|S |j                         D ]Q  \  }}t        j                  |dz   |      }t        |      dk(  s.|d   }||v sJ d	| d
|        ||   }	| d|	 dc S  t        d|       )Nr+   )CONTAINER_PYTHON_TO_CPPPYTHON_TO_CPPTensorzat::&z const&z\[([a-zA-Z_]+)]r   zunsupported z type in convert_arg_type: <>zunsupport python_type: )cpprC   rD   repr	real_type
alias_infois_writeitemsrefindalllenAssertionError)
argrC   rD   python_typecpp_typepy_containercpp_containercontainer_matchcontained_typecpp_contained_types
             r?   convert_arg_typer[   T   s   ; s}}%Kh>>%#..*A*A+a((+g..m# - (?'D'D'F <#m**\4F%FT1$,Q/N-/Xl^+F~FVWX/!.~!>#_A&8%9;;< 2;-@
AArA   c                    t        | j                        }ddd}|j                  |d       }|
J d|        |dk(  r| j                  |dz  }|S )Nz
at::Tensorzstd::vector<at::Tensor>)rE   zList[Tensor]zNYI return type: rE   rF   )rJ   rK   getrL   )retrT   python_to_cpprU   s       r?   convert_return_typer`   s   sk    s}}%K1M
   d3HB#4[M!BB h3>>#=COrA   c                   | j                   j                  }| j                   j                  }t        |      }|dkD  sJ d       |dk(  rt	        |d         }n3|dkD  r.dj                  |D cg c]  }t	        |       c}      }d| d}|D cg c]  }t        |       d|j                    }} ddj                  |       d	S c c}w c c}w )
Nr   z#must have at least one return valuer+   , zstd::tuple<rH    ())_schema	argumentsreturnsrQ   r`   joinr[   name)	kernelargsrh   num_returnscpp_return_valuertuple_returnsrS   cpp_arg_types	            r?   get_cpp_op_schemarr      s    >>##Dnn$$Gg,K?AAA?a.wqz:	q		7"Ka#6q#9"KL(q9EIJc',-Qsxxj9JLJq<!8 9;;	 #L Ks   ,C	!C.c                    t               dd	 	 dfd}dd fd}d  } |d| d       r4t        j                  j                  rj                  j                         nt        j                         }j                         5  |5  t        |      dk(  r ||d         \  }}	 |d	| d	|	        nt        |      dkD  sJ t        |      t        |      k(  sJ t               }
t        ||      D ]  \  }}|j                  j                         D  cg c]  \  } }d
|  d|  }} }dj                  |      } ||      \  }}	d| d| }||
v ra|
j                  |        ||d| d|	         d d d        d d d        |j                         fS c c}} w # 1 sw Y   )xY w# 1 sw Y   -xY w)Nc                d    t        | t        j                        r| S t        j                  |       S r6   )
isinstancesympyr   Integer)items    r?   _convert_to_sympy_exprz@user_defined_kernel_grid_fn_code.<locals>._convert_to_sympy_expr   s#    !$

3tLt9LLrA   c                    t        |       r| | fS t        fd| D              }j                  |      t        j                  j
                  r$j                  t        fd|D                    fS dfS )a'  
        This function return a tuple of two values: the first one is for the real grid
        which is used in the generated code; the second one is an example grid with
        concreate values which is used in the autotune block to run the generated
        kernels at compile time.
        Nc              3  .   K   | ]  } |        y wr6    ).0gry   s     r?   	<genexpr>zKuser_defined_kernel_grid_fn_code.<locals>.determine_grid.<locals>.<genexpr>   s     C1!4Cs   c              3  T   K   | ]  }j                  |t        |             ! y wr6   generate_example_arg_valuetype)r}   r~   wrappers     r?   r   zKuser_defined_kernel_grid_fn_code.<locals>.determine_grid.<locals>.<genexpr>   s'      GHG66q$q'B   %()callabletuplecodegen_shape_tupler   tritonautotune_at_compile_time)grid
sympy_gridry   r   s     r?   determine_gridz8user_defined_kernel_grid_fn_code.<locals>.determine_grid   s     ?htn:CdCC
''
3 }}55 '' LV 	
 		
 	
 		
rA   c                    j                  |        rJt        j                  j                  r/j                  vr j
                  j                  |xs |        y y y y r6   )	writeliner   r   r   kernel_autotune_nameskernel_autotune_calls)lineexample_gridrj   outputr   s     r?   r   z3user_defined_kernel_grid_fn_code.<locals>.writeline   sW    66G999))33L4HDI : 7 rA   grid_wrapper_for_zdef z(meta):r+   r   zreturn zmeta['z'] == z and if z	: return )rx   zUnion[int, sympy.Expr]return
sympy.Expr)r   
TritonGridr6   )r   strr   Optional[str])r/   r   r   r   r   indent
contextlibnullcontextrQ   setzipkwargsrN   ri   addgetvalue)rj   configsgridsr   r   r   fn_namekernel_autotune_calls_indentr   r   seencvalguards	statementry   r   s   `  `           @@r?    user_defined_kernel_grid_fn_coder      s    FM

2J J "$(GWIW%& v}}== 	%%,,.##% !
 
 L6 Lu:?!/a!9D,v&',(@Au:>!>u:W---5Dug. LaFGhhnnFVWsF4&se4WW f-%3D%9"l!&4&9	$#)s6()L>%JKLL L$ FOO%%% XL L L Ls8   F6BF*"F$4AF*F6$F**F3	/F66F?c                  (    e Zd ZU ded<   ded<   d Zy)SymbolicCallArgr   innerr   
inner_exprc                ,    t        | j                        S r6   )r   r   selfs    r?   __str__zSymbolicCallArg.__str__   s    4::rA   N)__name__
__module____qualname____annotations__r   r|   rA   r?   r   r      s    JrA   r   i  c                  6     e Zd Z fdZddZddZddZ xZS )MemoryPlanningStatec                l    t         |           t        j                  t              | _        d| _        y Nr   )super__init__collectionsdefaultdictlist
reuse_pooltotal_allocated_buffer_size)r   	__class__s    r?   r   zMemoryPlanningState.__init__   s/     ##D) 	 12(rA   c                L    t        | j                  j                  |d             S r6   )boolr   r]   )r   keys     r?   __contains__z MemoryPlanningState.__contains__   s    DOO''T233rA   c                \    | j                   |   j                         }|j                  rJ |S r6   )r   pop	is_reusedr   r   rx   s      r?   r   zMemoryPlanningState.pop  s+    s#'')>>!!rA   c                \    |j                   rJ | j                  |   j                  |       y r6   )r   r   appendr   s      r?   pushzMemoryPlanningState.push  s&    >>!!##D)rA   )r   ReuseKeyr   r   )r   r   r   FreeIfNotReusedLine)r   r   rx   r   r   None)r   r   r   r   r   r   r   __classcell__r   s   @r?   r   r      s    24
*rA   r   c                      e Zd Zy)WrapperLineNr   r   r   r|   rA   r?   r   r         rA   r   c                  2    e Zd ZU ded<   ded<   ddZd	dZy)
EnterSubgraphLineWrapperCodeGenr   r4   r9   c                b    | j                   j                  | j                   j                         y r6   )r   push_computed_sizescomputed_sizesr   s    r?   __post_init__zEnterSubgraphLine.__post_init__  s    (()D)DErA   c                n    | j                   j                  | j                         |j                          y r6   )r   push_codegened_graphr9   	do_indentr   codes     r?   codegenzEnterSubgraphLine.codegen  s"    ))$**5rA   Nr   r   r   r/   r   r   r   r   r   r   r   r   r|   rA   r?   r   r     s    FrA   r   c                  (    e Zd ZU ded<   ddZddZy)ExitSubgraphLiner   r   c                V    | j                   j                         | j                   _        y r6   )r   pop_computed_sizesr   r   s    r?   r   zExitSubgraphLine.__post_init__"  s    &*ll&E&E&G#rA   c                X    | j                   j                          |j                          y r6   )r   pop_codegened_graphdo_unindentr   s     r?   r   zExitSubgraphLine.codegen%  s    ((*rA   Nr   r   r   r|   rA   r?   r   r     s    HrA   r   c                  *    e Zd ZU ded<   ded<   ddZy)EnterDeviceContextManagerLineint
device_idxzOptional[int]last_seen_device_guard_indexc                ~   t         j                  j                  r
|j                  d       t         j                  j                  ri| j
                  =t        j                  r|j                  d       y |j                  t        d             y | j
                  | j                  k(  s|J d       | j
                  I|j                  t        j                  rd| j                   dnt        d| j                   d             y |j                  d| j                   d       y y |j                  d	t         j                  j                  j                  | j                         d
       |j                          |j                  t         j                  j                  j                  | j                               y )N
z<AOTICudaStreamGuard stream_guard(stream, this->device_idx_);zcat::cuda::CUDAStreamGuard stream_guard(at::cuda::getStreamFromExternal(stream, this->device_idx_));z4AOTInductor only supports running on one CUDA devicezAOTICudaGuard device_guard(z);z!at::cuda::CUDAGuard device_guard(zdevice_guard.set_index(with :)r*   r9   cpp_wrapperr   aot_moder   r   abi_compatibler,   r   
device_opsdevice_guardr   
set_devicer   s     r?   r   z%EnterDeviceContextManagerLine.codegen/  sd   77NN4 ww 44<,,Z 5!a 99T__LNMNL 44<NN!00 6doo5FbI6??PPRS NN%<T__<MR#PQ M  NNU177#5#5#B#B4??#S"TTUVWNNNN177--88IJrA   Nr   )r   r   r   r   r   r|   rA   r?   r   r   *  s    O"//'KrA   r   c                      e Zd ZddZy)ExitDeviceContextManagerLinec                Z    t         j                  j                  s|j                          y y r6   )r*   r9   r   r   r   s     r?   r   z$ExitDeviceContextManagerLine.codegenZ  s     ww"" #rA   Nr   )r   r   r   r   r|   rA   r?   r  r  Y  s    rA   r  c                  0    e Zd ZU ded<   ddZddZd	dZy)
MemoryPlanningLiner   r   c                    | S )zFirst pass to find reuser|   r   states     r?   planzMemoryPlanningLine.planc  s    rA   c                     y)zSecond pass to output codeNr|   r   s     r?   r   zMemoryPlanningLine.codegeng  s    rA   c                r   g }t        j                  |       D ]t  }|j                  dk(  rt        | |j                        }|j	                  |j                   d|j
                  t        j                  u r|j                         n|        v t        |       j                   ddj                  |       dS )zF
        Emits a string representation that fits on one line.
        r   =rd   rb   re   )dataclassesfieldsrj   getattrr   r   r    Bufferget_namer   ri   )r   rl   fieldr   s       r?   r   zMemoryPlanningLine.__str__j  s      ''- 	EzzY&$

+CKK::,a%**		2IsST		 t*%%&a		$'8::rA   Nr  r   r   r  r   r   r   )r   r   r   r   r  r   r   r|   rA   r?   r  r  _  s    );rA   r  c                  (    e Zd ZU ded<   ddZddZy)AllocateLine	ir.Bufferr>   c           	        | j                   j                         t        j                  j                  v rt        | j                        S t        | j                         }t        j                  rG||v rC|j                  |      }d|_        t        | j                  |j                   | j                         S | j                   j                         j                  dk(  rh| j                  j                  | j                         }|A|xj                   t#        t%        j&                  t(        j*                  |d            z  c_        | S )NTcpur+   )r>   r  r*   r9   removed_buffersNullLiner   r@   r   allow_buffer_reuser   r   	ReuseLiner7   r   static_shape_for_buffer_or_noner   r   	functoolsreduceoperatormul)r   r  r   	free_linestatic_shapes        r?   r  zAllocateLine.plan}  s    99177#:#::DLL)) tyy)$$		#I"&IT\\9>>499EE99!&&%/<<GG		RL'11S$$X\\<C6 1 rA   c                    | j                   j                         t        j                  j                  vsJ | j
                  j                  | j                         }|j                  |       y r6   )r>   r  r*   r9   r  r   make_buffer_allocationr   )r   r   r   s      r?   r   zAllocateLine.codegen  sK    yy!!#177+B+BBBB||22499=trA   Nr  r   )r   r   r   r   r  r   r|   rA   r?   r  r  y  s    
O(rA   r  c                  6    e Zd ZU ded<   dZded<   d	dZd
dZy)r   r  r>   Fr   r   c                   t        | j                  j                               dkD  r| S t        | j                  j                  t
        j                        r| S | j                  rJ | j                  j                         t        j                  j                  v rt        | j                        S t        j                  r%|j!                  t#        | j                        |        | S r   )rQ   r>   get_inputs_that_alias_outputru   r<   r    MultiOutputLayoutr   r  r*   r9   r  r  r   r   r  r   r@   r  s     r?   r  zFreeIfNotReusedLine.plan  s    tyy55781<Kdii&&(<(<=K>>!!99177#:#::DLL))$$JJ'		2D9rA   c                    | j                   j                         t        j                  j                  vsJ | j
                  s5|j                  | j                  j                  | j                                y y r6   )	r>   r  r*   r9   r  r   r   r   make_buffer_freer   s     r?   r   zFreeIfNotReusedLine.codegen  sR    yy!!#177+B+BBBB~~NN4<<88CD rA   Nr  r   )r   r   r   r   r   r  r   r|   rA   r?   r   r     s    
OIt
ErA   r   c                  @    e Zd ZU ded<   ded<   dZded<   d
dZddZy	)r  r  r>   	reused_asTr   
delete_oldc                p   | j                   j                         t        j                  j                  v rK| j
                  j                         t        j                  j                  v sJ t        | j                        S | j
                  j                         t        j                  j                  vsJ | S r6   )r>   r  r*   r9   r  r-  r  r   r  s     r?   r  zReuseLine.plan  s    99177#:#::>>**,0G0GGGGDLL))~~&&(0G0GGGGrA   c                p   | j                   j                         t        j                  j                  vsJ | j
                  j                         t        j                  j                  vsJ |j                  | j                  j                  | j                   | j
                  | j                               y r6   )
r>   r  r*   r9   r  r-  r   r   make_buffer_reuser.  r   s     r?   r   zReuseLine.codegen  sz    yy!!#177+B+BBBB~~&&(0G0GGGGLL**499dnndooV	
rA   Nr  r   )r   r   r   r   r.  r  r   r|   rA   r?   r  r    s!    
OJ
rA   r  c                      e Zd Zy)r  Nr   r|   rA   r?   r  r    r   rA   r  c                      e Zd ZdZ fdZdddZdedZdedZeded       Z	eded       Z
dfdZedgd	       Zded
ZdedZdedZdedZdhdidZd Zd Zd Zd Zd ZdjdZdkdZdedZdldZdmdZdmdZd Zd Z	 	 	 	 	 	 	 	 dndZ	 	 	 	 	 	 dodZ d Z!d Z"	 	 	 	 dp	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dqd Z#d! Z$d" Z%d# Z&d$ Z'd% Z(drd&Z)drd'Z*	 	 	 	 dsd(Z+dtd)Z,d* Z-d+d,dud-Z.dvd.Z/dwd/Z0dxd0Z1dxd1Z2djd2Z3	 dh	 djd3Z4d4 Z5d5 Z6d6 Z7d7 Z8d8 Z9	 dy	 	 	 	 	 dzd9Z:d: Z;dhd{d;Z<d< Z=d= Z>d> Z?d? Z@d@ ZAdA ZBdB ZC	 	 d|	 	 	 	 	 	 	 d}dCZDdD ZEd~dEZFdF ZG	 	 	 	 	 	 	 	 	 	 d	 ddGZHdH ZIdI ZJdJ ZKdhdKZLdL ZMdM ZNddNZOdO ZPddPZQddQZRddRZSdS ZTddTZUdU ZVdhdVZWdW ZXddXZYdY ZZdZ Z[d[ Z\d\ Z]d] Z^d^ Z_e`d_        Zae`d`        Zbe`da        Zce`db        Zde`dc        Ze xZfS )r   zB
    Generate outer wrapper in Python that calls the kernels.
    c                H    t                    t                _        t	                _        t	                _        t	                _        t	                _        t	                _	        t	                _
        t	                _        t                _        i  _        t                _        g  _        d _        d _        d _        d _        d _        d _        d _        d _        d _        d _        d  _        d _        t:         _        i  _        t                _         d  _!        i  _"        t                _#        g  _$        g  _%         jM                           jO                           jQ                          tR        jT                  jV                  sBtR        jT                  jX                  j[                         D ]  \  }} j]                  ||        t                _/        t                _0        i  _1         te        jf                  d        jh                         _4        te        jf                  d       d fd	       }| _5        i  _6        t                _7        tq                _9        tu        tv        jx                  jz                  
       _>        y )N []#r   zsize()zstride()Tc                    j                   j                  |        t        j                  j                  rj
                  j                  |        y y r6   )importsr   r   r   r   r   )r   r   s    r?   add_import_oncez0WrapperCodeGen.__init__.<locals>.add_import_once  s;    LL""4(}}55**44T: 6rA   )debug_printer_level)r   r   r   r   )?r   r   r   _names_iterr/   r:  headerprefixsuffixwrapper_callkernel_autotune_defsr   r   r   src_to_kernelkernel_numel_exprlinesdeclaredeclare_maybe_referenceendingopen_bracketclosed_bracketcomment	namespacenone_strsizestrider   supports_intermediate_hookspexprexpr_printeruser_defined_kernel_cacheunbacked_symbol_declsallow_stack_allocationstack_allocated_buffersr   codegened_graph_stackcomputed_sizes_stackwrite_headerwrite_prefix!write_kernel_autotune_defs_headerr*   r9   r   constant_reprsrN   write_constant	allocatedfreedreusesr  	lru_cachewrite_get_raw_streamr;  _metas
_meta_varsr   multi_kernel_stater   r   aot_inductor debug_intermediate_value_printerdebug_printer)r   rj   hashedr;  r   s   `   r?   r   zWrapperCodeGen.__init__  s.   */'%'$&$&$&*,$2$4!%3%5"/2u" .0ADCE
')$!	 ;?)+/(27QS&/2u"6:#DF$14 &("$&!..0ww ! 6 6 < < > 2f##D&12 +.%&)e
 57$=I$7$7$=%%%
! 
		T	"	; 
#	;
  /&($'E"2"4 1 & 3 3 T T
rA   c                D    | j                   j                  | d|        y )Nz = None  # )r>  r   )r   rj   ri  s      r?   r]  zWrapperCodeGen.write_constant  s    k&:;rA   c                0   t         j                  j                  j                         }d}||j                  d|j                   }| j
                  j                  d| dt        j                   dd       | j                  j                  dd       y )	Nr5  z
# AOT ID: z
                a!  
                from ctypes import c_void_p, c_long, c_int
                import torch
                import math
                import random
                import os
                import tempfile
                from math import inf, nan
                from torch._inductor.hooks import run_intermediate_hooks
                from torch._inductor.utils import maybe_profile
                from torch._inductor.codegen.memory_planning import _align as align
                from torch import device, empty_strided
                from z import AsyncCompile
                from torch._inductor.select_algorithm import extern_kernels
                from torch._inductor.codegen.multi_kernel import MultiKernelCall
            Tstripa  
                aten = torch.ops.aten
                inductor_ops = torch.ops.inductor
                _quantized = torch.ops._quantized
                assert_size_stride = torch._C._dynamo.guards.assert_size_stride
                empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
                reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
                alloc_from_pool = torch.ops.inductor._alloc_from_pool
                async_compile = AsyncCompile()
            )
torch_guardsTracingContexttry_getaot_graph_namer:  splicer   r   r>  )r   contextaot_config_comments      r?   rY  zWrapperCodeGen.write_header  s    --..6687#9#9#E#-g.D.D-E!F#$ % $,,- ." % 	 	
( 	  	 	
rA   c                ^    | j                   j                  dt        j                   d       y )Na	  
                import torch
                from torch._dynamo.testing import rand_strided
                from torch._dynamo.utils import preserve_rng_state
                from torch._inductor.select_algorithm import AlgorithmSelectorCache
                from z import AsyncCompile

                async_compile = AsyncCompile()
                generate_example_value = AlgorithmSelectorCache.generate_example_value
            )rB  rs  r   r   r   s    r?   r[  z0WrapperCodeGen.write_kernel_autotune_defs_headerH  s3    !!((
 $,,- .		
rA   c                    dt         j                   d}| j                  j                  |d       t        j
                  j                  r| j                  j                  |       | j                          y )NzU
            import triton
            import triton.language as tl
            from zV import grid, split_scan_grid, grid_combo_kernels, start_graph, end_graph
            Trl  )	r#   r   r:  rs  r   r   r   r    write_get_raw_stream_header_once)r   
import_strs     r?   write_triton_header_oncez'WrapperCodeGen.write_triton_header_onceV  sg     $,,- .

 	Jd3==11&&--j9--/rA   c                B   | j                   j                  t        j                  j                  j                  d             t        j                  j                  rC| j                  j                  t        j                  j                  j                  d             y y )Nget_raw_stream)
r:  r   r*   r9   r   import_get_raw_stream_asr   r   r   r   r   s    r?   rx  z/WrapperCodeGen.write_get_raw_stream_header_onceb  sn    GG778HI	
 ==11&&00"";;<LM 2rA   c                   t        |      }|| j                  vrdt        | j                         }|| j                  |<   | j                  j	                  | d|        t
        j                  j                  r;| j                  j	                  | d|        | j                  j                  |       | j                  |   S )Nmeta = )rJ   rc  rQ   r>  r   r   r   r   r   rd  r   )r   r  vars      r?   add_meta_oncezWrapperCodeGen.add_meta_oncel  s    Dzt{{"T[[)*+C #DKKKK!!SETF"34}}55**44uCv5FG##C({{4  rA   c                    t         j                  j                  D cg c]  }|j                  | j                         c}S c c}w r6   )r*   r9   graph_outputscodegen_referencerA  r   xs     r?   get_output_refszWrapperCodeGen.get_output_refsw  s1    @A@U@UV1##D$5$56VVVs   "Ac                     y r6   r|   r   s    r?   mark_output_typezWrapperCodeGen.mark_output_type{      rA   c           
        t         j                  j                  j                         D ]  \  }}t	        |t
        j                        r!t        |j                               dk(  r>| j                  |j                               }| j                  |j                               }| j                  j                  d| d| d| d        y )Nr   zassert_size_stride(rb   re   )r*   r9   graph_inputsrN   ru   rv   r   r(   get_sizer   
get_strider?  r   )r   rj   bufrN  rO  s        r?   codegen_input_size_assertsz)WrapperCodeGen.codegen_input_size_asserts~  s    --335 		SID##uzz* S\\^,1++CLLN;D--cnn.>?FKK!!$7vRvRxq"QR		SrA   c                T   | j                   j                  d       t        j                  j                  j                         D ]b  \  }}t        |t        j                        r!d| d}| j                   j                  |       d| d}| j                   j                  |       d y )Nz(# make sure graph inputs are not nan/infzassert not z.isnan().any().item()z.isinf().any().item())	r?  r   r*   r9   r  rN   ru   rv   r   )r   rj   r  r   s       r?   codegen_input_nan_assertsz(WrapperCodeGen.codegen_input_nan_asserts  s    HI--335 	(ID##uzz* &;<DKK!!$' &;<DKK!!$'	(rA   c                   | j                   j                  d       | j                   j                         5  t        j                  j
                  rA| j                   j                  t        j                  j                  j                                t        j                  j                  rdj                  t        j                  j                        }t        t        j                  j                        dk(  r|dz  }| j                   j                  | d       | j                   j                  d       | j                  | j                   t        j                  j                         t        j                   r| j#                          t        j$                  r| j'                          d d d        y # 1 sw Y   y xY w)Nzs

            async_compile.wait(globals())
            del async_compile

            def call(args):
            rb   r+   ,z = argszargs.clear())r?  rs  r   r   r   debug_sync_graphr   r*   r9   r   synchronizer  ri   graph_input_namesrQ   codegen_inputssize_assertsr  nan_assertsr  )r   lhss     r?   rZ  zWrapperCodeGen.write_prefix  s)   	
 [[! 	1}}--%%agg&8&8&D&D&FGww##ii 9 9:qww001Q63JC%%Wo6%%n5QWW-A-AB""//1!!..0	1 	1 	1s   E7F66F?c                ^    | j                          d| }| j                  | d| d       |S )Nstream = get_raw_stream(re   )rx  r   )r   r   r9   rj   s       r?   rb  z#WrapperCodeGen.write_get_raw_stream  s9    --/
|$$1*Q?@rA   c                     | j                   d   S )N)rW  r   s    r?   get_codegened_graphz"WrapperCodeGen.get_codegened_graph  s    ))"--rA   c                :    | j                   j                  |       y r6   )rW  r   )r   r9   s     r?   r   z#WrapperCodeGen.push_codegened_graph  s    ""))%0rA   c                6    | j                   j                         S r6   )rW  r   r   s    r?   r   z"WrapperCodeGen.pop_codegened_graph  s    ))--//rA   c                P    ddl m} | j                  j                   ||            S )Nr   )deepcopy)copyr  rX  r   )r   r   r  s      r?   r   z"WrapperCodeGen.push_computed_sizes  s!    !((//0HIIrA   c                6    | j                   j                         S r6   )rX  r   r   s    r?   r   z!WrapperCodeGen.pop_computed_sizes  s    ((,,..rA   c                .    t        | j                         S r6   )nextr=  r   s    r?   next_kernel_suffixz!WrapperCodeGen.next_kernel_suffix  s    t''()*rA   c                8   | j                  t        || j                               t        j                  j
                  r| j                          | j                  j                  dt        j                  j                  j                  |       d       | j                  j                          | j                  j                  t        j                  j                  j                  |             | j                  j                  d| d| d       || _        y )Nr   r   r  r  re   )r   r   r   r   r   r   rz  r   r*   r9   r   r   r   r   )r   r   s     r?   codegen_device_guard_enterz)WrapperCodeGen.codegen_device_guard_enter  s    )*d6W6WX	
 ==11))+&&00**77
CDAF &&002&&00""--j9 &&00$6zl!D -7)rA   c                    | j                  t                      t        j                  j                  r| j
                  j                          y y r6   )r   r  r   r   r   r   r   r   s    r?   codegen_device_guard_exitz(WrapperCodeGen.codegen_device_guard_exit  s6    356==11&&224 2rA   c                    |r1| j                   j                  ddj                  |      z   dz          y | j                   j                  d       y )Nzreturn (rb   , )z	return ())rA  r   ri   )r   output_refss     r?   generate_returnzWrapperCodeGen.generate_return  s@    ''
TYY{5K(Ke(ST''4rA   c                     y r6   r|   r   results     r?   generate_before_suffixz%WrapperCodeGen.generate_before_suffix  r  rA   c                     y r6   r|   r  s     r?   generate_endzWrapperCodeGen.generate_end  r  rA   c                (    | j                  ||       y r6   )generate_extern_kernel_alloc)r   fallback_kernelrl   s      r?   generate_fallback_kernelz'WrapperCodeGen.generate_fallback_kernel  s    ))/4@rA   c           
        t        |j                  t        j                        }|j	                         }|j                         }|j                         }| j                  }t        j                  r	d|v rd| }|r5| j                  | j                   | ddj                  |       d|        y | j                  | j                   | d| ddj                  |       d|        | j                  rKt        j                  r:|7t        d   dxx   d	z  cc<   | j                  d
|j                   d| d       y y y y )Nview_as_complexz.clone()rd   rb   re   r  inductorintermediate_hooksr+   zrun_intermediate_hooks()ru   r<   r    
NoneLayoutr  get_origin_nodeget_kernel_namerH  r   memory_planningr   rF  ri   rP  generate_intermediate_hooksr   rj   )r   extern_kernelrl   	no_returnoutput_nameorigin_nodekernel_namerH  s           r?   r  z+WrapperCodeGen.generate_extern_kernel_alloc  s;    }33R]]C	#,,.#335#335!!&7;&F  x(FNNdll^K=$))D/9J!F8TUNN<<.SQtyy>OqQWPXY 0066+$%9:a?:-k.>.>-AK=PQR , 7 1rA   c                    |j                  d|r|n|        | j                  | ddj                  |       d       y )Nzout=rd   rb   re   )r   r   ri   )r   rk   outout_viewrl   s        r?   generate_extern_kernel_outz)WrapperCodeGen.generate_extern_kernel_out  s>     	dx8S9:;&499T?"3156rA   c                Z   t        ||||       \  }}|j                  d      D ]  }	| j                  |	        |D 
cg c]  }
| j                  |
       }}
|D cg c])  }t	        |d      r|j                         n
t        |      + }}| j                  |||||       y c c}
w c c}w )N)r   r   r8   )grid_fn	arg_typesraw_args)r   splitr   val_to_arg_strhasattrr8   r   generate_kernel_call)r   r  r  r   r   triton_meta
constexprsr  r   r   vrl   rS   r  s                 r?   #generate_user_defined_triton_kernelz2WrapperCodeGen.generate_user_defined_triton_kernel  s     9$

 JJt$ 	!DNN4 	! 1991##A&99  
  'sK8CMMOd3iG
	 
 	!!w)h 	" 	
 :
s   B#.B(c                    | ddj                  t        t        |             }|j                  d      r|dj                  dg|z         z  }n|r|dt	        |       z  }|dz  }| j                  |       y )Nrd   r  zaten.scatter_reducerb   r5  z	, reduce=re   )ri   mapr   
startswithrJ   r   )	r   r   inputscpp_kernel_namepython_kernel_namesrc_is_tensorr  r   r   s	            r?   generate_scatter_fallbackz(WrapperCodeGen.generate_scatter_fallback0  s{     %%QsxxC0@'A&BC(()>?DIIrdVm,,D)DL>22trA   c                    | j                    dj                  |       | j                   }||||g}| j                  | j	                  ||             y )Nrb   )rI  ri   rJ  r   wrap_kernel_call)r   rk   r  indicesvalues
accumulateindices_strrl   s           r?   generate_index_put_fallbackz*WrapperCodeGen.generate_index_put_fallbackC  sS    **+DIIg,>+?@S@S?TU;
3t,,VT:;rA   c           	     V    | j                  | d| ddj                  |       d       y )Nr  rd   rb   re   )r   ri   )r   buf_namer  r  codegen_argscpp_op_schemacpp_kernel_keycpp_kernel_overload_nameop_overloadr  outputss              r?   6generate_extern_kernel_alloc_and_find_schema_if_neededzEWrapperCodeGen.generate_extern_kernel_alloc_and_find_schema_if_neededH  s0     	(3'9&:!DIIl<S;TTUVWrA   c                f    t        d      5  | j                  |      cd d d        S # 1 sw Y   y xY w)NzWrapperCodeGen.generate)r   	_generate)r   is_inferences     r?   generatezWrapperCodeGen.generateW  s,    34 	0>>,/	0 	0 	0s   '0c                   t         j                  r| j                          t               }|j	                  | j
                         |j                  d       |j	                  | j                         t        j                  j                  r>t        j                  j                  r$t        j                  j                  r
t               }t        j                         5 }|j                  | j                   j#                                t         j$                  r| j'                  |       t         j                  r| j)                          |r(t         j*                  r| j-                          d| _        n| j1                          t         j2                  j4                  r| j7                          | j8                  D ]I  }t;        |t<              r|j?                  | j                          /| j                   j                  |       K | jA                         }| jC                          t         j2                  jD                  rA| j                   j                  t        j                  jF                  jI                                t         j                  r| jK                          t         j2                  j4                  r| jM                          t         j2                  jN                  r| jQ                          | jS                  |       d d d        | jU                          |j	                  | jV                         |j#                         5  |j	                  | j                          d d d        | jY                  |       |j	                  | jZ                         | j]                  |       | j_                  |       |ja                         S # 1 sw Y   xY w# 1 sw Y   sxY w)Nr5  F)1r   profile_bandwidthrz  r/   rs  r:  r   r>  r*   r9   r   r   is_const_graphr   	ExitStackenter_contextrA  r   profiler_mark_wrapper_call#generate_profiler_mark_wrapper_callgenerate_start_graphr  memory_planrU  memory_plan_reuser   store_cubin!generate_reset_kernel_saved_flagsrE  ru   r   r   r  r  r  r   r  generate_end_graph generate_save_uncompiled_kernelsr   generate_and_run_autotune_blockr  finalize_prefixr?  r  r@  r  add_benchmark_harnessgetvaluewithlinemap)r   r  r  stackr   r  s         r?   r  zWrapperCodeGen._generate[  s   ##))+!dll#dkk" 77 3 38N8N#%F!!# &	.u 1 1 8 8 :;0088?''))+  6 6  ".3+&&(}}((668

 6dK0LL!2!23%%//5	6 ..0K!!#}}--!!++AGG,>,>,J,J,LM'''')}}((557}}55446  -M&	.P 	dkk"]]_ 	-MM$++,	- 	##F+dkk"&!""6*))++k&	. &	.V	- 	-s   H'O
O
OOc                   | j                   j                  d       i }| j                   j                         | j                  j                         z   }t        j
                  t        j                  k(  rlt        j                  t               dd      5 }|j                  |j                  d             |j                  }ddd       t	        j                  d|       t        ||       y# 1 sw Y   -xY w)z
        Compose self.kernel_autotune_defs and self.kernel_autotune_calls into a single block of
        code and execute it to trigger Triton kernel compilation and auto-tuning
        zQ
            async_compile.wait(globals())
            del async_compile
        z.pyF)dirr@  deletezutf-8NzB
Compile-time auto-tuning code: 
%s
Auto-tuning code written to %s)rB  rs  r   r   r!   levelloggingDEBUGtempfileNamedTemporaryFiler   writeencoderj   debugexec)r   scopetuning_codef	file_paths        r?   r  z.WrapperCodeGen.generate_and_run_autotune_block  s    
 	!!((	
 %%..043M3M3V3V3XX 	   GMM1 ,,Ke #**734FF		#
 !!W 	[% # #s   -C--C6c                \    ddl m}  ||       j                  | j                        | _        y )Nr+   )MemoryPlanner)r  r  r  rE  )r   r  s     r?   r   zWrapperCodeGen.memory_plan  s     2"4(--djj9
rA   c                &   t         j                  j                         }| j                  rt	        | j                  d   t
              r| j                  d   j                  j                  |vri| j                  j                          | j                  rCt	        | j                  d   t
              r&| j                  d   j                  j                  |vrit               g}g }t        t        | j                              D ]  }| j                  |   }t	        |t
              r"|j                  |d         | j                  |<   Dt	        |t              r|j                  t                      nt	        |t              s|j                  |j                                 |j                  |j                                t        |      dk(  sJ t!        d |D              }| j"                  duxr t$        j"                  xr	 |t&        k  | _        y )Nr  r   c              3  4   K   | ]  }|j                     y wr6   )r   )r}   ss     r?   r   z3WrapperCodeGen.memory_plan_reuse.<locals>.<genexpr>  s      *
./A))*
s   F)r*   r9   get_output_namesrE  ru   r  r>   rj   r   r   rangerQ   r  r   r   r   sumrU  r   MAX_STACK_ALLOCATION_SIZE)r   	out_namesplanning_statespast_planning_statesir   r   s          r?   r  z WrapperCodeGen.memory_plan_reuse  s   GG,,.	 JJ4::b>+=>

2##((	9 JJNN JJ4::b>+=>

2##((	9 /01!s4::' 	CA::a=D$ 23 $		/"*= >

1D"34&&':'<=D"23$++O,?,?,AB	C 	##O$7$7$9:?#q((( '* *
3G*
 '
#
 ''u4 I--I+/HH 	#rA   c           	     z    |j                  | j                   | d| d| j                   | j                          y )Nz_size = .)r   rF  rN  rH  r   r   rj   s      r?   codegen_input_size_var_declz*WrapperCodeGen.codegen_input_size_var_decl  s3    $,,vXdV1TYYK}UVrA   c           	     z    |j                  | j                   | d| d| j                   | j                          y )Nz
_stride = r)  )r   rF  rO  rH  r*  s      r?   codegen_input_stride_var_declz,WrapperCodeGen.codegen_input_stride_var_decl  s6    ||nTF*TF!DKK=N	
rA   c                X    t        j                  d       fd       }t        j                  d       fd       }t               }d t        t	        |j                                     }t        t	        fd|j                                     }|D ]d  \  }}	t        |	t        j                        s!|	|vs&j                   j                   |	 d|  j                          |j                  |	       f |D ]  \  }}
|
j                         }t        |      D ]n  \  }}	t        |	t        j                        s!|	|vs&j                   j                   |	 d ||       d| d j                          |j                  |	       p  |D ]  \  }}
|
j                         }t        |      D ]n  \  }}	t        |	t        j                        s!|	|vs&j                   j                   |	 d ||       d| d j                          |j                  |	       p  y)	z$Assign all symbolic shapes to localsNc                2    j                  |        |  dS )N_size)r+  rj   r   r   s    r?   sizeofz-WrapperCodeGen.codegen_inputs.<locals>.sizeof  s    ,,T48V5>!rA   c                2    j                  |        |  dS )N_stride)r-  r1  s    r?   strideofz/WrapperCodeGen.codegen_inputs.<locals>.strideof  s     ..tT:V7##rA   c                <    t        | d   t        j                        S )Nr+   )ru   rv   r   )r  s    r?   is_exprz.WrapperCodeGen.codegen_inputs.<locals>.is_expr  s    adEJJ//rA   c                     |        S r6   r|   )r  r7  s    r?   <lambda>z/WrapperCodeGen.codegen_inputs.<locals>.<lambda>  s    ^ rA   r  r6  r7  )r  ra  r   r   filterrN   ru   rv   Symbolr   rF  rH  r   r  	enumerater  )r   r   r  r2  r5  
bound_varsgraph_inputs_exprgraph_inputs_tensorsrj   shapevalueshapesdimr7  s   ``           @r?   r  zWrapperCodeGen.codegen_inputs  s   
 
		T	"	" 
#	" 
		T	"	$ 
#	$
 ),
	0 !1C1C1E!FG#+\-?-?-AB 
 - 	&KD%%.5
3J$,,wc$}MNu%	&
 0 	*KD%^^%F'/ *
UeU\\2uJ7NNN<<.s6$<.#a}U NN5)*	* 0 	*KD%%%'F'/ *
UeU\\2uJ7NNN<<.s8D>2B!C5$++W NN5)*	*rA   c                   t        |t        j                        rt        |t        j
                        r|| j                  v ry | j                  j                  |       t        j                  j                  j                  |   }| j                  | j                   | d| j                  |       | j                          y y y Nr  )ru   rv   r;  r   r   PRECOMPUTED_SIZEr   r   r*   r9   r:   inv_precomputed_replacementsr   rF  rR  rH  )r   symexprs      r?   ensure_size_computedz#WrapperCodeGen.ensure_size_computed!  s    c5<<(^CAVAV-Wd)))##C(77##@@EDNN<<.S):):4)@(A$++O .X(rA   c                     y r6   r|   r   s    r?   r  zWrapperCodeGen.finalize_prefix+  s    rA   Tr;   c                   t        ||      S )NrL  )rQ  )r   r  r;   s      r?   codegen_python_sizevarz%WrapperCodeGen.codegen_python_sizevar.  s    Q**rA   c                $    | j                  |      S r6   )rN  r  s     r?   codegen_sizevarzWrapperCodeGen.codegen_sizevar1  s    **1--rA   c                    | d| dS )Nr6  r7  r|   )r   basenamerj   indexs       r?   codegen_tuple_accessz#WrapperCodeGen.codegen_tuple_access4  s    1UG1%%rA   c                    t        t        | j                  |            }t        |      dk(  ryt        |      dk(  r	d|d    dS ddj	                  |       dS )Nr   z()r+   rd   r  rb   re   )r   r  rN  rQ   ri   )r   r@  partss      r?   codegen_python_shape_tuplez)WrapperCodeGen.codegen_python_shape_tuple7  s^    S44e<=u:?u:?uQxj$$499U#$A&&rA   c                $    | j                  |      S r6   )rW  )r   r@  s     r?   r   z"WrapperCodeGen.codegen_shape_tuple?  s    ..u55rA   c                    dj                  dj                  |t        |      t        |      | j	                  |      | j	                  |      g            S )Nzalloc_from_pool({})rb   )formatri   rQ  r   r   )r   rj   offsetr   r@  rO  s         r?   codegen_alloc_from_poolz&WrapperCodeGen.codegen_alloc_from_poolB  sS    $++II&MJ,,U3,,V4

 
	
rA   c                   ||j                   j                  k(  rk||j                   j                  k(  rR||j                   j                  k(  r9|&||j                  k7  rd|j                          d| dS |j                          S | j                  |      }| j                  |      }| j                  |      }|/||j                  k7  r d|j                          d| d| d| d| dS d|j                          d| d| d| d	S )Nzaten.view.dtype(rb   re   z#aten.view.dtype(reinterpret_tensor(z), zreinterpret_tensor()r<   rN  rO  r[  r   r  r   rP  )r   datarN  rO  r[  writerr   s          r?   codegen_reinterpret_viewz'WrapperCodeGen.codegen_reinterpret_viewO  s    DKK$$$$++,,,$++,,, Udjj%8)$--/):"UG1EE--/*+++D1D--f5F))&1F Udjj%8<T]]_<MRPTvUWX^W__abhaiilmrlsstuu *$--/):"TF"VHBvhVWXrA   c                2    | j                  | d| d       y )Nz.copy_(re   r   )r   srcdsts      r?   codegen_device_copyz"WrapperCodeGen.codegen_device_copyf  s    #gcU!,-rA   c                `    | j                  | j                   | d| | j                          y rE  )r   rF  rH  )r   rj   rA  s      r?   codegen_multi_outputz#WrapperCodeGen.codegen_multi_outputi  s)    $,,vS}EFrA   c                   d |j                   D        \  }t        |j                        dk(  r#| j                  |j                   d| d       nkt        |j                        dk(  r@t        |j                  d   t              r#| j                  |j                   d| d       nt        |j                        dk(  rt        |j                  d   t              r| j                  |j                   d| d       | j                  d	|j                   d
|j                  d   j                   d|j                   d|j                  d   j                   d	       | j                  |j                   d|j                   d|j                  d   j                          nt        d|j                         | j                  |j                          d       y )Nc              3  <   K   | ]  }|j                           y wr6   )r  )r}   ts     r?   r   z8WrapperCodeGen.codegen_dynamic_scalar.<locals>.<genexpr>m  s     >Q1&&(>s   r   r  .item()r+   z = 1 if z.item() else 0z_undivided = zassert z_undivided % z
 == 0, f'{z_undivided} not divisible by 'z_undivided // zunrecognized keypath z = None)r  rQ   keypathr   rH  ru   r   r   divisorrR   r  )r   r>   r^  s      r?   codegen_dynamic_scalarz%WrapperCodeGen.codegen_dynamic_scalarl  s   >$++>t||!NNdhhZs4&89!#
4<<?M(RNNdhhZxv^DE!#
4<<?K(PNNdhhZ}TF'BCNN$((=a1H1H0I Jxxj >t||A?V?V>WWXZ NN88*CzQ8O8O7PQ !#8!GHH 	$--/*'23rA   c           
          fd}fd}fd}j                  g d       j                         5  j                  dd       t        j                  j
                  j                         D ]U  \  }}j                  d|         |||j                         |j                         |j                  |j                         W t        t        j                  j                        d	kD  r^j                  d
       t        j                  j                  j                         D ]"  \  }}j                  d|         |||       $ t        j                  j                  j                         D ]d  \  }}t        |t         j"                        rCt        t        j                  j$                  j&                  j)                  |d       t*              rdt        |t         j,                        r3 ||t        j                  j$                  j/                  |d             |j1                         D cg c]-  }t        j                  j$                  j/                  |d      / }	}|j3                         D cg c]-  }t        j                  j$                  j/                  |d      / }
} |||	|
|j5                         |j7                                g ddj9                  t        j                  j                  j;                                d}j                  d|        j                  d       d d d        y c c}w c c}w # 1 sw Y   y xY w)Nc                    j                  |  dj                  |       dj                  |       d| d| d
       y )Nz = rand_strided(rb   
, device='	', dtype=re   )r   rW  )rj   r@  rO  devicer   r   r   s        r?   add_fake_inputz@WrapperCodeGen.benchmark_compiled_module.<locals>.add_fake_input  sT    &(2259:"226:; <!()E7!5rA   c                2    j                  |  d|        y rE  rb  )rj   r   r   s     r?   add_expr_inputz@WrapperCodeGen.benchmark_compiled_module.<locals>.add_expr_input  s    vS./rA   c                Z    dd l }j                  |  d|j                  |      d       y )Nr   z = pickle.loads(re   )pickler   dumps)rj   rA  ry  r   s      r?   add_torchbind_inputzEWrapperCodeGen.benchmark_compiled_module.<locals>.add_torchbind_input  s,    v%5fll56I5LANOrA   )r5  r5  z3def benchmark_compiled_module(times=10, repeat=10):z
                from torch._dynamo.testing import rand_strided
                from torch._inductor.utils import print_performance
                Trl  zglobal r   zimport pickle*   fallbackzcall([rb   z])zfn = lambda: z8return print_performance(fn, times=times, repeat=repeat))
writelinesr   rs  r*   r9   	constantsrN   r   rN  rO  rt  r   rQ   torchbind_constantsr  ru   rv   r;  r:   
var_to_valr]   r   r   	size_hintr  r  r7   r8   ri   keys)r   r   ru  rw  r{  rj   rA  torchbind_objr  r@  rO  call_strs   ``          r?   benchmark_compiled_modulez(WrapperCodeGen.benchmark_compiled_module  s   		0	P
 	K	
 ]]_ :	YMM     !ww00668 e   74&!12%**,ekk	 177../!3  1+,77+F+F+L+L+N ='D- $$wtf%56'm<	=  !ww3399; eeU\\2zGG$$//33E4@,8 eUZZ0
 #4)9)9)C)CETV)C)WX "'!1 ((221r2BE  "'!1!1!3 ((221r2BF  #((*)/>  		!''*>*>*C*C*E FGrJH}XJ78WXu:	Y :	YPY:	Y :	Ys+   HM92M+M?2M1BM
MMc                    t         j                  sy| j                  |       |j                  g d       |j	                         5  |j                  ddt                dg       ddd       y# 1 sw Y   yxY w)zL
        Append a benchmark harness to generated code for debugging
        N)r5  r5  zif __name__ == "__main__":zBfrom torch._inductor.wrapper_benchmark import compiled_module_mainzcompiled_module_main('z', benchmark_compiled_module))r   benchmark_harnessr  r  r   r&   )r   r   s     r?   r  z$WrapperCodeGen.add_benchmark_harness  ss     ''&&v.@A]]_ 	X,-?-A,BB_`	 	 	s    A//A8c                    |r| dnd}d| | d| }| j                   j                  |       t        j                  j                  r| j
                  j                  |       y y )Nr   r5  z

r  )r>  rs  r   r   r   rB  )r   rj   rk   metadatacudametadata_commentbodys          r?   define_kernelzWrapperCodeGen.define_kernel  sd     /7hZr?B&'vS94 ==11%%,,T2 2rA   c           	     >	   ! ddl m}  |        |j                  }ddlm}m}m} g }	i }
g }g }t        |j                        D ]m  \  }}||vr||   }||j                  v r||
|<   %|j                  |       t        |t        j                        r7|	j                   |||j                         |j                                      t        |t        j                         rV|	j                   |||j"                  j                         |j                         |j$                  j&                               |	j                   |||             t        |t(        t*        j,                  f      s1t.        j0                  j2                  j5                  |d      s]|j                  |       p d}t7        |	||      t9        j:                  t.        j0                  j<                  j?                               i |
t@        jC                  |d      tE        |	|	      gd
}tG        |jH                        g}tK        |      dkD  rQ|jM                         D ]>  }t        |t        j                  t        j                   f      r.|j                  |       @ |j                  tO        |             tQ        |      }|| jR                  v r| jR                  |   S | dtK        | jR                         }||f| jR                  |<   tU               jW                  d|d       ddl,m-}m.} j_                   |              d|i|ja                         }|D cg c]&  }|jb                  |jd                  |jf                  d( }}j_                  d|d|d|d       j_                  |jh                  d       ddl,m5 ddl6m7 |h  !fd! !|       t.        j0                  j<                  j?                         }jW                  d|jp                   d       ts        jt                  |jH                        \  }}ts        jv                  |jH                        }d| d| }| jy                  |j{                         |       ||fS c c}w )Nr   )patch_triton_dtype_reprr+   )KernelArgTypeSizeArg	TensorArg)rj   bufferr   )rj   r  r   r[  ztl.int32)
size_dtyper  )r  )	signaturert  r  r   _zasync_compile.triton(z, ''')gen_common_triton_importsTritonKernelr  )r   	num_warps
num_stageszG
            @triton_heuristics.user_autotune(
                configs=z ,
                inductor_meta=z,
                triton_meta=z{,
                filename=__file__,
                custom_kernel=True,
            )
            @triton.jit
            Trl  )JITFunction)	constexprc           	        t        j                  | j                        D ch c]  }|j                  dk(  r|j                   }}| j                  j
                  j                  di       }| j                  j                  j                  D ]  }|v r	|| j                  j
                  v s"| j                  j
                  |   }t        |
      rXj                          j                  d       j                  |j                  d       j                  |        |       t        |t        t         t"        f      rj                          t        |      rd|j$                  d}n|}|j                  |      x}rMd}t        |t&              rd	|j(                   d
|j*                   }	nd	|}	j                  | |	 d|        nj                  | d|       j                  |       t||v sz|dk7  st-        |d      s|j(                  j/                  d      sj                  d|j(                   d|j*                   d|        j                  |        y c c}w )NLOAD_GLOBALr   z@triton.jitTrl  ztl.constexpr(re   r5  : r)  r  tlr   r   zfrom z import z as )disBytecodefnopnameargval__globals__r]   __code__co_namesru   newliner   rs  rc  r   r   r   r   rA  r   r   r   r  r  )
cur_kernelinstunqualified_loadsglobal_annotationssymbol_namesymbol
symbol_str
annotationannotion_codeannotation_coder  compile_wrapperr  symbols_includedtraverses             r?   r  zBWrapperCodeGen.define_user_defined_triton_kernel.<locals>.traverseu  sa     LL7!;;-/ ! !
 ",!:!:!>!>?PRT!U)}}55>> .:"22*--";";;']]66{CF!&+6'//1'11-@'..vzz.F(,,[9 (#FS#tY,GH'//1%fi8+88H)JJ,2:J);)?)?)LL:L,.M)*d;&()>)>(?qATAT@U$V !0 57zn2E+55#.-/@J< P ,55S
6ST(,,[9#'88'4/#FL9 #--88B
 (11#F$5$5#6hv>OtT_S`a ),,[9].:!s   "I<z''', device_str='z')z# Original path: r   )>torch.utils._tritonr  r   commonr  r  r  r<  	arg_namesr  r   ru   r    r  r  r8   r"   r^  r<   r[  r   rv   rw   r*   r9   r:   statically_known_equalsr3   r$   create	schedulerget_current_device_or_throwdictfromkeysr1   idr  rQ   r  r   r   rS  r/   r   r   r  r  rs  inductor_meta_commonr   r  r  rc  r  triton.languager  r   inspectgetsourcelinesgetsourcefiler  r   )"r   rk   r   r   r  original_namer  r  r  r  r  non_constant_indicesequal_to_1_arg_idxidxr   rS   index_dtyper  	cache_keyrj   r  r  inductor_metar   current_devicer  linenosrcfiler  r  r  r  r  r  s"                                @@@@@r?   !define_user_defined_triton_kernelz0WrapperCodeGen.define_user_defined_triton_kernel  se   ?!==)+	$&	!(*!&"2"23 #	7HC& +Cf'''!$	#$++C0c299-$$!!$#&<<>"%--/  R%7%78 $$!!$#&88#4#4#6"%--/#&::#4#4	 $$WS#%67!c5==1''**BBQ +11#6G#	7H !*&,
 '--!!==?-- 2A6
 0)
: !#699	w<!}} *!#		23E3E'FG$$S)* 	[)*)$	66611)<<#d&D&D"E!FG59;4G&&y1(*!!$9-9J%"PQC8:; 4
//1
 "
 	 !--#--$//
 
 	  $,/ 0(O ,			
 	vzz6 	'- *?9	: 9	:v 	**FFH!!$5n6I6I5J""MN**6995	6''		2&wiq9$$&	

 [  K
s    +Rc                   | d|j                    d}||d| z  }|t        j                  f| j                  vrs| j                  j	                  |t        j                  f       | j                  | j                   | d| j                  |j                         | j                          n;| j                  | d| j                  |j                         | j                          t        ||j                        S )Nr  numelr  )r?  r*   r9   rD  r   r   rF  rR  r  rH  r   )r   r  treer@  rI  s        r?   generate_numel_exprz"WrapperCodeGen.generate_numel_expr  s    a}E2axL D!''?$"8"88""&&agg7NN<<.c$*;*;DJJ*G)HV NNdV3t'8'8'D&Edkk]ST tTZZ00rA   c                    | j                  d|t        j                  |fd      }| j                  |       |r| j                  d| j                          y y )N	workspace)r+   )r@  rO  zworkspace.zero_())make_allocationrn  uint8r   rH  )r   nbytesrt  	zero_fillr   s        r?   generate_workspace_allocationz,WrapperCodeGen.generate_workspace_allocation  sV    ##VId $ 
 	tNN.t{{m<= rA   c                H    | ddj                  |       d| j                   S )Nrd   rb   re   )ri   rH  )r   rj   	call_argss      r?   r  zWrapperCodeGen.wrap_kernel_call  s'    q9-.a}==rA   c                    | j                   j                  d       | j                   j                  dt        j                  j                   d       |j                  | j                   j                                y )Nz*from torch.profiler import record_functionzwith record_function('graph_z_inductor_wrapper_call'):)rA  r   r*   r9   graph_idr  r   )r   r
  s     r?   r  z2WrapperCodeGen.generate_profiler_mark_wrapper_call  sb    ##$PQ##*177+;+;*<<UV	
 	D--4467rA   c                :    | j                   j                  d       y )Nzstart_graph())rA  r   r   s    r?   r  z#WrapperCodeGen.generate_start_graph  s    ##O4rA   c                ^    | j                   j                  dt        j                  d       y )Nz
end_graph(re   )rA  r   r   profile_bandwidth_outputr   s    r?   r  z!WrapperCodeGen.generate_end_graph  s'    ##j1P1P0SST$UVrA   c                ^    | j                   j                  dt        j                   d       y )NU
            for kernel in globals().values():
                if isinstance(kernel, zU.CachingAutotuner):
                    kernel.cuda_kernel_saved = False
            rA  rs  r#   r   r   s    r?   r  z0WrapperCodeGen.generate_reset_kernel_saved_flags  s2      ''8'A'A&B C	
rA   c                ^    | j                   j                  dt        j                   d       y)a[  
        Precompile and save the CUBINs of the Triton kernels that haven't
        been precompiled and saved as a side effect of running the generated
        JIT model (Python wrapper). This can happen when the model contains
        control flow: only one pass through the control flow operators covers
        the kernels that are saved, the remaining kernels are not launched,
        hence not saved. The main purpose of this codegen is to compile and
        save the Triton kernels outside the active control flow path for
        subsequent AOTInductor code generation and compilation.
        r  a  .CachingAutotuner):
                    if not kernel.cuda_kernel_saved:
                        if len(kernel.launchers) == 0:
                            kernel.precompile()
                        kernel.save_gpu_kernel(
                            grid=(0, 0, 0),   # use dummy grid
                            stream="stream",  # use dummy stream
                            launcher=kernel.launchers[0],
                        )
            Nr  r   s    r?   r  z/WrapperCodeGen.generate_save_uncompiled_kernels  s4     	  ''8'A'A&B 	C	
rA   c                    |S r6   r|   )r   r  r   r  grid_callablegrid_extra_kwagss         r?   generate_default_gridz$WrapperCodeGen.generate_default_grid  s	     rA   c                      fd}|D cg c]
  } ||       }}|4t         j                  j                  j                         }|j                  }||fS c c}w )Nc                   t        | t              rt        |       r| dz   S | S t        | t        t        t
        t        f      rt        |       S j                  t        j                  j                  j                  |             S )Nrk  )ru   r   r2   r   floatr   r   rR  r*   r9   r:   r;   )rS   r   s    r?   wrap_argz;WrapperCodeGen.prepare_triton_kernel_call.<locals>.wrap_arg  se    #s#*B3*GsYPSPC#udO!DE3x(()9)9)B)B3)GHHrA   )r*   r9   r  r  rS  )r   device_indexr  r  rS   r  s   `     r?   prepare_triton_kernel_callz)WrapperCodeGen.prepare_triton_kernel_call  s[    	I /88sXc]8	8WW..JJLN)//LY&& 9s   Ac                    t        |t              rt        j                  j	                  |      "|}t        j                  j                  |      }n|J d       d| }|}t        j                  j                  j                  |j                         t        j                        }t        j                  j                  j                  |j                         t        j                        }|j                         }	|j                         }
t        j                  j                  j                  |j                  j                   t        j                        }d| d| d|	 d|
 d| d} j"                  j%                  | d	|        |S t'        |t(        j*                        st        |t,              rt        |t.              r| j0                  v r|S |y
|}t        |t,              r|j2                  }|t        j                  j                  j4                  v r't        j                  j                  j4                  |   }t/        t        j                  j                  j                  |t        j                              S t        |t.        t6        t8        t:        f      rt/        |      S t        |t<              rddj?                   fd|D               dS tA        dtC        |             )NzBV.graph.get_buffer(arg) and raw_arg can't be None at the same timetmp_arg_r}  zgenerate_example_value(rb   z, 'z', re   r  r   r6  c              3  T   K   | ]  }j                  |t        |             ! y wr6   r   )r}   ar   s     r?   r   z<WrapperCodeGen.generate_example_arg_value.<locals>.<genexpr>\  s#      ZQR!@!@DG!L Zr   r7  zUnsupported type )"ru   torch_dtyper*   r9   try_get_buffer
get_bufferr:   
size_hintsr  r   unbacked_symint_fallbackr  r7   r8   r  r<   r[  r   r   
issubclassrv   Basicr   r   rd  r   rG  r   r  r   r   ri   NotImplementedErrorr   )r   rS   arg_typeraw_argrS  r  r  rN  rO  rt  r   r[  rA  s   `            r?   r   z)WrapperCodeGen.generate_example_arg_value*  s   h,ww%%c*6gg((- 'XWX'%eW-77##..88 / D WW%%00 88 1 F ^^%FMMOEWW%%//

!!88 0 F .dV2fXSE7RTU[T\\]^E&&00H:S1HIO%++.*S/2R#s#$//)J?!#/nnagg&&CCCgg&&CCCH  **#<< +   c3t45s8OT"tyy ZVY ZZ[[\]]%(9$s)&EFFrA   c                z     t        |t              r ddj                   fd|D              z   dz   S t        |      S )Nr6  rb   c              3  @   K   | ]  }j                  |        y wr6   _grid_dim_strr}   rx   r   s     r?   r   z/WrapperCodeGen._grid_dim_str.<locals>.<genexpr>c  s     RT 2 24 8R   r7  )ru   r   ri   rQ  )r   grid_per_dims   ` r?   r  zWrapperCodeGen._grid_dim_str`  s<    lD)diiR\RRRUXX &&rA   c           
         |rƉ j                  ||      \  }}dj                  |      } j                  |t        j                        }|rW j                          ||	}n+dj                   fd|D              }|r| d| }|	 d| d} j                  | d| d| d| d       t        j                  j                  r| j                  vr|t        |      t        |      k(  sJ d	       i }g }|dgt        |      z  }nt        |      t        |      k(  sJ d
       t        t        |||            D ]  \  }\  }}}d}t        |t              r!dt        |      v r|j!                  d      \  }}t        |t"              r$||vr j%                  ||||      }|||<   n||   }n j%                  ||||      }|j'                  ||n| d|         ||	}n+dj                   fd|D              }|r| d| }|	 d| d} j(                  j                  | ddj                  |       d| d| d        j(                  j                  ddj                  d |j+                         D               d        j                  j-                  |       yd| d} j                  | d| d| d| d       yyy j                   j/                  ||             y)a7  
        Generates kernel call code.

        cuda: Defines whether the backend is GPU. Otherwise the backend is CPU.

        triton: Defines whether the GPU backend uses Triton for codegen.
                Otherwise it uses the CUDA language for codegen.
                Only valid when cuda == True.
        rb   Nc              3  @   K   | ]  }j                  |        y wr6   r  r  s     r?   r   z6WrapperCodeGen.generate_kernel_call.<locals>.<genexpr>  s     (Sd););D)A(Sr  rd   re   z.run(z, grid=z	, stream=z$call_args and arg_types do not matchz#call_args and raw_args do not matchr  c              3  T   K   | ]  }j                  |t        |             ! y wr6   r   )r}   r~   r   s     r?   r   z6WrapperCodeGen.generate_kernel_call.<locals>.<genexpr>  s'      -LMD;;AtAwG-r   del c              3      K   | ]  }|  y wr6   r|   )r}   rS   s     r?   r   z6WrapperCodeGen.generate_kernel_call.<locals>.<genexpr>  s     (M(M   r   z	c_void_p(r)  )r  ri   rb  r*   r9   rz  r   r   r   r   r   rQ   r<  r   ru   r   r  r  r   r   r   r  r   r  )r   r  r  r   r  r  r   r  r  r  r  autotune_configsgrid_extra_kwargscall_args_strstream_namegrid_strtensor_argsall_argsr'  rS   r  r   r   arg_str
stream_ptrs   `                        r?   r  z#WrapperCodeGen.generate_kernel_callh  ss   0 *.*I*Ii+'L- !IIm4M33L!''JK--/<&H#yy(Sd(SSH(&.Zr2C1D#E")!H:Q7H"m5wxj	R]Q^^_` MM::#4+E+EE %0S^s!H 6 >=>  #%K!H'$(6C	N#:"8}%1   A@A   8AIy(;8 X33C7 #%c3/C3s8O'*yy~HC%h<"+5*.*I*I$'7A+" 4;C 0*5c*:&*&E&E #Xw'G !3;se1WIDVW)X, |#*#'99 -QU- $ -*226G5H'IH&-Yaz#;..88&-uTYYx-@,A
R[\g[hhij ..88tyy(M8J8J8L(MMNbQ ..22;?(Q7
"m1[M=/J<qQu F ;~ NN400iHIrA   c                :    | j                   j                  |       y r6   )rE  r   )r   r   s     r?   r   zWrapperCodeGen.writeline  s    

$rA   c                4    |D ]  }| j                  |        y r6   rb  )r   rE  r   s      r?   r  zWrapperCodeGen.writelines  s     	!DNN4 	!rA   c                L    | j                   j                  t        |             y r6   )rE  r   r'   )r   ctxs     r?   r  zWrapperCodeGen.enter_context  s    

+c*+rA   c                    ddl m}m}  |       rdd l}t	        |t
              rt        |j                  j                        S t	        |t        j                        rt        |      S t	        |t        t        f      rAt        j                   G d d             t         t!        |       fd|D                    S t	        |t"        j$                  j&                        rt)        |      S t	        |t*        j,                  t.        f      r|j1                         S  |       r(t	        |j2                  j4                        r ||      S t        |      S )Nr   )dtype_to_stringhas_triton_packagec                      e Zd ZU ded<   d Zy)+WrapperCodeGen.val_to_arg_str.<locals>.Shimr   refc                    | j                   S r6   )r   r   s    r?   __repr__z4WrapperCodeGen.val_to_arg_str.<locals>.Shim.__repr__  s    88OrA   N)r   r   r   r   r"  r|   rA   r?   Shimr    s    $rA   r#  c              3  L   K   | ]  } j                  |              y wr6   )r  )r}   r  r#  r   s     r?   r   z0WrapperCodeGen.val_to_arg_str.<locals>.<genexpr>  s!     HT%8%8%; <Hs   !$)r  r  r  r   ru   r   rQ  r>   rI  rv   r   r   r   r  	dataclassrJ   r   rn  _ops
OpOverloadr   r    r  r"   r  languager   )r   r  type_r  r  r   r#  s   `     @r?   r  zWrapperCodeGen.val_to_arg_str  s    Ka"%%5::&8OE4=)""$ $ #$ QHaHHII5::001&q))BII78&&((!jFOO4I4I&J"1%%7NrA   c                    |j                         }|j                         }t        |j                               }t        |j	                               }| j                  |j                         ||||      S r6   )r7   r8   r   r  r  r  r  )r   r  rt  r   r@  rO  s         r?   r%  z%WrapperCodeGen.make_buffer_allocation  sb    ""$  "foo'(v((*+##FOO$5vueVTTrA   c           
        |j                   dv r9| d|j                    d| j                  |       d| j                  |       d| d
S | d| j                  |       d| j                  |       d|j                    d| d
S )	N)r  r  xpuz = empty_strided_rd   rb   re   z = empty_strided(rr  rs  )r   r   )r   rj   rt  r   r@  rO  s         r?   r  zWrapperCodeGen.make_allocation  s    ;;00 &)&++a++E232++F34B' f%''./r''/0 1{{m9UG16	
rA   c           	     `    | j                    | d| | j                   d| j                   d| 	S )Nr    rc   )rF  rH  rK  )r   new_nameold_namerK  s       r?   make_tensor_aliasz WrapperCodeGen.make_tensor_alias  s6    ,,zXJt{{m2dll^STU\T]^^rA   c                (    d|j                          S )Nr  )r  )r   r  s     r?   r+  zWrapperCodeGen.make_buffer_free  s    foo'())rA   c                8    ddj                  d |D               S )Nr  rb   c              3      K   | ]  }|  y wr6   r|   )r}   rj   s     r?   r   z4WrapperCodeGen.make_free_by_names.<locals>.<genexpr>  s     >>r  )ri   )r   names_to_dels     r?   make_free_by_namesz!WrapperCodeGen.make_free_by_names  s    dii>>>?@@rA   c           	     `    | j                    | d| | | j                   d| j                   d	S )Nr  r.   reuse)rG  rH  rK  )r   r0  r/  del_lines       r?   codegen_exact_buffer_reusez)WrapperCodeGen.codegen_exact_buffer_reuse  s@    ../zXJxjQUQ\Q\P]]_`d`l`l_mmsttrA   c                   |j                         |j                         k(  sJ |j                         }|j                         }d}|t        j                  j	                         vr|rd| j                  |       }|j                         |j                         k(  rQ|j                         |j                         k(  r0|| j                  v r|| j                  |<   | j                  |||      S | j                  ||j                         |j                         d| j                        }|| j                  v r|| j                  |<   | j                   | d| | d| j                   dS )N;z; r   r  r.  r8  )r8   r  r*   r9   r   r+  r  r  rV  r:  r`  rA  rG  rK  )r   oldnewr.  r0  r/  r9  reinterpret_views           r?   r1  z WrapperCodeGen.make_buffer_reuse   sE   }}#--/111<<><<>1773355*D11#678H<<>S\\^+0@CNNDT0T47779<,,X6228XxPP88!11d6G6G
 t;;;58D((2../z=M<NxjXZ[_[g[gZhhnoorA   c                    | j                  t        || j                   | d|j                  j	                          | j
                   d| j                   d             y )Nr  r.  z alias)r   r.   rG  viewr  rH  rK  )r   rj   r<   s      r?   codegen_deferred_allocationz*WrapperCodeGen.codegen_deferred_allocation4  s\    //0c&++:W:W:Y9Z[_[f[fZggi<<.(	
rA   c                   |j                         }|t        j                  j                  v s|| j                  v ry | j                  j                  |       t        |j                         t        j                  t        j                  f      ry |j                         }t        |t        j                        ry t        |t        j                        ry t        |t        j                        r>t        |j                  t        j                         s*J dt#        |j                         d|j                          t        |j                  j$                  t        j&                        s$J t#        |j                  j$                               t        |j                  j$                  j$                  t        j(                        s$J t#        |j                  j$                               | j+                  |j                  j$                  j$                         | j-                  ||       y | j/                  t1        | |             y )Nzunexpected r  )r  r*   r9   r  r^  r   ru   get_defining_opr    ExternKernelAllocMultiOutput
get_layoutMutationLayoutSHOULDREMOVEr  NonOwningLayoutrA  r"   r   r^  
StorageBoxr  codegen_allocationrB  r   r  )r   r  rj   r<   s       r?   rK  z!WrapperCodeGen.codegen_allocation=  s    177***ddnn.D4 ""$!!2>>2
 ""$fb;;<fbmm,fb001R// @T&++./r&++?@  fkk..>VV[[EUEU@VV>fkk..33RYY?WfkkFVFVAWW?##FKK$4$4$9$9:,,T6:|D&12rA   c                *   |j                         }t        |t        j                        r!| j	                  | j                  |             y | j                  |      sy | j                  j                  |       | j	                  t        | |             y r6   )
r  ru   r    InputBufferr   r+  	can_reuser_  r   r   )r   r  rj   s      r?   codegen_freezWrapperCodeGen.codegen_freeZ  sk      fbnn-NN40089~~f%

t*489rA   c                l   |j                         }|t        j                  j                  v xs |t        j                  j                  v xsh |t        j                  j
                  v xsJ |t        j                  j                  v xs, |t        j                  j                  v xs || j                  v  S r6   )	r  r*   r9   r  r  r  r  never_reuse_buffersr_  )r   input_bufferoutput_bufferrj   s       r?   rN  zWrapperCodeGen.can_reuseh  s    $$&AGG+++ "qww+++"qww(((" qww222" qww222	"
 tzz!
 	
rA   c                    |j                         | j                  v xr. | j                  |j                            |j                         k(  S r6   )r  r`  )r   r  reused_buffers      r?   	did_reusezWrapperCodeGen.did_reuses  sC     OO, KFOO-.-2H2H2JJ	
rA   c                   t        |      t        |      k(  sJ | j                  |       | j                  j                  |j	                                | j
                  j                  |j	                                |j	                         | j                  |j	                         <   | j                  t        | ||             y r6   )	r@   rK  r_  r   r  r^  r`  r   r  )r   rR  rS  s      r?   codegen_inplace_reusez$WrapperCodeGen.codegen_inplace_reuse{  s    -1A-1PPPP-

|,,./=11340<0E0E0GM**,-y|]CDrA   c                    t        |      }|| j                  v r|S | j                  j                  |       | j                  |z   S r6   )r   rT  r   rF  )r   r  rj   s      r?   codegen_unbacked_symbol_declz+WrapperCodeGen.codegen_unbacked_symbol_decl  sC    6{4---K &&**40<<$&&rA   c                    t        |j                  j                  |      D ]3  \  }}| j                  | j                   | d| | j
                          5 y rE  )r   r9   r  r   rF  rH  )r   subgraphouter_inputsouter_outputsinner_inputouter_inputs         r?   codegen_subgraph_prefixz&WrapperCodeGen.codegen_subgraph_prefix  sP    (+HNN,G,G(V 	X$KNNdll^K=K=VW	XrA   c                    t        |j                  j                  |      D ]5  \  }}| j                  | d|j	                          | j
                          7 y rE  )r   r9   r  r   r  rH  )r   r\  r]  r^  inner_outputouter_outputs         r?   codegen_subgraph_suffixz&WrapperCodeGen.codegen_subgraph_suffix  sX    *-NN((-+
 	&L, NN.L$B$B$D#Edkk]S	rA   c                   	 | j                  |j                         | j                  | j                   d|j                          | j                  |||       t        j                  }t        j                  |j                        5  |j                  j                  |       d d d        | j                  |||       | j                          y # 1 sw Y   -xY w# | j                          w xY w)Nz subgraph: )parent_graph)r   r9   r   rK  rj   ra  r*   set_graph_handlercodegen_subgraphre  r   )r   r\  r]  r^  rg  s        r?   ri  zWrapperCodeGen.codegen_subgraph  s    	'%%hnn5NNdll^;x}}oFG((<O77L$$X^^4 //!- 0  ((<O$$&  $$&s$   BC 	C&C CC C0c                   |j                         }| j                  | dt        |j                                |j                  D cg c]  }|j                          }}t        t        |j                              D cg c]
  }| d| d }}|j                  j                         }t        |j                  t        j                        s| d}| j                  | dt        |j                                | j                  d| d       | j                  t        | |j                  j                               | j                  |j                  ||       | j                  t        |              | j                  d       | j                  t        | |j                   j                               | j                  |j                   ||       | j                  t        |              y c c}w c c}w )N = [None] * r6  r7  rk  r   r   zelse:)r  r   rQ   r  operandsr  r!  	predicateru   r    ShapeAsConstantBufferr   true_subgraphr9   ri  r   false_subgraph)r   conditionalrj   r  r]  r'  r^  rm  s           r?   codegen_conditionalz"WrapperCodeGen.codegen_conditional  s   ##%$|C0C0C,D+EFG;F;O;OPC--/PP16s;;N;N7O1PQAD61#QQQ));;=	+//1I1IJ$+W-I$|C0C0C,D+EFGYKq)*({/H/H/N/NOPk77}U'-.w({/I/I/O/OPQk88,V'-.! QQs   G1G6c                   |j                         }|j                  D cg c]  }|j                          }}|j                  D cg c]  }|j                          }}| j	                  | dt        |              t        |      D ]  \  }}| j	                  | d| d|          g t        t        |            D cg c]
  }| d| d c}|}| dg}	t        |      }
|
d t        |       }| j	                  d       | j	                  t        | |j                  j                               | j                  |j                  ||	       | j	                  d|	d    d	       | j	                  t        |              | j	                  t        | |j                  j                               | j                  |j                  |
|       | j	                  t        |              y c c}w c c}w c c}w )
Nrk  r6  z] = r7  _cond_resultzwhile True:zif not r   z.item(): break)r  carried_inputsr  additional_inputsr   rQ   r<  r!  r   r   cond_subgraphr9   ri  r   body_subgraph)r   
while_looprj   r  outer_carried_inputsouter_additional_inputsr'  inpcond_outer_inputscond_outer_outputsbody_outer_inputsbody_outer_outputss               r?   codegen_while_loopz!WrapperCodeGen.codegen_while_loop  s   ""$/9/H/H 
(+C!!# 
  
 0:/K/K#
(+C!!##
 #
 	$|C0D,E+FGH 45 	3FAsNNdV1QCtC512	3
&+C0D,E&FGas!nG
$
 "&l34 
 //J5I1JK}%(z/G/G/M/MNO$$&79K	
 	(+,N;	
 	'-.(z/G/G/M/MNO$$&79K	
 	'-.M 
#
 Hs   G+G0G5c                    	 t        | dd       ry t        | t              r| S t        j                  j
                  j                  |       }t        |      S # t        $ r Y y w xY w)Nfree_symbols)r  ru   r   r*   r9   
_shape_env_maybe_evaluate_static	Exception)r  r   s     r?   statically_known_int_or_nonez+WrapperCodeGen.statically_known_int_or_none  s\    
	q.$/ !S!''$$;;A>Cs8O 		s   A A 3A 	A"!A"c                l    g }| D ],  }t         j                  |      }| y |j                  |       . |S r6   )r   r  r   )lstr  r  nums       r?   %statically_known_list_of_ints_or_nonez4WrapperCodeGen.statically_known_list_of_ints_or_none  sA     	A ==a@C{MM#		
 rA   c                0    t         j                  |       d uS r6   )r   r  )r  s    r?    is_statically_known_list_of_intsz/WrapperCodeGen.is_statically_known_list_of_ints  s    CCCHPTTTrA   c                H    t         j                  | j                               S r6   )r   r  r  r  s    r?   r  z.WrapperCodeGen.static_shape_for_buffer_or_none  s    CCFOODUVVrA   c                0    t         j                  |       d uS r6   )r   r  r  s    r?   !can_prove_buffer_has_static_shapez0WrapperCodeGen.can_prove_buffer_has_static_shape  s    ==fETQQrA   )rj   r   ri  r   r   r   r   )r  TritonMetaParamsr   r   )r   	List[str]r6   )r   r   r   r   r  )r   r   r   r   )r  r  r   r   )r  r/   r   r   )rk   r   r  r   r  r   rl   r  )r  r   r  	List[Any]r   r  )r5  NNN)r  r   r  r   r  r   r  r  r  r   r  r   r  r   r  zOptional[torch._ops.OpOverload])r   r/   )r   r/   r  zDict[str, ir.TensorBox])rH  zsympy.Symbol)r  r   r;   r   r   r   )r  r   r   r   )rR  r   rj   r   rS  r   r   r   )r@  zTuple[Expr, ...]r   r   )NT)rj   r   rk   r   r  r   )r  r   r@  r   )TN)r  r   r   r  r  r   r  zOptional[Callable[..., Any]])NN)
NNTTNNr   NNr5  )r  r   )r5  )r5  r  )r0  r   r/  r   r9  r   )r=  r  r>  r  r.  r   )r  r  )rR  r  rS  r  )gr   r   r   __doc__r   r]  rY  r[  r%   rz  rx  r  r  r  r  r  rZ  rb  r  r   r   r   r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r+  r-  r  rJ  r  rN  rP  rT  rW  r   r\  r`  re  rg  ro  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r   r  r  r  r%  r  r1  r+  r6  r:  r1  rB  rK  rO  rN  rV  rX  rZ  ra  re  ri  rr  r  staticmethodr  r  r  r  r  r   r   s   @r?   r   r     s   K
Z<'
R
 	0 	0  	! W W
S	(1:.10J
/+7&5
5A:77 #7/<7DM7

 
 	
4&< )+7;XX  X 	X
  X X X #&X 5X0A,F!>:
$
LW

/*"/*2I/*b CG +.&'6
 9=	..G4*NY`& LP33!$30=3P!d1*>>85W

< 6:  	
 4'$4Gl' jJ jJX !,<U
"_*Aup(
3::	

E'X'/.(/T     U U W W R RrA   r   )r>   r  r   r   )rS   torch.Argumentr   r   )r^   r  r   r   )rk   ztorch._ops.OpOverloadr   r   r6   )
rj   r   r   zList[triton.Config]r   zList[TritonGrid]r   zOptional[WrapperCodeGen]r   zTuple[str, str])q
__future__r   r   r   r  r  r  r  r  r   rO   r  	itertoolsr   typingr   r   r   r   r	   r
   r   r   r   r   rv   r   rn  
torch._opsr   r  torch._dynamo.utilsr   r   #torch._inductor.codegen.debug_utilsr   $torch._inductor.codegen.multi_kernelr   %torch._inductor.runtime.runtime_utilsr   %torch.fx.experimental.symbolic_shapesr   r   r   torch.fx.noder    torch.utils._sympy.singleton_intr   torch.utils._sympy.symbolr   r   r5  r   r   r    	codecacher!   r"   runtimer#   runtime.hintsr$   utilsr%   r&   r'   r(   r)   virtualizedr*   aoti_hipify_utilsr,   r  r-   r.   r/   r0   triton_utilsr1   r2   r3   r   r9   r4   doprintrQ  rt  r   r   r@   r[   r`   rr   r   r  r   r   r%  r   r#  r   r   r   r   r   r  r  r  r   r  r  
BufferNamer   r|   rA   r?   <module>r     s   "    
     	         & 6 C A ; V V - 9 : ( ( '   ' ,   8 H H P P % 	 u{{C/0B>$<$ S> 	%UZZ
 #
%&2B1CU3PS8_1T(UU
 )-	F&
F& F& F& &	F&
 F&R    ' * **	 	 	 	 	 {   +KK +K +K\;  ; ; ;2 %  : E, E E, 
" 
 
(	! 	 
RW RrA   