
    sg                       U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZm	Z	 d dl
mZ d dlmZmZmZmZmZmZmZmZ d dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZmZm Z  d dl!m"Z"m#Z#m$Z$ d	d
l%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6 ddl7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZB ddlCmDZDmEZEmFZFmGZG ddlHmIZImJZJmKZKmLZLmMZMmNZNmOZOmPZPmQZQmRZRmSZSmTZTmUZU ddlVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZb ej                  dk(  Zdd Zeej                  j                  ehd      Zih dZjddddddddd d d!
Zkh d"Zld#d$d%d&d'd(d)d*d+d,d-
Zmd.d/d0Znej                  ej                  gZqej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                  g	Zyeej                     e{d1<   ej                  ej                  ej                  ej                  ej                  gZ|eej                     e{d2<   d3 Z}d4 Z~	 	 dfd5eej                     fd6Zd7 Zej                  d5ej                  d8ej                  fd9       Zej                  d5ej                  d8ej                  d:efd;       Zej                  	 dgd5ej                  d8ej                  d:ee   fd<       Z G d= d>e4      Z G d? d@      Z G dA dBeT      Zej                  dC        G dD dEe      Zej                  dF       ej                           G dG dHe      Z G dI dJeR      Z G dK dLe      Z G dM dNe      ZdOe/dPeeej                     etf   fdQZ G dR dS      Z G dT dUe      Z G dV dWe      Z G dX dYe      Z G dZ d[e2      Z G d\ d]      Z G d^ d_e      Z G d` da      Zej6                   G db dc             Zej6                   G dd de             Zy)h    N)copydeepcopy)Enum)castDictListOptionalSequenceSetTupleUnion)dependencies)is_float_dtypeis_integer_dtype)CeilDivFloorDivModularIndexing)free_symbol_is_typesymbol_is_typeSymT   )counters   )	codecacheconfigcpp_buildercpu_vec_isairmetrics)LoopBody)BaseSchedulerNodeBaseSchedulingForeachKernelSchedulerNodeFusedSchedulerNode	SchedulerSchedulerNode)cache_on_selfget_bounds_index_exprget_fused_kernel_namehas_free_symbolsis_welford_reductionparallel_num_threadsPlaceholdersympy_index_symbolsympy_index_symbol_with_prefixsympy_product
sympy_subs)NullKernelHandleropsOpsValueV   )BackendFeatureBracesBufferCppWrapperKernelArgsCSECSEVariableDataTypePropagationDeferredLineDTYPE_TO_COMPUTATION_DTYPEIndentedBufferKernel
KernelArgsOpOverridesOptimizationContext)_get_dtype_from_loopbodies_get_loop_bodycexprcexpr_indexcodegen_randCppCSEVariableDTYPE_TO_CPP
INDEX_TYPELocalBufferContextpromote_argsunify_mask_base_typevalue_to_cppwin32c                      t         rdS dS )Nz__declspec(dllexport) _IS_WINDOWS     N/var/www/html/venv/lib/python3.12/site-packages/torch/_inductor/codegen/cpp.pyget_export_declarationrX   R   s    &1"9r9rV   schedule>   *maxmin+^||r]   rZ   r^   r\   r[   argminargmaxr_   welford)
sumprodxor_sumr\   r[   r`   ra   anywelford_reducewelford_combine>
   rf   r[   r\   rc   rd   ra   r`   re   rg   rh   z
at::Tensorlongdoubleboolzstd::stringzc10::ScalarTypezat::MemoryFormatz
at::Layoutz
at::Devicez
at::Scalar)
Tensorintfloatrk   str
ScalarTypeMemoryFormatLayoutDevicenumberzstd::vectorzstd::optional)r   r	   VECTORIZABLE_DTYPESMASKED_VECTORIZABLE_DTYPESc                 2   |t         v rt        j                  }| dv ry| dk(  ry| dv rNt        |   }t	        |      rd| dnd| d	}t	        |      rd| dnd| d
}| dv r|n|}| dv r|S d| d| dS t        |       rdt        |    dS t        |       )N)re   rc   rf   r   rd   r6   )r[   ra   r\   r`   -std::numeric_limits<>::infinity()std::numeric_limits<>::min()>::max())r[   ra   )r[   r\   IndexValue<z>{0, }Welford<>())DTYPE_LOWP_FPtorchfloat32rJ   r   r+   AssertionError)reduction_typedtypecdtypemin_varmax_varinit_vars         rW   reduction_initr      s     22;;e$ e$ $F8=9'xx8 	 e$ #6(-8'xx8 	
 -0AA7w / 	
 vhfXJb9	

 N+,u-.c22

((rV   c                 \    t         t        |      }t        |       rd| dS | dv rd| dS |S )Nr   >>   ra   r`   r}   )rJ   r>   r+   )r   r   scalar_types      rW   reduction_acc_typer      sG    9%@AKN++a((--[M++rV   indexc           	         |t         j                  k(  }| dk(  r|rdnd}| d| d| S | dk(  r| d| S | dk(  r| d| S | d	k(  r| d
| S | dv r|  d| d| dS | dk(  r	d| d| dS | dk(  r6t        |t              r|\  }}}	nt	        | |      \  }}}	d| d| d| d|	 d	S | dv r||  d| d| d| dS |  d| d| dS t        |       )Nrc   |r]    rd    * re    ^ rf    || )r\   r[   z_propagate_nan(, )rg   welford_combine(rh   , {})r`   ra   z	_combine()r   rk   
isinstancetuplereduction_projectr   )
r   var
next_valuer   	src_dtypeis_boolconjunctionmeanm2weights
             rW   reduction_combiner      sl    5::%G$c#a}Aj\22c*&&"c*&&d:,''' !R
|1EE))!#bA66**j%()D"f0LD"f!#d4&2$bDD--$%Yse2j\E7!LL$%Yse2j\CC

((rV   c                 J    t        |       r| d| d| dfS | dv r| dS |S )Nz.meanz.m2z.weight>   ra   r`   z.index)r+   )r   accs     rW   r   r      sC    N+e}SkcU'?::	/	/f~JrV   r   c                     | j                  |      st        j                  d      S ||dz   i}t        | |      }t        j                  || z
        S Nr   r6   )hassympyIntegerr1   simplify)r   r   replacement	new_indexs       rW   	stride_atr      sK    99S> }}Qa.K5+.I>>)e+,,rV   
vec_lengthc                   	 dd	fd}	fd}| }t        j                  dd      }| j                  t              r| j	                  t        |      |      } t        j                  dd      }| j                  t
              r| j	                  t        ||      |      } t        j                  |       } | |k7  rt        |       S | S )a  
    Simplifies the index expression within the range of a vectorized loop.
    Given a vectorized loop variable `var` in the range of a loop with `vec_length`,
    this function transforms the `index` into an equivalent form. It handles
    simplifications for cases where `var` can be expressed as `vec_length * a + b`,
    where `b` ranges from 0 to `vec_length - 1`. The function reduces occurrences
    of `FloorDiv` and `ModularIndexing` in the `index` with best-effort optimizations.

    NOTE:
    The simplified index expression is intended for analysis purposes only, not
    for code generation. It replaces `FloorDiv` and `ModularIndexing` with free variables
    which are not dependent on the loop variable `var` in the vectorized range. Check
    https://github.com/pytorch/pytorch/pull/117221#discussion_r1449746217 for more details.

    Examples:
    1. If `var` is `x3` and `vec_length` is 16, and `x3 = 16*a + b`, then
       `FloorDiv(x3, div)` or `ModularIndexing(x3, div, mod)` becomes a free variable
       when `div` is divisible by 16.
    2. `ModularIndexing(x3, 1, mod)` can be simplified to `x3 + c` where `c` is a free
       variable when `mod` is divisible by 16.
    r   c                     t        |       }t        j                  |       k(  rt        j                   d       }dz  |S )N_div_cr6   )r   r   gcdSymbol)divisorresultdiv_freevar_idr   r   s     rW   visit_indexing_divz7simplify_index_in_vec_range.<locals>.visit_indexing_div  sK    #w'99Wj)Z7\\SE/?"@AFaNrV   c                    t        | |      }t        j                  |       k(  r!t        j                   d       }dz  |S | dk(  r;t        j                  |      k(  r"t        j                   d       z   }dz  |S )N_mod_cr6   )r   r   r   r   )r   modulusr   mod_freevar_idr   r   s      rW   visit_modular_indexingz;simplify_index_in_vec_range.<locals>.visit_modular_indexing!  s     gw799Wj)Z7\\SE/?"@AFaN  \eii<
J5<<3%vn5E(FGGFaNrV   r   T)integerr   )r   Wildr   r   replacer   r   simplify_index_in_vec_range)
r   r   r   r   r   original_indexdivmodr   r   s
    ``     @@rW   r   r      s    0 NN	 N
**Y
-CyyhsC02DE
**Y
-Cyy!oc3<>TUNN5!E*5#zBBLrV   c                 8    |rt        | ||      } t        | |      S N)r   r   )r   r   r   s      rW   stride_at_vec_ranger   =  s"     +E3
CUC  rV   c                   d     e Zd Zededefd       Zdddeeee	f      f fdZ
d Zd	 Zd
 Z xZS )OuterLoopFusedSchedulerNodenode1node2c                    |j                   |j                   u sJ t        d ||fD              sJ t        d ||fD              rt | |j                   t        |      t        u rt        |j                               n|gt        |      t        u r t        |j                               z   |      S |gz   |      S  | |j                   ||g|      S )Nc              3   T   K   | ]   }t        |      t        t        t        fv  " y wr   )typer   r&   r$   .0nodes     rW   	<genexpr>z3OuterLoopFusedSchedulerNode.fuse.<locals>.<genexpr>L  s1      
  J+"
   &(c              3   >   K   | ]  }t        |      t        u   y wr   r   r   r   s     rW   r   z3OuterLoopFusedSchedulerNode.fuse.<locals>.<genexpr>U       TTtDz88T   )	schedulerallrf   r   r   listget_outer_nodes)clsr   r   outer_loop_fusion_depths       rW   fusez OuterLoopFusedSchedulerNode.fuseG  s     %//111 
 
 
 	
 
 TeU^TT E{&AA ..01  E{&AA ..01 (!   (! & u8OPPrV   r   r%   outer_fused_nodesc                     || _         || _        g }| j                   D ]B  }t        |t        t        f      sJ |j                  t        |j                                      D t        | %  ||       y r   )
r   r   r   r&   r$   extendr   	get_nodessuper__init__)selfr   r   r   flatten_snodes_node	__class__s         rW   r   z$OuterLoopFusedSchedulerNode.__init__k  su      	 (?$++ 	;Eem5G%HIII!!$u'8"9:	; 	N3rV   c                     | j                   S r   )r   r   s    rW   r   z+OuterLoopFusedSchedulerNode.get_outer_nodes{  s    %%%rV   c                 0   dt         dt         dt        dt        ffdt        t	        |      dz
        D ]]  }||   j
                  }||dz      j
                  }t        d ||fD              s% |j                  d   |j                  d   |      r] y	 y
)Nleft_loop_levelright_loop_levelloop_fusion_depthreturnc                     g d}t         fd|D              sy|dk\  sJ |dz
  x}dkD  rS j                  j                  J t        d  fD              s$  j                  d   j                  d   |      syy)N)r   sizeoffsetstepsc              3   P   K   | ]  }t        |      t        |      k(    y wr   )getattr)r   attr_comparer   r   s     rW   r   zaOuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr.<locals>._inner.<locals>.<genexpr>  s2       % O\:/>?s   #&Fr6   r   c              3   L   K   | ]  }t        |j                        d k7    ywr6   N)leninner)r   
loop_levels     rW   r   zaOuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr.<locals>._inner.<locals>.<genexpr>  s*       # 
(()Q.   "$T)r   kernelrf   r   )r   r   r   outer_loops_attr_compare_list_inners   ``  rW   r  zNOuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr.<locals>._inner  s    -)   )F  $)))%6%::!a? $**27G7N7N7VV   (78H&I	 
  #))!,$**1-%
 !rV   r6   c              3   L   K   | ]  }t        |j                        d k7    ywr   )r   root)r   	loop_nests     rW   r   zQOuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr.<locals>.<genexpr>  s(        INN#q(r  r   FT)	LoopLevelrm   rk   ranger   r  rf   r  )r   cpp_kernel_proxy_listr   idxleft_loop_nestright_loop_nestr  s         @rW   "check_outer_fusion_loop_level_attrz>OuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr~  s    )	&)	')	  #)	 	)	V 23a78 
	C237AAN3C!G<FFO  #1/!B  ##A&(<(<Q(?AX 
	 rV   c                     |D cg c]  }|j                    }}|d   j                  dt        t        d      ffd |D cg c]  }|j                   c}| j                         |d   S c c}w c c}w )Nr   loop_level_nested_listr  c                 n   |dk\  sJ t        d | D              sJ |dz
  x}dk\  r&| D cg c]  }|d   j                   }} ||       y t              }| d   d   }t        t	        |             D ],  }|j                  j                  t        | |   d                . g |_        ||_        y c c}w )Nr6   c              3   8   K   | ]  }t        |      d k(    ywr   r   )r   loop_level_lists     rW   r   zrOuterLoopFusedSchedulerNode.merge_outer_fusion_kernels.<locals>._merge_outer_fusion_loop_levels.<locals>.<genexpr>  s      .=O$)s   r   )r   r   OuterLoopFusedKernelr	  r   appendr   r  )	r  r   r  next_loop_level_nested_listouter_loop_fused_kernelloop_level_of_first_kernel
kernel_idx_merge_outer_fusion_loop_levelskernel_groups	          rW   r  z_OuterLoopFusedSchedulerNode.merge_outer_fusion_kernels.<locals>._merge_outer_fusion_loop_levels  s     +a/// AW    ,CQ+FF'1L ,B/' $A&,,/+ / 0/+
 +?|*L'-CA-Fq-I*"',B(C"D J+1188 !7
!CA!FG 46*04K*1!/s   B2)r  r  r   r  r   )r   r
  r  loop_nest_list
_loop_nestr  r  s        @@rW   merge_outer_fusion_kernelsz6OuterLoopFusedSchedulerNode.merge_outer_fusion_kernels  s    
 ,A3
!'F3
 3
 -Q/<<	L$(k):$;	L> 	(/=>Z__>((	
 %Q''Q3
J ?s   A2A7)__name__
__module____qualname__classmethodr!   r   r   r   r$   r&   r   r   r  r  __classcell__r   s   @rW   r   r   F  sb    !Q%!Q.?!Q !QF44  &8-&G HI4 &@D,(rV   r   c                   2    e Zd ZddefdZd Zd Zd Zd Zy)	RecordOptimizationContext	func_namec                 .    || _         d | _        d | _        y r   )r(  current_nodeopt_ctx)r   r(  s     rW   r   z"RecordOptimizationContext.__init__  s    "596:rV   c                    t         j                  sJ t         j                  j                  sJ t         j                  j                  | _        | j                  J t        j                  | j                  j
                  v r-| j                  j
                  t        j                     | _        nt               | _        | j                  J | j                  | j                  _        | S r   )	r5   interpreterr*  rC   keymetar+  r(  ops_namer   s    rW   	__enter__z#RecordOptimizationContext.__enter__  s    }}}}}))))MM66  ,,,""d&7&7&<&<<,,112E2I2IJDL.0DL||''' $rV   c                     | j                   sJ | j                  sJ | j                  | j                   j                  t        j                  <   y r   )r*  r+  r/  rC   r.  r   exc_typeexc_valexc_tbs       rW   __exit__z"RecordOptimizationContext.__exit__  s>        |||:>,,2667rV   c                     | j                   S r   )r+  r   s    rW   get_opt_ctxz%RecordOptimizationContext.get_opt_ctx  s    ||rV   c                 6    | j                   sJ | j                   S r   )r*  r   s    rW   get_fx_nodez%RecordOptimizationContext.get_fx_node  s           rV   N)rR   )	r   r!  r"  ro   r   r1  r7  r9  r;  rU   rV   rW   r'  r'    s#    ;# ;
G
!rV   r'  c                      e Zd ZdZed        Zed        Zed        ZedLd       Zed        Z	ed        Z
ed	        Zed
        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Z ed        Z!ed         Z"ed!        Z#ed"        Z$ed#        Z%ed$        Z&ed%        Z'ed&        Z(ed'        Z)ed(        Z*ed)        Z+ed*        Z,ed+        Z-ed,        Z.ed-        Z/ed.        Z0ed/        Z1ed0        Z2ed1        Z3ed2        Z4ed3        Z5ed4        Z6ed5        Z7ed6        Z8ed7        Z9ed8        Z:ed9        Z;ed:        Z<ed;        Z=ed<        Z>ed=        Z?ed>        Z@ed?        ZAed@        ZBedA        ZCedB        ZDedC        ZEedD        ZFedEeGj                  dFeGj                  fdG       ZIedEeGj                  dFeGj                  fdH       ZJedEeGj                  dFeGj                  fdI       ZKedJ        ZLedK        ZMy)MCppOverrideszMap element-wise ops to C++c                     d|  d|  d| dS )N	decltype()( + r   rU   abs     rW   addzCppOverrides.add      1#Rs#aS**rV   c                     d|  d|  d| dS )Nr?  r@   - r   rU   rB  s     rW   subzCppOverrides.sub  rF  rV   c                     d|  d|  d| dS )Nr?  r@  r   r   rU   rB  s     rW   mulzCppOverrides.mul  rF  rV   Nc                    t        | t              sJ || j                  }t        j                  j                  | ||      }t        j                  j                  j                  t        j                  j                  |      }|j                  d| |fd|i       |t        j                  t        j                  fv r6|t        j                  k(  r#	 t        j                  j                  | |||       |S )Nto_dtyper   )r   rI   r   r5   r  get_to_dtype_exprcsegeneratecomputeupdate_on_argsr   bfloat16float16rn   cache_dtype_convert)xr   r   use_compute_typesexprcsevars         rW   rM  zCppOverrides.to_dtype  s    !^,,,Ixx))!UI>&&qxx'7'7>j1e*{I6NOU^^U]]33	U[[8P> HH((IvuErV   c                    |t         v sJ | dt         d       |t        j                  t        j                  fv r>dt         |    d|  d}dt         |    d| d}dt         t        j
                      d| dS dt         |    d|  dS )Nz missing from z.DTYPE_TO_CPPc10::convert<>(r   zc10::bit_cast<)rJ   r   r   rT  rS  r   )rV  r   r   cast_xs       rW   to_dtype_bitcastzCppOverrides.to_dtype_bitcastJ  s    $U~hZ}&UU$77 %\)%<$=Rs!DF%l5&9%:"VHAFF"<#>"?r&KK#L$7#81#Q??rV   c                     d|  dS )Nz	std::abs(r   rU   rV  s    rW   abszCppOverrides.absZ      1#QrV   c                     d|  dS )Nz	std::sin(r   rU   r`  s    rW   sinzCppOverrides.sin^  rb  rV   c                     d|  dS )Nz	std::cos(r   rU   r`  s    rW   coszCppOverrides.cosb  rb  rV   c                     d|  d|  dS )Nr?  z)(-r   rU   r`  s    rW   negzCppOverrides.negf      1#S1%%rV   c                     d|  dS )Nz	std::exp(r   rU   r`  s    rW   expzCppOverrides.expj  s     1#QrV   c                     d|  dS )Nz
std::exp2(r   rU   r`  s    rW   exp2zCppOverrides.exp2o      A3a  rV   c                     d|  dS )Nzstd::expm1(r   rU   r`  s    rW   expm1zCppOverrides.expm1s      QCq!!rV   c                     d|  dS )Nz	std::erf(r   rU   r`  s    rW   erfzCppOverrides.erfw  rb  rV   c                     d|  dS )Nz
std::erfc(r   rU   r`  s    rW   erfczCppOverrides.erfc{  rn  rV   c                     d|  dS )Nzcalc_erfinv(r   rU   r`  s    rW   erfinvzCppOverrides.erfinv      aS""rV   c                     d|  dS )Nz
std::sqrt(r   rU   r`  s    rW   sqrtzCppOverrides.sqrt  rn  rV   c                     d|  dS )Nz1 / std::sqrt(r   rU   r`  s    rW   rsqrtzCppOverrides.rsqrt  s    s!$$rV   c                 |    t         j                  j                  }|dk(  r|  d|  dS |d|  dS t        d|      )Naccuracy + decltype()(1)zstd::log1p(r   8unrecognized config cpp.inject_log1p_bug_TESTING_ONLY = r   cppinject_log1p_bug_TESTING_ONLYr   rV  bugs     rW   log1pzCppOverrides.log1p  sW    jj66*SQCt,,[ 1%% J3'R rV   c                     d|  dS )Nz	std::tan(r   rU   r`  s    rW   tanzCppOverrides.tan  rb  rV   c                     d|  dS )Nz
std::tanh(r   rU   r`  s    rW   tanhzCppOverrides.tanh  rn  rV   c                 &    t         rd|  dS d|  dS )z
        On windows std::signbit only support float type.
        Ref: https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/signbit?view=msvc-170
        z std::signbit(static_cast<float>())zstd::signbit(r   rS   r`  s    rW   signbitzCppOverrides.signbit  s-      /qc4	
 !1%	
rV   c                     d|  d| dS )Nz	std::pow(r   r   rU   rB  s     rW   powzCppOverrides.pow  s    1#Rs!$$rV   c                     d|  dS )Nz	std::log(r   rU   r`  s    rW   logzCppOverrides.log  rb  rV   c                     d|  dS )Nzstd::nearbyint(r   rU   r`  s    rW   roundzCppOverrides.round  s     1%%rV   c                     d|  dS )Nzstd::floor(r   rU   r`  s    rW   floorzCppOverrides.floor  rq  rV   c                 H    |  d| }|  d| }d|  d| d| d| d| d| d	S )
N /  % ((z
 < 0) != (z	 < 0) ? (z != 0 ? z - 1 : z) : r   rU   )rC  rD  quotrems       rW   floordivzCppOverrides.floordiv  sR     Cs|3qclA3j9SE$wtfDQUPVVWXXrV   c                     d|  dS )Nz
std::ceil(r   rU   r`  s    rW   ceilzCppOverrides.ceil  rn  rV   c                     d|  dS )Nzstd::trunc(r   rU   r`  s    rW   trunczCppOverrides.trunc  rq  rV   c                     |  d| S Nr  rU   rB  s     rW   truncdivzCppOverrides.truncdiv  s     Cs|rV   c                     d|  d| dS )Nz
std::fmod(r   r   rU   rB  s     rW   fmodzCppOverrides.fmod  s    A3b1%%rV   c                     d|  dS )Nzstd::isinf(r   rU   r`  s    rW   isinfzCppOverrides.isinf  rq  rV   c                     d|  dS )Nzstd::isnan(r   rU   r`  s    rW   isnanzCppOverrides.isnan  rq  rV   c                     d|  dS )Nzstd::lgamma(r   rU   r`  s    rW   lgammazCppOverrides.lgamma  rx  rV   c                     d|  dS )Nz
std::acos(r   rU   r`  s    rW   acoszCppOverrides.acos  rn  rV   c                     d|  dS )Nzstd::acosh(r   rU   r`  s    rW   acoshzCppOverrides.acosh  rq  rV   c                     d|  dS )Nz
std::cosh(r   rU   r`  s    rW   coshzCppOverrides.cosh  rn  rV   c                     d|  dS )Nz
std::sinh(r   rU   r`  s    rW   sinhzCppOverrides.sinh  rn  rV   c                     d|  dS )Nz
std::asin(r   rU   r`  s    rW   asinzCppOverrides.asin  rn  rV   c                     d|  dS )Nzstd::asinh(r   rU   r`  s    rW   asinhzCppOverrides.asinh  rq  rV   c                     d|  d| dS )Nzstd::atan2(r   r   rU   rV  ys     rW   atan2zCppOverrides.atan2      QCr!A&&rV   c                     d|  dS )Nz
std::atan(r   rU   r`  s    rW   atanzCppOverrides.atan  rn  rV   c                     d|  dS )Nzstd::atanh(r   rU   r`  s    rW   atanhzCppOverrides.atanh  rq  rV   c                     d|  d| dS )Nzstd::copysign(r   r   rU   r  s     rW   copysignzCppOverrides.copysign  s    s"QCq))rV   c           	      N   d|  dd|  df}t        d |D              rt        d |D              S t               }t        j                  j
                  j                         }t        j                  j
                  j                         }|j                  d| d       |j                  d| d	|  d
| d       t        j                  j                  j                  |       ||f}t        ||      D ],  \  }}|t        j                  j
                  j                  |<   . ||fS )Nfrexp()[0])[1]c              3   h   K   | ]*  }|t         j                  j                  j                  v  , y wr   r5   r  rO  cacher   	cache_keys     rW   r   z%CppOverrides.frexp.<locals>.<genexpr>
  #     K9yAHHLL...K   02c              3   j   K   | ]+  }t         j                  j                  j                  |    - y wr   r  r  s     rW   r   z%CppOverrides.frexp.<locals>.<genexpr>  #     S9++I6S   13zint32_t ;auto z = std::frexp(, &);)r   r   r8   r5   r  rO  newvar	writelinerQ  splicezipr  )rV  
cache_keyscodeexponentmantissacse_varsr  cse_vars           rW   frexpzCppOverrides.frexp  s   aS%s$'77
K
KKS
SSS~88<<&&(88<<&&((1-.xjqcXJbIJ	%h'"%j("; 	4Iw,3AHHLLy)	4!!rV   c                     d|  d| dS )Nzstd::hypot(r   r   rU   r  s     rW   hypotzCppOverrides.hypot  r  rV   c                     d|  dS )Nzstd::log10(r   rU   r`  s    rW   log10zCppOverrides.log10  rq  rV   c                     d|  dS )Nz
std::log2(r   rU   r`  s    rW   log2zCppOverrides.log2   rn  rV   c                     d|  d| dS )Nzstd::nextafter(r   r   rU   r  s     rW   	nextafterzCppOverrides.nextafter$  s     2aS**rV   c                     t         j                  j                  }|dk(  ry|dk(  r|  dS |dk(  r|  d|  dS |	d|  d	|  d
S t        d|      )Ncompile_errorcompile error!runtime_error	; throw 1r~  r  r  z	std::max(, decltype()(0))7unrecognized config cpp.inject_relu_bug_TESTING_ONLY = r   r  inject_relu_bug_TESTING_ONLYr   r  s     rW   reluzCppOverrides.relu(  s|    jj55/!#O#S	?"JSQCt,,[qcQCu55 I#Q rV   c                     d|  d| dS )Nzmin_propagate_nan(r   r   rU   rB  s     rW   minimumzCppOverrides.minimum8      #A3b1--rV   c                     d|  d| dS )Nzmax_propagate_nan(r   r   rU   rB  s     rW   maximumzCppOverrides.maximum<  r  rV   c                     |  d| d| S )N ?  : rU   )rC  rD  cs      rW   wherezCppOverrides.where@  s    Cs#aS!!rV   c                     d|  d| dS )Nzmod(r   r   rU   rB  s     rW   r   zCppOverrides.modD  s    aS1#QrV   c                 X    |t         v rt        j                  }t        | t        |         S r   )r   r   r   rO   rJ   )valr   s     rW   constantzCppOverrides.constantH  s'    M! MMECe!455rV   c                    t        t        j                  j                  |             }t        j                  j                  j                  t        j                  j                  |t        |             }t        j                  ||      S )Nbounds)
rF   r5   r  rename_indexingrO  rP  rQ  r(   r3   rM  )rX  r   idx_strr   s       rW   
index_exprzCppOverrides.index_exprP  sb    0067hhll##HHg.CD.I $ 
 ||C''rV   c                 "   t               }t        j                  j                  j	                         }|j                  d| d       t        j                  j                  |      5  |j                         5   |       }|j                  d| d       d d d        d d d        |j                  d       t        j                  j                  j                  |       t        |d| d      }|  d| d| S # 1 sw Y   exY w# 1 sw Y   ixY w)	Nr   = [&]return r  r?  z())r  z() : )r8   r5   r  rO  r  r  swap_buffersindentrQ  r  rO   )maskbodyotherr  body_varr   
other_codes          rW   maskedzCppOverrides.maskedX  s    ~ 88<<&&(xj/0XX""4( 	0$++- 	0VFNNWVHA./	0 	0 	s	% "%9XJc)BC
s8*E*66	0 	0 	0 	0s$   'D8C9D9D	>DDc                     |  d| S )Nz && rU   rB  s     rW   logical_andzCppOverrides.logical_andi      D}rV   c                     d|  S )N!rU   rC  s    rW   logical_notzCppOverrides.logical_notm      1#wrV   c                     |  d| S )Nr   rU   rB  s     rW   
logical_orzCppOverrides.logical_orq  r  rV   c                     |  d| S )N != rU   rB  s     rW   logical_xorzCppOverrides.logical_xoru  r  rV   c                     d|  d|  d| dS )Nr?  r@   & r   rU   rB  s     rW   bitwise_andzCppOverrides.bitwise_andy  rF  rV   c                     d|  d|  dS )Nr?  z)(~r   rU   r  s    rW   bitwise_notzCppOverrides.bitwise_not}  ri  rV   c                     d|  d|  d| dS )Nr?  r@   | r   rU   rB  s     rW   
bitwise_orzCppOverrides.bitwise_or  rF  rV   c                     d|  d|  d| dS )Nr?  r@  r   r   rU   rB  s     rW   bitwise_xorzCppOverrides.bitwise_xor  rF  rV   c                     d|  d|  d| dS )Nr?  r@   << r   rU   rB  s     rW   bitwise_left_shiftzCppOverrides.bitwise_left_shift      1#Rs$qc++rV   c                     d|  d|  d| dS )Nr?  r@   >> r   rU   rB  s     rW   bitwise_right_shiftz CppOverrides.bitwise_right_shift  r-  rV   seedr   c                     d|  d| dS )Nznormalized_rand_cpu(r   r   rU   r1  r   s     rW   randzCppOverrides.rand  s    %dV2fXQ77rV   c                     d|  d| dS )Nz
randn_cpu(r   r   rU   r3  s     rW   randnzCppOverrides.randn  s    D6F81--rV   c           	           d|  d| d| d| d	S )Nzrandint64_cpu(r   r   rU   )r1  r   lowhighs       rW   	randint64zCppOverrides.randint64  s#    vRxr#ba@@rV   c                     d|  d|  d|  dS )Nr?  z)(1) / (decltype(z)(1) + std::exp(-r  rU   r`  s    rW   sigmoidzCppOverrides.sigmoid  s    1#.qc1B1#RHHrV   c           
      N   t               }d|  d}d|  d}|j                  d       |j                         5  |j                  d|  d| d| d       |j                  d	|  d
| d| d       |j                  d       d d d        |j                  d       |S # 1 sw Y   xY w)Nr?  )(0)r  [&]()auto left = z > 0 ? r  r  auto right = z < 0 ? return left - right;()r8   r  r  )rV  r  scalar_zero
scalar_ones       rW   signzCppOverrides.sign  s    ~!!D) 4(
w[[] 	3NN\!GJ<s;-qQRNN]1#WZLK=PQRSNN12	3 	t	3 	3s   ABB$NT)Nr   r!  r"  __doc__staticmethodrE  rI  rK  rM  r^  ra  rd  rf  rh  rk  rm  rp  rs  ru  rw  rz  r|  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r  r  r  r  r"  r$  r'  r)  r,  r0  r   Exprr4  r6  r:  r<  rG  rU   rV   rW   r=  r=    s   %+ + + + + + ( (T @ @             & &     ! ! " "     ! ! # # ! ! % % 	 	     ! ! 	
 	
 % %     & & " " Y Y ! ! " "   & & " " " " # # ! ! " " ! ! ! ! ! ! " " ' ' ! ! " " * * " "  ' ' " " ! ! + +   . . . . " "     6 6 ( ( 7 7          + + & & + + + + , , , , 85:: 8uzz 8 8 .EJJ .

 . . A

 AEJJ A A I I 
 
rV   r=  r  c                       e Zd ZdZ fdZed        Zed        Zed        Zed        Z	ed        Z
ed        Zed	        Zed
        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Z ed        Z!ed        Z"ed         Z#ed!        Z$ed"        Z%ed#        Z&ed$        Z'ed%        Z(ed&        Z)ed'        Z*ed(        Z+ed)        Z,ed*        Z-ed+        Z.ed,        Z/ed-        Z0ed.        Z1ed/        Z2ed0        Z3ed1        Z4ed2        Z5ed3        Z6ed4        Z7ed5        Z8ed6        Z9ed7        Z:ed8        Z;ed9        Z<ed:        Z=ed;        Z>ed<        Z?ed=        Z@ed>        ZAed?        ZBed@        ZCedA        ZDedB        ZEedC        ZFedD        ZGedE        ZHedF        ZIedG        ZJedH        ZKedI        ZLedJ        ZMedRdK       ZNedL        ZOedM        ZPedN        ZQedO        ZReSdP        ZTeSdQ        ZU xZVS )SCppVecOverridesz.Map element-wise ops to aten vectorization C++c                     t         |   |       fd}t        t              j	                         D ]<  \  }}t        |dd       t        k(  s|dvs t        | ||j                               > S )Nc                       fd}|S )Nc                  $   | D cg c]@  }t        |t        t        j                  f      st        |t              r|j
                  s|B }}| D cg c]   }t        |t              r|j
                  r|" }}t        |       }|r|rg }| D ]  }t        |t        t        j                  f      rt        |t        j                        r1|j                  s%t        j                  |t        j                        }n$t        j                  |t        j                        }t        |t              r|j                  n|}|j                  |        |r>t!        |      dk(  rt#        |      }n$
t$        j&                  k(  rt#        |dd        |dd  |r|rt        t(        j*                  t,              sJ |D cg c]p  }t        |t              r\|j
                  sP
t$        j.                  t$        j0                  t$        j2                  fvrt(        j*                  j5                  |      n|r }}|r 
|i |S t7        t$              }t9        |
j:                  |j=                  
j:                              }|J  || i |S c c}w c c}w c c}w )Nr   r6   )r   rm   r   rK  rI   is_vecr   	is_numberr3   r  r   int64r  r4   valuer  r   rM   rM  r  r5   r  CppVecKernelr4  r6  r:  	broadcastr   r   r   __getattr__)argskwargsargscalarsvectorsnew_argsnew_arg
scalar_opsscalar_funcr   funcr   s            rW   wrapperz6CppVecOverrides.__new__.<locals>.wrap.<locals>.wrapper  sN     $!#UZZ'89"37

    $!#~63::  
  :w!H# -%cC+<=)#uzz:3==&)nnS%++&F&)ll3&D/9#x/H#))cC ,-  8})#/#9!6!66'3HQRL'A w%ahh=== (0  $ 'w?$+NN $ / 4 4 / 5 5 / 9 9$!	 **73 %% H    4V44 "'!=J")"DMM:3I3I$--3X#K '222&777@ s   AJ%J6A5JrU   )ra  rb  r   r   s   ` rW   wrapz%CppVecOverrides.__new__.<locals>.wrap  s    @8D NrV   r   )r  r  )	r   __new__varsrM  itemsr   rJ  setattr__func__)r   rX  kargsrc  namemethodr   r   s         @rW   rd  zCppVecOverrides.__new__  sx    ws#O	b !1779 	;LD&v{D1\Ad S G dD$9:	; rV   c                     |  d| S )NrA  rU   rB  s     rW   rE  zCppVecOverrides.add      Cs|rV   c                     |  d| S )NrH  rU   rB  s     rW   rI  zCppVecOverrides.sub  rm  rV   c                     |  d| S Nr   rU   rB  s     rW   rK  zCppVecOverrides.mul  rm  rV   c                     |  d| S r  rU   rB  s     rW   truedivzCppVecOverrides.truediv  rm  rV   c                     |  dS )Nz.abs()rU   r`  s    rW   ra  zCppVecOverrides.abs"      F|rV   c                     |  dS )Nz.sin()rU   r`  s    rW   rd  zCppVecOverrides.sin&  rt  rV   c                     |  dS )Nz.cos()rU   r`  s    rW   rf  zCppVecOverrides.cos*  rt  rV   c                     |  dS )Nz.exp()rU   r`  s    rW   rk  zCppVecOverrides.exp.  rt  rV   c                     |  dS )Nz.exp2()rU   r`  s    rW   rm  zCppVecOverrides.exp22      G}rV   c                     d|  d}|  d| S )Nr?  r  z	.exp() - rU   rV  vec_ones     rW   rp  zCppVecOverrides.expm16  s#     aS%IgY''rV   c                     |  dS )Nz.erf()rU   r`  s    rW   rs  zCppVecOverrides.erf<  rt  rV   c                     |  dS )Nz.erfc()rU   r`  s    rW   ru  zCppVecOverrides.erfc@  ry  rV   c                     |  dS )Nz	.erfinv()rU   r`  s    rW   rw  zCppVecOverrides.erfinvD      IrV   c                     |  dS )Nz.sqrt()rU   r`  s    rW   rz  zCppVecOverrides.sqrtH  ry  rV   c                     t        t        j                  t              sJ t        | t              sJ | j
                  J t        j                  j                  | j
                         d|  d| dS )N( == r   r   r5   r  rU  rI   r   _get_mask_typer  s     rW   eqzCppVecOverrides.eqL  c    !((L111!^,,,ww"""(())!''231QCtA3a@@rV   c                    t        t        j                  t              sJ t        | t              sJ | j
                  t        j                  k(  rO|j
                  t        j                  k(  sJ t        t        j                  j                  | |f      \  }}| d| S | j
                  J t        j                  j                  | j
                         d|  d| dS )Nr  r  r   )r   r5   r  rU  rI   r   r   rk   rN   rQ  r  )rV  r  x_casty_casts       rW   nezCppVecOverrides.neS  s    !((L111!^,,,77ejj 77ejj(((1!((2B2BQFKNFFXT&**77&&&hh--agg67q4s!DDrV   c                     t        t        j                  t              sJ t        | t              sJ | j
                  J t        j                  j                  | j
                         d|  d| dS )Nr   < r   r  r  s     rW   ltzCppVecOverrides.lt_  c    !((L111!^,,,ww"""(())!''231QCs1#Q??rV   c                     t        t        j                  t              sJ t        | t              sJ | j
                  J t        j                  j                  | j
                         d|  d| dS )Nr  z > r   r  r  s     rW   gtzCppVecOverrides.gtf  r  rV   c                     t        t        j                  t              sJ t        | t              sJ | j
                  J t        j                  j                  | j
                         d|  d| dS )Nr   <= r   r  r  s     rW   lezCppVecOverrides.lem  r  rV   c                     t        t        j                  t              sJ t        | t              sJ | j
                  J t        j                  j                  | j
                         d|  d| dS )Nr  z >= r   r  r  s     rW   gezCppVecOverrides.get  r  rV   c                     |  d| S Nr!  rU   r  s     rW   and_zCppVecOverrides.and_{  rm  rV   c                     |  dS )Nz.rsqrt()rU   r`  s    rW   r|  zCppVecOverrides.rsqrt      H~rV   c                     |  d| dS )Nz.pow(r   rU   rB  s     rW   r  zCppVecOverrides.pow  s    E!ArV   c                     |  dS )Nz.log()rU   r`  s    rW   r  zCppVecOverrides.log  rt  rV   c                     |  dS )Nz.round()rU   r`  s    rW   r  zCppVecOverrides.round  r  rV   c                     |  dS )Nz.floor()rU   r`  s    rW   r  zCppVecOverrides.floor  r  rV   c                     |  dS )Nz.ceil()rU   r`  s    rW   r  zCppVecOverrides.ceil  ry  rV   c                     |  dS )Nz.trunc()rU   r`  s    rW   r  zCppVecOverrides.trunc  r  rV   c                     |  d| dS )Nz.fmod(r   rU   rB  s     rW   r  zCppVecOverrides.fmod  s    F1#QrV   c                     |  dS )Nz	.lgamma()rU   r`  s    rW   r  zCppVecOverrides.lgamma  r  rV   c                     |  d| S r  rU   rB  s     rW   r  zCppVecOverrides.logical_and  rm  rV   c                     d|  S N~rU   r  s    rW   r  zCppVecOverrides.logical_not  r  rV   c                     |  d| S Nr&  rU   rB  s     rW   r  zCppVecOverrides.logical_or  rm  rV   c                     |  d| S Nr   rU   rB  s     rW   r  zCppVecOverrides.logical_xor  rm  rV   c                     |  d| S r  rU   rB  s     rW   r"  zCppVecOverrides.bitwise_and  rm  rV   c                     d|  S r  rU   r  s    rW   r$  zCppVecOverrides.bitwise_not  r  rV   c                     |  d| S r  rU   rB  s     rW   r'  zCppVecOverrides.bitwise_or  rm  rV   c                     |  d| S r  rU   rB  s     rW   r)  zCppVecOverrides.bitwise_xor  rm  rV   c                     |  d| S )Nr+  rU   rB  s     rW   r,  z"CppVecOverrides.bitwise_left_shift  r  rV   c                     |  d| S )Nr/  rU   rB  s     rW   r0  z#CppVecOverrides.bitwise_right_shift  r  rV   c                     t        t        j                  t              sJ t        j                  j	                  | |       S r   )r   r5   r  rU  load)rj  r   s     rW   	load_seedzCppVecOverrides.load_seed  s.    !((L111((--f-./rV   c                 |    t        t        j                  t              sJ t	               }d|  d}t        |||      S )Nz)result[offset_idx] = normalized_rand_cpu(, offset[offset_idx]);r   r5   r  rU  r8   rH   r1  r   r  rand_functions       rW   r4  zCppVecOverrides.rand  s@    !((L111~7v=ST 	 FD-88rV   c                 |    t        t        j                  t              sJ t	               }d|  d}t        |||      S )Nzresult[offset_idx] = randn_cpu(r  r  r  s       rW   r6  zCppVecOverrides.randn  s;    !((L111~9$?UVFD-88rV   c                     t        t        j                  t              sJ t	               }d|  d| d| d}t        |||t        j                        S )Nz#result[offset_idx] = randint64_cpu(z, offset[offset_idx], r   r  )r   r5   r  rU  r8   rH   r   rS  )r1  r   r8  r9  r  r  s         rW   r:  zCppVecOverrides.randint64  sS    !((L111~=dVCYZ]Y^^`ae`ffhiFD-EErV   c                 ~    | j                   |j                   k(  sJ d       |  dt        j                  | |       d| S )Nz;remainder vec implementation expect the same inputs' dtype.z - (z) * )r   rM  r  rB  s     rW   	remainderzCppVecOverrides.remainder  sK     GGqww	IH	ID11!Q78QC@@rV   c                     |  dS )Nz.tan()rU   r  s    rW   r  zCppVecOverrides.tan  rt  rV   c           	      F    d|  d}d|  d}d|  d}| d| d| d|  d| 	S )	Nr?  r  z)(2)z)(-2)z / ( + (r   z).exp()) - rU   )rC  r|  vec_twovec_minus_twos       rW   r  zCppVecOverrides.tanh  sO    aS%aS%#A3e,$witM?#aSG9UUrV   c                     |  dS )Nz.reciprocal()rU   r  s    rW   
reciprocalzCppVecOverrides.reciprocal  s    M""rV   c                     |  dS )Nz.atan()rU   r`  s    rW   r  zCppVecOverrides.atan  ry  rV   c                     |  dS )Nz.acos()rU   r`  s    rW   r  zCppVecOverrides.acos  ry  rV   c                     |  dS )Nz.asin()rU   r`  s    rW   r  zCppVecOverrides.asin  ry  rV   c                     |  dS )Nz.cosh()rU   r`  s    rW   r  zCppVecOverrides.cosh	  ry  rV   c                     |  dS )Nz.sinh()rU   r`  s    rW   r  zCppVecOverrides.sinh  ry  rV   c                     |  dS )Nz.log10()rU   r`  s    rW   r  zCppVecOverrides.log10  r  rV   c                     |  dS )Nz.log2()rU   r`  s    rW   r  zCppVecOverrides.log2  ry  rV   c                     |  d| dS )Nz.nextafter(r   rU   r  s     rW   r  zCppVecOverrides.nextafter  s    Ks!$$rV   c                     |  d| dS )Nz
.copysign(r   rU   rB  s     rW   r  zCppVecOverrides.copysign  s    Jqc##rV   c                     |  d| dS )Nz.atan2(r   rU   rB  s     rW   r  zCppVecOverrides.atan2!      GA3a  rV   c                     |  d| dS )Nz.hypot(r   rU   rB  s     rW   r  zCppVecOverrides.hypot%  r  rV   c           
      <    d|  d}d|  d}| d| d|  d| d|  d
S )	Nr?  r  z)(0.5)z * ((rA  z)/(rH  z)).log()rU   )rV  r|  vec_one_halfs      rW   r  zCppVecOverrides.atanh)  sE     aS%"1#V,uWIS3wis1#XNNrV   c           	      ,    d|  d}d|  d| d|  d|  d	S )Nr?  r  r  r  rA  rZ   z).sqrt()).log()rU   r{  s     rW   r  zCppVecOverrides.asinh0  s3     aS%1#T'#aS!O<<rV   c                     |  dS )Nz.acosh()rU   r`  s    rW   r  zCppVecOverrides.acosh6  r  rV   c                     t         j                  j                  }|dk(  ry|dk(  r|  dS |dk(  r|  d|  dS |	d|  d	|  d
S t        d|      )Nr  r  r  r  r~  r  r  zat::vec::clamp_min(r  r  r  r  r  s     rW   r  zCppVecOverrides.relu:  s|    jj55/!#O#S	?"JSQCt,,[(;qc?? I#Q rV   c                     d|  d|  d|  dS )Nr?  z)(1)/(decltype(z)(1) + z.neg().exp())rU   r`  s    rW   r<  zCppVecOverrides.sigmoidK  s    1#_QCwqcGGrV   c                     |  dS )Nz.neg()rU   r`  s    rW   rh  zCppVecOverrides.negO  rt  rV   c                    t        | j                        r)| j                  |j                  k(  sJ d       d|  d| dS t        d | |fD              sJ d|  d}t        j                  j                  |j                        dk  r,| ddt        j                  j                  z  dz
   d	| d
| d}|  d| }d|  d| d| d}d|  d| d| d| d	}| d| d| d| d
| d| dS )NzDdiv_floor_floating_vec implementation expect the same inputs' dtype.zdiv_floor_floating_vec(r   r   c              3   F   K   | ]  }t        |j                          y wr   )r   r   )r   items     rW   r   z+CppVecOverrides.floordiv.<locals>.<genexpr>[  s     G'

3G   !r?  r6   ::blend<r\  (1), r  r  r  r  z(0))r  r  z	(0)) != (z(0)))z	::blendv(rH  r!  )r   r   r   r5   r  _get_raw_num_vectorstiling_factor)rC  rD  _tr  has_remis_negs         rW   r  zCppVecOverrides.floordivS  s1   !''"177"VUV",QCr!A66GAGGGGQCq!Bxx,,QWW59d(A)?)?$?1#D"ERt5QRPSSTUSA3<D!Cs$rd$/G!Ct9QCs2$e<FT4&4&B4uWISPQRRrV   c                     t         j                  j                  |j                        dk  r2d| d}| ddt         j                  j                  z  dz
   d| d| d}|  d| S )Nr6   r?  r   r  r\  r  r  )r5   r  r  r   r  )rC  rD  r  s      rW   r  zCppVecOverrides.truncdivf  sp     88((1A5QCq!B$hQXX%;%; ;q@AB4uQCqQACs|rV   c                     | j                   t        j                  k(  rO|j                   t        j                  k(  sJ t        t        j
                  j                  | |f      \  }}| d| S d|  d| dS )Nr!  at::vec::minimum(r   r   r   r   rk   rN   r5   r  rQ  rC  rD  a_castb_casts       rW   r  zCppVecOverrides.minimumo  l    77ejj 77ejj(((1!((2B2BQFKNFFXS))&qcA3a00rV   c                     | j                   t        j                  k(  rO|j                   t        j                  k(  sJ t        t        j
                  j                  | |f      \  }}| d| S d|  d| dS )Nr&  at::vec::maximum(r   r   r  r  s       rW   r  zCppVecOverrides.maximumx  r  rV   c                     |  d|  S rp  rU   r  s    rW   squarezCppVecOverrides.square  rm  rV   c                    t        t        j                  t              sJ |j                  t
        j                  k(  rY|j                  t
        j                  k(  sJ t        t        j                  j                  | ||f      \  }}}d| d| d| d| d	S d| d| d| dt        j                  j                  | |j                         d	S )Nr?  
)::blendv(r   r   )
r   r5   r  rU  r   r   rk   rN   rQ  _get_mask_cast)rC  rD  r  blendv_ablendv_bblendv_cs         rW   r  zCppVecOverrides.where  s    !((L11177ejj 77ejj(((+?  1a),(Hh xj
8*Bxj8*TUVVqcA3b2ahh6M6MaQRQXQX6Y5ZZ[\\rV   c                 ~   t               }d|  d}d|  d}d|  d| d| d| d|  d}d|  d| d| d|  d| d}|j                  d       |j                         5  |j                  d	| d
       |j                  d| d
       |j                  d       d d d        |j                  d       |S # 1 sw Y   xY w)Nr?  r>  r  r  r   r  r   r?  r@  r  rA  rB  rC  rD  )rV  r  vec_zeror|  blendv_lblendv_rs         rW   rG  zCppVecOverrides.sign  s    ~qc&aS%qcH:Ry8*CPQsRSTqcH:Ry1#S
RSTw[[] 	3NN\(156NN]8*A67NN12	3 	t	3 	3s   <B33B<c           
         |t         j                  t         j                  t         j                  t         j                  t         j
                  t         j                  t         j                  t         j                  t         j                  f	v sJ t         d|        t        | t              sJ | j                  }t        j                  j!                  | ||      }t        j                  j"                  j%                  t        j                  j&                  |      }|j)                  d| |fd|i       |t         j                  t         j
                  fv r5|t         j                  k(  r"t        j                  j+                  | |||       |S )Nz does not support rM  r   )r   rk   float64rn   rS  rT  uint8int8int32rS  r   r   rI   r   r5   r  rN  rO  rP  rQ  rR  rU  )rV  r   r   use_compute_dtypesrX  rY  s         rW   rM  zCppVecOverrides.to_dtype  s   JJMMKKNNMMKKJJKKKK

 

 
	2 Z)%1
	2 

 !^,,,GG	xx))!UI>&&qxx'7'7>j1e*{I6NOU^^U]]33	U[[8PHH((IvuErV   c                 z    t         j                  j                  }|dk(  r|  d|  dS ||  dS t        d|      )Nr~  r  r  z.log1p()r  r  r  s     rW   r  zCppVecOverrides.log1p  sT    jj66*SQCt,,[S>! J3'R rV   c                 \
   t        t        j                  t              sJ t	               }t        j                  j
                  j                         }t        j                  j                  |       5 }|j                  d| d       t        j                  j                  |      5  |j                         5   |       }|j                  d| d       d d d        d d d        d d d        |j                  d       t        j                  j                  j                  |       j                  }| d}|j                  r|n$t        j                  j                  |       d| d}	t!        |t"        |         }
|t$        j&                  k(  r$t        j                  j)                          d|
 dn$t        j                  j                  |       d|
 d}t        t*              sJ |       |j                  rt	               }|j                  d	       t        j                  j                  |      5  |j                         5  |j                  d
| d       |j                         5  |j                  d| d       d d d        |j                  d       |j                         5  t        j                  j
                  j-                  t        j                  j                  |	      }t        j                  j
                  j-                  t        j                  j                  |      }t        |t*              sJ |       t        |t*              sJ |       ||_        ||_        |j                  dt        j                  j.                  j1                  |||       d       d d d        d d d        d d d        |j                  d       t        j                  j
                  j-                  t        j                  j                  |      }n|j                  rKt        j                  j
                  j-                  t        j                  j                  |  d|	 d|       }nJt        j                  j
                  j-                  t        j                  j                  |  d| d|
       }|j3                  d| |||fi        |S # 1 sw Y    xY w# 1 sw Y   xY w# 1 sw Y   
xY w# 1 sw Y   wxY w# 1 sw Y   \xY w# 1 sw Y   axY w# 1 sw Y   fxY w)Nr  r	  r
  r  rC  r  r   ::from([&]if (z.all_zero())elser  r  r  )r   r5   r  rU  r8   rO  r  r  r  r  r  rQ  r  r   rQ  _get_vec_typerO   rJ   r   rk   r  rI   rP  	overridesr  rR  )r  r  r  r  r   new_maskr   r   	body_codebody_code_vecr  other_code_vecbody_vec_varother_vec_varrY  s                  rW   r  zCppVecOverrides.masked  s%   !((L111~hhll!!#XX__T" 	4hNNU3%v./&&t, 4dkkm 4234 4	4
 	s	%e2J	 }} HH**512!I;a@ 	
 "%e)<=
 

" xx&&()A>HH**512!J<qA 	
 (N3=X=3??>DNN5!&&t, dkkm hZ|<=[[] @NNW^,<A#>?@v&[[] #$88<<#8#8((%$L %&HHLL$9$9((&%M &lNCQ\QC%m^DSmSD).L&*/M'NN!!(("4"4":":8\S`"a!bbcd , NN4 XX\\**  F ]]XX\\**  TF#m_C?O"PF XX\\**  TF#i[J<"HF
 	htUF(CRH}4 4 4 4	4 	48@ @    s   25S-'S 8SS S-8T!	&T/S:)T.C?T-T5T!SS  S*	%S--S7:T?TTTT	T!!T+c                 X   t        t        j                  t              sJ t        j                  j	                  |       }t        j                  j
                  t        j                  j                     }t        j                  j                  ||      }|dk(  rt        j                  | |      S |t        j                  j                  j                  t        j                  j                  t        |      t        |             }t        j                   ||      }t        |t"              r|j$                  }t        j                  j'                  ||      }n:t        j                  j)                  d ||t        j                  j                        }|j+                  d| |fi        |S )Nr   r  r  )r   r5   r  rU  r  itervars
tiling_idx_try_get_const_strider=  r  rO  rP  rQ  rF   r(   r3   rM  r4   rT  arange_load_or_store_non_contiguousrR  )rX  r   r   
tiling_varstrider  rT  rY  s           rW   r  zCppVecOverrides.index_expr	  s3   !((L111((.XX&&qxx':':;
//zBQ;**477((,,''  %,7LT7R ( C LLe,E%*XX__UF3FXX;;eUAHH$4$4F 	lT5M2>rV   c           
      :   d|  dd|  df}t        d |D              rt        d |D              S t        | j                     }t        j
                  j                  rt        j
                  j                  nt        j
                  j                  }t               }t        j
                  j                  j                         }t        j
                  j                  j                         }|j                  d| fi        |j                  d| fi        t        j
                  j                  | j                        }|dk(  rd	| d
nd| d| d
}|j                  |dk(  rd| dnd| d| d       |j                  | d| d       |j                  d       |j                         5  |j                  d| dt        j
                  j                   d       |j                  |  dt        |       d       |j                  dt        j
                  j                   d       |j                  d| dt        j
                  j                   d       |j                  dt        |       d       |j                         5  |j                  d       d d d        |j                  |dk(  r| dt        |       dn| d| dt        |       d       |j                  | d | d!t        |       d       d d d        |j                  d"       t        j
                  j                   j#                  |       ||f}	t%        ||	      D ],  \  }
}|t        j
                  j                  j&                  |
<   . ||fS # 1 sw Y   xY w# 1 sw Y   xY w)#Nr  r  r  c              3   h   K   | ]*  }|t         j                  j                  j                  v  , y wr   r  r  s     rW   r   z(CppVecOverrides.frexp.<locals>.<genexpr>#  r  r  c              3   j   K   | ]+  }t         j                  j                  j                  |    - y wr   r  r  s     rW   r   z(CppVecOverrides.frexp.<locals>.<genexpr>$  r  r  r  )rY  r6   at::vec::Vectorized<r   at::vec::VectorizedN<r   zat::vec::Vectorized<int32_t> r  zat::vec::VectorizedN<int32_t, > r   r?  __at_align__ std::array<	> tmpbuf;.store(tmpbuf.data(), r  z!__at_align__ std::array<int32_t, z> tmpbuf_exponent;z> tmpbuf_mantissa;for (int i = 0; i < ; i++)z@tmpbuf_mantissa[i] = std::frexp(tmpbuf[i], &tmpbuf_exponent[i]);z? = at::vec::Vectorized<int32_t>::loadu(tmpbuf_exponent.data(), z! = at::vec::VectorizedN<int32_t, z!>::loadu(tmpbuf_exponent.data(),  = z ::loadu(tmpbuf_mantissa.data(), z();)r   r   rJ   r   r5   r  	tail_sizer  r8   rO  r  rR  _get_num_vectorsr  r  rG   rQ  r  r  r  )rV  r  r   r   r  r  r  n_vec
mantissa_tr  r  r  s               rW   r  zCppVecOverrides.frexp   si   aS%s$'77
K
KKS
SSSagg&%&XX%7%7qxx!!QXX=S=S~88<<&&(88<<&&(!b9!b9))!''2 z #6(!,(5'; 	
 	z ,H:Q71%8*AF	

 	*Qxj23w[[] 	NN*6("QXX5K5K4LIV NNaS 6{47H6ILMNN3AHH4J4J3KK]^ NN*6("QXX5K5K4LL^_ NN1+d2C1DFKL V NNA: *[\ghl\m[nnpq z!B5'Ijkvw{k|j}}  A
 NN*C
|+KKX\L]K^^`a+	0 	u	%h'"%j("; 	4Iw,3AHHLLy)	4!!# 	 	s&   .CN
NA$NN	
NNc                     fd}|S )Nc                     |rJ t         j                  }t        |t              sJ t	               }|j                  d       | d   j                  }|j                  |      }|j                  r|j                  n|j                  }g }t        |   }j                  dv }	|	rdn|}
j                  dk(  rt        | d      n|
}
|j                         5  t        |       D ]  \  }}t        |t              r}|j                  sJ |j                  |k(  sJ |j                  d| d|j                   d	| d
       |j                  | d| dt!        |       d       |j#                  d| d       |j#                  |        |j                  d|
 d|j                   d        | }|j                  dt!        |       d       |j                         5  |j                  d| d
       d d d        |	r|j                  rJ d}d| d| d}n#dt!        |       }|dk(  rd|
 d}n	d|
 d| d}|j                  d| d| d       d d d        |j                  d       |S # 1 sw Y   {xY w# 1 sw Y   (xY w) Nr?  r   )r  r  r  rk   r^  r  r   z> tmpbufr  z.store(tmpbufz	.data(), r  tmpbufz[i]z> tmpbuf_out;r  r   ztmpbuf_out[i] = ztmpbuf_out.data()at::vec::VecMask<,z>::fromztmpbuf_out.data(), r6   r  z>::loaduz at::vec::VectorizedN<r
  r  rC  )r5   r  r   rU  r8   r  r   r#  r"  r  rJ   r   r  	enumeraterI   rQ  rG   r  )rX  rY  r  r  	vec_dtyper$  r   scalar_argsr   output_maskoctypeargidxrZ  res	load_argsload_fnr`  s                   rW   r   z(CppVecOverrides.scalarize.<locals>.inner[  s   :XXFfl333>DNN7#QI++I6E'-'7'76##V=Q=QDK!),F%.. 3 K
  +VF  ((,>> T"X& 
  B#,T? 0KFC!#~6"zz)z"yyI5556vhbAUAU@VV^_e^ffgh "e=	+dBSATTVW $**VF83+?@#**3/0 .vhb9M9M8Nm\ ";/!5k$6G5HOP[[] >NN%5cU!#<=>%//// 3I 1&5'IG"5k$6G5H IIz$8"I$:6("UG8"T	9+R@A?B@ NN4 K> >'B Bs&   DI6I*0AI6*I3	/I66I?rU   )r   r`  r   s    ` rW   	scalarizezCppVecOverrides.scalarizeY  s    7	r rV   c                    t        t              j                         D ]d  \  }}t        |dd       t        k(  s|t        t
              vs-| j                  |j                        }||_        t        | |t	        |             f y )Nr   )
re  r=  rf  r   rJ  rM  r5  rh  r   rg  )r   rj  rk  ra  s       rW   _initialize_scalarizez%CppVecOverrides._initialize_scalarize  st     .446 	7LD&v{D1\AdRVS G }}V__5 $T<#56	7rV   rH  )Wr   r!  r"  rI  rd  rJ  rE  rI  rK  rr  ra  rd  rf  rk  rm  rp  rs  ru  rw  rz  r  r  r  r  r  r  r  r|  r  r  r  r  r  r  r  r  r  r  r  r  r"  r$  r'  r)  r,  r0  r  r4  r6  r:  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r<  rh  r  r  r  r  r  r  rG  rM  r  r  r  r  r#  r5  r7  r$  r%  s   @rW   rM  rM    sX   8[z                   ( (
         A A 	E 	E @ @ @ @ A A A A                                           0 0 9 9 9 9 F F A A   V V # #               % % $ $ ! ! ! ! O O = =
      H H   S S$   1 1 1 1   	] 	]    * 	 	 D DL  , 6" 6"p : :x 7 7rV   rM  cppvecc                       e Zd Zed        Zy)CppTile2DOverridesc                     t        t        j                  t              sJ t        j                  j	                  |       } t
        j                  | |      S r   )r   r5   r  CppTile2DKerneltransform_indexingrM  r  )rX  r   s     rW   r  zCppTile2DOverrides.index_expr  s=    !((O444xx**40))$66rV   N)r   r!  r"  rJ  r  rU   rV   rW   r:  r:    s    7 7rV   r:  c                       e Zd ZeZeZdZdZ fdZ	e
edfdZdefdZd Zej"                  d	        Z	 d&d
ej(                  fdZd
ej(                  defdZd
ej(                  dej.                  fdZd
ej(                  dej.                  fdZd Zdej(                  dej(                  dedefdZded
ej(                  fdZd'dZd Zd Z d Z!d Z"d Z#d Z$e%defd        Z&d! Z'ej"                  d"        Z(d# Z)d$ Z*d% Z+ xZ,S )(	CppKernelr  r  c                 0   t         |   |       d | _        g | _        g | _        d | _        t               | _        t               | _        t               | _	        t               | _
        t               | _        t               | _        d| _        t               | _        t        | j                   | j"                  d      | _        t        | j                   | j"                  d      | _        t               | _        t               | _        || _        i | _        y )NFtmp_acc)name_prefixwrecps)r   r   call_rangesrangesr  reduction_depthr?   reduction_prefixreduction_suffixparallel_reduction_prefixparallel_reduction_suffixlocal_reduction_initlocal_reduction_storesis_reductionnon_parallel_reduction_prefixr:   newvar_prefixsuffixreduction_cseweight_recps_csepreloads
poststoresnum_threadsreduction_omp_dec)r   rX  rU  r   s      rW   r   zCppKernel.__init__  s    =A(*,.# . 0 . 0)7)9&)7)9&$2$4!&4&6#!-;-=* !3!3T[[iX #!
 '((*&=?rV   Nc                 $   t         j                  j                  r'| j                  s| j                  j	                  d       | d}t         j                  j                  rdn	t               }	| d}
|
 d|	 d}	 d|
 d| d	|	 d
}|
 d|	 d}|j                  d|	 dd      }| j                  j	                  | d| d |||       d       | j                  j	                  t        j                         r| dn| d| d       | j                  j                  d|	 ddd| d |||       ddg       | j                  j                  | d| dg       | j                  j                  d|	 ddd| d |||||       ddg       y )Nz(int max_threads = omp_get_max_threads();_localmax_threads_arr[]r  z = std::make_unique<z[]>(r   z[tid]r   r!  r  zfor (int tid = 0; tid < z; tid++){z    r~   r   )r   r  dynamic_threadsrI  r  r,   r   rK  r   
is_msvc_cl
writelinesrL  rJ  )r   r   acc_typer   r   reduction_combine_fnreduction_init_fn welford_weight_reciprocal_vec_fn	acc_localrU  acc_per_thread_var_nameacc_per_threadacc_per_thread_unique_ptr_declacc_per_thread_vla_declacc_local_in_arrays                  rW   _gen_parallel_reduction_buffersz)CppKernel._gen_parallel_reduction_buffers  s    ::%%d.L.L**44: e6N	#ZZ77M=Q=S 	 &)E,34Ak]!D	 ,11H0II]^f]ggklwkxxy)z&%<$=Q{m1"M+33a}A4FP!!++j)C(9.%(P'QQRS	
 	&&00%%' ..a0:Q67q9	

 	&&11*;-x@)*#.?PU.V-WWXY		
 	##..%&c)A6	

 	&&11*;-x@se33NCI[glmnnop		
rV   linec                 .    t        j                  d|      S )Nztmp_acc[0-9]+researchr   rm  s     rW   get_reduction_var_patternz#CppKernel.get_reduction_var_pattern  s    yy$//rV   c                    t        | j                  j                        D ]g  \  }}t        |t              s| j                  |      }|s+|j                  d      }|j                  || d      | j                  j                  |<   i y )Nr   rX  )r,  stores_linesr   ro   rs  groupr   )r   irm  mvar_names        rW   %update_stores_with_parallel_reductionz/CppKernel.update_stores_with_parallel_reduction	  sv     !3!34 	XGAt$$2248 wwqzH,0LLhZvCV,WDKK&&q)	XrV   c              #     K   | j                   }|rYt        j                  ||      }t        |t              r3|j
                  }t        |t              sJ t        j                  |_	        || _         	 | || _         y# || _         w xY ww)z>Context manager to add an additional mask to loads and stores.N)

_load_maskr3   r  r   r4   rT  rI   r   rk   r   )r   r  priors      rW   r  zCppKernel.masked  ss      88D%(D$)zz!$777 #ZZ
	$J#DOeDOs   A/B
2A> 6B
>	BB
r   c                 P    | j                   |   }|||z  |z   i}t        ||      }|S r   )r  r1   )r   r   scaleitervar_idxr   r   r   r   s           rW   scale_index_with_offsetz!CppKernel.scale_index_with_offset$  s7     mmK(C%K&01uk2	rV   r   c                 6    t        | j                  |            S )z
        Convert an index expr to a string that can be used in cpp code.
        e.g. a sympy expression "s2" may actually appear as "ks1" in the cpp kernel.
        )rF   r  r   r   s     rW   index_to_strzCppKernel.index_to_str,  s    
 T))%011rV   itervarc                 D     t         fd|j                  D              S )z]
        Check if an index has free symbol CppCSEVariable that depends on `itervar`.
        c              3   (  K   | ]  }|j                   j                  j                  v ret        j                  j                  |j                      t              r4j                  j                  |j                      j                          y wr   )rj  rO  varname_mapr   rI   
depends_on)r   sr  r   s     rW   r   z6CppKernel.index_indirect_depends_on.<locals>.<genexpr>7  sj      
vv---488//7H HH  (33G<
s   BB)rf   free_symbolsr   r   r  s   ` `rW   index_indirect_depends_onz#CppKernel.index_indirect_depends_on3  s%      
''
 
 	
rV   c                 F    ||j                   v xs | j                  ||      S r   )r  r  r  s      rW   index_depends_onzCppKernel.index_depends_on>  s,    %,,, 
0N0N71
 	
rV   c                 T    t        t        | j                  | j                              S r   )dictr  r  rE  r   s    rW   
var_rangeszCppKernel.var_rangesC  s    Ct{{344rV   rX  r   lowerupperc                    |s|sy t        |t        j                        }|rIt        j                  |t
        j                        j                  }t        j                  j                  }nt        j                  j                  }	 | j                  t        j                  _
        t        j                  |t
        j                        j                  }|t        j                  _
        | j                  }|r.t        j                  j                  | j                  |            nd }	| j                  ||rdnd |	| j                        }
| j                   j#                  ||
d       y # |t        j                  _
        w xY w)N0F)
assignment)r   r   TMPr3   r  r   rS  rT  r5   r  rQ  loadssexprr  indirect_assertr}  rO  rP  )r   rX  r   r  r  indirectrY  bufferprior_computesize_strrm  s              rW   check_boundszCppKernel.check_boundsF  s
    &tTXX6^^D%++6<<FXX%%F HH,,M1#':: ekk:@@#0 ZZFAF188>>$"6"6t"<=D##5CdHdoo
 	&$59 $1 s   AE' 'E>rj  c                     | j                   j                  |      }| j                  |      }| dt        |       d}| j                  j                  | j                  |      }|j                  d| ||fi        |S )Nr[  r\  r  )rX  inputr  rG   rO  rP  r  rR  )r   rj  r   r   rm  rY  s         rW   r  zCppKernel.loadf  st    iiood#$$U+aE*+1-""4::t4ftT5&92>rV   c                    d|v sJ | j                   j                  |      }| j                  |      }|| dt        |       d| d}n|dk(  rt        j
                  j                  s$| j                  dk(  r| dt        |       d| d}nSt        j                  j                  |      }dt        |    d	| d
}d| dt        |       d| d}nt        d|       | j                  j                  t        ||             y )Nbufr[  ] = r  
atomic_addr6   z] += zstatic_cast<r\  r   zatomic_add(&z], r  store mode=)rX  outputr  rG   r   r  r_  rU  r5   graph	get_dtyperJ   NotImplementedErrorru  r  r=   )r   rj  r   rT  moder   rm  r   s           rW   storezCppKernel.storen  s   }}iit$$$U+<U!K./tE7!<D\!::--$2B2Ba2GaE 235qA))$/&|E':&;2eWAF%cU!K,>+?s5'L%D6&:;;l467rV   c           
         |dv }|||f}|| j                   j                  v r| j                   j                  |   S | j                   j                  | j                  d| d      }d| _        |r|n|}t        ||      }	| j                  j                  |	 d| dt        ||       d       | j                  J | j                  | j                     }
t        | j                  d	z   t        | j                              D ]$  }|
| j                  |   z  | j                  |   z   }
& | j                  j                  | dt        ||||
       d       | j!                  ||	||       t#        ||      }|| j                   j                  |<   |S )
N>   ra   r`   
reduction FwriteTr   r!  r  r6   )rQ  reduction_cacherP  r  rM  r   rG  r  r   rF  r  r	  r   rE  ru  r   rl  r   )r   r   r   r   rT  argmax_or_argminreduction_keyr   
init_dtyperb  r   rx  r   s                rW   	reductionzCppKernel.reduction  s   )-AA!>58D..>>>%%55mDD  ))JJ*]O4E * 
 !"2Y
%njA''j#c."L!MQO	
 ##///d223t++a/T]]1CD 	>ADKKN*T]]1-==E	>e3(eUKLAN	
 	,,S(NJW">37<B**=9rV   c                     | j                  |      }| j                  j                  |      }| j                  j	                  t        || dt        |       d| d             y )Nr[  r  r  )r  rX  r  rH  r  r=   rG   )r   rj  r   rT  r   s        rW   store_reductionzCppKernel.store_reduction  s]    $$U+iit$''#aE(:';4waHI	
rV   c                    | j                   ri| j                   t        |      t        |      z   k(  s+J | j                    dt        |       dt        |              | j                  t        |      k(  sJ t        |      t        |      z   | _         | j                   D cg c]  }| j	                  |       c}| _        t        t        | j
                              D cg c]  }t        t        j                  |       c}| _
        t        |      | _        | j                  d | j                   | j                  | j                  d  fS c c}w c c}w )Nr  rA  )rD  r   rF  r   r  rE  r	  r/   r   XBLOCKr  )r   lengthsreduction_lengthsrV  ns        rW   
set_rangeszCppKernel.set_ranges  s<   ##uW~!9 (  V""#4g'7s5AR;S:TUV  ''3w<777$W~6G0HHD<@<L<LMq4//2MDK s4;;/0 /t{{A>DM $'w<D MM0D001MM$..01
 	
 Ns    E !Ec                 ~    t         j                  j                  j                  t	        | j
                        d      S )N    fallback)r5   r  sizevars	size_hintr0   rD  r   s    rW   r  zCppKernel.size_hint  s4    ww))$**+d * 
 	
rV   c                    t               | j                  J j                         }t        d |D              }|rIt	        |      dk(  sJ t        |d   t              sJ |d   j                  j                               }n | j                  j                               }t        j                         5 }|rDj                         rj                          nj                         j                  |       n4dkD  r/j                         r|j!                  j#                                dt$        ffdfdddddt&        t$           ffd	dt$        ffd
|j!                  j#                                j(                  r6|r t        t*        j,                  t.              rt*        j,                  j0                  rt*        j,                  j0                  }|j3                         D ]  }	t5        |	j7                         j8                  D 
cg c]  }
| j;                  |
       c}
      }t<        |	j7                         j>                     }d| dtA        |       d}|	jC                         }jE                  d| d| d| d       jE                  | d| d| d         j(                         n jF                         d d d        y c c}
w # 1 sw Y   y xY w)Nc              3   <   K   | ]  }t        |t                y wr   )r   r  )r   r  s     rW   r   z/CppKernel.codegen_loops_impl.<locals>.<genexpr>  s      $
9?Jv34$
   r6   r   loopc                     d }| j                         }t        |      dk(  sJ t        |d   t              s ||       r|d   j	                           |d          y )Nc                 V    | j                         }|j                  xr |j                  S r   )get_rootrM  parallel)r  r  s     rW   is_parallel_reductionzTCppKernel.codegen_loops_impl.<locals>.gen_loop_kernel.<locals>.is_parallel_reduction  s"    ==?D,,>>rV   r6   r   )get_kernelsr   r   r  r{  )r  r  kernels
gen_kernels      rW   gen_loop_kernelz5CppKernel.codegen_loops_impl.<locals>.gen_loop_kernel  s`    ? **,7|q(((!AJ 4+D1AJDDF71:&rV   c                 :   t        | t              r| j                  D ]p  }|j                  r |j                  |j                         -t	        j
                         5 }|j                  j                                 |       d d d        r y t	        j
                         5 }| sJ t        | d      rKj                  | j                         | j                         |j                  j                                j                  | j                         j                  | j                         j                  | j                         d d d        t        | d      rj                  | j                         y y # 1 sw Y   pxY w# 1 sw Y   ?xY w)Ncodegen_inner_loops)r   r  r   rM  
contextlib	ExitStackenter_contextr  hasattrr  rS  r  r  rQ  ru  rT  )r  r  stackr  r  	gen_loopss      rW   r  z0CppKernel.codegen_loops_impl.<locals>.gen_kernel  s?   f&:; & 	6::%djj$2C2CD!+!5!5!7 65 !& 3 3DKKM B / 56 6		6 $--/ 35%v"6+@A KK8"66t<!//>FLL1FNN3FMM23 v'<=F$5$56 >!6 63 3s   (F&B-FF	Fc                 f   |dv sJ | D ]  }|j                         D ]  }|dk(  r|j                  |j                  fc c S |dk(  r-|j                  }|j                  r|j
                  |z   }|c c S |j                  }|j                  r||j                  z   }n||j                  z   }|c c S   y )N)prefixrP  localr  rP  )	r  rK  rL  rH  r  rJ  rG  rI  rN  )loopsr  r  r  rP  r  s         rW   get_reduction_code_bufferz?CppKernel.codegen_loops_impl.<locals>.get_reduction_code_buffer  s    !>>>>! *D"&"2"2"4 *!W, & ; ; & = =$  $x/%+%<%<F#}})/)I)IF)R#)M%+%<%<F#}})/&2R2R)R)/&2V2V)V#)M#**rV   r  c                    t        j                         5 }d x}}| r| d   }|j                  r<|s: 	|       }|r|j                  j	                                j                  |       
j                         r@|j                  r4 	| d      \  }}j                         |r|sJ j                  |       | D ]
  } |        | rj| d   }
j                         r/|j                  r#|rj                  |       j                          |j                  r|sj                   	| d             d d d        y # 1 sw Y   y xY w)Nr   r  rP  )	r  r  rM  r  r  r  is_reduction_onlyr  close)r  in_reductionstack_outerrK  rL  r  rG  r  gen_loopr  r  threadsworksharings          rW   r  z/CppKernel.codegen_loops_impl.<locals>.gen_loops  s=   ))+ T{DHH(+A$Qx,,\/H/O,/ + 9 9$++- H KK(89$668T]] !:% I 4 6'0093'= ='= $,@ A % ' ' $Qx$668T]]5 $,B C'--/,,\ KK(A%(RS;T T Ts   DD>>Ec                 T   t        j                         5 }| j                         }|
	 d d d        y j                  |       |j	                  j                                | j                  r | j                  | j                         n |        d d d        y # 1 sw Y   y xY wr   )r  r  linesra  r  r  r   rM  )r  r  
loop_linesr  r  r  s      rW   r  z.CppKernel.codegen_loops_impl.<locals>.gen_loop4  s    ))+ 
.u!%J!)
. 
. OOJ/''6zz!$**d.?.?@'-
. 
. 
.s   BA"BB'zstd::make_unique<z []>(r   zstd::unique_ptr<z	 []> buf_r!  r  z* z = buf_z.get();r  F)$r,   rD  r  rf   r   r   r  decide_parallel_depthmax_parallel_depthr  r  r  r  r  mark_parallelsingler  r  r  r   r  r5   local_buffer_contextrL   local_buffersvaluesr0   
get_layoutr   r  rJ   r   rF   get_namer  r  )r   r  r  r  r  has_outer_loop_kernel	par_depthr  r  local_buffersize_vallocal_buf_sizelocal_buf_dtypeallocatelocal_buffer_namer  r  r  r  r  r  s    ```           @@@@@@rW   codegen_loops_implzCppKernel.codegen_loops_impl  s   &(+++'') # $
CJ$
 !
 !w<1$$$gaj*>???
88,,.I 22,,.I !!# Q	-u..0%%'((1''	21%%'''6'i '72*,Ti T T@.y . .~~)"1#9#9;MN..<< %&$:$:$H$HM(5(<(<(> )6 1=0G0G0I0N0N$, !% 4 4X >* +7|7N7N7P7V7V*W%66GuUSaMbLccd#e,8,A,A,C)..?yIZH[[^_g^hhij ./r2C1DGL]K^^ef" )..)9++,cQ	- Q	-BCQ	- Q	-s    E/K>2K9
B&K>9K>>Lc                 T    t         j                  |       }| j                  |||       y r   )LoopNestWithSplitbuildr  )r   r  r  r  s       rW   codegen_loopszCppKernel.codegen_loops_  s$    %++D1		4=rV   c                 :    t         j                  j                  ryy)NAOTI_TORCH_CHECKTORCH_CHECK)r5   r  aot_moder   s    rW   assert_functionzCppKernel.assert_functionc  s    77 & rV   c                    | j                   J | j                   d | }| j                         }d}d}|D ]m  }t        j                  j                  j                  |d      }|d|z  k\  s||k(  r n3||z  t
        j                  j                  k  r n|dz  }||z  }||z  }o t
        j                  j                  r|dk(  rt        |      dkD  rd}|S )Nr6   r   r  r  r   )
rD  r  r5   r  r  r   r  min_chunk_sizer_  r   )	r   r  r  rE  seqpardepthrX  hints	            rW   r  zCppKernel.decide_parallel_depthl  s    +++!!"5#56nn 		D77##--dT-BDa'k!SG^g~

 9 99QJE4KC4KC		 ::%%%1*VqErV   c              #     K   | j                   | j                  | j                  | j                  f}t	               | _         t	               | _        t	               | _        | j                  j                         | _        d  | j                  j                  | j                          | j                  j                  | j                         | j                  j                  | j                         |\  | _         | _        | _        | _        y wr   )r  rQ  ru  rO  r?   clonerH  r  )r   r~  s     rW   write_to_suffixzCppKernel.write_to_suffix  s     T\\4;;A#%
%'$&88>>#$$TZZ0$$T\\2$$T[[1<A9T\4;s   D
Dc                     t        |i |S r   )rI   )r   rX  rY  s      rW   create_cse_varzCppKernel.create_cse_var  s    t.v..rV   c                 "    dt         |    d| dS )Nr[  r\  r   )rJ   )r   srcr   r   s       rW   rN  zCppKernel.get_to_dtype_expr  s    |E232cU!<<rV   c                 \    | j                  |||      }|| j                  j                  |<   y r   )rN  rO  r  )r   dst	dst_dtyper  r   rX  s         rW   rU  zCppKernel.cache_dtype_convert  s(    %%c9i@"trV   )r6   r   r   )-r   r!  r"  r=  r  rF   r  rO  rP  r   r   r   rl  ro   rs  r{  r  contextmanagerr  r   rK  r  r  r   r  r  r  rk   r  r  r  r  r  r  r  r  r  propertyr  r  r  r  rN  rU  r$  r%  s   @rW   r?  r?    s   IEMF@: /()-:
x0c 0X $ $& BCZZ2%** 2 2	
uzz 	
ELL 	

ejj 
5<< 

5:jj: jj: 	:
 :@ UZZ 8$6

&

c-J> ! ! !. 
B 
B/=#rV   r?  c                   V    e Zd ZeZ	 d+ fd	Zdej                  dej                  fdZ	de
j                  defdZde
j                  defd	Zde
j                  defd
Ze
j                  fde
j                  defdZdede
j                  defdZdefdZ	 d+dedej                  de
j                  dee   fdZ	 	 	 d,dee   dej                  de
j                  dee   deeeef      dedee   fdZdedej                  f fdZ	 d-deeef   dedej                  de
j                  def
dZd+dZd Zd Z dedefd Z!ded!ej                  defd"Z"d# Z#d$ Z$d+d%Z%ddde
jL                  fdeej                     d&ee   d'ee
j                     fd(Z'd+ fd)	Z( fd*Z) xZ*S ).rU  Nc                     t         |   ||       t        j                         | _        | j                  sJ |dkD  sJ d       || _        || _        || _        |r|| _        y || _        y )Nr   z0Expect pass in Non-Zero tiling_factor explicitly)	r   r   r   pick_vec_isavec_isar  r  r"  	num_elems)r   rX  rU  r  r  r"  r   s         rW   r   zCppVecKernel.__init__  si     	{+"//1|||q T"TT *$"&/]rV   r   r  c                       j                  ||      ry  fd|j                  D        D ]"  }t        |t              sJ |j                  s" y  t        || j                        }|j                  r|S d S )Nc              3      K   | ]A  }t        |t        j                        r%j                  j                  |j
                      C y wr   r   r   r  rO  r  rj  r   r  r   s     rW   r   z5CppVecKernel._try_get_const_stride.<locals>.<genexpr>  s:      
a* HH  (
   AA
)r  r  r   rI   rQ  r   r  rR  )r   r   r  indirect_varr  s   `    rW   r  z"CppVecKernel._try_get_const_stride  s|    ))%9
''
 	L
 lN;;;""	 %UGT5G5GH))v3t3rV   r   r   c                     t        j                  | j                  |j                  z  dz  | j                  j                         z        }|dk\  sJ |S )N   r6   )mathr  r  itemsizer  	bit_widthr   r   num_vectorss      rW   r#  zCppVecKernel._get_num_vectors  sO    ii/!3dll6L6L6NN
 arV   c                 p    | j                   |j                  z  dz  | j                  j                         z  S )Nr'  )r  r)  r  r*  )r   r   s     rW   r  z!CppVecKernel._get_raw_num_vectors  s0     !!ENN2Q69O9O9QQQrV   c                 h    | j                  |      }|dk(  rdt        |    dS dt        |    d| dS )Nr6   r  r   r  r+  )r#  rJ   r+  s      rW   r  zCppVecKernel._get_vec_type  sJ    ++E2!),u*=)>a@@*<+>*?qQOOrV   c                 l    |t         j                  k(  ry| j                  |      }dt        |    d| dS )NrR   r*  r+  r   )r   rk   r#  rJ   r+  s      rW   r  zCppVecKernel._get_mask_type  s<    EJJ++E2"<#6"7qQGGrV   r  c                     |j                   t        j                  k(  sJ t        |             | j	                  |      }| dt
        |    d| dS )Nz.template cast<r+  r   )r   r   rk   reprr#  rJ   )r   r  r   r,  s       rW   r  zCppVecKernel._get_mask_cast  sP    zzUZZ'3d3'++E2|E':&;1[MMMrV   rm  c                 .    t        j                  d|      S )Nztmp_acc[0-9]+_vecro  rr  s     rW   rs  z&CppVecKernel.get_reduction_var_pattern  s    yy,d33rV   r   	load_maskc                    t         |   }| j                  |      }d}|rS|j                  s&| j                  t        j
                         d| d}n!| j                  |t        j
                         }|dk7  r| dt        |       n|}|t        j                  k(  r| j                          d| d}	|	S |r| d| d| d| dn,| j                  |       d	| d
t        | j                         d}	|	S )a  
        Get a load line str that loads a vector from `var` at `index` of type `dtype`.
        If `load_mask` is not None, we do a masked load accordingly.
        Notes on the `dtype`:
        1. We always load `self.tiling_factor` number of elements regardless of the `dtype`.
           It means we load half of the vector lanes for 16-bit data types and quarter of the
           vector lanes for 8-bit data types.
        2. `torch.bool` and `torch.uint8` could mean masks and we load them as float mask vectors.
        Nr  r   r   rA  z.template loadu<r+  r\  z::loadu(r   )rJ   r#  rQ  r  r   rn   r  rG   rk   r  r  )
r   r   r   r   r3  cpp_typer,  load_mask_strloadbufrm  s
             rW   _get_vec_load_linezCppVecKernel._get_vec_load_line  s      &++E2###'#6#6u{{#C"DGI;VW X#'#6#6y%++#N"O5:aZSE[/01SEJJ))+,GG9A>D  ! !/!1(1[MG9TUV**512(7)2kRVR`R`FaEbbcd 
 rV   Fr  store_value
accu_storec                 Z    |r	|J d       |r|sJ  j                   dt        j                  dt        f fddt        j                  dt        f fddt        dt        f fd}t               }|j                  d	       |j                         5   |      }	 |      }
d
t        |    d|
 d}|j                  |       |r |j                  | dt        |	       d       t         j                   j                      d      }i } fd|j                  D        D ]4  }t        |t              sJ |j                  s" ||      }| d| d||<   6  j!                  | j                  |      }d} j"                  l|rJ d       t         j"                  t              sJ  j"                          j"                  j                  r j"                   d| d}n j"                   d}t%        j&                         r|j                  d j(                          n|j                  d j(                          |j                  d| d| dt         j*                         dz   | dz          |j                         5  t-        j.                         5 }t        |      }|D ]#  }t1        j2                  d| z   dz   ||   |      }% || d| dn| }|r4|j                  d | d       |j5                  |j                                |r!|rd!nd"}|j                  | d#| d$| d%       n|j                  d&| d'| d(       ddd       ddd       |s( j7                  d)d*|      }|j                  d+| d(       ddd       |j                  d,       |r#|j                  d(       j9                  |       y j:                  j=                  |      }t        |t              sJ d-|_        |S # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w).a  
        Load or store a vector in a non-contiguous way. The vector is initialized from an array that is
        filled in an inner loop over the tiling factor.
        :param var: buffer to load from or store to, i.e. `var[transformed(index)]`. If None, we load the index
                    as index expression, i.e. `transformed(index)`.
        :param index: index into the `var` or the index expression by its own if `var` is None.
                      The `index` could contain indirect indexing or the tiling itervar. When used in
                      the inner loop, the index is transformed as follows:
                      1. the index is linearized along the tiling dim.
                      2. the indirect indexing vector variables are transformed into arrays over the tiling dim.
        :param dtype: data type of `var` or `index` if `var` is None.
        :param buffer: the code buffer to write the generated code to. If None, we write to `self.loads`.
        :param store_value: the value to store. If None, we load the vector.
        :param accu_store: whether accumulate the store_value to store_ptr. If True, a store_value should be provided
        :return: a CppCSEVariable that represents the loaded vector or None if it is a store.
        Nzstore var must be providedr   r   c                 r    | j                   dk  rj                  d| j                   z  z  S j                  S N   )r)  r  r   r   s    rW   get_result_sizezCCppVecKernel._load_or_store_non_contiguous.<locals>.get_result_size"	  s1    ~~!~~enn)<==~~%rV   c                 r    | j                   dk  rj                  d| j                   z  z  S j                  S r=  )r)  r  r?  s    rW   get_tiling_sizezCCppVecKernel._load_or_store_non_contiguous.<locals>.get_tiling_size(	  s5    ~~!))Q%..-@AA)))rV   vec_varc                 L   | j                   sJ t               }|j                  d       |j                         5  | j                  }|J |t
        j                  k(  rt
        j                  } |      } 	|      }|j                  dt        |    d| d       |  dt        |       d}|j                  |       |j                  d       d d d        |j                  d       
j                  j                  |      }t        |t              sJ |S # 1 sw Y   JxY w)	Nr  r  r   r  r  r  zreturn tmpbuf;rC  )rQ  r8   r  r  r   r   rk   rn   rJ   rG   rO  rP  r   rI   )rC  r  r-  result_sizetiling_sizerm  rY  r  r@  rB  r   s          rW   vec_to_arrayz@CppVecKernel._load_or_store_non_contiguous.<locals>.vec_to_array.	  s   >>!>>DNN5! 1#MM	 ,,,

* %I-i8-i8.|I/F.Gr+V_` ""8[9Q8RRTUt$/01 NN4 XX&&vt4Ffn555M!1 1s   BDD#r  r  r   r  r  r  r  c              3      K   | ]A  }t        |t        j                        r%j                  j                  |j
                      C y wr   r"  r#  s     rW   r   z=CppVecKernel._load_or_store_non_contiguous.<locals>.<genexpr>U	  s:      !!!TXX. $$QVV,!r$  r[  r\  r  r   zunexpected store with load maskz.is_masked(r   z != 0z#pragma GCC unroll z#pragma unroll 
for (long  = 0; r  ; ++)z\br  +==r   z tmpbuf[];ztmpbuf[r  r  ztmpbuf.data()r   r
  rC  T)r  r   r   rm   rI   r8   r  r  rJ   rG   r.   r  r  r  r   rQ  r  r}  r   is_gccr  r  r  r  rp  rI  r  r8  r  rO  rP  )r   r   r   r   r  r9  r:  rG  r  rE  rF  result_declareitervar_innerreplacementsr%  	array_varr3  r  index_crhsr   	load_linerY  r@  rB  s   `   `                  @@rW   r  z*CppVecKernel._load_or_store_non_contiguous	  s[   2 #/O3OO1;>ZZF	&5;; 	&3 	&	*5;; 	*3 	*	. 	^ 	 	, ~u[[] ?	7)%0K)%0K*<+>*?r+iX  NN>*"m#9+k:R9SSUV /==12&9M L!++! Q
 ",???&& ,\ :I4=;aa1PL.Q 004??= 1 E I*&I(II!$//>BSDOOSB??))#'??"3;}oQ OI#'??"35 9I!!#!4T5G5G4HIJ1C1C0DEFNN]O62"O3{4>>'B&C2FG"O3'(
  H
 4 4 6 H%%e,$0 L ff<.1E9$\2G .1_Qwiq)WINNT)A#67''6*4$#KNNcU!K=r#RSNNW]O4uA#FG!H H"  33OQN	156?	7@ 	tNN3MM$XX&&vt4Ffn555 FMM;H H H HY?	7 ?	7sE   B1P!EP!P(B2P	P"2P!	PPP	P!!P*rj  c                 6   | j                   j                  |      }| j                  |      }t        j                  j                  |      }| j                  | j                     }| j                  ||      }|dk(  rt        	| )  ||      S |dk(  rE| j                  |||| j                        }| j                  j                  | j                  |      }n| j!                  |||      }t#        |t$              sJ |j'                  d| ||fi        d|_        |S )Nr   r6   r  T)rX  r  r  r5   r  r  r  r  r  r   r  r8  r}  rO  rP  r  r  r   rI   rR  rQ  )
r   rj  r   r   r   r  r  rm  rY  r   s
            rW   r  zCppVecKernel.load	  s    iiood#$$U+!!$']]4??3
++E:>Q;7<e,,q[**3udooNDXX&&tzz48F77UEJF&.111ftT5&92>rV   rT  c           	         t        |t              s#t        |t              r|j                  sJ |       | j                  | j
                     }| dt        |       }| j                  ||      }t               }	|dk(  rg|t        j                  k(  r%| j                  |	j                  | d| d       |	S |	j                  | d| dt        | j                         d       |	S | j                  ||||	||       |	S )a2  
        Get a store line buffer that stores `value` into `var` at `index` of `dtype`. It handles
        both contiguous and non-contiguous store cases.
        :param value: Vectorized type templaterized on `dtype`.
        :param var: buffer to store into.
        :index: index into the `var`.
        rA  r6   .store(r  r   )r  r9  r:  )r   ro   rI   rQ  r  r  rG   r  r?   r   rn   r"  r  r  r  )
r   rT  r   r   r   r:  r  var_exprr  r  s
             rW   _get_store_linezCppVecKernel._get_store_line	  s   " %%un-%,,		 
 ]]4??3
U#k%012++E:>Q;#(>%z<=  gWXJbT^^1L0MRP  ..UE$Ej /  rV   c                    dv sJ t        |t              sJ |       |j                  s| j                  |      }| j                  j                        }| j                  |      }t        j                  j                        }|B| j                  ||||      }| j                  j                  |j                  fd             y |dk(  rt        j                  j                   sT| j"                  dk(  rE| j                  | |||d      }| j                  j                  |j                  fd             y | j%                  |      }| j%                  t&        j(                        }	t*        |   }
t-        j.                  |t&        j(                        j0                  }|j                  sJ d|
 d	|	 d	| d
| d	| d	| d}| j                  j3                  t5        |             y t7        d|       )Nr  c                     t        |       S r   r=   rV  rj  s    rW   <lambda>z$CppVecKernel.store.<locals>.<lambda>	  s    ,tQ2G rV   r  r6   T)r:  c                     t        |       S r   r`  ra  s    rW   rb  z$CppVecKernel.store.<locals>.<lambda>	  s    l46K rV   zatomic_add_vec<r   r\  r  r  )r   rI   rQ  rV  rX  r  r  r5   r  r  r]  ru  r  mapr   r  r_  rU  r#  r   rS  rJ   r3   r  rT  r  r=   r  )r   rj  r   rT  r  r   r   r  n_srcn_idxr   rm  s    `          rW   r  zCppVecKernel.store	  s   }}%07%70||NN5)Eiit$$$U+!!$'<''sE5ADKKtxx(GHI\!::--$2B2Ba2G++g# ,  ""488,K#LM--e4--ekk:%e,uekk:@@||#|(5'E7"SEE7RTUZT[[]^%%l4&>?%D6&:;;rV   c           
         |t         v sJ |dv }| j                  | j                  k\  }|r|n|}t        |t              sJ |       |j
                  s| j                  |      }|||f}|| j                  j                  v r| j                  j                  |   S d}	|	 dt        |    d}
t        ||      }| j                  ||      }| j                  j                  | j                  d| d      }| d}d	| _        | j                  j!                  | d
| dt#        ||       d       | j                  j!                  | d
| d| j%                  ||       d       |dk(  r| j                  J | j                  j!                  | d| d| j%                  ||       d       t'        j(                  d | j*                  | j                  d        }| j                  | j                  k\  r| j,                  nd}t/        ||      | _        | j0                  | j2                  j                  vr| j2                  j                  | j4                  d| j0                   d      | _        | j6                  | j2                  j                  | j0                  <   | j8                  j!                  | j;                  |             t<        j>                  j@                  rdn	tC               }| jD                  j!                  | j;                  ||             n(| j2                  j                  | j0                     | _        | jF                  rd| n|}| jH                  j!                  | d| jK                  |||d	       d       n| j                  J | jL                  | j                     }tO        | j                  dz   tQ        | jL                              D ]$  }|| j*                  |   z  | jL                  |   z   }& | jK                  ||||||      }| jH                  j!                  | d| d       | jS                  ||||       | jS                  ||||| jJ                  | j$                         |dk(  r.| jS                  d| |||| jJ                  | j$                         |tT        jV                  k(  }|r1tY        |      rS| j[                  |      dv sJ d       d| d}d| d}| j\                  j!                  | dt_        |||       d       n|r	| d| d}n|r|dv rd| d}n|dk(  sJ | d}n}d | jK                  |d!d"      z   d#z   }|tT        jV                  k(  }|rtT        j`                  n|}d$t        |    d}
d%t        |    d&| j[                  |       d}| d'|
 d(|
 d)| d&| d
}| j\                  j!                  | dt_        ||||*       d       |}n>|}tY        |      r1d| }| j\                  j!                  | dt_        |||       d       tc        ||      }|| j                  j                  |<   |S )+N>   ra   r`   zat::vecz::Vectorized<r   r  Fr  _vecTr   r!  r  rg   z masked_c                     | |z  S r   rU   r  s     rW   rb  z(CppVecKernel.reduction.<locals>.<lambda>
  s
    QU rV   r6   rY  masked_)r   horizontal_reductionr   )rc  rd  )r6   r   z4Welford reduction does not support VectorizedN (N>2)zwelford_vec_reduce_all(r   zwelford_vec_reduce_all(masked_z_vec_reduce_all()rf   rc   r[   r  z.all_zero()r\   z.all_masked()z	{ return rV  r  z; }r  zat::vec::vec_reduce_all<r   z([](z& x, z& y) r^  )2VECTORIZABLE_RTYPESr  rF  r   rI   rQ  rV  rQ  r  rJ   r   reduction_acc_type_vecrP  r  rM  rG  r  r   reduction_init_vec	functoolsreducerE  r  r   weight_recp_vec_rangerR  rQ  weight_recps_valrN  welford_weight_reciprocal_vecr   r  r_  r,   rK  r"  ru  reduction_combine_vecr  r	  r   rl  r   rk   r+   r#  rH  r   rn   r   )r   r   r   r   rT  r  rk  r  r  vec_nsvecrb  acc_type_vecr   acc_vecreduction_sizereduction_factorrU  acc_vec_r   rx  combiner   r   masked_next_valuereduce_all_bodyr-  vec_reduce_all_functmpvarmasked_tmpvarr   s                                  rW   r  zCppVecKernel.reduction	  s)   !4444)-AA#$2F2FF"2Y
%07%70||NN5)E!>58D..>>>%%55mDDl5&9%:!<%njA22>:N  ))JJ*]O4E * 
 E, ''j#c."L!MQO	
 	''nAgYc$*A*A.R\*])^^_`	
 --''333!!++.	T5L5L^]b5c4ddef '--"DKK0D0D0F$GN '+oo9M9M&M""ST  *2.BR)SD&))1F1F1V1VV(,(=(=(F(FLLJt/I/I.J"KSX )G )%
 )) %%55.. 22<<66u= zz11 "-/ 
 ))3366ukJ )-(=(=(M(M..)% /3nn	*'HKK!!*C : :>8UZ\` abbcd ''333MM$"6"67E4//!3S5GH BA.q1AAB00%9# 1 G KK!!WIS	";<,,		
 	,,!%!;!;"55 	- 	
 --00'#%)%?%?"&"9"9 1  5::%#N3,,U3 8  J JJ   7wiqA
&DWIQ$O!%%//e30FWXYYZ[ " .//?yJ
! & 
 $%WI[!9J)U222$+9M!:J  00cJK  
  5::-+2EKK	,\)-D,EQG(@iAX@YY[\`\q\qr{\|[}}~&# 34DU3%u_L]]_`g_hhij
!!++%s,^S*Xabccde FF#N3")& 2%%//hc"3NFM"Z![[\] #>6:<B**=9rV   c                 &   | j                  |      }| j                  j                        }t        j                  j                        }|j                  r%|t        j                  k(  r|nt        j                  nt        j                  }t        j                  j                  |      }t        j                  j                  |      }t               }	| j                  | j                  k\  r.|	j!                  | dt#        |       dt$        |    d| d       n||k7  rt$        |    d| }
|t        j&                  k(  r&| d| j                  t        j&                         d}n?||cxk(  rdk(  rn nd	t$        |    d| d
}n d	t$        |    d| dt$        |    d| d| d
}|	j!                  d|
 d| d       |
}|	j)                  | j+                  ||||             | j,                  j)                  |	j/                  fd             y )Nr[  z] = static_cast<r\  r  _z.template cast<bool,r   r6   at::vec::convert<r   r+  r  r!  r  c                     t        |       S r   r`  ra  s    rW   rb  z.CppVecKernel.store_reduction.<locals>.<lambda>
  s    T18M rV   )r  rX  r  r5   r  r  is_floating_pointr   rj   rn   rS  r  r#  r?   r  rF  r  rG   rJ   rk   r  r]  rH  rd  )r   rj  r   rT  r   	out_dtyper   out_num_vectorssrc_num_vectorsr  converted_valueconverts    `          rW   r  zCppVecKernel.store_reduction
  s   $$U+iit$GG%%d+	 ** $u||3Y 	
 ((33I>((33E:??d222NN%qU+,,<\)=T<UUWX]W^^`a
 E!%1)%<$=Qug"F

*!&';D<Q<QRWR\R\<];^^abG&/>Q>/Y0G/H5'QRS  
 0Y0G/H./qe1D0EQFWWYZ_Y``ac   &7s7)1EF'KK,,UC	JK$$TXX.M%NOrV   
scalar_varc                    |j                   rJ |j                  t        j                  k(  rE| j                  j                  | j                  | j                          d|j                   d      }n]|j                  J | j                  j                  | j                  | j                  |j                         d|j                   d      }t        |t              sJ |j                  |_        |j                  |_        d|_         |S )Nr  r   r  T)rQ  r   r   rk   rO  rP  rQ  r  rj  r  r   rI   dependent_itervars)r   r  rC  s      rW   rV  zCppVecKernel.broadcast
  s    $$$$uzz)hh''!4!4!6 7wz>OqQG ##///hh''%%j&6&678*//9J!LG '>222"((%/%B%B"rV   r  c           	      "   |j                   rJ |j                  J | j                  j                  | j                  | j                  |j                         d| d| d      }t        |t              sJ |j                  |_        d|_         |S )Nz	::arange(r   r   T)rQ  r   rO  rP  rQ  r  r   rI   )r   r   r  rY  s       rW   r  zCppVecKernel.arange
  s    <<{{&&&""LL!!%++./yr&K
 &.111{{rV   c                    t         |   }| j                  |      }t        |      rd| dS |dv rWt        |   }| j	                  ||      }|dk(  rt        |      rd| dnd| d}nt        |      rd| dnd| d	}| d
| dS |dk(  r| j                          dS t        ||      }| d
| d}	|t        j                  k(  r|dv sJ | j                          d| dS |	S )Nr   r   >   ra   r`   r`   rz   ry   r|   rx   r{   r  r   rf   z	::from(0))r\   r[   rc   r  )
r>   r  r+   rJ   rm  r   r  r   r   rk   )
r   r   r   r   vec_typer   rb  r   scalar_initvec_inits
             rW   rn  zCppVecKernel.reduction_init_vec
  sA   07%%k2/hZs++11!+.F22>5IH) &e, +6(-@/xx@  &e, ,F8=A/xx@ 
 ZqQ''U"))+,I66$^U;ZqQ/EJJ!%::::))+,GK=BBrV   c                 >   t         |   }| j                  |      }t        |      rd| dS |dv rC| j                  |      }| j                  t        j
                        }dt        |    d| d| dS |t        j                  k(  r|dv sJ | j                          S |S )Nr   r   >   ra   r`   zIndexValueVec<r   )r\   r[   rf   rc   )	r>   r  r+   r#  r   rS  rJ   rk   r  )r   r   r   r   r  re  rf  s          rW   rm  z#CppVecKernel.reduction_acc_type_vec
  s    07%%k2/hZq))11))+6E))%++6E#L$=#>br%PQRREJJ!%AAAA))+,-rV   c                     |rt        | j                  |      n| j                  }t        |      }d| j                  |       d| j                   d| dS )Nzstatic WeightRecp<r  r  r  )r   rq  rG   r  rr  )r   r   rU  vec_num_range_threadvec_num_range_thread_exprs        rW   rs  z*CppVecKernel.welford_weight_reciprocal_vec  sk      D..<++ 	
 %00D$E! !3!3E!: ;2d>S>S=T()	
rV   rk  r   c                    |t         j                  k(  }|dk(  r=| j                  rd| d| dt        | j                         dS |r| d| S d| d| dS |dk(  r=| j                  rd| d| dt        | j                         dS |r| d	| S d
| d| dS |dk(  r;| j                  rd| d| dt        | j                         dS |rdnd}	| d|	 d| S |dk(  r2| j                  rd| d| dt        | j                         dS | d| S |dk(  r2| j                  rd| d| dt        | j                         dS | d| S |dk(  r|rN| j                  r,d| d| dt        | j                         d| j                   d	S d| d| d| j                   dS | j                  rd| d| dt        | j                         dS d| d| dS |dk(  rgt        |t              r|\  }
}}nt        ||      \  }
}}| j                  r%d| d|
 d| d| dt        | j                         dS d| d|
 d| d| d	S |dv r|J t        |   }| j                  |      }| j                  t         j                        }d}d}|%|J dt        |      j                          }d| }| j                  r.| d| d| d| | d | d| | dt        | j                         dS | d| d| d| | d | d| | dS |d!k(  r| d| S t        )"Nr[   zmax_masked_reduce(r   r   r&  r  r\   zmin_masked_reduce(r!  r  rc   zsum_masked_reduce(r   r]   r   rd   zprod_masked_reduce(r   re   zxor_sum_masked_reduce(r   rg   r   r  rh   r   z}, r   r   rR   z_combine_vec<r\  rf   )r   rk   r"  rG   rr  r   r   r   rJ   r#  rS  ro   r  r  )r   r   r   r   use_weight_recpsr   rk  r   r   r   r   r   r   r   re  rf  t_extra	arg_extras                     rW   rt  z"CppVecKernel.reduction_combine_vec  sD    uzz)U"~~+C5:,bT^^A\@]]^__  e3zl+ -SEJ<qA
 u$~~+C5:,bT^^A\@]]^__  e3zl+ -SEJ<qA
 u$~~+C5:,bT^^A\@]]^__%,c#a}Aj\::v%~~,SEJ<r+dnnB]A^^_``c*..y(~~/uBzl"[QUQ_Q_E`Daabccc*..//>>-cU"ZL;t~~C^B__bcgcxcxbyyz{{-cU"ZLDDYDYCZZ[\\>>-cU"ZL;t~~C^B__`aa-cU"ZLBB00*e,#- b& $5^Z#P b&~~)#d4&2$b[Y]YgYgMhLiijkk)#d4&2$bLL33(((!),F)))4E))%++6EGI +777s#78>>@AB L	~~%&mF82eWBuggY WuBzl9+RDNN8S7TTUW
 ))vhbr%QXPYY[\_[``bcmbnoxnyyz{{u$U#j\**%%rV   c           	         t        |t              sJ |j                  J |j                  s4t        |t              r|j                  rd| d}t        	|   ||||      S |}|}|r!| j                  |j                         d| d}|r!| j                  |j                         d| d}|r|rd| d| d| d| d	}| d| d| }n#|r| d| }| d| }n|sJ | d| }| d| }| j                  |j                         d| d}|r6|j                  s!| j                  |j                         d| d}d| d| d}| j                  rS| j                  |j                         d| j                  |j                         d	| d
t        | j                         d}d| d}| j                   d| d| dS )Nr  z).all_masked()r   r  z) & (r  z) | ~(z::set(z::from(1), (z), z, "index out of bounds: z"))r   rI   r   rQ  r   r  r  r  r"  rG   r  )
r   r   r  r  r  lower_scalarupper_scalarcond
cond_printr   s
            rW   r  zCppVecKernel.indirect_assertl  s$   #~...yy$$$zz$/DKK4&/7*3udCC))#))45QugQ?E))#))45QugQ?EUugT#eC5E7!<D(>cU#l^DJWD&D(>cU3JL5U#eW%D5L>2J%%cii014&:;;--cii894&BtfF4&*D>>&&syy12&9L9LSYY9W8X YV3{4>>:;1>  4&'&&'q.FzlRTUUrV   c                 *   t        |t              sJ |j                  st        	|   |||      S t
        |   }| j                  |      }t
        |   }| j                  |      }d| d}|t        j                  k7  r2|t        j                  k(  r| j                  |       d| d| d| d}|S |t        j                  k(  r |t        j                  k7  r| d| d| d}|S ||k7  r+||cxk(  rdk(  rn nd	| d| d}|S d	| d| d| d| d| d}|S )
Nr  r   z::from<r+  r\  z.to<r   r6   r  )
r   rI   rQ  r   rN  rJ   r#  r   rk   r  )
r   r  r   r   src_cpp_typer  dst_cpp_typedst_num_vectorsrX  r   s
            rW   rN  zCppVecKernel.get_to_dtype_expr  s\   #~...zz7,S%CC#I.//	:#E*//63%qz

"u

':)))45W\N!OK\\^_b^ccdeD  %**$%**)<U$|nAo->cBD  %/6Q6*<.3%qA  +<./9J!L>YZ[jZkkmnqmrrstrV   r   )NNFr  )+r   r!  r"  rM  r  r   r   rK  r   r  r   r   rm   r#  rn   r  ro   r  r  rI   r  rs  r	   r8  r?   r   rk   r  r  r]  r  r  r  rV  r  rn  rm  rs  r   rt  r  rN  r$  r%  s   @rW   rU  rU    s   I C"45:: 4 4ekk c R%++ R% R
P5;; P3 P 38++ HEKK H# HN> N%++ N# N
4c 4 /3## zz# {{	#
 N+#T ,0<@ Lc]L zzL {{	L
 (L eC$789L L 
.	!L\ UZZ 4 !#S.()# # zz	#
 {{# #J<BcJ$PLN ~ $
N 
ELL 
^ 
 D
& (,/3+0==X& %X& 'tnX& EKK(X&t#VJ rV   rU  c                        e Zd ZdZeZ	 	 d fd	Zd Zd Zd Z	de
dej                  f fdZd fd		Zd
 Z fdZdej                  dej                  fdZ xZS )r<  an  
    A vector kernel that handles the 2d tiles with the tile size defined in `tiling_factor` on
    the inner-most loop level and one of the outer loop level (`outer_tiling_idx`). When the data
    tile is accessed in a contiguous way from the outer loop axis, a transposition is applied on the
    tile to make the access contiguous from the inner-most loop axis. Then, the same vectorization
    logic from its parent `CppVecKernel` is leveraged for load/store/compute. The transposed tile load
    and store are generated into kernel.preloads and kernel.poststores buffers.

    The loop structure looks like below:
    for ...
      for i_outer ...
        for ...
          for inner_most ...
            // generated by CppTile2DKernel
            float tmp0[16*16]; at::vec::transpose_mxn<...>(tmp0, in_ptr0 + ..., ...); // into kernel.preloads
            float tmp1[16*16]; // into kernel.preloads
            for i_inner ... { // the kernel inner loop
              vectorized loads/compute/stores (e.g., load tmp0, store tmp1) // into kernel.loads/compute/stores
            }
            at::vec::transpose_mxn(out_ptr0 + ..., tmp1, ...) // into kernel.poststores
          for inner_most ... (tail)
            // generated by CppVecKernel
            ...
      for i_outer ... (tail)
        for ...
          for ...
            // generated by CppKernel
            ...
    c                     t         |   ||||d   |       || _        || _        || _        |r|n|| _        |r|n|| _        d| _        y )Nr6   T)r   r   tiling_indicesinner_tail_sizeouter_tail_sizeinner_num_elemsouter_num_elemsinner_is_tiling_idx)r   rX  rU  r  r  r  r  r   s          rW   r   zCppTile2DKernel.__init__  s`     	1	
 -..2A}2A}#' rV   c                 L    t        | j                  | j                      d      S )Nr  )r.   r  	outer_idxr   s    rW   inner_itervarzCppTile2DKernel.inner_itervar  s"    !T]]4>>%B$C6"JKKrV   c                 b   | j                   | j                     }| j                   | j                     }t        ||| j                        }t        ||| j                        }| j
                  d u xr@ |dk(  xr9 |j                  |      xr& |j                  |       xr |j                  |       S Nr6   )r  r  r  r   r  r}  r   )r   r   	outer_var	inner_varouter_strideinner_strides         rW   need_vec_transposez"CppTile2DKernel.need_vec_transpose  s    MM$..1	MM$//2	*5)T=O=OP*5)T=O=OPOOt# 0!0		)$0 !$$Y//0 !$$Y//	
rV   c                 .   t         j                  j                  |      }| j                  }| dt	        |       }d}t	        t        || j                  | j                     | j                               }	t	        | j                         }
|r||}}|
|	}
}	d}| j                  |z  r| j                  | j                  }}n| j                  | j                  }}t        |t        j                        r|j                  r&t        |t        j                        r>|j                  s2dt         |    d| d|	 d| d|
 dt	        |       dt	        |       d}n1dt         |    dt	        |       dt	        |       d| d|	 d| d|
 d}|r| j"                  j%                         }n\|| j"                  j&                  vr)| j"                  j)                  | j*                  |d	
      }nd	}| j"                  j&                  |   }|r4d| dt         |    d| d| d| d}| j*                  j-                  |       |j/                  dt1        |            }|r'| j2                  j-                  t5        ||             |S | j*                  j-                  |       |S )NrA  __place_holder__Tzat::vec::transpose_mxn<r\  r   r  r+  Fr  zalignas() r   r[  rZ   rP  )r5   r  r  r  rG   r   r  r  r  r  r  r  r   r   rK  rR  rJ   rO  r  r  rP  rS  r  r   ro   rT  r=   )r   rj  r   r   is_storer   factorr  r  ld_srcld_dstneed_defineMNload_or_storetile_vardefine_lines                    rW   gen_transposed_tile_load_storez.CppTile2DKernel.gen_transposed_tile_load_store  s   !!$'##SU+,-  3E4==;Y[_[m[m nop/0CC#VFF##h.'')=)=qA $$$$ A q%**%akkq%**%akk *,u*=)> ?56("SEF82k!n5ERTUGWWY[  *,u*=)>aA?OqQ\]^Q_P` a56("SEF827  xx(H$((..0xx((U(SHKxx~~m4H$VHB|E/B.C1XJaPVxWXY_X``bcKMM##K0%--.@#h-POO%%l4&GH  MM##M2rV   rj  r   c                 V   | j                   j                  |      }| j                  |      }| j                         }| j	                  |      r| j                  |||d      }| dt        || j                  z         }t        j                  j                  |      }| j                  |d|      }| j                  j                  | j                  |      }	|	j                  d| ||fi        t!        |	t"              sJ d|	_        |	S | j'                  |      }
t(        | U  ||
      S )NFr  rA  r   r  T)rX  r  r  r  r  r  rG   r  r5   r  r  r8  rO  rP  r  rR  r   rI   rQ  r=  r   r  )r   rj  r   r   r   r  r7  r   rm  rY  r   r   s              rW   r  zCppTile2DKernel.load"  s   iiood#$$U+""$""5)::c55 ; H "
#k%$..2H&I%JKGGG%%d+E**7Au=DXX&&tzz48F!!&4u*=rBfn555 FMM//6I7<i00rV   c                    d|v sJ | j                   j                  |      }| j                         }| j                  |      }|J | j	                  |      r| j                  |||d      }| dt        || j                  z         }| j                  sFt        j                  j                  |      t        t        j                  t        j                  gz   v r| d| dt        | j                         d}	n| d| d}	| j                   j#                  t%        ||	             y | j'                  |      }
t(        | U  ||
||       y )Nr  Tr  rA  r[  r   r  )rX  r  r  r  r  r  rG   r  r"  r5   r  r  r   r   r  r  ru  r  r=   r=  r   r  )r   rj  r   rT  r  r   r   r  storebufrm  r   r   s              rW   r  zCppTile2DKernel.store8  sB   }}iit$""$$$U+||""5)::c54 ; H #3{54>>3I'J&KLH~~!2!24!8M

M = "  zK4O3PPRSz4KK!!,tT":;//6IGM$	5$7rV   c                    | j                         }| j                  r2|j                  d| d| dt        | j                         d| d	       y |j                  d| d| dt        | j
                         d| d	       y )NrJ  rK  r  rL  rM  )r  r  r  rG   r  r  )r   r  r   s      rW   r  z#CppTile2DKernel.codegen_inner_loopsQ  s    ""$##NNUG6%K@T@T4U3VVXY^X__bc NNUG6%K@T@T4U3VVXY^X__bcrV   c                    t         |   ||      }| j                  d   | j                  k  r| j                  nt	        | j                        \  | _        | _        | j                  | j                  d   k(  r+| j                  | _        | j                  | _
        d| _        |S | j                  | _        | j                  | _
        d| _        |S )Nr6   r   FT)r   r  r  rF  reversedr  r  r  r"  r  r  r  r  r  )r   rw  reduction_groupre  r   s       rW   r  zCppTile2DKernel.set_ranges\  s    w!%9 ""1%(<(<< $--. 	(
 ??d11!44!11DN!11DN',D$
  "11DN!11DN'+D$rV   r   c                 Z    | j                  || j                  | j                               S )NrI  )r  r  r  r  s     rW   r=  z"CppTile2DKernel.transform_indexingn  s0    ++%%' , 
 	
rV   NNr   )r   r!  r"  rI  r:  r  r   r  r  r  ro   r   rK  r  r  r  r  r=  r$  r%  s   @rW   r<  r<    sn    < #I (.L
2h1 1UZZ 1,82	$


 
uzz 
rV   r<  _bodyr   c                 j   | j                   gt        | j                  j                               z   }d}d}|D ]  }|j                  j
                  D ]  }|j                  dk(  s|j                  dv r!|j                  dvrd}t        |d      r|j                  rt        j                  |j                  v sJ |j                  t        j                     }|j                  r|j                  t        vrd}|&||j                  k7  st        j                  d       |j                  }d}  ||fS )	z
    Returns the low precision data type (torch.float16/torch.bfloat16) contained in the nodes
    and if all the nodes can codegen with this data type without converting to float.
    Otherwise returns None and True.
    NFplaceholder)	get_indexr  )r  r  ra  rh  r  Tr/  z.bf16 and fp16 are mixed in the scheduler node.)
root_blockr   	subblocksr  r  nodesoptargetr  r/  rC   r.  r   r   warningswarn)r  
sub_blocks_lowp_fp_type	_use_fp32	sub_blockr   r+  s          rW   get_loop_body_lowp_fpr  v  s$    ""#d5??+A+A+C&DDJ+/MI !	__** 	!Exx=(ELL = -  || $  !	uf%%***..%**<<</4zz:M:Q:Q/R}}](J $I".$5 &VW$+MMM 	9	!!> )##rV   c                   J     e Zd ZdZ fdZdeee   ee   f   fdZd Z	 xZ
S )TilingSelectz
    Implement the heuristic to select the tiling factors and tiling indices.
    In the future, we can implement advanced heuristic in a subclass.
    c                 "    t         |           y r   )r   r   r   r   s    rW   r   zTilingSelect.__init__  s    rV   r   c           	      |  " t        |      }t        |      }|sJ t        d |D              rg g fS t        j                  }t        |d         d   ""rt        "fd|dd  D              r"}t        j                         j                  |      }| j                  |||      }|rt        |d       \  }}	t        |      t        |	      z   }
t        j                  j                  rJd }d	 }d
 }t!        t#        |
            D cg c]  }t%        t&        j(                  |       }}t#        |      }|d | ||d  }}i }i }|D ]n  }|j*                  gt-        |j.                  j1                               z   }|D ]4  }|j2                  j4                  D ]  }|j6                  dv r|j6                  dk(  rdnd}|j8                  j;                  ||f      |j<                  |   j<                  d      } |||      r4 |||||      }|j6                  dk(  r|n|dvr ||j6                  |       t?        |j6                  t@              s|j6                  jC                  d      r|j6                  dv r|j6                  |vrd||j6                  <   ||j6                  xx   dz  cc<    7 q tE        |j1                               }tE        |j1                               }d}|dkD  r||z  |k\  rg g fS |	s4|r2t#        |      dk(  r$tG        ||d      g      s||d      |dz  k  rg g fS |tH        v rt        j                         j                  |      }|D ]  } | dk  r| t#        |
      z   } | dk  s| t#        |
      k\  r*tG        |
      retJ        j2                  jL                  jO                  |
|    d      }!|!|k  sitJ        j2                  jL                  jQ                  |!|       |dz  } n|
|    |k  s|dz  } n t#        |      dk(  r|g|fS t#        |      dk(  r||g|fS g g fS c c}w )Nc              3   ,   K   | ]  }|t         v  y wr   )ru   r   r   s     rW   r   z-TilingSelect.select_tiling.<locals>.<genexpr>  s     HEu//H   r   c              3   @   K   | ]  }t        |      d    k(    yw)r   N)r  )r   	loop_body_lowp_fp_dtypes     rW   r   z-TilingSelect.select_tiling.<locals>.<genexpr>  s(      "
 #9-a0NB"
s   r6   )r   c                     t        | d         S r  r  sizess    rW   rb  z,TilingSelect.select_tiling.<locals>.<lambda>  s    #eAh- rV   r.  c                 L    ||d      }t        | ||      }|j                  r|S d S Nr   )r   rR  )r   r  r  r  r  r  s         rW   _try_get_stridez3TilingSelect.select_tiling.<locals>._try_get_stride  s4     '~a'89G0OF%+%5%56?4?rV   c                 2    | |vrd|| <   y || xx   dz  cc<   y r  rU   )	node_namenon_contig_indexing_op_counters     rW   _update_negative_op_countz=TilingSelect.select_tiling.<locals>._update_negative_op_count  s(     !(FFDE6yA6yAQFArV   c                     t        |      dk(  xr: t        |       dkD  xr* |d   dk\  r|d   n|d   t        |       z   t        |       k  S Nr6   r   r  )r  r  s     rW   _is_valid_indicesz5TilingSelect.select_tiling.<locals>._is_valid_indices  sb    
 N+q0 (MA-(  .a0A5 +1-!/!2S]!Bh-(	rV   )r  r  r  r  r   )r   r6   masked_subblock)r3   r  r  r  g{Gz?r  ))rE   rD   rf   r   rn   r  r   r   r  	nelements_select_tiling_indicesr[   r   r   r  enable_tiling_heuristicsr	  r   r/   r   r  r  r   r  r  r  r  r  r  indexing_from_argsrX  r   ro   
startswithrc   r*   r   r5   r  r  guard_lt)#r   fn_listvar_sizes_listloop_bodies
all_dtypesr   r  r  rw  r  rD  r  r  r  r  r  rF  re  reduction_vars
op_counterr  r  r  r  r   arg_idxr   r  op_numnon_contig_indexing_op_num	thresholdfactor_lowptiling_indice
call_ranger  s#                                     @rW   select_tilingzTilingSelect.select_tiling  s    %W-/<
zHZHHr6M.{1~>qAc "
(_"
 
 #E#002<<5<I44^]
 %($?&"E?  ,)??Kzz22@G" #3{#34 34;;B  #&e*-o._-. % .0
 BD.( BE"'"2"2!3d5??;Q;Q;S6T!TJ%/ B	%.__%:%: BE$||/NN/4|||/K!QR(1(I(I%)>$:)""'**W"5":":1"=)? $5X~#N-<(-x.&F
 ,1<<<+G )/-36-A(A,1LL:X)*  *%,,< % 7 78I J#(<<#M$N $)<<z#A?@Ju||$<$.u||$<$A$<7BBB@ Z..01-0299;.* !	A:"<v"E"R r6M (N+q0,!."34
 nQ/0=13DD r6M% *668BBBO%3 M$q((5K8H(H$q(MS=M,M '4%&WW%5%5%?%?'6 &@ &
 &3GG,,55j+N,71,<M!$]3kA(3q(8" >"a'%66>"a'%}5~EE2vKs   
!P9c           	         g }t        ||      D ]`  \  }}t        j                  |g| }|t        j                  |j
                  |j                        D cg c]  }|j                   c}z  }b t               }	g }
t               }t               }|D ]  }|j                  D ]  }t        j                  d|j                        s$t        |||      }|dk(  r7|dk(  rO|	j                  t        |j                  dd               |
j!                  t        |j                  dd               t#        d |j                  D              r(|j                  t        |j                  dd               |j                  t        |j                  dd                
 |	|z
  |z
  }t%        |d       \  }}t'        |      t'        |      z   }t'        |	      dk(  r|dz
  gS |rt)        |      dd  S |	|z  |z
  }t)        |	      }t'        |      dk(  r|d   |v r|d   |dz
  k(  r|S t)        ||
j*                        dd  S c c}w )	Nz^d\d+$r   r6   c              3   P   K   | ]  }t        |t        j                           y wr   )r   r   SIZEr   r  s     rW   r   z6TilingSelect._select_tiling_indices.<locals>.<genexpr>k  s     S!4995S   $&c                     t        | d         S r  r  r  s    rW   rb  z5TilingSelect._select_tiling_indices.<locals>.<lambda>p      s5QR8} rV   r  r  r   )r  r   extract_read_writes	itertoolschainreadswritesr   setr  rp  rq  rj  r   rE  rm   r  r   r[   r   sortedcount)r   r  r   r  	all_indexfn	var_sizesrwdepcontig_varscontig_vars_listnon_contig_stride_constnon_contig_stride_otherr   r   r  contig_onlyrw  r  num_itervarscontig_and_const_stridecontig_vars_sorteds                         rW   r  z#TilingSelect._select_tiling_indicesS  sG    	 .9 	UMB	11"AyAByrxx/ST#))TTI	U e"%%"%% 	CE)) CyyCHH5,UCGQ;q[OOC$56$++C,=>Sv?R?RSS+//CHHQRL0AB+//CHHQRL0ABC	C "$;;>UU!$^9T!U5zC$88{q  1$%%+&rs++11##$ $K0"#q("2&*AA"2&,*::%%(.>.D.DEbcJJK Us   I)r   r!  r"  rI  r   r   r   rm   r  r  r$  r%  s   @rW   r  r    s7    
f 
tCy$s)#	$	fP.KrV   r  c                   f     e Zd Z fdZd ZdefdZdefdZd Z	d Z
d	 Zd
ee   fdZd Z xZS )CppKernelProxyc                     t         |   |j                  |j                  j                         || _        d | _        d | _        t        j                         | _
        y r   )r   r   rX  wsrU  r  r  rD  r   r  picked_vec_isar   r  r   s     rW   r   zCppKernelProxy.__init__  sJ    **LOO,G,GH(2=2J2J2LrV   c                 `    |D ])  }t        |t              sJ t        j                  |       + y r   )r   r&   r<   propagate_scheduler_node)r   r  r   s      rW   data_type_propagationz$CppKernelProxy.data_type_propagation  s1     	@Ee]33388?	@rV   scheduler_nodec                     t        |j                  t              syt        j                  |       t        |j                        d   d uxr t        |j                        d    S )NTr   r6   )r   r  r    r<   r0  r  )r   r2  s     rW   is_lowp_fp_schedulerz#CppKernelProxy.is_lowp_fp_scheduler  s\    ...944^D!."6"67:$F C).*>*>?BB	
rV   r  c                     dt         j                  j                  fd}|j                  gt	        |j
                  j                               z   }|D ]  } ||j                          y )N	sub_graphc                 &   dt         j                  j                  fd}dt         j                  j                  fd}t        | j                        }g |D ]u  } ||      rt        d |j                  D              r)|j                  d   }| j                  |      5  | j                  d||t         j                  f      }|j                  }|j                  |       ||_        t        xj                  dz  c_        d d d         ||      r|j                  \  }}}	}
}	|
j                  d	k(  rt        d
 |
j                  D              rt        j                   j#                  |      }| j%                  |      5  | j                  d||
|f      }|j'                  |
|       t        xj                  dz  c_        d d d        v|j                  dk(  r|j                  \  }}}}}|t(        v s|t         j                  t         j*                  t         j,                  t         j.                  fv sJ ||t(        v rt         j                  n|t         j                  ||f|_        |j                  dk(  rP|j                  d   t(        v r;|j                  \  }}}	j1                  |       ||t         j                  f|_        vx dt         j                  j2                  ffd} ||        y # 1 sw Y   xY w# 1 sw Y   xY w)Nr   c                     | j                   dvryt        | j                        dk(  sJ t        j                  j                  | j                  d         }|t        v S )N)r  Fr   r6   )r  r   rX  r5   r  r  r   )r   
load_dtypes     rW   is_lowp_fp_loadz]CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_load  sN    ;;h. 499~***WW..tyy|<
!]22rV   c                     | j                   dk7  ry| j                  \  }}}}}t        j                  j	                  |      }|t
        v S )Nr  F)r  rX  r5   r  r  r   )r   r  	store_varstore_dtypes       rW   is_lowp_fp_storez^CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_store  sD    ;;') (,		%9aAgg//	:"m33rV   c              3   :   K   | ]  }|j                   d k(    ywr  Nr  r   users     rW   r   zWCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<genexpr>  s     Jd4;;'1J   r   rM  rX  r6   r  c              3   :   K   | ]  }|j                   d k(    ywr@  rA  rB  s     rW   r   zWCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<genexpr>  s      :37w.:rD  r  r  r6  c                 V    dt         j                  j                  ffd} ||        y )Nr6  c                 B   dt         j                  j                  fd}| j                  D cg c]  }|j                  dk(  s| }}|D cg c]  } ||      s||j
                  i }}|D ]  }|j                         D ]q  \  }| j                  v st        fd|D              sv s.t        d |D              sAj                  d   }j                  |       | j                         s  | j                  | j                          y y c c}w c c}w )Nto_nodec                 :    t        d | j                  D              S )Nc              3   :   K   | ]  }|j                   d k(    yw)rM  NrA  r   usrs     rW   r   zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>._used_by_to.<locals>.<genexpr>  s     "U3::#;"UrD  )r   users)rI  s    rW   _used_by_tozCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>._used_by_to   s    ""Uw}}"UUUrV   rM  c              3   \   K   | ]#  }|j                   d    j                   d    k(   % ywr  NrE  )r   rM  r   s     rW   r   zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>.<genexpr>  s&     #ScCHHRLDIIbM$A#Ss   ),c              3   F   K   | ]  }|j                   d    t        v   ywrQ  )rX  r   rL  s     rW   r   zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>.<genexpr>  s"      ,&JM(E,&r  r  )r   fxNoder  r  rN  rf  r   all_input_nodesreplace_all_uses_with
erase_nodeowning_modulelint)	r6  rO  r   all_to_nodesall_to_nodes_and_users
node_usersrN  val_nodeto_lowp_fp_legalized_nodess	     `     rW   _eliminate_duplicate_to_nodezCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node  s-   VUXX]] V *3$!%DKK:<U$L $ 8D./3{SWGXtzz*.* . '= ;
+5+;+;+= ;KD%#y6 ##SU#S S$(,F$F(+ ,&QV,& )&
 ,0+?+?+C $ : :8 D ) 4 4T :;;, !..6!( 79$.s   DDDD)r   rS  Graph)r6  r_  r^  s     rW   eliminate_to_dtypez`CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype  s"    ')EHHNN ')R -Y7rV   )r   rS  rT  r   r  r   rN  rX  inserting_aftercall_methodrn   rV  r   cpp_to_dtype_countr  r5   r  r  inserting_beforereplace_input_withr   rS  rT  rS  r  r`  )r6  r:  r>  sub_graph_nodesr   r3   to_type_nodeto_type_node_argsrj  r  	value_varr   r   r   rT  rV  ra  r^  s                    @rW   add_to_dtypezDCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype  s   3ehhmm 34uxx}} 4 #9??3O)+&( H"5)JekkJJ **Q-C"2259 8'0'<'<&c5%++-F (= ( -9,=,=)33LA,=)22a728 8 &e,16.Cq)Q ''61c :;D??: 7 !GG--d3E"33E: 8'0'<'<&c9e-D (= ( 00LI22a728 8 \\[0 

!& M1  %!KK!NN!MM!KK	)       +0M+AEKKu!KK*!&
 \\Z/EJJrNm4S"'**KS!Q /55e<"%q%++!6EJQHT*8ehhnn *8X y)a8 8 8 8s    A"K9AL9L	L	)r   rS  r`  r  r   r  r  r  )r   r  rk  r  r  s        rW   legalize_lowp_fp_dtype_loopbodyz.CppKernelProxy.legalize_lowp_fp_dtype_loopbody  s]    G	*EHHNN G	*R  **+d93F3F3M3M3O.PP
# 	*I)	*rV   c                     t         fd|D              r|D ]  }|j                  j                  gt        |j                  j                  j                               z   }|D ]  }|j                  j                  D ]n  }|j                  dv s|j                  sJ t        j                  |j                  v sJ |j                  t        j                     }|j                  t        v rnJ    y |D ]^  }t        |t              sJ t        |j                  t               sJ |j                  }|j#                         rN j%                  |       ` y )Nc              3   d   K   | ]'  }t        |t              xr j                  |       ) y wr   )r   r&   r4  )r   r   r   s     rW   r   z8CppKernelProxy.legalize_lowp_fp_dtype.<locals>.<genexpr>+  s3      
 um,Q1J1J51QQ
s   -0)r  r  )r   r  r  r   r  r  r  r  r  r/  rC   r.  r   r   r   r&   r    is_memory_copyrl  )r   r  r   r  r  fx_noder+  r  s   `       rW   legalize_lowp_fp_dtypez%CppKernelProxy.legalize_lowp_fp_dtype*  sD    

 

  B#kk445KK))0029 
 ", BI#,??#8#8 B">>->>#*<</<#6#:#:gll#JJ#J;B<< 3 7 7<G $+==M#AA#ABB	B  	;Ee]333ekk8444"[[D&&(44T:	;rV   c           	         t              t              k(  sJ | j                  t        d       \  | j                         fd}fd |t              }t
        j                  xj                  |j                  z  c_        t
        j                  xj                  |j                  z  c_        t        j                  |      | _        | j                  sy | j                  sy t        j                  j                   j#                  d      5  t%               }|j'                        \  }}t        |      t        |      k(  sJ d}t)        t+                    }	t-        d |	D              rd}t        |      d	k(  r |t.        |d
   |d
         }
t0        xj2                  d	z  c_        | j                  j5                  |d
   |d
         \  }}|j7                  |
       d|_        t         j:                  j<                  rX|rV|j>                  |j@                  z
  |_!         |t.        |d
   |d
   |jB                        }|j7                  |       d|_        n|j7                  |       d|_"        |d
   dz  |_#        nt        |      dk(  r|d	   t        | j                        d	z
  k(  r|d
   |d	   k(  sJ t0        xj2                  dz  c_        | j                  j5                  |d
   |d
         \  }}|j5                  |d	   |d
   z
  |d
         \  }} |tH        |d
   |      }|j7                  |       t         j:                  j<                  r|r|j5                  |d	   |d
   z
  |d
         \  }}|||fD ]   }|j>                  |j@                  z
  |_!        " ||jB                  d f|d |jB                  f||jB                  |jB                  ffD ]*  \  }}} |tH        |d
   |||      }|j7                  |       , n6 |t.        |d
   |d
         }
|j7                  |
       |j7                  |       d d d        y # 1 sw Y   y xY w)Nc                     t        | d         S r  r  r  s    rW   rb  z2CppKernelProxy.codegen_functions.<locals>.<lambda>K  r  rV   r  c                      j                   | g| 5 }t        xj                  dz  c_         |       |cd d d        S # 1 sw Y   y xY wr  )
new_kernelr   generated_kernel_count)r   rX  r  r  runs      rW   codegen_kernelz8CppKernelProxy.codegen_functions.<locals>.codegen_kernelO  sL    (((4t4  ..!3.F  s   #AAc           	      T   | j                        \  }}d}t        	      D ]u  \  }}|ft        t        j                              dffv r|rJ  |||       ;d}|dfk(  sJ d| d d        | j                         5   ||d       d d d        w y # 1 sw Y   xY w)NFrU   Tzunexpected group: r  r   )r  r  r   r  r  r  )
r  re  r  	in_suffixr  r  r  rw  r  r   s
         rW   rw  z-CppKernelProxy.codegen_functions.<locals>.runX  s    #)#4#4UO#L D.I!$Wn!= %IO,9??5/BCRH!   )(=t^, $I$)  V ,I;d5'OCTUV 
  //1 %4% %%% %s   	
BB'	Finplace_buffersTc              3   ,   K   | ]  }|t         v  y wr   )rv   r  s     rW   r   z3CppKernelProxy.codegen_functions.<locals>.<genexpr>  s     Su5 ::Sr  r6   r   )r  r   )%r   r  r[   r  r?  r5   r  removed_buffersinplaced_to_remover   r  r  r-  r  r   	_inductorr   patchr  r  rD   rE   rf   rU  r   generated_cpp_vec_kernel_countsplit_with_tiling
set_kernelsimd_vecr  enable_loop_tail_vecr   r   r   simd_ompsimd_nelementsr<  )r   r  r   rx  scalar_kerneltiling_selecttiling_factorsr  could_masked_vecr  
vec_kernel	main_loop	tail_loopmasked_vec_kernelouter_main_loopouter_tail_loopinner_main_loopinner_tail_looptile2d_kernel"inner_main_loop_of_outer_tail_loop"inner_tail_loop_of_outer_tail_loopr  r  masked_tile2d_kernelrw  r  r  rw  s    ``                     @@@@rW   codegen_functionsz CppKernelProxy.codegen_functionsH  s   7|s>2222((!$^9T!U/		%( 'y1	=#@#@@	""m&F&FF"*00?""}} __##))%)@ i	>(NM-:-H-H.*NN ~&#n*====#3N74KLJS
SS#( >"a'+ ."3^A5F
 66!;6'+~~'G'G"1%nQ.? (H ($	9 $$Z0%)	"::227G&/nny7G7G&GIO(6$&q)&q)!	)% (():;)-I&((7)-I& ,:!+<+A	(^$)"1%T]]);a)??&q)^A->>? 66!;637>>3S3S"1%nQ.? 4T 40 $55"1%q(99.QRBS 6 ## !/#^A%6!  **=9::227G (99&q)N1,==nUVFW : :: (':& L	
 +4..9;K;K*K	L )/*?*?F> +11 ?>DD+11H CC	?O 0>+*1-*++0, ",,-AB)C, "0$nQ&79J"J $..z:#..}=Si	> i	> i	>s   !L3QQ&c                     |D ](  }| j                  |       t        j                  |       * | j                  ||       y r   )rl  r<   propagate_loopbodyr  )r   r  r   r  s       rW   codegen_loop_bodiesz"CppKernelProxy.codegen_loop_bodies  s?     	9D0062248	9 	{N;rV   r  c                    | j                  |       | j                  |       t        |      dk\  sJ d }|D cg c]  }t        j                  ||       }}t        t        j                  t              r2t        j                  j                  rd }|D cg c]
  } ||       }}|D cg c]  }|j                  d    }}| j                  ||       y c c}w c c}w c c}w )Nr6   c                     | j                          | j                          t        t        j                  t
              r | j                  | S | j                  |      S r   )decide_inplace_updatemark_runr   r5   r  r2   r  codegen)r   
index_varss     rW   r  z(CppKernelProxy.codegen_nodes.<locals>.fn  sF    &&(MMO!(($56!tzz:..||J//rV   c                 R    t         j                  j                  |       }| |_        |S r   )r5   r  localize_functionoriginal_fn)r  
wrapped_fns     rW   wrap_fnz-CppKernelProxy.codegen_nodes.<locals>.wrap_fn  s+    33EE
 *,
&!!rV   )rq  r1  r   ro  partialr   r5   r  rL   r  rw  r  )r   r  r  r   r  r  r   s          rW   codegen_nodeszCppKernelProxy.codegen_nodes  s    ##E*""5)5zQ	0 <AA49$$R.AA q--/AB&&44" .55rwr{5G549:D$**Q-::w7# B 6:s   CC-C"c                 >    | j                  | j                  ||       y r   )r  r  )r   r  r  s      rW   r  zCppKernelProxy.codegen_loops  s    kBrV   )r   r!  r"  r   r1  r&   r4  r    rl  rq  r  r  r   r  r  r$  r%  s   @rW   r*  r*    sS    M@
= 
L* L*\;<]>~<84#6 8BCrV   r*  c                   *     e Zd Z fdZdefdZ xZS )r  c                 p    t         |   |j                  |j                  j                         g | _        y r   )r   r   rX  r,  rU  r   r.  s     rW   r   zOuterLoopFusedKernel.__init__  s)    **LOO,G,GH&(
rV   r   c                 D   g }| j                   D cg c]  }|j                          }}|D ]X  }|d   j                  J t        fd|D              sJ |j	                  |d   j                  t              |             Z t        |t        |            S c c}w )Nr   c              3   <   K   | ]  }|j                   k(    y wr   )rD  )r   r  rD  s     rW   r   z=OuterLoopFusedKernel.decide_parallel_depth.<locals>.<genexpr>!  s     OVv))[8Os   )	r   r  rD  r   r  r  r   r\   r[   )r   r  r  kernels_parallel_depthr  nested_kernelsr  rD  s          @rW   r  z*OuterLoopFusedKernel.decide_parallel_depth  s    !#+/::1
#'D1
 1
 & 	G "!*00K***OwOOOO"))
00[1A7K	 &'
 	
1
s   B)r   r!  r"  r   rm   r  r$  r%  s   @rW   r  r    s    )
C 
rV   r  c                       e Zd ZdZdZdZy)ReasonFusedNodessame_vars_reducecompatible_reductioncompatible_ranges_no_reductionN)r   r!  r"  SAME_VARS_REDUCECOMPATIBLE_REDUCTIONCOMPATIBLE_RANGES_NO_REDUCTIONrU   rV   rW   r  r  +  s    )1%E"rV   r  c                   p    e Zd ZdZej                  ej                  ej                  g      Z	e
dej                  fd       Z fdZdefdZd Zd Zd	 Zd
ee   fdZd Zd Zd Zd Zd Zd Zd Zdee   fdZ de!fdZ"de#e!e$ef   fdZ%de&d
efdZ'de&de(e&   fdZ)d Z*d Z+d Z,d!dZ-d  Z. xZ/S )"CppSchedulingi  devicec                     | j                   S r   )backend_features)r   r  s     rW   get_backend_featuresz"CppScheduling.get_backend_features=  s    ###rV   c                 b    t         |           || _        |r| j                          d| _        y NF)r   r   r   reset_kernel_group_ready_to_flush)r   r   r   s     rW   r   zCppScheduling.__init__A  s,    "##%$rV   statusc                     || _         y r   r  )r   r  s     rW   _set_flush_statuszCppScheduling._set_flush_statusH  s
    %rV   c                 &    t        d |D              S )Nc              3      K   | ];  }t        t        t        j                  j                  j
                  |             = y wr   )r   rd  r5   r  r  r   r  s     rW   r   z)CppScheduling.group_fn.<locals>.<genexpr>L  s,     M!U3qww//88!<=Ms   AA)r   )r   r  s     rW   group_fnzCppScheduling.group_fnK  s    MuMMMrV   c                     ddl m} |  t        t        j                  j
                  |      rt               | _        y t               | _        y )Nr6   )CppWrapperCpu)	cpp_wrapper_cpur  r   r5   r  wrapper_codeCppWrapperKernelGroupr  KernelGroup)r   r  s     rW   r  z CppScheduling.reset_kernel_groupN  s1    2agg**M: 5 7D +DrV   c                    |j                         s|j                         rt        j                  ||      S |j                         r(|j                         rJ t	        j                  ||      S | j                  ||      t        j                  k(  rt        |t        t        f      sJ t        |t        t        f      sJ |j                  \  }\  }}|j                  \  }\  }}|dk(  r|dk(  s	J ||f       fdt        |      t        |      k  r|n|}t        |t              sJ t        |      t        |      k  r|n|}	 |	      }
|j                  |
       |j                  \  }\  }}|j                  \  }\  }}||k(  s	J ||f       t	        j                  ||      S | j                  ||      r't        j                  ||| j                  ||            S t	        j                  ||      S )NrU   c                 6   t        | t              rt        | j                        dkD  sJ | j                         d }t	               }| j                  D ];  } 	|      \  }}||}||k(  sJ ||| j                  f       |j                  |       = |t        |      fS t        | t              sJ | j                  }t        |t        j                        sJ |j                         \  }}}|j                  t        |j                  j                               fS r  )r   r$   r   snodesr  updater   r&   r   r   ComputedBufferget_default_sizes_bodyr  indexing_exprsr  )
r   r  r  snodevexprscomp_bufferr  r  get_indexing_ranges_exprss
            rW   r  z5CppScheduling.fuse.<locals>.get_indexing_ranges_exprsi  s   !$(:;"4;;/!3@T[[@3%)
),%)[[ 9E'@'GHAu)1-.
#-?PZDKK4PP?*11%89  *4+???)$>>>&*ii)+r7H7HIII%0%G%G%I
4#T5H5H5O5O5Q0RRRrV   )extra_indexing_constraints)
is_foreachr#   r   is_templater$   _why_fuse_nodesr  r  r   r&   rw  r   recompute_size_and_bodycan_fuse_vertical_outer_loopr   _get_outer_loop_fusion_depth)r   r   r   r  vars1reduce1vars2reduce2node_to_recompref_noder  r  s              @rW   r   zCppScheduling.fuseW  s   !1!1!3-225%@@ ((***%**5%88 $$UE2#BBC "%-9K)LMMM!%-9K)LMMM&+kk##E7&+kk##E7"}BJ'8JJ6S& +.e*s5z*Au!.-@@@$'JU$;5-Fx-P*66/I 7  !&:E1 %:E1~5u~5~)..ue<<225%@2775$"C"CE5"Q  *..ue<<rV   r   c                     |j                   \  }\  }}|j                   \  }\  }}||k(  r||k(  rt        j                  S |dk(  r|||z   k(  rt        j                  S | j	                  ||      rt        j
                  S y )NrU   )rw  r  r  r  &_can_fuse_nodes_with_compatible_rangesr  )r   r   r   r  r  r  r  r  s           rW   r  zCppScheduling._why_fuse_nodes  s    #kkE7#kkE7E>g0#444b=Uego5#88866ueD#BBBrV   c                    |j                   \  }\  }}|j                   \  }\  }}|dk(  xr |dk(  }t        j                  |      t        j                  |      k(  }	t        |      dk(  xs t        |      dk(  }
|r|	r|
syt        |      t        |      k  r|n|}t        |      t        |      k  r|n|}t	        |t
              ryt	        |t              sJ t	        |j                  t        j                        ryt	        |j                  t        j                        sJ |j                  j                  j                         }d }t	        |t
              rt               }|j                  D ]  }t	        |j                  t        j                        r ndt	        |j                  t        j                        sJ |j                  t!        |j                  j                  j                                       t        |      dk7  ryt#        t%        t'        |                  }n\t	        |t              sJ t	        |j                  t        j                        sJ |j                  j                  j                         }||k7  ryy)NrU   r6   FT)rw  r(  rd   r   r   r$   r&   r   r   TemplateBufferr  dataget_sizer  r  rE  r   r   nextiter)r   r   r   r  r  r  r  r  c1c2c3r  r  ranges2ranges1
ranges_setr  s                    rW   r  z4CppScheduling._can_fuse_nodes_with_compatible_ranges  s    $kkE7#kkE7],w"}YYu5!11Z1_/E
arb"%e*s5z"9uJU35 n&89 .-888n))2+<+<=.--r/@/@AAA !%%**335h 23J! Bejj"*;*;<!%**b.?.?@@@uUZZ__%=%=%?@A	B :!#4Z 012Gh666hmmR->->???mm((113GgrV   c                     t        |t        t        f      sJ t        |t        t        f      sJ t        d ||fD              ry| j	                  ||      d uS )Nc              3   <   K   | ]  }t        |t                y wr   )r   r   r   s     rW   r   z:CppScheduling._can_fuse_horizontal_impl.<locals>.<genexpr>  s      
>BJt89
r  F)r   r$   r&   rf   r  r   r   r   s      rW   _can_fuse_horizontal_implz'CppScheduling._can_fuse_horizontal_impl  sd    %"4m!DEEE%"4m!DEEE 
GLen
 
 ##E51==rV   c                    |j                         s|j                         ryt        |j                               t        |j                               z   t        j                  j
                  kD  ry| j                  ||      S r  )r  r   r   r   r  max_horizontal_fusion_sizer  r  s      rW   can_fuse_horizontalz!CppScheduling.can_fuse_horizontal  sf    %"3"3"5!"S):%;;jj334 --eU;;rV   c                    d}t        d ||fD              s|S t        |t              r|j                         d   n|}t        |t        t
        f      sJ t        |t              r|j                         d   n|}t        |t        t
        f      sJ |j                  \  }\  }}|j                  \  }\  }	}
|dk(  r|	dk(  r|dk7  r|
dk7  r|S t        d ||fD              r'|j                  |j                  k(  r|j                  S |S t        t        |      t        |	            }|dk\  rI|d | |	d | k(  r>t        d ||fD              r(t        |      t        u r|n|}|j                  |k(  r|S |S |S |S )Nr   c              3   T   K   | ]   }t        |      t        t        t        fv  " y wr   )r   r   r$   r&   r   s     rW   r   z=CppScheduling._get_outer_loop_fusion_depth.<locals>.<genexpr>  s/      
  J+-?OP
r   r  rU   c              3   >   K   | ]  }t        |      t        u   y wr   r   r   s     rW   r   z=CppScheduling._get_outer_loop_fusion_depth.<locals>.<genexpr>  r   r   r6   c              3   >   K   | ]  }t        |      t        u   y wr   r   r   s     rW   r   z=CppScheduling._get_outer_loop_fusion_depth.<locals>.<genexpr>  s      >BT
99r   )r   r   r   r   r$   r&   rw  r   r\   r   rf   r   )r   r   r   DISABLE_OUTER_LOOP_FUSION_node1_node2r  r  r  r  r  r   _compare_nodes                rW   r  z*CppScheduling._get_outer_loop_fusion_depth  s   $%! 
 
 

 -, %!<= !!#B' 	
 &#5}"EFFF %!<= !!#A& 	
 &#5}"EFFF$llE7$llE7B;5B;7b=W],,TeU^TT 00E4Q4QQ -- /
 #&c%j#e*"=#q(../59Q:Q3RR GLen  "%[,GGEU  !88<SS2244 /.((rV   c                    |j                          xro |j                          xr\ |j                         |j                  z  xr= | j                  ||      xr |j	                           xr | j                  ||      dk\  S r  )r  get_operation_names	ancestorsr  rM  r  r  s      rW   r  z*CppScheduling.can_fuse_vertical_outer_loop!  s    !!## E%%''E))+eoo=E ..ue< -**,,E 11%?1D		
rV   c                 *    | j                  ||      ryyr  )r  r  s      rW   get_fusion_pair_priorityz&CppScheduling.get_fusion_pair_priority-  s    ,,UE:rV   c                     |j                         ry|j                         r|j                          S | j                  ||      xr |j                          xs | j                  ||      S r  )r  rM  r  r  r  s      rW   can_fuse_verticalzCppScheduling.can_fuse_vertical4  sj    ))+++**5%8UASASAU=U=..ue<	=rV   r  c                    t        d |D              r|S dddd}d}d}|D ]  }t        |j                  t        j                        sJ |j                  j                         \  }}}|j                  j                         D ](  \  }}	||	j                  t              z  }|dkD  r|c c S |	j                  t              dk(  sC|	j                  t              j                         }
|
j                  d   |
j                  d   |t        t        j                  j                  j                         st        t        j                  j"                  j$                        s|j&                  v st)        fd|j                  j                         D              s%d}|}+  |s|S dfd}|D ]  }||k(  s	|j+                  |	        |D ]  }||k7  s	|j+                  |
        |S )a  
        Apply loop split optimization.
        When one of the indexing_exprs contains a division, we eliminate the division by splitting the loop
        to avoid non-contiguous loads, subject to the following conditions:
            1. No reduction and no mudular index for all nodes.
            2. Only one node's one indexing_exprs contains a division, according to this indexing_exprs,
               we can get the dimension that needs to be split, and the split dimension is contiguous
               in all other indexing_exprs.

        For example, if the node's var_ranges: {z0: 2, z1: 9216, z2: 960} and indexing_exprs:
        {'index0': 8847360*z0 + 960*z1 + z2, 'index1': 32*z0 + (z2//30), 'index2': z2},
        we will split z2 -> 30*z2 + z3, then the node's var_ranges will be changed to
        {z0: 2, z1: 9216, z2: 32, z3: 30} and indexing_exprs will be changed to
        {'index0': 8847360*z0 + 960*z1 + 30*z2 + z3, 'index1': 32*z0 + z2, 'index2': 30*z2 + z3}.
        c              3      K   | ]X  }t        |j                  d    d          dk7  xs4 t        d |j                  j                  j                         D               Z yw)r6   r   c              3   F   K   | ]  }|j                  t                y wr   )r   r   )r   rX  s     rW   r   z9CppScheduling.try_loop_split.<locals>.<genexpr>.<genexpr>R  s      .2)r  N)r   rw  rf   r  r  r  r   s     rW   r   z/CppScheduling.try_loop_split.<locals>.<genexpr>P  sc      

 	 

1a !Q&  6:jj6O6O6V6V6X 
s   AA Nr   Fr6   c              3   L   K   | ]  \  }}|k7  rt        |      d k(    ywr   )r   )r   rj  rX  divide_index_name	split_vars      rW   r   z/CppScheduling.try_loop_split.<locals>.<genexpr>q  s2        *d#'88 0i@AE s   !$Tc                    | \  }}|\  }}|j                        }|j                         }||   z  ||<   |j                  |dz          t        j                  ||d      \  \  }	}
}|	j                         }|j                  |dz         }||   z  |z   ||<   t        j                  |||g||	|      }s/|j                  t        |j                  j                               f||f||	|ffS )Nr6   r  r  )r   r   insertr   index_vars_no_squeezepopr   r    r  r   r  r  )r  r  re  
index_sizereduce_sizer  reduce_vars	split_idxnew_index_sizenew_index_varsr  r  	iter_varsdivisor_varr  split_numberr  s                 rW   
loop_splitz0CppScheduling.try_loop_split.<locals>.loop_split  s   &+#J&*#J"((3I'__.N(29(=(MN9%!!)a->.:.P.PC/+^Q '++-I#--	A6K#/)I2F#F#TIi ;;y+.
NKD .OO,,3356.*
  -- rV   )recompute_sizes_body_func)r  r!  )rf   r   r   r   r  r  r  rf  r  r   findr  rX  r   corenumbersr   symbolr   r  r   r  )r   r  num_div	match_divmatched_noder   r  original_bodyrj  rX  div_exprr   r  r  r  r  s               @@@@rW   try_loop_splitzCppScheduling.try_loop_split>  s   $  

 
 
 L	 	 	,Ddii):):;;;"&))"B"B"DA}a+::@@B ,
d4::h//Q; L::h'1,#yy2668H (a 0I#+==#3L(,%"<1C1C1K1KL&y%**2C2C2J2JK%)@)@@-9  .;.J.J.P.P.R   %)	'++,	,6 L%)"	:  	SD|#,,z,R	S  	D|#,,/I.8 - 	 rV   r   c                 H  	 | j                   t        j                  }g g 	t        |t              sJ dt        f	fd} ||      s|t        _        j                          	j                          t        j                  j                  j                  d      5  |j                         D ]X  }t        |t        t        f      sJ |j                         }t              }|j                  |       j!                  ||       Z 	 ddd       yy# 1 sw Y   yxY w)a  
        Generate the code for the outer loop fused scheduler node.
        1. Codegen with fused outer loop: depends on the analysis of
            the outer loop fused scheduler node, with or without the local buffer.
        2. If failed, fallback to standard codegen.
        r   c           
          t         t              sJ j                          j                          dt        fdg }i t	         fd j                         D              rt                j                         D ]  t        t              sJ j                  j                                j                         st        j                               dk7  rbj                         d   t	         fdj                  D              sj                  }t        |t         j"                        sJ |j%                         } j&                  t                     z
  }fd}|j)                         r |       s	t!        j*                  |j,                  |j.                  |j0                  |d |j2                  |d       }fd	}d
} |||      }	|	sDt!        j4                  | dt        |       |      }	|j7                  |	       g |	j8                  <   |	j8                     j7                  |        t;        j<                        5 }
t        |      dkD  r4|D ]/  }|j8                  J |
j?                  ||j8                            1  j                         D ]t  }t        |t@        t        f      sJ tC              }|jE                  |j                                j7                  |       j7                  |j                                v  jG                   j&                        s
	 ddd       ytH        jJ                  j7                  tI        jL                  t              t        |
jN                                      jQ                        }jS                  |D cg c]  }|D ]  }|  c}}       ddd       yc c}}w # 1 sw Y   yxY w)zN
            Codegen code with fused outer loop and local Buffer.
            r   c                     t        | t        t        f      sJ | j                         }t	        |d       j
                  \  }\  }}t        |      t        |      z   }|S )Nc                 4    t        | j                               S r   )rm   rM  r`  s    rW   rb  z~CppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.get_call_ranges.<locals>.<lambda>  s    Q^^-=)> rV   r  )r   r&   r$   r   r[   rw  r   )r   r  r  rw  r  rD  s         rW   get_call_rangeszlCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.get_call_ranges  s`    !$8J(KLLL-1^^-=.1>/% ,+E? $ElU?-CC""rV   c              3   `   K   | ]%  }t         |            j                  d z   k(   ' ywr   )r   r   )r   r   r0  r   s     rW   r   zfCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.<genexpr>  s3       OE*+t/K/Ka/OOs   +.r6   r   c              3   V   K   | ]   }|j                   j                         v  " y wr   )r   r   )r   rC  r   s     rW   r   zfCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.<genexpr>  s&      :>		T^^%55   &)c                  B   dd} t        j                  j                  j                               D ]  \  }}| |z  z  | |z  }  j                  j	                  j                               }fd |      xr t        fdj                  D              S )Nr   r6   c                     | k(  S r   rU   )rV  contiguous_index_exprs    rW   is_contiguous_indexzCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguous.<locals>.is_contiguous_index  s    '(,A'A ArV   c              3      K   | ]Y  }t        |j                  t              xr9  |j                  j                  j	                  j                                      [ y wr   )r   r   r&   r  get_read_exprr  )r   rC  r7  scheduler_buffers     rW   r   zCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguous.<locals>.<genexpr>  s\      Q %) !+499m D !"$7$(IIOO$A$A(8(A(A(C%&%"!"Qs   AA")r  r  r  rf  get_write_exprr  r   rN  )r  r   r	  write_index_exprr6  r7  r:  r2  s       @@rW   is_all_write_read_contiguouszyCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguous  s    451%&F.6 . 4 4 ? ? E E G/ 0
U !6# E 5 &%	0
 0>/C/C/R/R 0 9 9 ;0,B $77G#H $S Q -=,B,BQ N rV   Nc                 ~    |D ]7  }| |j                   k(  st        fd|j                     D              s5|c S  y )Nc              3      K   | ]]  }|j                   Ot        fdt        j                  j                  j
                  |j                      j                  D               _ y w)Nc              3   V   K   | ]   }|j                   j                         v  " y wr   )r   r  )r   rC  visited_scheduler_nodess     rW   r   zCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffer.<locals>.<genexpr>.<genexpr>  s,      (&,0 )-		(:(:(<@W(W(&r3  )rj  r   r5   r  r   name_to_bufrN  )r   global_bufferrA  s     rW   r   zCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffer.<locals>.<genexpr>  s`      S" )6 (5'9'9'E %( (&45GG4E4E4Q4Q,9,>,>5**/%	(& %&S"s   A#A&)layoutr   rj  )local_buffer_layoutr  	local_buflocal_to_global_buffersrA  s      rW   try_share_local_bufferzsCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffer  sS    -: 5	#6):J:J#Js S" :Q(1:&S" P" ,5$45 $(rV   local_buffer_datar  F)local_buffer_numberT)*r   r   clearr!   r   r   r  r   r&   rE  r  rM  r   get_outputsrN  r   r   r  r  r   is_contiguousFixedLayoutr  r   r   r  Bufferr  rj  rL   rX  add_local_bufferr$   r*  r  r  r   !cpp_outer_loop_fused_inner_countsCppOuterLoopFusedCountr  r  finalize_kernel)r   r  rC  global_buffer_layoutsize_offsetr=  rE  rH  local_buf_prefixlocal_buffer_usedscoper  r   cpp_kernel_proxyouter_fusion_cpp_kernel_proxy_nodesr0  rG  r:  r2  rA  r
  r  
nodes_lists   `               @@@@@rW   $try_outer_loop_fusion_with_local_bufzSCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf  s    d$?@@@!'')#&7 # .0MBD# !113  58E'&*nn&6 [N%nmDDD+//0G0G0IJ&335~99;<A '5'A'A'CA'F$ BRBXBX  )9(=(=)-9J9JKKK/</G/G/I,&*&B&BS+N;F '4 1>>@ < >$.0nn077066055klC077E	/+(" ,?(,B/-)  102		#3"4Ac-6H5I J 31- *001BCNP34E4J4JK/0A0F0FGNN)s[z $L$5$56 %}%)(5 +00<<<..(*A,BSBS*T
 "113 9E%e.@--PQQQ'5l'C$$225??3DE)001AB%%eoo&789 >>)4+G+G !# $ 99@@2212,/0C0C,D 150O0O)1- ,,1)3HvHUHUH7@  I;@ s&   "C'O+A2O+O%O+%O++O4Fr{  N)r  r   r  r   r   rK  r   r  r   r  r   r$   r&   r   r*  r  rS  )
r   r   r  r]  r   r[  rY  r
  r  r\  s
          @@@rW   codegen_outer_loop_nodez%CppScheduling.codegen_outer_loop_node  s    (()0)O)O&6802
$ ;<<<[	7R [	z 4D95SG2!'') ''--e-D K!113 KE%e.@--PQQQ27//2CF'5l'C$$226: 001A6JKK K :K Ks   !A,DD!c                 h   | j                   }t        |t              r| j                  |       nO|j	                         }| j                  |      }t        |      }|j                  |       |j                  ||       | j                         }|t        j                  kD  r| j                  d       yy)zC
        Turn an set of pre-fused nodes into a C++ kernel.
        TN)r  r   r   r^  r   r+  r*  r  rS  _get_scheduled_num_argsr  MAX_FUSED_KERNEL_ARGS_NUMr  )r   r   r  r  rY  args_nums         rW   codegen_nodezCppScheduling.codegen_nodef  s     ((d78((.)-)9E''.E-l;**51(()95A//1m===""4( >rV   c                 n    t        |t              xr$ t        |j                  t        j                        S r   )r   r&   r   r   CppTemplateBuffer)r   r   s     rW   is_cpp_templatezCppScheduling.is_cpp_template|  s,    $. 
:IIr++4
 	
rV   template_nodeepilogue_nodesc                    t         d   dxx   t        |      z  cc<   | j                  |      sJ d       t        t        |      }|j
                  \  }\  }}|dk(  sJ t        t        j                  |j                        }|D cg c]  }|j                   }}t        d |D              sJ d       d } |||j                  |      }	|j                  ||	|      \  }
}|
5  |g|D ]  }|j                            |       }d	d	d	       t        j                  |
      5  |g|}| j                  ||
j                         }d	d	d	       |
j#                  |       t        j$                  xj&                  |
j&                  z  c_        | j(                  j+                          y	c c}w # 1 sw Y   xY w# 1 sw Y   txY w)
zG
        Codegen a CPP template, possibly with fused epilogues
        inductorcpp_epilogue_fusion_counterzlTemplate node passed to CppScheduler.codegen_template must be a SchedulerNode that wraps a CppTemplateBufferrU   c              3   P   K   | ]  }t        |t        j                           y wr   )r   r   r  )r   r  s     rW   r   z1CppScheduling.codegen_template.<locals>.<genexpr>  s"      
12Jq"++,
r  z9Epilogue nodes must all be instances of ir.ComputedBufferc                     | j                         |v sJ || j                            j                  }t        fd|D               S )Nc              3      K   | ]8  }t        |j                  t              xr |j                  j                  v  : y wr   )r   r   r!   )r   rC  rh  s     rW   r   zZCppScheduling.codegen_template.<locals>.template_buffer_has_other_users.<locals>.<genexpr>  sA        499&78 5IINNn45s   >A)r  rN  r   )template_bufferoutputs_by_namerh  rN  s     ` rW   template_buffer_has_other_userszGCppScheduling.codegen_template.<locals>.template_buffer_has_other_users  sV     #++-@@@#O$<$<$>?EEE  "   rV   )$flag_template_buffer_has_other_usersrh  N)r   r   rf  r   r&   rw  r   re  r   r   rp  make_kernel_renderr  r5   set_kernel_handlerdefine_kernelrX  call_kernelr  r~  r   free_buffers)r   rg  rh  r  rnumelctbr  epilogue_ir_nodesrq  rr  r  renderr   src_codenode_schedulekernel_names                   rW   codegen_templatezCppScheduling.codegen_template  s    	:;s>?RR;##
 	zy	z 
 ]M:&,,;Av||$()=)=}?Q?Q$R*;
AFF;
 ;
  
6G
 
 	GF	G 
		 0O..0A0
, //1U, 0 

  	 &88   xH	 
 !!&) 	S*<^<M,,X}fkkRK	S 	;,	6#9#99##%I;
4	  	 
	S 	Ss   F)%"F.$#F:.F7:Gc                 6    | j                   j                         S r   )r  get_num_argsr   s    rW   r`  z%CppScheduling._get_scheduled_num_args  s      --//rV   c                     | j                   S r   r  r   s    rW   ready_to_flushzCppScheduling.ready_to_flush  s    ###rV   c                      y r   rU   r   s    rW   codegen_synczCppScheduling.codegen_sync  s    rV   c                    t         j                  j                  }t        j                  j
                  r$t        |t        j                  j
                        nd}dj                  d||j                         g      }t         j                  j                  r|nd}|j                  t        t        j                        |      }|j                  t        t        j                        |      }|j                  dd      }t               }|| j                   j"                  n|}	|	j%                         \  }
}
}t         j                  j                  s|j'                  d|d       |j)                  |d	
       t         j                  j                  s|j'                  d       |j+                  ||j-                         d       |S )NrR   r  r  r  z#pragma CMTz//zasync_compile.cpp_pybinding(z, '''T)stripz''')F)cuda)r5   r  r  r   r  descriptive_namesr)   joinnext_kernel_suffixcpp_wrapperr   ro   r-   KERNEL_NAMEDESCRIPTIVE_NAMEr?   r  rX  cpp_argdefsr  r  ru  getvalue)r   r|  r  kernel_argsrb  
fused_namer~  kernel_decl_namecompile_wrapperrX  r  	arg_typess               rW   ru  zCppScheduling.define_kernel  sm   ''&& zz++ "%)E)EF 	
 hhz73M3M3OPQ*+''*=*=;8##C(?(?$@BRS##C(D(D$E{S ##M48(*)4)<t  %%+**,1iww""%%(DYMQV&WXxt4ww""%%f-k?+C+C+EERrV   c                 2   | j                   j                         }|rZ| j                  || j                   j                        }| j                   j	                  t
        j                  j                  |       | j                          | j                  d       y r  )
r  codegen_groupru  scheduled_nodesrv  r5   r  r  r  r  )r   r|  r~  s      rW   flushzCppScheduling.flush  sv    $$224,,$++;;K ))!''*>*>L!u%rV   r   )0r   r!  r"  ra  r  fromkeysr7   INPLACE_BUFFERSREDUCE_TO_SINGLE_ELEMENTr  r#  r   r  r  r   rk   r  r  r  r   r	   r  r  r  r  r  r  r  r
  r  r   r&   r+  r   r^  r   r$   rc  r!   rf  r
   r  r`  r  r  ru  r  r$  r%  s   @rW   r  r  1  s5    !$}}**33	
 $%,, $ $%& &N.9=vx8H/I 6p>	<4)l

=iD$7 iV{K){Kz)/1C]RS),
$5 
$ 

4&(4& !!234&l0$4&rV   r  c                   D     e Zd Z fdZd Zd Zd ZddefdZd Z	 xZ
S )	r  c                    t         |           t               | _        t	               | _        t        | j
                        | _        t        j                         | _
        | j                  j                  | j                         g | _        y r   )r   r   rA   rX  r8   
loops_codeWorkSharingr,  r  r  r  r  r  r  s    rW   r   zKernelGroup.__init__  s^    L	&.doo.))+


  )!rV   c                 :     || j                   t               g| S r   )rX  r,   )r   r   rX  s      rW   ru  zKernelGroup.new_kernel  s    49924<t<<rV   c                     | xj                   |z  c_         | j                  }| j                  }|j                  ||       y r   )r  r  r,  r  )r   ru  r  r  r,  s        rW   rS  zKernelGroup.finalize_kernel  s5    %WW  r*rV   c                 X    | j                   j                         \  }}}t        |      }|S r   )rX  r  r   )r   arg_defs	call_argsr  rb  s        rW   r  zKernelGroup.get_num_args  s)    )-)>)>)@&)Yx=rV   r   c           	      .   | j                   j                          | j                  syt               }t        j
                  j                  xr t        j                  dv }|r|j                  dg       |j                  t        j                                |t        t        j                        n|}|t        t        j                         n|}| j"                  j%                         \  }}}dj'                  d      j)                  |      }t+               }|j                  d| d| d| d	       |j-                         5  |rHt.        j0                  j2                  }	|	d
t        |	      z   dz   nd}
|j                  d|
|z    dg       | j"                  j5                         D ]  \  }}|j                  d| d| d        |j7                  | j8                         d d d        |j;                         S # 1 sw Y   |j;                         S xY w)NrR   )linuxrP   z!#include <ATen/record_function.h>z,
   zextern "C" z void r  r   graph_r  zRECORD_FUNCTION("z#", c10::ArrayRef<c10::IValue>({}));r  r!  r  )r  r  r  r8   r   r  enable_kernel_profilesysplatformra  r  r   
cpp_prefixro   r-   r  r  rX  r  ljustr  rX   r  r5   r  graph_idaliasesr  r  r  )r   rj  r  r  r  r~  r  r  func_export_declr  r  oldnews                rW   r  zKernelGroup.codegen_group  s   

##~ !'

 @ @ !
S\\ V
 F
 !OO@ABy++-. <@<3{667T;?<c+667T..0!Q;;r?''113*+62B1C1XJaP	

 [[] 	)$77++;C;OCM1C7UW+F[,@+AAfg
 !II--/ 7Sse3se1567KK(	) }}	) }}s   B G<<Hc                 j    | j                   j                         \  }}}|j                  ||d|       y )NF)r  r  )rX  r  generate_kernel_call)r   rb  r~  r  r  r  s         rW   rv  zKernelGroup.call_kernel%  s7    "&))"7"7"99i$$) 	% 	
rV   r   )r   r!  r"  r   ru  rS  r  ro   r  rv  r$  r%  s   @rW   r  r    s)    "=+
&# &P
rV   r  c                        e Zd Z fdZ xZS )r  c                 @    t         |           t               | _        y r   )r   r   r9   rX  r  s    rW   r   zCppWrapperKernelGroup.__init__-  s    (*	rV   )r   r!  r"  r   r$  r%  s   @rW   r  r  ,  s    + +rV   r  c                   0    e Zd Zd Zd Zd Zd Zd Zd Zy)r  c                 `    || _         d| _        d | _        t        j                         | _        y r  )r  in_parallelrU  r  r  r  )r   r  s     rW   r   zWorkSharing.__init__3  s)    	 ))+
rV   c                    | j                   r|| j                  k7  r| j                          | j                   s|| _        d| _         t        j                  j
                  r| j                  j                  d       n| j                  j                  d| d       | j                  j                  | j                  j                                | j                  j                  d       y y )NTz#pragma omp parallelz!#pragma omp parallel num_threads(r   zint tid = omp_get_thread_num();)r  rU  r  r   r  r_  r  r  r  r  r  )r   r  s     rW   r  zWorkSharing.parallel9  s    4+;+; ;JJL&D#Dzz))		##$:;		##&GyPQ$RSJJ$$TYY%5%5%78II1  rV   c                 h    | j                   r| j                  j                  d       | j                   S )Nz#pragma omp single)r  r  r  r   s    rW   r  zWorkSharing.singleI  s*    II 45rV   c                 F    | j                   j                          d| _        y r  )r  r  r  r   s    rW   r  zWorkSharing.closeN  s    

 rV   c                 :    | j                   j                          | S r   )r  r1  r   s    rW   r1  zWorkSharing.__enter__R  s    

rV   c                 >    | j                   j                  |||       y r   )r  r7  r3  s       rW   r7  zWorkSharing.__exit__V  s    

Hgv6rV   N)	r   r!  r"  r   r  r  r  r1  r7  rU   rV   rW   r  r  2  s     ,  
!7rV   r  c                      e Zd ZU dZeej                     ed<   dZeej                     ed<    ej                  d      Z
ej                  ed<    ej                  d      Zej                  ed<   dZeed<   d	Zeed
<   d	Zeed<   d	Zeed<   d	Zeed<   dZed    ed<    ej*                  e      Zed    ed<   dZee   ed<   d Zdee   fdZd ZdefdZded    fdZd Z d Z!d Z"y)r  Nr   r   r   r   r6   r   r  Fr  r  	collapsedrM  parent)default_factoryr   r  c                 j    t        j                         }|r|j                         | _        y d| _        y r  )r   r  r  r  )r   r-  s     rW   __post_init__zLoopLevel.__post_init__l  s-     .9-E-E-GAO>#;#;#=UVrV   r   c                     | j                   r| j                   gS g }| j                  D ]  }||j                         z  } |S z,Get all kernel objects under this loop level)r  r   r  r   r  r  s      rW   r  zLoopLevel.get_kernelsx  sE    ;;KK= JJ 	*Dt''))G	*rV   c                 T    | }|j                   r|j                   }|j                   r|S r  r  )r   r  s     rW   r  zLoopLevel.get_root  s%    kk;;D kkrV   c                     | j                   s|| _        | }|J yt        | j                         dk(  sJ | j                   d   j                  |       y)zj
        Set the kernel under this loop level. No split is allowed under
        this loop level.
        Nr6   r   )r   r  r   r  )r   r  r  s      rW   r  zLoopLevel.set_kernel  sR    
 zz DK(,D###4::!###

1  (rV   c                 j    |dk(  r| gS g }| j                   D ]  }||j                  |dz
        z  } |S r   )r   get_loops_atr   r  r  r  s       rW   r  zLoopLevel.get_loops_at  sF    A:6ME

 6**519556LrV   c                      fd fd}|dk(  r3 |       \  }} j                   }|r||g|_        ||_         ||_         ||fS t         j                        dk(  sJ  j                  d   j                  |dz
        S )Nc                      g } j                   r0j                   D ]!  }| j                  |j                                # | S r   )r   r  r  )r   r  r   s     rW   clone_innerz0LoopLevel.split_with_tiling.<locals>.clone_inner  s9    Ezz JJ /DLL./LrV   c                  T   t        j                        } t        j                  |       | z  }t	        j
                  |      }| |_        j                  |_        d|_        j                  |_	                |_
        |j                  r|j                  D ]	  }||_         t	        j
                  j                        }||_        j                  |_        d|_        j                  |_	                |_
        |j                  r|j                  D ]	  }||_         ||fS r  )r   r   r   r   r  r   r   r  r  rM  r   r  r   )sympy_factorr   r  r  r  r  r  r   s        rW   do_split_with_tilingz9LoopLevel.split_with_tiling.<locals>.do_split_with_tiling  s     ==0Ldii6EF!$((F3I*IO!%I"'I%)%6%6I")mIO%OO ,D"+DK, "$((DII6I%I!%I"'I%)%6%6I")mIO%OO ,D"+DK, i''rV   r   r6   )r  r   r   r  )r   r  r  r  r  r  r  r  s   ` `    @rW   r  zLoopLevel.split_with_tiling  s    		(4 A:#7#9 Iy[[F )95#)	 #)	 i''tzz?a'''::a=22519fEErV   c                     t        |       }g |_        | j                  rC| j                  D ]4  }|j                         }||_        |j                  j	                  |       6 t        | j                        |_        |S r   )r   r   r  r  r  r   r  )r   r  
inner_loopinner_loop_clones       rW   r  zLoopLevel.clone  sn    Dz
::"jj 4
#-#3#3#5 *. '

!!"234 t{{+rV   c                    t        | j                        }t        | j                        }t        j                  j
                  r||k(  ry | j                  r| j                  dkD  rd| j                   dnd}| j                  rFd}| j                  dkD  r|d| j                   dz  }| j                  r\|j                  dd|       }nF| j                  rd}n7| j                  rd	| }n%| j                  st        j                         rd
}nd}t         d| j                   d| }| j                   d| }| j                   j"                  r%| j                   dt        | j                          }n;| j                   dt        | j                          dt        | j                          d}d| d| d| d}| j$                  s|s|gS ||gS )Nr6   zsimd simdlen(r  rR   z#pragma omp forz
 collapse(r   z for z#pragma omp z#pragma GCC ivdepr   rO  <rN  z+=(z == 0 ? 1 : zfor(rL  )rG   r   r   r   r  no_redundant_loopsr  r  r  r   r  rM  r   rQ  rK   r   r   rR  r  )	r   offset_expr	size_exprsimdline1
offset_strr  	steps_strline2s	            rW   r  zLoopLevel.lines  s   !$++.		*	::(([I-E }}!4!4q!8 D//03 	
 ==%E}}q :dmm_A66}}gtf~>]]E]]"4&)E""{'9'9';'EE"|1TXXJa}=
hhZq,::88*B{4::'>&?@I
 88*CDJJ 78 9"4::./q2  zl"XJb1=>>7Nu~rV   )#r   r!  r"  r   r	   r   rK  __annotations__r   r   r   r   r  rm   r  rk   r  r  rM  r  dataclassesfieldr   r   r   r  r?  r  r  r  r  r  r  r  r  rU   rV   rW   r  r  Z  s    $C%**	$!%D(5::
%&q)FEJJ)%a(E5::(HcHdHdItL$$(FH[!(  1{00FE4F"&FHY&
WT)_ ) )T+%6 ,F\	'rV   r  c                       e Zd ZU dZdZeee      ed<   dZ	ee
   ed<   ede
fd       Zd Zdee   fdZed	        Zd
 Zd Zd Zdee
   fdZy)r   a  
    A loop-nest like structure but with some loop level split along
    the loop range into the main tiling loop and the tail. It is built
    with the `build` method as a loop nest and then split with
    `split_with_tiling` at some depth.

    A typical case is for vectorization where we typically split at the inner-most
    loop level. A more complicated case is 2D tiling where we split at
    both inner-most and outer levels.
    Nr  r  c                 ^   | j                   }| j                  }| j                  }|J g }|}d}t        t	        ||            D ]I  \  }\  }}	t        ||	|      }||k\  r| j                  |_        |j                  |       |j                  }K t        |      }
|r	| |_
        |
S | |
_
        |
S )z4Build a LoopNest with the given `kernel` as the leafNr  )r  rE  rF  r,  r  r  rM  r  r   r   r  )r  r  rE  rF  r  levelsr  loop_idxr   r   r  s              rW   r  zLoopNestWithSplit.build  s     ?? 00*** ""&$(%.s8V/D%E 	 !HksDS$t4D?*$*$7$7!MM$ZZF	  &d+	 DK   &IrV   c                 ,    t        | j                        S r   )rk   r  r   s    rW   __bool__zLoopNestWithSplit.__bool__)  s    DIIrV   r   c                 p    g }| j                   J | j                   D ]  }||j                  |      z  } |S )zJGet all the loop levels at the given `depth` (most outer loop has depth 0))r  r  r  s       rW   r  zLoopNestWithSplit.get_loops_at,  sD    !#yy$$$II 	.DT&&u--E	.rV   c                 ,   d}| j                   J | j                   }t        |      dkD  ry|r|d   j                  nd}t        |      dk(  rG|d   j                  |k(  r5|dz  }|d   j                  }t        |      dk(  r|d   j                  |k(  r5|S )z
        Maximal allowed depth for parallelism:
        1) Levels without splitting and
        2) All reduction or non-reduction levels
        When the loop is split at the top level, the max depth is 1.
        r   r6   F)r  r   rM  r   )r   	max_depthr  rM  s       rW   r  z$LoopNestWithSplit.max_parallel_depth4  s     	yy$$$		u:>05uQx,,5%jAo%("7"7<"GNI!HNNE %jAo%("7"7<"G rV   c                     | j                   duxr3 t        | j                         dkD  xr | j                   d   j                  S )zr
        Whether all the loops are for reduction. Reduction loops
        are always the inner most ones.
        Nr   )r  r   rM  r   s    rW   r  z#LoopNestWithSplit.is_reduction_onlyG  s:     IIT!Vc$))nq&8VTYYq\=V=V	
rV   c                     || j                         k  sJ d       | j                  J | j                  }|D ]	  }||_         t        d|      D ]  }|d   j                  }d|d   _         y )Nz?Parallel depth cannot exceed the maximal allowed parallel depthr6   r   T)r  r  r  r	  r   r  )r   r  r  r  rx  s        rW   r  zLoopNestWithSplit.mark_parallelP  s    0022	ML	M2yy$$$		 	&D%DM	&q)$ 	&A!HNNE!%E!H	&rV   c                     | j                  |      }t        |      dk(  sJ |d   j                  d|      }|dk(  r|| _        |S )a  
        Split the loop into main and tail loops at given `depth` so that the range
        of the main loop has range `floor_div(range, factor) * factor` and
        the tail loop handles the remainder. The main loop is tiled
        according to the `factor`.
        r6   r   )r  r   r  r  )r   r  r  r  split_loopss        rW   r  z#LoopNestWithSplit.split_with_tiling\  sN     !!%(5zQAh00F;A:#DIrV   c                     | j                   r| j                   gS g }| j                  J | j                  D ]  }||j                         z  } |S )z+Get all kernel objects under this loop nest)r  r  r  r  s      rW   r  zLoopNestWithSplit.get_kernelsj  sU    ;;KK= #%yy$$$II 	*Dt''))G	*rV   )r   r!  r"  rI  r  r	   r   r  r  r  r?  rJ  r  r  r  r'   r  r  r  r  r  rU   rV   rW   r   r     s    	 '+D(4	?
#*"&FHY&i  .T)_   $

&T)_ rV   r   r  r   )r  r  ro  r  r(  rp  r  r  r   r   enumr   typingr   r   r   r	   r
   r   r   r   r   r   torch.fxtorch._inductorr   torch._prims_commonr   r   torch.utils._sympy.functionsr   r   r   torch.utils._sympy.symbolr   r   r   _dynamo.utilsr   rR   r   r   r   r   r   r   r  r    r   r!   r"   r#   r$   r%   r&   utilsr'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   virtualizedr2   r3   r4   r5   commonr7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   	cpp_utilsrD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   r  rT   rX   _logginggetArtifactLoggerr   schedule_logNATIVE_OMP_RTYPESRTYPE_TO_CPPrl  PYTHON_TO_CPPCONTAINER_PYTHON_TO_CPPrS  rT  r   r  rn   rk   r  r  r  rS  ru   r   r  rv   r   r   r   r   r   	lru_cacherK  r   rm   r   r   r   r'  r=  _initialize_pointwise_overridesrM  r7  r:  r?  rU  r<  r  r  r*  r  r  r  r  r  r  	dataclassr  r   rU   rV   rW   <module>r	     s7        	 
    J J J    ( @ K K O O % G G       > =        llg%: ~~//*E7    #&   
NN	MM 
MM	KK	NN	MM	JJ	KK	JJ	KK	KK
* T%++& 
 
KK	NN	MM	KK	JJ1 D- )@ %) ) ELL!	 )F -UZZ -ell - - ;uzz ;

 ;PS ; ;| FJ!::!!LL!6>sm! !f("4 f(R! !B\; \~  , ,U 3l7l l7^  / / 9  % % '7 7k# k#\H9 HVM
l M
`)$ )$eHU[[4I44O.P )$X_K _KDKCY KC\
9 
2Ft Fq
&N q
&hD
 D
N+K +%7 %7P c c cL p p prV   