
    sgQ              	         d dl mZ d dlZd dlZd dlmZmZmZmZm	Z	m
Z
mZ d dlZddlmZmZ ddlmZmZmZmZ ddlmZmZmZ ddlmZmZmZmZmZm Z  dd	l!m"Z" d
dl#m$Z$ e
rddlm%Z%  ejL                  e'      Z(ejR                  jT                  Z*d Z+d Z,dddddddddddddddddddddgZ- e.d e-D              Z/ej`                  jb                  r e.d e/D              Z/ ejd                  e$e/      Z3dZ4	  ede+de4z   dz   e4z   dz          Z5d!Z6 ed"e,d#e6z   d$z   e6z   d%z          Z7 eejp                  d&d'e*jp                  jr                  (      Z:d) Z; ee;d      Z< G d* d+e      Z=	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d2d,Z>d- Z?d. Z@ ee*jp                        	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d3d/       Z8 ee*j                        d0        ZAd1 ZB ee*jp                  eB       y)4    )annotationsN)castListOptionalSequenceTupleTYPE_CHECKING	TypedDict   )configir)add_layout_constraintconstrain_to_fx_strides	loweringsregister_lowering)autotune_select_algorithmExternKernelChoiceTritonTemplate)ceildivis_onesis_zerospad_listlikesympy_productuse_triton_template)V   )filtered_configs)	TensorBoxc                R    t        | |z  |z  |d         t        ||d         |d   fS NBLOCK_MBLOCK_NGROUPSr   )nchwmetas        N/var/www/html/venv/lib/python3.12/site-packages/torch/_inductor/kernel/conv.pyconv2d_gridr+   ,   s7    A	4	?+4	?#X     c                X    t        | |z  |z  |z  |d         t        ||d         |d   fS r    r$   )r%   r&   dr'   r(   r)   s         r*   conv3d_gridr/   4   s;    A	AtI/4	?#X r,   )@         r      T)r   cond)r1   r0   r2   r   r3   )i   r2   r2   r      )   r6       r   r5   )r0   r0   r7   r   r3   )r0   r1   r7   r   r5   )r1   r0   r7   r   r5   c           	   #     K   | ]8  }|d    r1t        t        t        t        t        t        t        f   |d          : yw)r4   r   N)r   r   int.0r   s     r*   	<genexpr>r<   J   s;      f~ 	sCc3&	')9:s   >A c              #  B   K   | ]  }|d    |d   |d   d|d   f  yw)r   r   r   r3   N r:   s     r*   r<   r<   R   s0      <BF1Ivay!VAY7s   )configsa  
        idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H
        idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W
        idx_x_c = tl.arange(0, BLOCK_K) + k

        x_ptrs = x_base + (
            (idx_x_h * stride_xh)[:, None]
            + (idx_x_w * stride_xw)[:, None]
            + (idx_x_c * stride_xc)[None, :]
        )
        mask_x = (
            (idx_n < BATCH)[:, None]
            & (idx_x_h >= 0)[:, None]
            & (idx_x_h < IN_H)[:, None]
            & (idx_x_w >= 0)[:, None]
            & (idx_x_w < IN_W)[:, None]
            & (idx_x_c < GROUP_IN_C)[None, :]
        )
        matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0)

        w_ptrs = w_base + (
            (idx_x_c * stride_wc_in)[:, None] + (i * stride_wh) + (j * stride_ww)
        )
        mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C)
        matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0)
        acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32)
convolution2dag  
{{def_kernel("X", "W")}}
    # Tensor dimensions
    BATCH = {{size("X", 0)}}
    IN_C = {{size("X", 1)}}
    IN_H = {{size("X", 2)}}
    IN_W = {{size("X", 3)}}
    OUT_C = {{size(None, 1)}}
    OUT_H = {{size(None, 2)}}
    OUT_W = {{size(None, 3)}}

    # Strides:
    stride_xn = {{stride("X", 0)}}
    stride_xc = {{stride("X", 1)}}
    stride_xh = {{stride("X", 2)}}
    stride_xw = {{stride("X", 3)}}
    stride_wc_out = {{stride("W", 0)}}
    stride_wc_in = {{stride("W", 1)}}
    stride_wh = {{stride("W", 2)}}
    stride_ww = {{stride("W", 3)}}

    nhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
    idx_y_w = nhw % OUT_W
    nh = nhw // OUT_W
    idx_y_h = nh % OUT_H
    idx_n = nh // OUT_H
    idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)

{% if GROUPS == 1 %}
    group = 0
    GROUP_IN_C = IN_C
    GROUP_OUT_C = OUT_C
{% else %}
    group = tl.program_id(2)
    GROUP_IN_C = IN_C // GROUPS
    GROUP_OUT_C = OUT_C // GROUPS
{% endif %}

    x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None]
    w_base = (
        W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :]
    )

    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)

{% if UNROLL %}
{% for i in range(KERNEL_H) %}
{% for j in range(KERNEL_W) %}
    i = {{i}}
    j = {{j}}
    for k in range(0, GROUP_IN_C, BLOCK_K):
        a  
{% endfor %}
{% endfor %}
{% else %}
    # Could be simplified, but slightly slower:
    # for i in range(KERNEL_H):
    #     for j in range(KERNEL_W):
    #         for k in range(0, GROUP_IN_C, BLOCK_K):
    BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K
    for ijk in range(KERNEL_H * KERNEL_W * BLOCK_K_COUNT):
        k = (ijk % BLOCK_K_COUNT) * BLOCK_K
        ij = ijk // BLOCK_K_COUNT
        i = ij // KERNEL_W
        j = ij % KERNEL_W
        a  
{% endif %}

    mask = (
        (idx_n < BATCH)[:, None]
        & (idx_y_h < OUT_H)[:, None]
        & (idx_y_w < OUT_W)[:, None]
        & (idx_y_c < GROUP_OUT_C)[None, :]
    )
    idx_n = idx_n[:, None]
    idx_c = idx_y_c[None, :] + group * GROUP_OUT_C
    idx_h = idx_y_h[:, None]
    idx_w = idx_y_w[:, None]

    # inductor generates a suffix
    {{store_output(("idx_n", "idx_c", "idx_h", "idx_w"), "acc", "mask")}}
)namegridsourcea  
        idx_x_d = d - PADDING_D + idx_y_d * STRIDE_D
        idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H
        idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W
        idx_x_c = tl.arange(0, BLOCK_K) + k

        x_ptrs = x_base + (
            (idx_x_d * stride_xd)[:, None]
            + (idx_x_h * stride_xh)[:, None]
            + (idx_x_w * stride_xw)[:, None]
            + (idx_x_c * stride_xc)[None, :]
        )
        mask_x = (
            (idx_n < BATCH)[:, None]
            & (idx_x_d >= 0)[:, None]
            & (idx_x_d < IN_D)[:, None]
            & (idx_x_h >= 0)[:, None]
            & (idx_x_h < IN_H)[:, None]
            & (idx_x_w >= 0)[:, None]
            & (idx_x_w < IN_W)[:, None]
            & (idx_x_c < GROUP_IN_C)[None, :]
        )
        matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0)

        w_ptrs = w_base + (
            (idx_x_c * stride_wc_in)[:, None] +
            (d * stride_wd) + (i * stride_wh) + (j * stride_ww)
        )
        mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C)
        matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0)
        acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32)
convolution3daH  
{{def_kernel("X", "W")}}
    # Tensor dimensions
    BATCH = {{size("X", 0)}}
    IN_C = {{size("X", 1)}}
    IN_D = {{size("X", 2)}}
    IN_H = {{size("X", 3)}}
    IN_W = {{size("X", 4)}}
    OUT_C = {{size(None, 1)}}
    OUT_D = {{size(None, 2)}}
    OUT_H = {{size(None, 3)}}
    OUT_W = {{size(None, 4)}}

    # Strides:
    stride_xn = {{stride("X", 0)}}
    stride_xc = {{stride("X", 1)}}
    stride_xd = {{stride("X", 2)}}
    stride_xh = {{stride("X", 3)}}
    stride_xw = {{stride("X", 4)}}
    stride_wc_out = {{stride("W", 0)}}
    stride_wc_in = {{stride("W", 1)}}
    stride_wd = {{stride("W", 2)}}
    stride_wh = {{stride("W", 3)}}
    stride_ww = {{stride("W", 4)}}

    ndhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
    idx_y_w = ndhw % OUT_W
    ndh = ndhw // OUT_W
    idx_y_h = ndh % OUT_H
    nd = ndh // OUT_H
    idx_y_d = nd % OUT_D
    idx_n = nd // OUT_D
    idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)

{% if GROUPS == 1 %}
    group = 0
    GROUP_IN_C = IN_C
    GROUP_OUT_C = OUT_C
{% else %}
    group = tl.program_id(2)
    GROUP_IN_C = IN_C // GROUPS
    GROUP_OUT_C = OUT_C // GROUPS
{% endif %}

    x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None]
    w_base = (
        W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :]
    )

    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)

{% if UNROLL %}
{% for d in range(KERNEL_D) %}
{% for i in range(KERNEL_H) %}
{% for j in range(KERNEL_W) %}
    d = {{d}}
    i = {{i}}
    j = {{j}}
    for k in range(0, GROUP_IN_C, BLOCK_K):
        aF  
{% endfor %}
{% endfor %}
{% endfor %}
{% else %}
    # Could be simplified, but slightly slower:
    # for d in range(KERNEL_D):
    #   for i in range(KERNEL_H):
    #     for j in range(KERNEL_W):
    #         for k in range(0, GROUP_IN_C, BLOCK_K):
    BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K
    for dijk in range(KERNEL_D * KERNEL_H * KERNEL_W * BLOCK_K_COUNT):
        k = (dijk % BLOCK_K_COUNT) * BLOCK_K
        dij = dijk // BLOCK_K_COUNT
        j = dij % KERNEL_W
        di = dij // KERNEL_W
        i = di % KERNEL_H
        d = di // KERNEL_H
        a  
{% endif %}

    mask = (
        (idx_n < BATCH)[:, None]
        & (idx_y_d < OUT_D)[:, None]
        & (idx_y_h < OUT_H)[:, None]
        & (idx_y_w < OUT_W)[:, None]
        & (idx_y_c < GROUP_OUT_C)[None, :]
    )
    idx_n = idx_n[:, None]
    idx_c = idx_y_c[None, :] + group * GROUP_OUT_C
    idx_d = idx_y_d[:, None]
    idx_h = idx_y_h[:, None]
    idx_w = idx_y_w[:, None]

    # inductor generates a suffix
    {{store_output(("idx_n", "idx_c", "idx_d", "idx_h", "idx_w"), "acc", "mask")}}
zat::convolutionF)has_out_variantop_overloadc          
         t        j                  t        j                  |d      d      }t        j                  | j                  dddd      |j                  dd      |j                  dddd            S )Nr   r      r   )out)torchsqueezematmulpermute)xr(   rJ   s      r*   conv1x1_via_mmrP   i  s]    emmAr*B/A<<			!Q1qyyACKK1a4K r,   c                  J    e Zd ZU ded<   ded<   ded<   ded<   ded<   ded	<   y
)ConvLayoutParamstuple[int, ...]stridepaddingdilationbool
transposedoutput_paddingr9   groupsN)__name__
__module____qualname____annotations__r>   r,   r*   rR   rR   s  s%    ##Kr,   rR   c	                l   t         j                  j                  5  t        j                  j
                  j                  t        j                  | d      t        j                  |d      t        j                  |d      t         j                  j                  j                  |      t         j                  j                  j                  |      t         j                  j                  j                  |      |t         j                  j                  j                  |      |	      }	t        j                  |	j                               }
t        j                  |	j                               }ddd       t        j                  | j                         | j!                         
|      S # 1 sw Y   =xY w)z)Determine output layout for a convolutionT)guard_shapeN)r   graph	fake_moderK   opsatenconvolutionr   ir_node_to_tensorsizevars
size_hintsconvert_shape_to_inductorsizerT   FixedLayout
get_device	get_dtype)rO   weightbiasrT   rU   rV   rX   rY   rZ   outputsizess              r*   conv_layoutrr   |  s-    
		 ?++  5  T:  48GG''/GG''0GG''1GG''7

 ,,V[[];--fmmo>? >>			 ? ?s   EF**F3c                    t        t        t        |                   }|j                  d|j	                  d             |S )Nr   rH   )listreversedrangeinsertpop)rankorders     r*   channels_last_orderr{     s0    %+&'E	LLEIIbM"Lr,   c                   t        |j                               }t        |dz
        D ]   }t        t        j
                     |d      }" t        t        j                     |ddg      }t        j                  j                  | t        |            } t        t        |            }|j                  |j                  d             t        t        j                     | |      } | j                         ^ }}t        t        j                     | t        |      |g      } |t        t        j                      | |      }nt        t        j"                     || |      }t        t        j                     |g |d      }t        t        |            }	|	j%                  d|	j                  d             t        t        j                     ||	      S )Nr   rH   dimr   r   )lenget_sizerv   Lrd   rL   rN   r   ExternKernelrequire_stride_orderr{   rt   appendrx   reshaper   mmaddmmrw   )
rO   rn   ro   ry   _	x_permuterq   in_chanresultresult_permutes
             r*   convert_1x1_conv_to_mmr     sa   v !D4!8_ 14<<R01t||_VaV,F
,,Q0CD0IJAU4[!IY]]1%&	$,,9%AjjlOUG	$,,M%0':;A|477Av&4::tQ/t||_V\u\b\2F%+&N!^//34T\\?6>22r,   c	                0    t        |      }t        |      }t        |      }t        |      }t        |t              s)t        j                  j
                  j                  |      }t        |t              sJ t        t        j                  j
                  j                  |            }t        t        j                  j
                  j                  |            }||||||dt         j                               t        j                               dz
  k(  rVt        t        j                     t        t        t        j                      dg j                               |fi d      S t        j                  j
                  j                  j                               ^}	}
}t        |      t        |      }t        |      }t        |      }t        |      } fd}t         j"                  xs t         j$                  }t         j&                  s	|r |       rt)        |      rt)        |      rvt+        |      rkt)        |      r`|s^t+        |      rS|dk(  rNt        j                  j
                  j-                  t/         j                               d      rt1         |      S |wt3        j4                         dk7  r_t         d fi }t        t        j6                     |t        t        j8                     ||j                         d   gdgz  z               S  j;                          j;                          t        j                  j<                  rudk(  rpt        j                  xj>                  dz  c_        t2        j@                  jC                          t2        j@                  jC                        tE         d fi }ntE         d fi }t3        jF                  t        j                  j
                  jI                  |jJ                              }t2        j@                  jM                   |       t2        j@                  jM                  |      g d}| g}d d<   |jO                  dd       n\ |g}|j;                          |jQ                          t        j                  j
                  j                  |j                                g }tR        jT                  jV                  jY                  d	      rt[        j\                  |||fi g}tR        jT                  jV                  jY                  d
      r)t_        |      rt)        |      r|st+        |      rt        j                  j
                  ja                  |
 j                         d         rt)        |      r@t)        |      r5t+        |      r*|dk(  r%|jc                  td        j]                  ||             tg        t/         j                         d   g j                         dd        |	|
      D ]<  }dk(  rti        jj                  |f f||d   |d   |d   |d   |d   |d   |t)        |      tR        jl                  jn                  jp                  |jr                  |jt                  d|jv                   dk(  sty        jj                  |fi d fd|d|d   d|d   d|d   d|d   d|d   d|d   d|d   d|d   d|d   d|dt)        |      dtR        jl                  jn                  jp                  d|jr                  d|jt                  |jv                   ? t{        d|||      S )N)rT   rU   rV   rX   rY   rZ   r   r   r}   c                    t         j                  j                  rdk(  ryt        d fi } t	        j
                  t         j                  j                  j                  | j                              }|t        j                  k(  S )Nr   T)
r   ra   
layout_optrr   r   get_stride_orderrg   rh   rT   NHWC_STRIDE_ORDER)layoutreq_stride_orderkwargsndimrn   rO   s     r*   channels_last_convz'convolution.<locals>.channels_last_conv  sl    77$!)Q77..GG''6
  2#7#777r,   cpur   ro   ATENTRITON)input_nodesr   KERNEL_HKERNEL_WSTRIDE_HSTRIDE_W	PADDING_H	PADDING_Wr#   UNROLL
ALLOW_TF32
num_stages	num_warpsrI   r   r   KERNEL_Dr   r   STRIDE_Dr   r   	PADDING_Dr   r   r#   r   r   r   r   re   )>tuple
isinstancer9   r   ra   rg   evaluate_static_shapeevaluate_static_shapesr   r   r   rd   rL   re   expandr   r   max_autotunemax_autotune_gemmconv_1x1_as_mmr   r   statically_known_gtr   r   r   get_device_typeaddviewrealizer   num_channels_last_convr   require_channels_lastrr   r   rh   rT   r   rw   freeze_layoutrK   	_inductorutils_use_conv_autotune_backendaten_convolutionbindr   statically_known_equalsr   aten_conv1x1_via_mmconv_configsconv2d_templatemaybe_append_choicebackendscudnn
allow_tf32r   r   r   conv3d_templater   )rO   rn   ro   rT   rU   rV   rX   rY   rZ   out_chanr   kernel_shaper   autotuning_gemmr   r   r   ordered_kwargs_for_cpp_kernelargschoicescfgr   r   s   ``                   @@r*   re   re     s    6]FGnGXH>*Nfc"!!77?fc""" 177##::6BCFAGG$$;;GDEG  ( F 1::<C 12Q66$++q1*<qzz|*<=vtVvV
 	

 ()ww'7'7'N'N($Hg |D&$'F7D)GHd+H!.$7N8 ))EV-E-EO 
		?7I7KL!FOWH^$aKGG00qzz|1LaP%a66B..q1U:Q77{AdiiL(9!(<'=s
'JK
 	
 IIK
NN
 	wwdai	&&!+&OO11!4 66v>Q77Q77..GG''6
 OO004DE55f>NO%! |6{v%,,Q764 	//@G77?!!- 	
 	88B'H^$GG44Wajjl1oN L!!!NN.33D&AB1::<?>QZZ\!"-=>?
 /	C
 qy33!"F!)!_)!_#AY#AY%aj%aj! #<0$~~33>>"~~!mm!" jj#& 33!"F " *!_	
 *!_ *!_ $AY $AY $AY &aj &aj &aj "  #<0!"  %~~33>>#$  #~~%& "mmjj)5/	b %]GT6JJr,   c                (    t        | ||||||||	      S N)re   )rO   rn   ro   rT   rU   rV   rX   rY   rZ   	benchmarkdeterministiccudnn_enabledr   s                r*   _convolutionr     s%      	64(JPV r,   c                    | j                   t        j                  j                  j                  j
                  k(  sJ t        j                  j                  r||fS t        | g|i |S r   )
targetrK   rc   rd   re   defaultr   ra   r   r   )fx_noder   r   s      r*   constrain_conv_to_fx_stridesr     sT    >>UYY^^77?????wwV|&w@@@@r,   )rO   r   rn   r   ro   zOptional[TensorBox]rT   zSequence[int]rU   rS   rV   rS   rX   rW   rY   rS   rZ   r9   returnz	ir.Layout)rO   r   rn   r   ro   r   rT   	List[int]rU   r   rV   r   rX   rW   rY   r   rZ   r9   )C
__future__r   	functoolsloggingtypingr   r   r   r   r   r	   r
   rK    r   r   loweringr   r   r   r   r   select_algorithmr   r   r   r   r   r   r   r   r   r   virtualizedr   	mm_commonr   r   	getLoggerr[   logrc   rd   r+   r/   kernel_configsr   platform_configsversionhippartialr   LOOP_BODY_2Dr   LOOP_BODY_3Dr   re   r   r   rP   r   rR   rr   r{   r   r   r   r>   r,   r*   <module>r      s?   #   R R R    
   ' g! yy~~ #D1"D1#T2#T2!40"D1"D1	      	== FV  !y  
8
 !		3h i4jkCH IDJKUYvB !		;x y<z{O` aPbccgR &	  ((	  )> y       	 
       $     F3. 4##$JKJKJK JK 	JK
 JK JK JK JK JK %JKZ 4$$% &(A d&&(D Er,   