
    sgQ6              
          d dl Z d dlmZmZ d dlmZ  e j                  ed      Z e j                  ed      Z e       r(d dl	Z	d dl	m
Z e	j                  	 	 d,d       Ze	j                  	 	 	 	 d-d	       Z e	j                   e	j                   dd
idd       e	j                   dd
idd       e	j                   ddidd       e	j                   ddidd      gg       e	j                  	 	 d,d              Z e	j                   e	j                   ddidd      gg       e	j                  	 	 d,d              Z e	j                   e	j                   d
d
ddd       e	j                   d
d
ddd       e	j                   ddddd       e	j                   ddddd      gg       e	j                  	 	 	 	 d.d              Zd Z e	j                   e	j                   dd
idd       e	j                   ddidd      gg dddei      e	j                  	 	 d,d              Ze	j                  	 	 d,d       Ze	j                  	 	 d,d       Ze	j                  	 	 d,d       Ze	j                  d         Ze	j                  	 	 	 	 d/d!       Ze	j                  	 	 	 	 d0d"       Ze	j                  d1d#       Ze	j                  dej:                  fd$       Ze	j                  dej:                  fd%       Zd d&l m!Z!m"Z" e	j                  	 	 d,d'       Z#e	j                  	 	 d,d(       Z$e	j                  	 	 d,d)       Z%e	j                  	 	 d,d*       Z&e	j                  	 	 d,d+       Z'yy)2    N)HAS_CUDAHAS_GPU)
has_tritonzrequires cudazrequires gpu)language
BLOCK_SIZEc                    t        j                  d      }||z  }|t        j                  d|      z   }||k  }t        j                  | |z   |      }	t        j                  ||z   |      }
|	|
z   }t        j                  ||z   ||       y Nr   axismasktl
program_idarangeloadstorein_ptr0in_ptr1out_ptr
n_elementsr   pidblock_startoffsetsr   xyoutputs               W/var/www/html/venv/lib/python3.12/site-packages/torch/testing/_internal/triton_utils.py
add_kernelr            mm#J&		!Z 88#GGGg%D1GGGg%D1Q
7"F6    c                 .   t        j                  d      }||z  }|t        j                  d|      z   }||k  }	t        j                  | |z   |	      }
|dk(  r t        j                  ||z   |	      }|
|z   }n|
}t        j                  ||z   ||	       y )Nr   r
   r   twor   )r   r   r   r   ARGS_PASSEDr   r   r   r   r   r   r   r   s                r   add_kernel_with_optional_paramr&   "   s     mm#J&		!Z 88#GGGg%D1%')5AUFF
7"F6r"            )
num_stages	num_warps   @   )configskeyc                    t        j                  d      }||z  }|t        j                  d|      z   }||k  }t        j                  | |z   |      }	t        j                  ||z   |      }
|	|
z   }t        j                  ||z   ||       y r	   r   r   s               r   add_kernel_autotunedr1   7   s    " mm#J&		!Z 88#GGGg%D1GGGg%D1Q
7"F6r"         c                    t        j                  d      }||z  }|t        j                  d|      z   }||k  }t        j                  | |z   |      }	t        j                  ||z   |      }
|	|
z   }t        j                  ||z   ||       y r	   r   )r   r   r   r   r   r   r   r   r   r   r   r   s               r   &add_kernel_autotuned_weird_param_orderr5   Q   s      mm#J&		!Z 88#GGGg%D1GGGg%D1Q
7"F6r"   )BLOCK_SIZE_XBLOCK_SIZE_Yc                    t        j                  d      |z  }|t        j                  d|      d d d f   z   }||k  }	t        j                  d      |z  }
|
t        j                  d|      d d d f   z   }||k  }|}|}t        j                  | |||z  z   z   |	|z        }t        j                  | |||z  z   z   |	|z        }||z   }t        j                  ||||z  z   z   ||	|z         y )Nr      r   )r   r   r   
x_elements
y_elementsr6   r7   xoffsetxindexxmaskyoffsetyindexymaskx1y0tmp0tmp1tmp2s                     r   add_kernel_2d_autotunedrG   j   s    6 --"\1299Q5ag>>#--"\1299Q5dAg>>#www"
R"8955=Iwww"
R"8955=Id{
B*r/23T55=Ir"   c                     | S )N )r.   ___s      r   _dummy_early_config_prunerL      s    r"   
      early_config_prune)r.   r/   warmuprepprune_configs_byc                    t        j                  d      }||z  }|t        j                  d|      z   }||k  }t        j                  | |z   |      }	t        j                  ||z   |      }
|	|
z   }t        j                  ||z   ||       y r	   r   r   s               r   *add_kernel_autotuned_with_unsupported_argsrT      s    $ mm#J&		!Z 88#GGGg%D1GGGg%D1Q
7"F6r"   c                 $   t        j                  d      }||z  }|t        j                  d|      z   }||k  }	t        j                  | |z   |	      }
t        j                  ||z   |	      }|
|z   |z  }t        j                  ||z   ||	       y r	   r   )r   r   r   r   scaling_factorr   r   r   r   r   r   r   r   s                r   add_kernel_with_scalingrW      s     mm#J&		!Z 88#GGGg%D1GGGg%D1a%>)
7"F6r"   c                     t        j                  d      }||z  }|t        j                  d|      z   }||k  }t        j                  | |z   |      }d|z  }	t        j                  ||z   |	|       y Nr   r
   r   r3   r   )
r   r   r   r   r   r   r   r   r   r   s
             r   mul2_kernelrZ      sn     mm#J&		!Z 88#GGGg%D1Q
7"F6r"   c                     t        j                  d      }||z  }|t        j                  d|      z   }||k  }t        j                  | |z   |      }d|z  }t        j                  | |z   ||       y rY   r   )	ptrr   r   r   r   r   r   r   r   s	            r   mul2_inplace_kernelr]      sl     mm#J&		!Z 88#GGC'M-Q
wT2r"   c                 6    t        j                  | dk\  | d      S )Nr   )r   where)r   s    r   	zero_negsr`      s    xxQ1%%r"   c                 2   t        j                  d      }||z  }|t        j                  d|      z   }||k  }|dk(  rt        | ||       n|dk(  rt	        | | |||       t        j
                  | |z   |      }	t        j                  ||z   |	|       y )Nr   r
   r]   )r   r    r   )r   r   r   r]   r    r   r   )
r   r   r   r   
ACTIVATIONr   r   r   r   r   s
             r   indirection_kernelrc      s     mm#J&		!Z 88#..
K<'w*TGGGg%D1
7"AD1r"   c                    t        j                  d      }t        j                  d      }||z  }||z  }	|t        j                  d|      z   }
|	t        j                  d|      z   }|d d d f   |z  |
d d d f   z   }|d d d f   |z  |
d d d f   z   }t        j                  | |z         }t        j                  ||z   |dz         y )Nr   r
   r9   g       @r   )in_ptrr   in_y_strideout_y_strideX_BLOCK_SIZEY_BLOCK_SIZExidyidx_starty_start	x_offsets	y_offsetssrc_offsetsdst_offsetssrcs                  r   double_strided_kernelrs      s     mm#mm#$$bii<88	bii<88	4(;6479KK4(<7)D!G:LLggf{*+
;&c	2r"   c                    t        j                  | t        j                  d|      z         }t        j                  |t        j                  d|      z         }t        j                  |g|t         j                        }t        j
                  dd|||gt         j                  dd      }t        j                  |t        j                  d|      z   |       y )Nr   zshf.l.wrap.b32 $0, $1, $2, $3;z
=r,r, r, rTr9   )dtypeis_purepack)r   r   r   fullint32inline_asm_elementwiser   )	XYZnBLOCKr   r   szs	            r   inline_asm_kernelr   
  s    GGA		!U++,GGA		!U++,GGUGQ)%%,1I((
 	RYYq%((!,r"   c           
         t        j                  d      }||z  }t        j                  t        j                  | |gdg|g|gdg      dg      }t        j                  t        j                  ||gdg|g|gdg      dg      }||z   }	t        j                  t        j                  ||gdg|g|gdg      |	dg       y Nr   r
   r9   )baseshapestridesr   block_shapeorder)boundary_checkr   r   r   make_block_ptrr   )
x_ptry_ptr
output_ptrr   r   r   r   r   r   r   s
             r   add_kernel_with_block_ptrr     s     mm#J&GG!l$'Lc 3

 GG!l$'Lc 3

 Q
!l$'Lc 3	
r"   c                 ,   t        j                  d      }||z  }t        j                  t        j                  | |dgddg|dg|dgddg      dg      }|}t        j                  t        j                  ||dgddg|dg|dgddg      |dg       y r   r   )r   r   r   r   r   r   r   r   s           r   kernel_with_block_ptr_2dr   G  s     mm#J&GG!1oA$a('O!f 3

 
!1oA$a('O!f 3	
r"   )r   r   c                     t        j                  d      }||z  }|t        j                  d|      z   }||k  }t        | |z   |      }	t        ||z   |      }
|	|
z   }t	        ||z   ||       y r	   r   r   s               r   add_kernel_with_importr   k  sw     mm#J&		!Z 88#7".7".Qgd3r"   c                 Z   t        j                  d      }||z  }|t        j                  d|      z   }||k  }t        j                  | |z   |      }	t        j                  ||z   |      }
t        j                  d      dk(  r|	|
z   }n|	|
z  }t        j                  ||z   ||       y r	   r   r   s               r   cond_op_kernelr   |  s     mm#J&		!Z 88#GGGg%D1GGGg%D1==q UFUF
7"F6r"   c                    t        j                  d      }||z  }|t        j                  d|      z   }||k  }t        j                  | |z   |      }	t        j                  ||z   |      }
|	|
z   }t        j                  ||z   ||       y r	   )r   r   r   r   
atomic_addr   s               r   atomic_add_kernelr     s     mm#J&		!Z 88#GGGg%D1GGGg%D1Q
g'd;r"   c                    t        j                  d      }||z  }|t        j                  d|      z   }||k  }t        j                  | |z   |      }	t        j                  ||z   |      }
t	        d      D ]"  }|	|
z   }t        j
                  ||z   ||       $ d}|dkD  r,|dz  }|	|
z   }t        j
                  ||z   ||       |dkD  r+y y )Nr   r
   r   r3   r9   )r   r   r   r   ranger   )r   r   r   r   r   r   r   r   r   r   r   ir   s                r   add_4_times_kernelr     s     mm#J&		!Z 88#GGGg%D1GGGg%D1q 	;AUFHHWw&T:	; !eFAUFHHWw&T: !er"   c                    t        j                  d      }||z  }|t        j                  d|      z   }||k  }t        j                  | |z   |      }	t        j                  ||z   |      }
|	|
z   }t        j                  ||z   ||       y r	   r   )r   r   r   r   r   r   r   r   r   r   r   r   s               r   add_kernel_out_of_order_fn2r     r!   r"   )r   tl.constexpr)r%   r   r   r   )r6   r   r7   r   )r   r   rb   r   )rh   r   ri   r   )r~   r   r   r   )(unittest&torch.testing._internal.inductor_utilsr   r   torch.utils._tritonr   
skipUnlessrequires_cudarequires_gputritonr   r   jitr    r&   autotuneConfigr1   r5   rG   rL   rT   rW   rZ   r]   r`   rc   rs   r   	constexprr   r   triton.languager   r   r   r   r   r   r   rI   r"   r   <module>r      s    D * $##Ho>"x""7N;<% ZZ7
 #7 7  ZZ7
 $7 #7 7( V__FMM<-!qIFMM<-!qIFMM<,aHFMM<,aH	
  ZZ7
 #7 7  V__FMM<,aH
 	 ZZ7 #	7 7$ V__FMM!$c:qTU FMM!$c:qTU FMM!#R8QRS FMM!#R8QRS
 " ZZJ %J %J #$J, V__FMM<-!qIFMM<,aH
 .0IJ	 ZZ7
 #7 	7  ZZ7 #7 7" ZZ7 #	7 7 ZZ3 #3 3 ZZ& & ZZ2 #	2
 #2 2$ ZZ3
 %3 %3 3& ZZ- - ZZ+

 LL+
 +
Z ZZ
 LL	
 
B ,ZZ4
 #4 4  ZZ7
 #7 7& ZZ<
 #< <  ZZ;
 #; ;, ZZ7
 #7 7[ r"   