
    sg3                        d dl mZmZ d dlmZmZmZmZ d dlm	Z	 d dl
Z
d dlmZmZmZ d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ  e
j.                         defd       Z e
j.                         d	        Z e
j.                         d
efd       Z e
j.                  d      d        Z e	d       G d d             Z G d de      Zy)    )BaseBackend	GPUTarget)irpassesllvmnvidia)	dataclassN)AnyTupleOptional)Pathbinaryc                    t         j                  j                  d| j                          dd      t         j                  j                  t         j                  j                  t              d|       g}|D ]  }t         j                  j                  |      s#t         j                  j                  |      sCt        j                  |dgt        j                        }|mt        j                  d|j                  d      t        j                   	      }|||j#                  d
      fc S  t%        d|        )NTRITON__PATH bin	--version)stderrz.*release (\d+\.\d+).*utf-8flags   zCannot find )osenvirongetupperpathjoindirname__file__existsisfile
subprocesscheck_outputSTDOUTresearchdecode	MULTILINEgroupRuntimeError)r   pathsr   resultversions        R/var/www/html/venv/lib/python3.12/site-packages/triton/backends/nvidia/compiler.py_path_to_binaryr1      s     	

 06;
RWW__X.v>E
  177>>#277>>##6,,c;-?
HYHYZF!))$=v}}W?U]_]i]ij&a 0001 fX.
//    c                  j    t        j                  t        d      d   dg      j                  d      } | S )Nptxasr   r   r   )r$   r%   r1   r)   )r/   s    r0   get_ptxas_versionr5   !   s2    %%w'?'BK&PQXXY`aGNr2   returnc                     t        | t              sJ t        t        | j	                  d            \  }}|dk(  rd|z   S |dk(  rd|z   S |dk(  rd|z   S t        d      )	zK
    Get the highest PTX version supported by the current CUDA driver.
    .   P      F   
   ?   z'Triton only support CUDA 10.0 or higher)
isinstancestrmapintsplitr,   )cuda_versionmajorminors      r0   ptx_get_versionrG   '   sn    
 lC(((sL..s34LE5{Ez{Ez{Ez
@
AAr2   c                     t        | d      5 }t        j                  |j                               j	                         cd d d        S # 1 sw Y   y xY w)Nrb)openhashlibsha256read	hexdigest)r   fs     r0   	file_hashrP   7   s>    	dD	 4Q~~affh'1134 4 4s   1AAT)frozenc                       e Zd ZU dZeed<   dZeed<   dZeed<   dZe	e   ed<   d	Z
eed
<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZee   ed<   dZeed<   dZeed<   dZeed<   dZeed<   d Zd Zy)CUDAOptions   	num_warpsr   num_ctas   
num_stagesNmaxnreg)r   r   r   cluster_dimsptx_versionTenable_fp_fusionFallow_fp8e4nvallow_fp8e4b15tf32default_dot_input_precision)r_   tf32x3ieeeallowed_dot_input_precisionsmax_num_imprecise_acc_defaultextern_libsdebugcudabackend_namec                    t        t              j                  dz  }| j                  i nt	        | j                        }|j                  dd       s%t        j                  dt        |dz              |d<   t        j                  | dt        |j                                      | j                  dkD  r| j                  | j                  dz
  z  dk(  sJ d       y )	Nlib	libdeviceTRITON_LIBDEVICE_PATHzlibdevice.10.bcre   r   r   znum_warps must be a power of 2)r   r!   parentre   dictr   r   getenvr@   object__setattr__tupleitemsrU   )selfdefault_libdirre   s      r0   __post_init__zCUDAOptions.__post_init__Q   s    h..6 ,,4b$t?O?O:P{D1')yy1H#n_pNpJq'rK$4k6G6G6I0JK~~!t~~!9K'LQR&R 	0/	0R&Rr2   c           	      ^   t        | j                        }t        d t        |d         D              |d<   dj	                  t        |j                               D cg c]  \  }}| d|  c}}      }t        j                  |j                  d            j                         S c c}}w )Nc              3   <   K   | ]  \  }}|t        |      f  y wN)rP   ).0kvs      r0   	<genexpr>z#CUDAOptions.hash.<locals>.<genexpr>\   s     (htq!!Yq\):(hs   re   _-r   )
rn   __dict__rr   sortedr   rs   rK   rL   encoderN   )rt   	hash_dictnamevalkeys        r0   hashzCUDAOptions.hashZ   s    '	#((hviXeNfGg(h#h	- hh	@Q9RSID#4&#ST~~cjj12<<>> Ts   B)
)__name__
__module____qualname__rU   rB   __annotations__rV   rX   rY   r   rZ   rr   r[   r\   boolr]   r^   r`   r@   rc   r   rd   re   rn   rf   rh   rv   r    r2   r0   rS   rS   =   s    IsHcJ "GXc]!#L%#K!d!M4 ND '--/I %*I*.!4.KE4L#0?r2   rS   c                        e Zd Zedefd       Zdeddf fdZdefdZd Z	d Z
d	 Zed
        Zed        Zed        Zed        Zed        Zd Z ej&                         d        Z xZS )CUDABackendtargetc                      | j                   dk(  S )Nrg   )backend)r   s    r0   supports_targetzCUDABackend.supports_targetc   s    ~~''r2   r6   Nc                     t         |   |       |j                  | _        t	        | j                  t
              sJ d| _        y )Ncubin)super__init__arch
capabilityr?   rB   
binary_ext)rt   r   	__class__s     r0   r   zCUDABackend.__init__g   s6      ++$//3///!r2   c                     t         j                  j                         D ci c]  }||v s|||    }}| j                  dk\  |d<   | j                  dk  |d<   | j                  dk(  rdnd|d<   t        di |S c c}w )	NY   r]   Z   r^   i   @r   rd   r   )rS   __dataclass_fields__keysr   )rt   optsr{   argss       r0   parse_optionszCUDABackend.parse_optionsm   s    $/$D$D$I$I$KYqqTXy47
YY $2 5_!%2!59=B9NTU,-"T""	 Zs
   	A:A:c                     |j                   |j                  |j                  |j                  d   |j                  d   |j                  d   fS )Nr   r      )rU   rV   sharedrZ   )rt   metadatas     r0   pack_metadatazCUDABackend.pack_metadatat   sO    OO!!!$!!!$!!!$
 	
r2   c                 v    dd l mc mc m} d| j                  dk\  r|j
                  i}|S |j                  i}|S )Nr   convert_custom_typesr:   )triton.language.extra.cudalanguageextrarg   r   convert_custom_float8_sm80convert_custom_float8_sm70)rt   rg   codegen_fnss      r0   get_codegen_implementationz&CUDABackend.get_codegen_implementation~   sJ    11"/3"/DD++
  KOJiJi
 r2   c                 .    t        j                  |       y ry   )r   load_dialects)rt   ctxs     r0   r   zCUDABackend.load_dialects   s    S!r2   c                 v   t        j                  | j                        }|j                          t        j
                  j                  |       t        j                  j                  |       t        j                  j                  |       t        j
                  j                  |       t        j                  j                  |       t        j
                  j                  |       t        j
                  j                  |       t        j
                  j                  |       |j                  |        | S ry   )r   pass_managercontextenable_debugr   commonadd_inlinerttiradd_rewrite_tensor_pointeradd_combineadd_canonicalizeradd_reorder_broadcastadd_cseadd_licmadd_symbol_dcerun)modr   optpms       r0   	make_ttirzCUDABackend.make_ttir   s    __S[[)
!!"%..r2#''+))"-b!r"$$R(
s
r2   c                    t        j                         }|j                  <|j                  d   |_        |j                  d   |_        |j                  d   |_        t        j                  | j                        }|j                          t        j                  j                  |d| |j                  d|j                         t        j                  j!                  |       |dz  dk\  rt        j                  j#                  |       t         j                  j$                  j'                  ||       t        j                  j)                  |       t        j                  j+                  |       t        j                  j-                  |       t        j                  j)                  |       t        j                  j/                  ||dk\         t        j0                  j3                  |       |dz  dk\  rIt        j                  j5                  |       t        j                  j7                  ||j8                         t        j                  j;                  |       t        j                  j/                  ||dk\         t        j                  j)                  |       t        j                  j=                  |       t        j                  j?                  |       t        j0                  j3                  |       t        j0                  jA                  |       |dz  d	k\  rRt         j                  j$                  jC                  |       t         j                  j$                  jE                  |       t        j0                  jG                  |       |jI                  |        |j                  |j                  |j
                  f|d
<   | S )Nr   r   r   zcuda:    r=      r:   	   rZ   )%r   ClusterInforZ   clusterDimXclusterDimYclusterDimZr   r   r   r   r   r   add_convert_to_ttgpuirrU   rV   ttgpuiradd_coalesceadd_f32_dot_tc	ttnvgpuiradd_plan_ctaadd_remove_layout_conversionsadd_optimize_thread_localityadd_accelerate_matmuladd_optimize_dot_operandsr   r    add_combine_tensor_select_and_ifadd_pipelinerX   add_prefetchadd_reduce_data_duplicationadd_reorder_instructionsr   add_fence_insertionadd_tma_loweringr   r   )r   r   r   r   cluster_infor   s         r0   
make_ttgirzCUDABackend.make_ttgir   s   ))+''*'7'7':L$'*'7'7':L$'*'7'7':L$__S[[)
**2zl/CS]]TVX[XdXde##B'q NN))"-,,R>44R833B7,,R044R800Z25EFb!q NN;;B?NN''CNN;##B'00Z25EF44R82226//3b!$$R(q MM##77;MM##44R8''+
s$0$<$<l>V>VXdXpXp#q 
r2   c                    | j                  d      }||dxx   |z  cc<   | }t        j                  |j                        }|j	                          t
        j                  j                  j                  |       t        j                  j                  |       t        j                  j                  |       t        j                  j                  |       t        j                  j                  |       t
        j                  j                  j                  ||       t
        j                  j                  j!                  |       t        j                  j#                  |       t        j$                  j'                  |       t        j$                  j)                  |       t        j$                  j+                  |       t,        j.                  j1                  dd      dk(  rt        j2                  j5                  |       |j7                  |       t9        j:                          t9        j                         }t9        j<                  ||      }t        j>                  |       |j@                  R|jC                         D ]?  }	|	jE                         r|	jG                         s%|	jI                  |j@                         A |jJ                  r4|jJ                  D 
cg c]  \  }
}|	 }}
}t9        jL                  ||       t9        jN                  |t8        jP                         | j                  d      |d<   tS        |      }~~|S c c}}
w )Nz"triton_gpu.num-warp-groups-per-ctarU   TRITON_DISABLE_LINE_INFO0ztriton_gpu.sharedr   )*get_int_attrr   r   r   r   r   r   r   %add_decompose_unsupported_conversionsr   convertadd_scf_to_cfadd_index_to_llvmiradd_allocate_shared_memoryadd_to_llvmirr   add_nvgpu_to_llvmadd_arith_to_llvmirr   r   r   r   r   r   r   llvmiradd_di_scoper   r   init_targets	to_moduleset_nvvm_reflect_ftzrY   get_functionsis_declarationis_external_linkageset_nvvm_maxnregre   link_extern_libsoptimize_moduleOPTIMIZE_O3r@   )srcr   optionsr   num_warp_groupsr   r   r   llvm_modr{   r   r   r-   rets                 r0   	make_llirzCUDABackend.make_llir   s]    **+OP&[!_4!__S[[)
CCBG77;$$R(**2.11"5++B
;11"5**2.''+b!$$R(::>>4c:cAMM&&r*
s,,.>>#w/##H- ??&++- 8'')a.C.C.E&&w78 .5.A.ABltTTBEB!!(E2Xt'7'78 !--.AB(m
 Cs   %Mc           	      H   |j                   }|t        d      \  }}t        |      }t        d|      }d}|dk(  rdnd| }	d| }
t	        j
                  | ||	|
dg|j                  d	      }t        j                  d
|      }t        |      dk(  sJ |d   |d<   |dz   d|dz   }t        j                  dd| |t        j                        }t        j                  dd|      }t        j                  j                  dd      dk(  rt        d       t        |       |S )Nr4   S   znvptx64-nvidia-cudar   sm_90asm_z+ptxznvptx-short-ptrFz(.visible .entry ([a-zA-Z_][a-zA-Z0-9_]*)r   r   r   r=   r8   z\.version \d+\.\d+z	.version r   z,\s*debug|debug,\s*r   NVPTX_ENABLE_DUMPr   1z // -----// NVPTX Dump //----- //)r[   r1   rG   minr   translate_to_asmr\   r'   findalllensubr*   r   r   r   print)r   r   r   r   r[   r~   rD   llvm_ptx_versiontripleprocfeaturesr   namess                r0   make_ptxzCUDABackend.make_ptx   s4   oo-g6OA|),7K r;/&%+x3zl1C*+,##CxBSATVYVjVjlqr

FL5zQ 8$b);r>*:;ff*i},EsRTR^R^_ff+R5::>>-s3s:45#J
r2   c                 ~   t        d      \  }}t        j                  ddd      5 }t        j                  ddd      5 }|j                  |        |j	                          |j
                  dz   }t        j                  j                  d	      rd
nd}	|j                  rd
nd}
|dk(  rdnd}t        j                  j                  dd      dk(  r*| |	 |
 d| | |j
                   d| d|j
                   }n)| |	 |
 d| | |j
                   d| d|j
                   }	 t        j                  |dd       	 t        j&                  j)                  |j
                        rt        j*                  |j
                         t        j&                  j)                  |j
                        r t        j*                  |j
                         	 t        |d       5 }|j                         }d d d        t        j&                  j)                  |      rt        j*                  |       d d d        d d d        S # t        j                  $ r}t        |j
                        5 }|j                         }d d d        n# 1 sw Y   nxY w|j                  dk(  rt!        d       |j                  dt"        j$                  z   k(  rt!        d|j
                   d       t!        d|j                   d       d }~ww xY w# t        j&                  j)                  |j
                        rt        j*                  |j
                         t        j&                  j)                  |j
                        r t        j*                  |j
                         w w xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   S xY w)!Nr4   Fwz.ptx)deletemodesuffixrz.logz.or   r   z
 -lineinfoz --fmad=falser   za  DISABLE_PTXAS_OPTr   r  z  -v --opt-level 0 --gpu-name=sm_z -o z 2> z -v --gpu-name=sm_T)shellcheck   z$Internal Triton PTX codegen error: 
   zPlease run `ptxas z+` to confirm that this is a bug in `ptxas`
z`ptxas` failed with error code z: 
rI   )r1   tempfileNamedTemporaryFilewriteflushr   r   r   r   r\   r$   r   CalledProcessErrorrJ   rM   
returncoder,   signalSIGSEGVr   r"   remove)r   r   r   r   r4   r~   fsrcflogfbin	line_infofmadr  cmdelog_filelogrO   r   s                     r0   
make_cubinzCUDABackend.make_cubin  s1   "7+q((CO #	 SW''u3vN#	 RVJJsOJJL99t#D jjnn-GHlI--2?D'2-T3Fzz~~1373>	{4&0PQ[P\]c\deienendoostxsyy}  C  H  H  ~I  J	{4&0B:,vhW[W`W`Vaaefjekkoptpypyoz{)s$d; 77>>$)),IIdii(77>>$)),IIdii(dD! !Q!ww~~d#		$G#	  #	 H ) 00 	b$))_ *"--/C* * *<<3&&)Nse'TUU\\S6>>%99&,TYYK7cdgchik k ')HVZ[^Z_'`aa	b 77>>$)),IIdii(77>>$)),IIdii( -! !A#	  #	  #	 H s   N2 C#N%$H9=BN%N+<N%'N29K?K:!I;	2	K:;J A:K::K??LBNN%N"N%%N/	*N22N<c                 b      fd|d<    fd|d<    fd|d<    fd|d<    fd	|d
<   y )Nc                 *    j                  | |      S ry   )r   r   r   r   rt   s     r0   <lambda>z(CUDABackend.add_stages.<locals>.<lambda><  s    t~~c8W/U r2   r   c                 @    j                  | |j                        S ry   )r   r   r4  s     r0   r5  z(CUDABackend.add_stages.<locals>.<lambda>=      XwX\XgXg0h r2   ttgirc                 @    j                  | |j                        S ry   )r   r   r4  s     r0   r5  z(CUDABackend.add_stages.<locals>.<lambda>>  s    t~~c8WVZVeVe/f r2   llirc                 @    j                  | |j                        S ry   )r  r   r4  s     r0   r5  z(CUDABackend.add_stages.<locals>.<lambda>?  s    dmmC7TXTcTc.d r2   ptxc                 @    j                  | |j                        S ry   )r1  r   r4  s     r0   r5  z(CUDABackend.add_stages.<locals>.<lambda>@  r7  r2   r   r   )rt   stagesr   s   ` `r0   
add_stageszCUDABackend.add_stages;  s0    Uvhwfvduhwr2   c                 8    t               }| d| j                   S )Nr   )r5   r   )rt   r/   s     r0   r   zCUDABackend.hashB  s     #%!DOO,--r2   )r   r   r   staticmethodr   r   r   r
   r   r   r   r   r   r   r   r  r1  r?  	functools	lru_cacher   __classcell__)r   s   @r0   r   r   a   s    (	 ( ("y "T "#S #
"   & &P . .`  > & &Pi Y. .r2   r   ) triton.backends.compilerr   r   triton._C.libtritonr   r   r   r   dataclassesr	   rB  typingr
   r   r   rK   r'   r  r%  r   r$   pathlibr   rC  r@   r1   r5   rB   rG   rP   rS   r   r   r2   r0   <module>rJ     s    ; 8 8 !  ' '  	   	   0C 0 0   
 BS B B T4 4
 $ ?  ?  ?Fd.+ d.r2   