
    sg3                        d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ ej                  j!                  ej                  j#                  e            Zej                  j'                  ed      gZej                  j'                  ed      Zd	gZ e j.                         d
        Z e j.                         d        Zd Z G d de      Zd Zd Z G d de      Z G d de      Z y)    N)Path)_build)get_cache_manager)	GPUTarget)	GPUDriverincludelibcudac            	         t        j                  d      } | r| gS t        j                  ddg      j	                         }|j                         D cg c]  }d|v s|j                         d    }}|D cg c]!  }t         j                  j                  |      # }}t        j                  d      }|r^|s\|j                  d      D cg c]B  }t         j                  j                  t         j                  j                  |d            sA|D }}d}|r|d	t        |      z  z  }|d
z  }n
|dz  }|dz  }t        d |D              sJ |       |S c c}w c c}w c c}w )NTRITON_LIBCUDA_PATHz/sbin/ldconfigz-plibcuda.so.1LD_LIBRARY_PATH:zlibcuda.so cannot found!
z!Possible files are located at %s.z:Please create a symlink of libcuda.so to any of the files.z<Please make sure GPU is set up and then run "/sbin/ldconfig"z- (requires sudo) to refresh the linker cache.c              3      K   | ]A  }t         j                  j                  t         j                  j                  |d              C yw)r   N)ospathexistsjoin).0r   s     P/var/www/html/venv/lib/python3.12/site-packages/triton/backends/nvidia/driver.py	<genexpr>zlibcuda_dirs.<locals>.<genexpr>'   s,     Sdrww~~bggll4@ASs   AA	)r   getenv
subprocesscheck_outputdecode
splitlinessplitr   dirnamer   r   strany)	env_libcuda_pathlibslinelocslocdirsenv_ld_library_pathdirmsgs	            r   libcuda_dirsr+      sO   yy!67 !!""$4d#;<CCED *.):UnPT>TDJJLUDU,01SBGGOOC 1D1))$564288=sPRPWPWP\P\]`bpPqArss
&C2SY>>KKMM>>SdSSXUXXSK V1 ts   	EE7&EAEEc                  $    t         gt               S N)libdevice_dirr+        r   library_dirsr1   +   s    +LN++r0   c           	         t        j                  | j                  d            j                         }t	        |      }|j                  | d      }|t        j                         5 }t        j                  j                  |d      }t        |d      5 }|j                  |        d d d        t        |||t               t        t               }t        |d      5 }|j#                  |j%                         | dd      }d d d        d d d        dd l}	|	j(                  j+                  ||      }
|	j(                  j-                  |
      }|
j.                  j1                  |       |S # 1 sw Y   xY w# 1 sw Y   uxY w# 1 sw Y   yxY w)	Nzutf-8z.sozmain.cwrbT)binaryr   )hashlibsha256encode	hexdigestr   get_filetempfileTemporaryDirectoryr   r   r   openwriter   r1   include_dir	librariesputreadimportlib.utilutilspec_from_file_locationmodule_from_specloaderexec_module)srcnamekeycache
cache_pathtmpdirsrc_pathfso	importlibspecmods               r   compile_module_from_srcrU   0   sM   
..G,
-
7
7
9Cc"E4&-J((* 	Lfww||FH5Hh$ hYWBb$ L1"YYqvvxD6dYK
L	L >>11$
CD
..
)
)$
/CKKC J L L	L 	Ls<   (-FE)'4F&E5F)E2	.F5E>	:FF
c                   $     e Zd Z fdZd Z xZS )	CudaUtilsc                 d    t        | d      st        t        |   |       | _        | j                  S )Ninstance)hasattrsuperrW   __new__rY   )cls	__class__s    r   r\   zCudaUtils.__new__J   s*    sJ' C8=CL||r0   c                 Z   t        t        t        j                  j	                  t
        d            j                         d      }|j                  | _        |j                  | _        |j                  | _	        |j                  | _
        |j                  | _        |j                  | _        y )Nzdriver.c
cuda_utils)rU   r   r   r   r   r   	read_textload_binaryget_device_propertiescuOccupancyMaxActiveClustersset_printf_fifo_sizefill_1d_tma_descriptorfill_2d_tma_descriptor)selfrT   s     r   __init__zCudaUtils.__init__O   s~    %d277<<+L&M&W&W&Y[gh??%(%>%>",/,L,L)$'$<$<!&)&@&@#&)&@&@#r0   )__name__
__module____qualname__r\   ri   __classcell__r^   s   @r   rW   rW   H   s    
Ar0   rW   c                 >    | d   dk(  ryddddddd	d
dddddddd|    S )Nr   *CUdeviceptrint32_tint8_tint16_tint64_tuint32_tuint8_tuint16_tuint64_tfloatdouble)i1i8i16i32i64u1u8u16u32u64fp16bf16fp32f32fp64r/   tys    r   	ty_to_cppr   ^   sQ    	!u|  	!
 
r0   c                    dj                  d |j                         D              }d }d }dj                  |j                         D cg c]  } | ||             c}      }d|z   }t        |      dkD  r)ddj                  d |j                         D              z   nd}	|j	                         D 
cg c]	  }
|
| vs|
 }}
d	t        |      dkD  rd|z   nd d
dj                  d |D               ddj                  |j                         D 
cg c]  \  }
} ||       d|
 d c}}
       d| d|	 ddj                  |j                         D 
cg c]  \  }
}|d   dk(  rd|
 d|
 d|
 d|
 d	nd  c}}
       dt        |      dkD  r)ddj                  d |j                         D              z   nd d}|S c c}w c c}
w c c}}
w c c}}
w )Nz, c              3   B   K   | ]  \  }}t        |       d |   yw)z argNr   r   ir   s      r   r   z make_launcher.<locals>.<genexpr>w   s#     S2Yr]O4s3Ss   c                 *    | d   dk(  ryt        |       S )Nr   rp   	PyObject*r   r   s    r   _extracted_typez&make_launcher.<locals>._extracted_typey   s    a5C<}r0   c                 &    dddddddddd	d
dd|    S )NOrP   dlbhr   BHIK)r   rz   r{   longrs   rt   rr   ru   rw   rx   rv   ry   r/   r   s    r   	format_ofz make_launcher.<locals>.format_of~   s:    
  	r0    	iiiKKOOOOr   c              3   ,   K   | ]  \  }}d |   yw)z&_argNr/   r   s      r   r   z make_launcher.<locals>.<genexpr>   s      LB5 Ls   a	  
#include "cuda.h"
#include <stdbool.h>
#include <Python.h>
#include <dlfcn.h>

static inline void gpuAssert(CUresult code, const char *file, int line)
{
   if (code != CUDA_SUCCESS)
   {
      const char* prefix = "Triton Error [CUDA]: ";
      const char* str;
      cuGetErrorString(code, &str);
      char err[1024] = {0};
      strcat(err, prefix);
      strcat(err, str);
      PyGILState_STATE gil_state;
      gil_state = PyGILState_Ensure();
      PyErr_SetString(PyExc_RuntimeError, err);
      PyGILState_Release(gil_state);
   }
}

#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

typedef CUresult (*cuLaunchKernelEx_t)(const CUlaunchConfig* config, CUfunction f, void** kernelParams, void** extra);

static cuLaunchKernelEx_t getLaunchKernelExHandle() {
  // Open the shared library
  void* handle = dlopen("libcuda.so.1", RTLD_LAZY);
  if (!handle) {
    PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so.1");
    return NULL;
  }
  // Clear any existing error
  dlerror();
  cuLaunchKernelEx_t cuLaunchKernelExHandle = (cuLaunchKernelEx_t)dlsym(handle, "cuLaunchKernelEx");
  // Check for errors
  const char *dlsym_error = dlerror();
  if (dlsym_error) {
    PyErr_SetString(PyExc_RuntimeError, "Failed to retrieve cuLaunchKernelEx from libcuda.so.1");
    return NULL;
  }
  return cuLaunchKernelExHandle;
}

static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, CUstream stream, CUfunction functionz) {
  void *params[] = { c              3   &   K   | ]	  }d |   yw)z&argNr/   )r   r   s     r   r   z make_launcher.<locals>.<genexpr>   s      <4s <s   a   };
  if (gridX*gridY*gridZ > 0) {
    if (num_ctas == 1) {
      CUDA_CHECK(cuLaunchKernel(function, gridX, gridY, gridZ, 32*num_warps, 1, 1, shared_memory, stream, params, 0));
    } else {
      CUlaunchAttribute launchAttr[2];
      launchAttr[0].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
      launchAttr[0].value.clusterDim.x = clusterDimX;
      launchAttr[0].value.clusterDim.y = clusterDimY;
      launchAttr[0].value.clusterDim.z = clusterDimZ;
      launchAttr[1].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
      launchAttr[1].value.clusterSchedulingPolicyPreference = CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
      CUlaunchConfig config;
      config.gridDimX = gridX * clusterDimX;
      config.gridDimY = gridY * clusterDimY;
      config.gridDimZ = gridZ * clusterDimZ;
      config.blockDimX = 32 * num_warps;
      config.blockDimY = 1;
      config.blockDimZ = 1;
      config.sharedMemBytes = shared_memory;
      config.hStream = stream;
      config.attrs = launchAttr;
      config.numAttrs = 2;
      static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL;
      if (cuLaunchKernelExHandle == NULL) {
        cuLaunchKernelExHandle = getLaunchKernelExHandle();
      }
      CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0));
    }
  }
}

typedef struct _DevicePtrInfo {
    CUdeviceptr dev_ptr;
    bool valid;
} DevicePtrInfo;

static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {
  DevicePtrInfo ptr_info;
  ptr_info.dev_ptr = 0;
  ptr_info.valid = true;
  if (PyLong_Check(obj)) {
    ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(obj);
    return ptr_info;
  }
  if (obj == Py_None) {
    // valid nullptr
    return ptr_info;
  }
  PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr");
  if(ptr){
    PyObject *empty_tuple = PyTuple_New(0);
    PyObject *ret = PyObject_Call(ptr, empty_tuple, NULL);
    Py_DECREF(empty_tuple);
    Py_DECREF(ptr);
    if (!PyLong_Check(ret)) {
      PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
      ptr_info.valid = false;
      return ptr_info;
    }
    ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(ret);
    if(!ptr_info.dev_ptr)
      return ptr_info;
    uint64_t dev_ptr;
    int status = cuPointerGetAttribute(&dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
    if (status == CUDA_ERROR_INVALID_VALUE) {
        PyErr_Format(PyExc_ValueError,
                     "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
        ptr_info.valid = false;
    }
    ptr_info.dev_ptr = dev_ptr;
    Py_DECREF(ret);  // Thanks ChatGPT!
    return ptr_info;
  }
  PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
  ptr_info.valid = false;
  return ptr_info;
}

static PyObject* launch(PyObject* self, PyObject* args) {
  int gridX, gridY, gridZ;
  uint64_t _stream;
  uint64_t _function;
  PyObject *launch_enter_hook = NULL;
  PyObject *launch_exit_hook = NULL;
  PyObject *kernel_metadata = NULL;
  PyObject *launch_metadata = NULL;
   z _argz; z
  if(!PyArg_ParseTuple(args, "z", &gridX, &gridY, &gridZ, &_stream, &_function,
                                           &kernel_metadata, &launch_metadata,
                                           &launch_enter_hook, &launch_exit_hook ai  )) {
    return NULL;
  }

  int num_warps, num_ctas, shared_memory, clusterDimX, clusterDimY, clusterDimZ;
  if (!PyArg_ParseTuple(kernel_metadata, "iiiiii", &num_warps, &num_ctas, &shared_memory, &clusterDimX, &clusterDimY, &clusterDimZ)) {
    PyErr_SetString(PyExc_TypeError, "kernel_metadata must be a tuple");
    return NULL;
  }

  // extract launch metadata
  if (launch_enter_hook != Py_None){
    PyObject* args = Py_BuildValue("(O)", launch_metadata);
    PyObject* ret = PyObject_CallObject(launch_enter_hook, args);
    Py_DECREF(args);
    if (!ret)
      return NULL;
  }

  // raise exception asap
  rp   zDevicePtrInfo ptr_infoz = getPointer(_argz); if (!ptr_infoz.valid) return NULL;z;
  Py_BEGIN_ALLOW_THREADS;
  _launch(gridX, gridY, gridZ, num_warps, num_ctas, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (CUstream)_stream, (CUfunction)_functionc              3   H   K   | ]  \  }}|d    dk(  rd| dnd|   yw)r   rp   ptr_infoz.dev_ptr_argNr/   r   s      r   r   z make_launcher.<locals>.<genexpr>3  s       dx  ]b  ]^  `b  A  BC  D  FI  I  go  pq  or  rz  e{  QU  VW  UX  OY  eY  dxs    "aW  );
  Py_END_ALLOW_THREADS;
  if (PyErr_Occurred()) {
    return NULL;
  }

  if(launch_exit_hook != Py_None){
    PyObject* args = Py_BuildValue("(O)", launch_metadata);
    PyObject* ret = PyObject_CallObject(launch_exit_hook, args);
    Py_DECREF(args);
    if (!ret)
      return NULL;

  }

  // return None
  Py_INCREF(Py_None);
  return Py_None;
}

static PyMethodDef ModuleMethods[] = {
  {"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"},
  {NULL, NULL, 0, NULL} // sentinel
};

static struct PyModuleDef ModuleDef = {
  PyModuleDef_HEAD_INIT,
  "__triton_launcher",
  NULL, //documentation
  -1, //size
  ModuleMethods
};

PyMODINIT_FUNC PyInit___triton_launcher(void) {
  PyObject *m = PyModule_Create(&ModuleDef);
  if(m == NULL) {
    return NULL;
  }
  PyModule_AddFunctions(m, ModuleMethods);
  return m;
}
)r   itemsvalueslenkeys)	constants	signatureids	arg_declsr   r   r   args_formatformat	args_listr   paramsrI   s                r   make_launcherr   t   s    		SARSSI
  ''IDTDTDVWb9_R%89WXK;&FPST]P^abPbtyy L)//:K LLLhjI #)@AQi-?a@F@.}\ RU  V_  R`  cd  Rd  ~B  EN  ~N  jl  }m myy <V <<= W>n 88Y__=NOEAr#$E!B/OPQ R  &x (RR[Q\ ]( 99  R[  Ra  Ra  Rc  d  IN  IJ  LNoqrsotx{o{&qc);A3bCSTUSVVjk  BD  D  d  e  f fS |  @I  |J  MN  |N  TX  [_  [d  [d  dx  fo  fu  fu  fw  dx  [x  Tx  TV  SW )WHCR J_ X
 AN P. ds   
G6	G GG#G c                       e Zd Zd Zd Zy)CudaLauncherc                    dt        d      rj                  j                  n	t               i}t        d      rj                  n	t               }fd}|j                         D ci c]  \  }} ||      | }}}j                  j                         D ci c]  \  }} ||      | }}}t        |||      t        d      }	|	j                  | _
        y c c}}w c c}}w )Nids_of_const_exprsfnr   c                 r    t        | t              r%j                  j                  j	                  |       S | S r-   )
isinstancer    r   	arg_namesindex)r   rI   s    r   <lambda>z'CudaLauncher.__init__.<locals>.<lambda>e  s*    As9KCFF,,2215 QR r0   __triton_launcher)rZ   r   
constexprstupler   dictr   r   r   rU   launch)
rh   rI   metadatar   r   cst_keyrK   valuer   rT   s
    `        r   ri   zCudaLauncher.__init__b  s    #'#t:LSVV%6%6RWRYZ%,S+%>CMMDF	R;D??;LMZS%WS\5(M	M;>==;N;N;PQZS%WS\5(Q	QIy#6%c+>?jj	 NQs   *CC$c                 (     | j                   |i | y r-   )r   )rh   argskwargss      r   __call__zCudaLauncher.__call__l  s    T$V$r0   N)rj   rk   rl   ri   r   r/   r0   r   r   r   `  s    !%r0   r   c                   4     e Zd Z fdZd Zed        Z xZS )
CudaDriverc                 V    t               | _        t        | _        t        |           y r-   )rW   utilsr   launcher_clsr[   ri   )rh   r^   s    r   ri   zCudaDriver.__init__r  s    [
(r0   c                 ~    | j                         }| j                  |      }|d   dz  |d   z   }d}t        d||      S )Nr   
          r
   )get_current_deviceget_device_capabilityr   )rh   device
capability	warp_sizes       r   get_current_targetzCudaDriver.get_current_targetw  sK    ((*//7
]R'*Q-7
	Y77r0   c                  r    dd l } | j                  j                         xr | j                  j                  d u S )Nr   )torchr
   is_availableversionhip)r   s    r   	is_activezCudaDriver.is_active~  s,    zz&&(Hemm.?.?4.GHr0   )rj   rk   rl   ri   r   staticmethodr   rm   rn   s   @r   r   r   p  s#    
8 I Ir0   r   )!	functoolsr   r6   r   r;   pathlibr   triton.runtime.buildr   triton.runtime.cacher   triton.backends.compilerr   triton.backends.driverr   r   r   realpath__file__r   r?   r.   r@   	lru_cacher+   r1   rU   objectrW   r   r   r   r   r/   r0   r   <module>r      s     	     ' 2 . ,
''//"''**84
5ww||GY/0We,H	  0 , ,0A A,
,iX%6 % I Ir0   