
    bid                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ ej"                  j%                  ej"                  j'                  e            Zej"                  j+                  ed      gZej"                  j+                  ed	      Zd
gZda e j4                         d        Z e j4                         d        Z G d de      Zd ZddddddZ ddddddZ!dZ" e#e"      Z$d Z% e&d  e'd      D              Z(de(d<   de(d<   de(d<   d Z)d  Z* G d! d"e      Z+ G d# d$e      Z,y)%    N)Path)knobs)compile_module_from_src)_allocation)	GPUTarget)	GPUDriverincludelibcudac            	         t         j                  j                  x} r| gS t        j                  ddg      j                  d      }|j                         D cg c]  }d|v s|j                         d    }}|D cg c]!  }t        j                  j                  |      # }}t        j                  d      }|r^|s\|j                  d      D cg c]B  }t        j                  j                  t        j                  j                  |d            sA|D }}d	}|r|d
t        |      z  z  }|dz  }n
|dz  }|dz  }t        d |D              sJ |       |S c c}w c c}w c c}w )Nz/sbin/ldconfigz-pignore)errorslibcuda.so.1LD_LIBRARY_PATH:zlibcuda.so cannot found!
z!Possible files are located at %s.z:Please create a symlink of libcuda.so to any of the files.z<Please make sure GPU is set up and then run "/sbin/ldconfig"z- (requires sudo) to refresh the linker cache.c              3      K   | ]A  }t         j                  j                  t         j                  j                  |d              C yw)r   N)ospathexistsjoin).0r   s     W/var/www/html/engine/venv/lib/python3.12/site-packages/triton/backends/nvidia/driver.py	<genexpr>zlibcuda_dirs.<locals>.<genexpr>)   s,     Sdrww~~bggll4@ASs   AA	)r   nvidialibcuda_path
subprocesscheck_outputdecode
splitlinessplitr   r   dirnamegetenvr   r   strany)	env_libcuda_pathlibslinelocslocdirsenv_ld_library_pathdirmsgs	            r   libcuda_dirsr/      sW    <<4444 !!""$4d#;<CC8CTD *.):UnPT>TDJJLUDU,01SBGGOOC 1D1))$564288=sPRPWPWP\P\]`bpPqArss
&C2SY>>KKMM>>SdSSXUXXSK V1 ts   	E#E>&E AE%E%c                  $    t         gt               S N)libdevice_dirr/        r   library_dirsr5   -   s    +LN++r4   c                   $     e Zd Z fdZd Z xZS )	CudaUtilsc                 d    t        | d      st        t        |   |       | _        | j                  S )Ninstance)hasattrsuperr7   __new__r9   )cls	__class__s    r   r<   zCudaUtils.__new__9   s*    sJ' C8=CL||r4   c                 x   t        t        t        j                  j	                  t
        d            j                         dt               t        t              }|j                  a
|j                  | _        |j                  | _        |j                  | _        |j                  | _        |j                  | _        y )Nzdriver.c
cuda_utilssrcnamer5   include_dirs	libraries)r   r   r   r   r   r"   	read_textr5   rD   rE   PyCUtensorMapload_binaryget_device_propertiescuOccupancyMaxActiveClustersset_printf_fifo_sizefill_tma_descriptor)selfmods     r   __init__zCudaUtils.__init__>   s    %RWW\\':67AAC%%
 ))??%(%>%>",/,L,L)$'$<$<!#&#:#: r4   )__name__
__module____qualname__r<   rO   __classcell__r>   s   @r   r7   r7   7   s    
;r4   r7   c                     | d   dk(  ry| j                  d      ryi ddddd	d
dddddddddddddddddddddddddd|    S )Nr   *CUdeviceptr
tensordescCUtensorMapi1int8_ti8i16int16_ti32int32_ti64int64_tu1uint8_tu8u16uint16_tu32uint32_tu64uint64_tfp16doublebf16fp32f32fp64	nvTmaDesc)
startswith)tys    r   	ty_to_cppru   T   s    	!u|	}}\"hh 	y 	y	
 	y 	i 	i 	z 	z 	z 	 	 	 	x 	  	]!" 	#
 
r4   rg   ri   rk   )rl   rn   ro   rp   rq   	pack_fp16	pack_bf16	pack_fp32	pack_fp64iiiKKppOOOOOOc                    fd}fdfdfd ||j                               }t        |      D ci c]  \  }}||
 }}}dj                  |j                         D cg c]
  } |       c}      }t        |z   }	g }
|j                         D ]  } ||
        t        |
      D ci c]  \  }}||
 }}}t	        |      dkD  r)ddj                  d |j                         D              z   nd}g }|j                         D ]P  \  }}|d	k(  r|t        v r|j                  t        |    d
|        2|j                  t        |       d
|        R dj                  |      }g }|j                         D ]u  \  }}|d   dk(  r|j                  d| d       $|t        v r|j                  d| d       B|dk(  r|j                  d|        \|d	k7  sb|j                  d|        w t        t	        |            }d}|j                         D cg c]  \  }}|d   dk(  rd| d| d| d| d	 }}}|j                         D cg c]  \  }}|dk(  rd| d| d| d }}}|j                         D cg c])  \  }}|t        v rt        |    d| dt        |    d| d+ }}}|j                         D cg c]  \  }}|d	k7  sd|  }}}|j                  d        |j                  d!       d"t	        |      dkD  rd|z   nd d#dj                  |       d$|j                  |j                         D cg c]  \  }} |       d| d% c}}       d&|	 d'| d(|j                  |       d|j                  |       d|j                  |       d)t	        |      dkD  rddj                  |      z   nd d*}|S c c}}w c c}w c c}}w c c}}w c c}}w c c}}w c c}}w c c}}w )+Nc                    g }d}| D ]1  }t        |t              r|j                  d      r
r
|   nd }|dz  }t        j                  d|      }|j                  d      }|j                  d      }|j                  d      dz   }|J|j                  d|z          t        d|z        D ]  }	|j                  d        |j                  d	       n|j                  d
       t        |      D ]  }	|j                  d        t        |      D ]  }	|j                  d        !|j                  |       4 
r|t        
      k(  sJ |S )Nr   rX      ztensordesc<([^[>]*)\[([^]]*)\]   ,rV   ra   rZ   rr   r_   )

isinstancer$   rs   rematchgroupcountappendrangelen)	signatureoutputtensordesc_idxsigmetar   dtypeshapendim_tensordesc_metas             r   _expand_signaturez(make_launcher.<locals>._expand_signature   sC     	#C#s#|(D:I~6t!#!CSIAA{{3'!+<MM#+. #1t8_ -e,-MM$'MM+.t )AMM%()t )AMM%() c"9	#< #nO8L&LLLr4   c                 j    t        | t              r| D ]  } ||        y |j                  |        y r1   )r   tupler   )r   r   x_flatten_signatures      r   r   z)make_launcher.<locals>._flatten_signature   s4    c5! ."1f-. MM#r4   c                     t        | t              r!dj                  t        |             }d| dS | d   dk(  ry| dv ryt	        |       S )Nr   []r   rV   z	PyObject*	constexprrr   )r   r   r   mapru   )rt   val_extracted_types     r   r   z&make_launcher.<locals>._extracted_type   sT    b% ((334Cse1:a5C<++}r4   c                     t        | t              r!dj                  t        |             }d| dS | d   dk(  ry| dv ry| j	                  d      ryd	d
ddddddddd
t        |          S )N ()r   rV   Or   rX   dlbhiLBHIK)
rm   longr[   r^   r`   rb   rd   rg   ri   rk   )r   r   r   r   rs   ru   )rt   r   	format_ofs     r   r   z make_launcher.<locals>.format_of   s    b% ''#i,-Cse1:a5C<++==&
 B- 	r4   r   r   z, c              3   ,   K   | ]  \  }}d |   yw)z&_argNr3   )r   r   rt   s      r   r   z make_launcher.<locals>.<genexpr>   s      LB5 Ls   r   z argrV   ptr_infoz.dev_ptr_arg_storagerr   z*tma_ptrz
  zDevicePtrInfo ptr_infoz = getPointer(_argz); if (!ptr_infoz.valid) return NULL;zCUtensorMap* tma_ptrz = getTmaDesc(_argz); if (!tma_ptrz) return NULL;z _argz_storage = z(_argz);z&argz&global_scratchz&profile_scratcha  
#include "cuda.h"
#include <dlfcn.h>
#include <stdbool.h>
#include <stdlib.h>
#define PY_SSIZE_T_CLEAN
#include <Python.h>

typedef struct {
  PyObject_HEAD;
  _Alignas(128) CUtensorMap tensorMap;
} PyCUtensorMapObject;

static inline void gpuAssert(CUresult code, const char *file, int line)
{
   if (code != CUDA_SUCCESS)
   {
      const char* prefix = "Triton Error [CUDA]: ";
      const char* str;
      cuGetErrorString(code, &str);
      char err[1024] = {0};
      strcat(err, prefix);
      strcat(err, str);
      PyGILState_STATE gil_state;
      gil_state = PyGILState_Ensure();
      PyErr_SetString(PyExc_RuntimeError, err);
      PyGILState_Release(gil_state);
   }
}

#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

typedef CUresult (*cuLaunchKernelEx_t)(const CUlaunchConfig* config, CUfunction f, void** kernelParams, void** extra);

static cuLaunchKernelEx_t getLaunchKernelExHandle() {
  // Open the shared library
  void* handle = dlopen("libcuda.so.1", RTLD_LAZY);
  if (!handle) {
    PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so.1");
    return NULL;
  }
  // Clear any existing error
  dlerror();
  cuLaunchKernelEx_t cuLaunchKernelExHandle = (cuLaunchKernelEx_t)dlsym(handle, "cuLaunchKernelEx");
  // Check for errors
  const char *dlsym_error = dlerror();
  if (dlsym_error) {
    PyErr_SetString(PyExc_RuntimeError, "Failed to retrieve cuLaunchKernelEx from libcuda.so.1");
    return NULL;
  }
  return cuLaunchKernelExHandle;
}

static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int launch_cooperative_grid, int launch_pdl, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, CUstream stream, CUfunction function, CUdeviceptr global_scratch, CUdeviceptr profile_scratchz) {
  void *params[] = { a   };
  if (gridX*gridY*gridZ > 0) {
    // 4 attributes that we can currently pass maximum
    CUlaunchAttribute launchAttr[4];
    static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL;
    if (cuLaunchKernelExHandle == NULL) {
      cuLaunchKernelExHandle = getLaunchKernelExHandle();
    }
    CUlaunchConfig config;
    config.gridDimX = gridX;
    config.gridDimY = gridY;
    config.gridDimZ = gridZ;

    if (num_ctas != 1) {
      config.gridDimX *= clusterDimX;
      config.gridDimY *= clusterDimY;
      config.gridDimZ *= clusterDimZ;
    }

    config.blockDimX = 32 * num_warps;
    config.blockDimY = 1;
    config.blockDimZ = 1;
    config.sharedMemBytes = shared_memory;
    config.hStream = stream;
    config.attrs = launchAttr;
    int num_attrs = 0;

    if (launch_pdl != 0) {
      CUlaunchAttribute pdlAttr = { .id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION, .value = 1};
      launchAttr[num_attrs] = pdlAttr;
      ++num_attrs;
    }

    if (launch_cooperative_grid != 0) {
      CUlaunchAttribute coopAttr = { .id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE, .value = 1};
      launchAttr[num_attrs] = coopAttr;
      ++num_attrs;
    }

    if (num_ctas != 1) {
      CUlaunchAttribute clusterAttr = {};
      clusterAttr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
      clusterAttr.value.clusterDim.x = clusterDimX;
      clusterAttr.value.clusterDim.y = clusterDimY;
      clusterAttr.value.clusterDim.z = clusterDimZ;
      launchAttr[num_attrs] = clusterAttr;
      ++num_attrs;

      CUlaunchAttribute clusterSchedulingAttr = {};
      clusterSchedulingAttr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
      clusterSchedulingAttr.value.clusterSchedulingPolicyPreference = CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
      launchAttr[num_attrs] = clusterSchedulingAttr;
      ++num_attrs;
    }

    config.numAttrs = num_attrs;

    CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0));
  }
}

typedef struct _DevicePtrInfo {
    CUdeviceptr dev_ptr;
    bool valid;
} DevicePtrInfo;

static PyObject* data_ptr_str = NULL;
static PyObject* py_tensor_map_type = NULL;

static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {
  DevicePtrInfo ptr_info;
  ptr_info.dev_ptr = 0;
  ptr_info.valid = true;
  if (PyLong_Check(obj)) {
    ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(obj);
    return ptr_info;
  }
  if (obj == Py_None) {
    // valid nullptr
    return ptr_info;
  }
  PyObject *ret = PyObject_CallMethodNoArgs(obj, data_ptr_str);
  if (!ret) {
    PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
    ptr_info.valid = false;
    goto cleanup;
  }
  if (!PyLong_Check(ret)) {
    PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
    ptr_info.valid = false;
    goto cleanup;
  }
  ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(ret);
  if(!ptr_info.dev_ptr)
    return ptr_info;
  uint64_t dev_ptr;
  int status = cuPointerGetAttribute(&dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
  if (status == CUDA_ERROR_INVALID_VALUE) {
      PyErr_Format(PyExc_ValueError,
                   "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
      ptr_info.valid = false;
  } else if (status != CUDA_SUCCESS) {
      CUDA_CHECK(status);  // Catch any other cuda API errors
      ptr_info.valid = false;
  }
  ptr_info.dev_ptr = dev_ptr;
cleanup:
  Py_XDECREF(ret);
  return ptr_info;

}

static inline CUtensorMap* getTmaDesc(PyObject *obj) {
  if (sizeof(CUtensorMap*) != 8) {
    PyErr_SetString(PyExc_SystemError, "getTmaDesc() requires 64-bit compilation");
    return NULL;
  }

if (Py_TYPE(obj) != (PyTypeObject*)py_tensor_map_type) {
    PyErr_Format(PyExc_TypeError, "object must be of type PyCUtensorMap, got %s", Py_TYPE(obj)->tp_name);
    return NULL;
}

  CUtensorMap* map = &((PyCUtensorMapObject*)obj)->tensorMap;
  uintptr_t align_128 = (uintptr_t)map & (128 - 1);
  if (align_128 != 0) {
    PyErr_Format(PyExc_ValueError, "CUtensorMap must be aligned to 128B, but got (&map) mod 128 = %ld", align_128);
    return NULL;
  }
  return map;
}

static void ensureCudaContext() {
  CUcontext pctx;
  CUDA_CHECK(cuCtxGetCurrent(&pctx));
  if (!pctx) {
    // Ensure device context.
    CUdevice device;
    CUDA_CHECK(cuDeviceGet(&device, 0));
    CUDA_CHECK(cuDevicePrimaryCtxRetain(&pctx, device));
    CUDA_CHECK(cuCtxSetCurrent(pctx));
  }
}

static uint16_t pack_fp16(double f) {
    uint16_t result;
    // from https://github.com/python/pythoncapi-compat
#if 0x030600B1 <= PY_VERSION_HEX && PY_VERSION_HEX <= 0x030B00A1 && !defined(PYPY_VERSION)
    _PyFloat_Pack2(f, (unsigned char*)&result, 1);
#else
    PyFloat_Pack2(f, (unsigned char*)&result, 1);
#endif
    return result;
}

static uint16_t pack_bf16(double f) {
    float f32 = (float)f;
    uint32_t u32 = *(uint32_t*)&f32;
    return (uint16_t)(u32 >> 16);
}

static uint32_t pack_fp32(double f) {
    float f32 = (float)f;
    return *(uint32_t*)&f32;
}

static uint64_t pack_fp64(double f) {
    return *(uint64_t*)&f;
}

static PyObject* launch(PyObject* self, PyObject* args) {
  // ensure cuda context is valid before calling any CUDA APIs, e.g. before getPointer calls cuPointerGetAttributes
  ensureCudaContext();

  int gridX, gridY, gridZ;
  uint64_t _stream;
  uint64_t _function;
  int launch_cooperative_grid;
  int launch_pdl;
  PyObject *launch_enter_hook = NULL;
  PyObject *launch_exit_hook = NULL;
  PyObject *kernel_metadata = NULL;
  PyObject *launch_metadata = NULL;
  PyObject *global_scratch_obj = NULL;
  PyObject *profile_scratch_obj = NULL;
  ;z
  if(!PyArg_ParseTuple(args, "aM  ", &gridX, &gridY, &gridZ,
                                           &_stream, &_function, &launch_cooperative_grid, &launch_pdl, &global_scratch_obj, &profile_scratch_obj,
                                           &kernel_metadata, &launch_metadata,
                                           &launch_enter_hook, &launch_exit_hookaT  )) {
    return NULL;
  }

  int num_warps, num_ctas, shared_memory, clusterDimX, clusterDimY, clusterDimZ;
  if (!PyArg_ParseTuple(kernel_metadata, "iiiiii", &num_warps, &num_ctas, &shared_memory, &clusterDimX, &clusterDimY, &clusterDimZ)) {
    PyErr_SetString(PyExc_TypeError, "kernel_metadata must be a tuple");
    return NULL;
  }

  // extract launch metadata
  if (launch_enter_hook != Py_None){
    PyObject* ret = PyObject_CallOneArg(launch_enter_hook, launch_metadata);
    if (!ret)
      return NULL;
    Py_DECREF(ret);
  }

  CUdeviceptr global_scratch = 0;
  if (global_scratch_obj != Py_None) {
    DevicePtrInfo global_scratch_info = getPointer(global_scratch_obj, -1);
    if (!global_scratch_info.valid) {
      return NULL;
    }
    global_scratch = global_scratch_info.dev_ptr;
  }

  CUdeviceptr profile_scratch = 0;
  if (profile_scratch_obj != Py_None) {
    DevicePtrInfo profile_scratch_info = getPointer(profile_scratch_obj, -1);
    if (!profile_scratch_info.valid) {
      return NULL;
    }
    profile_scratch = profile_scratch_info.dev_ptr;
  }

  // raise exception asap
  z
  Py_BEGIN_ALLOW_THREADS;
  _launch(gridX, gridY, gridZ, num_warps, num_ctas, launch_cooperative_grid, launch_pdl, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (CUstream)_stream, (CUfunction)_function, global_scratch, profile_scratchap  );
  Py_END_ALLOW_THREADS;
  if (PyErr_Occurred()) {
    return NULL;
  }

  if(launch_exit_hook != Py_None){
    PyObject* ret = PyObject_CallOneArg(launch_exit_hook, launch_metadata);
    if (!ret)
      return NULL;
    Py_DECREF(ret);
  }

  Py_RETURN_NONE;
}

static PyMethodDef ModuleMethods[] = {
  {"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"},
  {NULL, NULL, 0, NULL} // sentinel
};

static struct PyModuleDef ModuleDef = {
  PyModuleDef_HEAD_INIT,
  "__triton_launcher",
  NULL, //documentation
  -1, //size
  ModuleMethods
};

PyMODINIT_FUNC PyInit___triton_launcher(void) {
  data_ptr_str = PyUnicode_InternFromString("data_ptr");
  if(data_ptr_str == NULL) {
    return NULL;
  }
  PyObject* driver_mod = PyImport_ImportModule("triton.backends.nvidia.driver");
  if (driver_mod == NULL) {
    return NULL;
  }
  py_tensor_map_type = PyObject_GetAttrString(driver_mod, "PyCUtensorMap");
  if (py_tensor_map_type == NULL) {
    return NULL;
  }

  PyObject *m = PyModule_Create(&ModuleDef);
  if(m == NULL) {
    return NULL;
  }
  PyModule_AddFunctions(m, ModuleMethods);
  return m;
}
)values	enumerater   _BASE_ARGS_FORMATr   itemsFLOAT_STORAGE_TYPEr   ru   r   FLOAT_PACK_FUNCTION)	constantsr   r   r   expand_signaturer   srt   args_formatformatflat_signaturer   	args_listarg_decl_list	arg_declsinternal_args_listparamsnewline	ptr_decls	tma_declsfloat_storage_declsrB   r   r   r   s     `                   @@@r   make_launcherr      s   %N. ))9)9);<"+,<"=>$!QA>I>''93C3C3EFR9R=FGK,FN! 03/0"+N";<$!QA<I<PST]P^abPbtyy L)//:K LLLhjI M" <2##  $6r$:#;4s!CD  IbM?$qc!:;< 		-(I" 	22a5C<%%8&<=%%%%QCx&89;%%n5;%%QCj1	2 3y>"F G __&Ara5C< !#5aS1#=MaSPdeI  foetetev\a\]_a qc!3A3oaSWI  __&Ar## b!
"%s+6I"6M5NeTUSVVXY 
 '0oo&7MUQ2;LQCjMFM
MM#$
MM$%5cj x{  |E  xF  IJ  xJ  dh  kt  dt  PR  cS Syy() y*r <<	@QRuq"OB'(aS2RST U  &x (Q R[P[ %\J <<	 
<<	 
<<#$% &Y BE  FX  BY  \]  B]  Z^  ae  aj  aj  k}  a~  Z~  ce  Yf 2fyNC^
 JI ?F =8

 Nd Ss5   O*O"7O')!O- O3.O9O?#O?Pc              #   $   K   | ]  }||f 
 y wr1   r3   )r   r   s     r   r   r   [  s     :1A:s      
      	   c           
      B   |L| j                   g| j                  | j                  | j                  dk(  | j                  | j                  S |d   }|d   }|d   }|d   }|d   }| j                  }| j                  }|d   dk(  sJ | j                  dk(  rdnd	}	|rt	        |      }|dxx   d
z  cc<   t
        j                  j                  j                  j                  j                  | j                   j                         ||t        |   ||||	      }
|
g||S )Nnanswizzle	elem_size	elem_type
block_size
fp4_paddedr   r}   r   r~   )baser   stridespaddinglisttritonruntimedriveractiveutilsrL   data_ptrTMA_DTYPE_DEVICE_TO_HOST)argmetadatar   r   r   r   r   r   r   r   cu_tensor_maps              r   make_tensordesc_argr   a  s2    c399cs{{cCKK54Hc399cWZWbWbccy!G%I%I,'J,'JIIEkkG2;!;;%'aQGUb	Q	NN))0066JJ +	M ,E,G,,r4   c           
      v    t        d |j                         D              }|s S t        t        |j                               D cg c]*  \  }}t	        |t
              s|j                  d      s)|, c}}      rt              t              k(  sJ sd gt              z   fd}|S c c}}w )Nc              3   b   K   | ]'  }t        |t              xr |j                  d        ) yw)rX   N)r   r$   rs   )r   r   s     r   r   z)wrap_handle_tensordesc.<locals>.<genexpr>  s)     rX[jc2Ss~~l7SSrs   -/rX   c                      t        | d t               }d}t        | t        d        D ]>  \  }}|v r$|j                  t	        ||                |dz  }.|j                  |       @  | S )Nr   r}   )r   _BASE_ARGS_FORMAT_LENr   extendr   r   )args
final_argsr   r   r   launchertensordesc_indicesr   s        r   innerz%wrap_handle_tensordesc.<locals>.inner  s    $5 567
%:%; <= 	'FAs&&!!"5c?>;Z"[\!#!!#&	' $$r4   )r%   r   setr   r   r$   rs   r   )r   r   r   has_tensor_desc_argr   r   r   r   s   ` `    @r   wrap_handle_tensordescr     s    r_h_o_o_qrr"9#3#3#56pvq#*S#:NSVSaSabnSopr#o"6#>P:Q"QQQ&3'9#::	% L! 	qs   	B5
"B5
4B5
c                       e Zd Zd Zd Zy)CudaLauncherc                    t        d      rj                  n	t               }fd}|j                         D ci c]  \  }} ||      | }}}j                  j                         D ci c]  \  }}||
 }}}t        |dd       }t        |||      t        dt               t        t              }	t        j                  t        j                  |j                  d      | _        t#        |	j$                  ||      | _        |j&                  | _        |j(                  | _        |j*                  | _        |j,                  | _        |j.                  | _        |j0                  | _        y c c}}w c c}}w )Nr   c                 t    t        | t              r&j                  j                  j	                  |       fS | S r1   )r   r$   fn	arg_namesindex)r   rB   s    r   <lambda>z'CudaLauncher.__init__.<locals>.<lambda>  s-    Z3=OSVV--33A69 UV r4   r   __triton_launcherrA   r}   )r:   r   dictr   r   getattrr   r   r5   rD   rE   	functoolsreduceoperatormulcluster_dimsnum_ctasr   launchglobal_scratch_sizeglobal_scratch_alignprofile_scratch_sizeprofile_scratch_alignlaunch_cooperative_grid
launch_pdl)
rM   rB   r   r   arg_idxidxvaluer   r   rN   s
    `        r   rO   zCudaLauncher.__init__  s7   %,S+%>CMMDF	V;D??;LMZS%WS\5(M	M25--2E2E2GHJCS%ZH	H!(,=tDIy/B%$%%
 "((x7L7LaP,SZZOT#+#?#? $,$A$A!$,$A$A!%-%C%C"'/'G'G$"--' NHs   E0/E6c                 .     fd} | j                    j                  t        j                        } | j                   j
                  t        j                        }	  j                  | j                   j                  ||	g	|  y )Nc                 x    | dkD  r4z  z  }|	j                   z  | z  }|j                         } |||
      S y Nr   )r  get)sizealign	allocator	grid_size
alloc_sizealloc_fngridXgridYgridZrM   streams         r   allocate_scratchz/CudaLauncher.__call__.<locals>.allocate_scratch  sH    ax!EME1	&6=
$==?
E6::r4   )
r  r  r   
_allocatorr  r  _profile_allocatorr  r	  r
  )
rM   r  r  r  r  functionr   r  global_scratchprofile_scratchs
   `````     r   __call__zCudaLauncher.__call__  s    	 	 *$*B*BDD]D]_j_u_uv*4+D+DdF`F`+6+I+IKE5%4;W;WY]YhYh"O	<6:	<r4   N)rP   rQ   rR   rO   r"  r3   r4   r   r   r     s    .0<r4   r   c                   b     e Zd Z fdZd Zd Zd Zed        Zde	de	fdZ
d	 Zd
 Zd Z xZS )
CudaDriverc                 V    t               | _        t        | _        t        |           y r1   )r7   r   r   launcher_clsr;   rO   )rM   r>   s    r   rO   zCudaDriver.__init__  s    [
(r4   c                 ~    | j                         }| j                  |      }|d   dz  |d   z   }d}t        d||      S )Nr   r   r}       r   )get_current_deviceget_device_capabilityr   )rM   device
capability	warp_sizes       r   get_current_targetzCudaDriver.get_current_target  sK    ((*//7
]R'*Q-7
	Y77r4   c                 J    dd l }|j                  d| j                               S )Nr   r   )torchr+  r)  rM   r0  s     r   get_active_torch_devicez"CudaDriver.get_active_torch_device  s    ||FD$;$;$=>>r4   c                 "    dd l }|j                  S r  )r0  r   r1  s     r   get_device_interfacezCudaDriver.get_device_interface  s    zzr4   c                      	 dd l } | j                  j                         xr | j                  j                  d u S # t
        $ r Y yw xY w)Nr   F)r0  r   is_availableversionhipImportError)r0  s    r   	is_activezCudaDriver.is_active  sC    	::**,L%--2C2Ct2KL 		s   7: 	AArt   returnc                     t        |      S r1   )ru   )rM   rt   s     r   map_python_to_cpp_typez!CudaDriver.map_python_to_cpp_type  s    }r4   c                     ddl m} |S )Nr   )do_bench)triton.testingr?  )rM   r?  s     r   get_benchmarkerzCudaDriver.get_benchmarker  s
    +r4   c                 b    dd l }d}|j                  t        |dz        |j                  d      S )Nr   i      r   )r   r+  )r0  emptyint)rM   r0  
cache_sizes      r   get_empty_cache_for_benchmarkz(CudaDriver.get_empty_cache_for_benchmark  s.    
 '
{{3zQ/uyy{PPr4   c                 $    |j                          y r1   )zero_)rM   caches     r   clear_cachezCudaDriver.clear_cache  s    r4   )rP   rQ   rR   rO   r.  r2  r4  staticmethodr:  r$   r=  rA  rG  rK  rS   rT   s   @r   r$  r$    sN    
8?    Qr4   r$  )-r   r   r   r   r   r   pathlibr   r   triton.runtime.buildr   triton.runtimer   triton.backends.compilerr   triton.backends.driverr   r   r"   realpath__file__r   rD   r2   rE   rG   	lru_cacher/   r5   objectr7   ru   r   r   r   r   r   r   r   r   r   r   r   r   r$  r3   r4   r   <module>rV     sy     	   	   8 & . ,
''//"''**84
5Wi01We,H	  . , ,; ;:
4     $ -. Wv  :b	::         $-N2(<6 (<V/ /r4   