
    i1                     R   d dl Z d dlZd dlmZmZmZ d dlZd dlmZ	 de j                  fdZdeddfdZde j                  fdZ	 	 	 	 dd	ed
edee   dedee   dee   defdZ G d d      Z G d d      Z	 ddeeef   deee      deeeedf   f   fdZ	 ddedededefdZy)    N)AnyOptionalUnion)_get_device_indexreturnc                  |    t         j                  dk(  rt        j                  d      S t        j                  d      S )Nwin32z
nvcuda.dllzlibcuda.so.1)sysplatformctypesCDLL     K/var/www/html/engine/venv/lib/python3.12/site-packages/torch/cuda/_utils.py_get_cuda_libraryr      s,    
||w{{<(({{>**r   resultc                     | dk(  ry t        j                         }t               }|j                  | t        j                  |             |j
                  |j
                  j                         nd}t        d|       )Nr   Unknown CUDA errorCUDA error: )r   c_char_pr   cuGetErrorStringbyrefvaluedecodeRuntimeError)r   err_strlibcudaerror_messages       r   _check_cudar      sn    {ooG!GVV\\'%:;")--";AU  m_5
66r   c                      t        t        j                  j                  j	                  d      d         } t
        j                  dk(  rd|  dg}nd|  dg}|D ]  }	 t        j                  |      c S  t        d      # t        $ r Y 2w xY w)	N.r   r	   nvrtc64_z0_0.dllzlibnvrtc.so.zlibnvrtc.soz Could not find any NVRTC library)
inttorchversioncudasplitr
   r   r   r   OSError)major_version
nvrtc_libslib_names      r   _get_nvrtc_libraryr,       s    **005a89M
||w}oW-


 =/*

  	;;x((
 4
55  		s   B	BBkernel_sourcekernel_namecompute_capabilityheader_codecuda_include_dirsnvcc_optionsc           
         ddl }t               ddt        ddffd}| j                         j	                  d      sd|  } |r	|dz   | z   }n| }|j                  d	      }	|M|j                  j                  |j                  j                               }
|
j                   |
j                   }g }|j                  d
| j                                |r)|D ]$  }|j                  d| j                                & |r'|D ]"  }|j                  |j                  d	             $ ddlm} |D cg c]
  }|dk7  s	| }}|j                  |D cg c]  }|j                  d	       c}       t        |      }t!        j"                  |z  | }t!        j$                         } |j'                  t!        j(                  |      |	| dj                         ddd             j+                  |||      }|k7  rt!        j,                         }j/                  |t!        j(                  |             t!        j0                  |j2                        }j5                  ||       t7        d|j2                  j9                                t!        j,                         } |j;                  |t!        j(                  |                   t!        j0                  |j2                        } |j=                  ||             j?                  t!        j(                  |             |j2                  S c c}w c c}w )a  
    Compiles a CUDA kernel using NVRTC and returns the PTX code.

    Args:
        kernel_source (str): The CUDA kernel source code as a string
        kernel_name (str): The name of the kernel function to compile
        compute_capability (str, None): The compute capability to target (e.g., "86").
                                           If None, will detect from current device.
        header_code (str, optional): Additional header code to prepend to the kernel source
        cuda_include_dirs (list, None): List of directories containing CUDA headers
        nvcc_options (list, None): Additional options to pass to NVRTC

    Returns:
        str: The compiled PTX code
    r   Nr   r   c                     | k7  rot        j                         }j                  | t        j                  |             |j                  |j                  j                         nd}t        d|       y )Nr   r   )r   r   nvrtcGetErrorStringr   r   r   r   )r   r   r   NVRTC_SUCCESSlibnvrtcs      r   check_nvrtcz#_nvrtc_compile.<locals>.check_nvrtcT   so    ]"oo'G((g1FG ==, $$&) 
 m_=>> #r   z
extern "C"zextern "C" 
utf-8z--gpu-architecture=sm_z-I)COMMON_NVCC_FLAGSz--expt-relaxed-constexprz.cuzKernel compilation failed:
) 
torch.cudar,   r#   strip
startswithencoder&   get_device_propertiescurrent_devicemajorminorappendtorch.utils.cpp_extensionr;   extendlenr   r   c_void_pnvrtcCreateProgramr   nvrtcCompileProgramc_size_tnvrtcGetProgramLogSizecreate_string_bufferr   nvrtcGetProgramLogr   r   nvrtcGetPTXSizenvrtcGetPTXnvrtcDestroyProgram)r-   r.   r/   r0   r1   r2   r$   r8   full_sourcesource_bytespropsoptions	directoryoptionr;   flagnvrtc_compatible_flagsnum_optionsoptions_arrayprogreslog_sizelogptx_sizeptxr6   r7   s                            @@r   _nvrtc_compilerb   3   s   0  "#H M	?C 	?D 	?  ++L9%m_5 !D(=8# %%g.L !

001J1J1LM %}U[[M: GNN+,>+?@GGIJ * 	6INNR	{+2245	6 " 	3FNN6==12	3 < +d6P.P  NN5KLTDKK(LM g,K__{2W=M ??D##LLm3&&(	
	 
&
&t[-
HC m??$''fll8.DE))(..9##D#.9#)):J:J:L9MNOO  H((v||H/EFG

%
%hnn
5C$$T3/0  d!3499S Ms   5
L6 L6L;c                   @    e Zd Zdej                  ddfdZdeddfdZy)_CudaModulemoduler   Nc                      || _         i | _        y N)_module_kernels)selfre   s     r   __init__z_CudaModule.__init__   s    02r   name_CudaKernelc           	         || j                   v r| j                   |   S ddlm}  |       }t        j                         }	 t        |j                  t        j                  |      | j                  |j                  d                   t        || j                        }|| j                   |<   |S # t        $ r}t        d| d      |d }~ww xY w)Nr   )r   r:   zNo kernel named 'z' in this module)ri   torch.cuda._utilsr   r   rH   r   cuModuleGetFunctionr   rh   r?   rm   r   AttributeError)rj   rl   r   r   funckernelerrs          r   __getattr__z_CudaModule.__getattr__   s    4== ==&& 	8#% 	V++LL&dkk'6J
 !t||4F"(DMM$M 	V #4TF:J!KLRUU	Vs    A.B/ /	C8CC)__name__
__module____qualname__r   rH   rk   strru   r   r   r   rd   rd      s/    3v 34 3V V Vr   rd   c                       e Zd ZdZdej
                  dej
                  ddfdZ	 	 	 	 	 ddeeeef   deeeef   d	e	e
   d
ede	e   ddfdZy)rm   zT
    Represents a compiled CUDA kernel that can be called with PyTorch tensors.
    rr   re   r   Nc                      || _         || _        y rg   )rr   re   )rj   rr   re   s      r   rk   z_CudaKernel.__init__   s    	r   gridblockargs
shared_memstreamc                    ddl }|j                  j                  j                         }|sg }g }g }	|D ]O  }
t	        |
|j
                        r|
j                  s'|
j                  r|
j                         st        d      t        j                  |
j                               }|j                  |       |	j                  t        j                  |             t	        |
t              r:t        j                   |
      }|	j                  t        j                  |             t	        |
t"              r;t        j$                  |
      }|	j                  t        j                  |             ;t'        dt)        |
              t        j                  t+        |	      z         }t-        |	      D ],  \  }}
t        j.                  |
t        j                        ||<   . |ddl}|j                  j3                         }t5        |j7                  | j8                  |d   |d   |d   |d   |d   |d   ||j:                  |d             y)a  
        Call the compiled CUDA kernel

        Args:
            grid (tuple): Grid dimensions (grid_x, grid_y, grid_z)
            block (tuple): Block dimensions (block_x, block_y, block_z)
            args (list): List of arguments to pass to the kernel.
                         PyTorch tensor arguments will be automatically converted to pointers.
            shared_mem (int): Shared memory size in bytes
            stream (torch.cuda.Stream): CUDA stream to use. If None, uses current stream.
        r   Nz?All tensor arguments must be CUDA tensors or pinned CPU tensorszUnsupported argument type:       )r$   r&   _utilsr   
isinstanceTensoris_cudais_cpu	is_pinned
ValueErrorr   rH   data_ptrrD   r   r#   c_intfloatc_float	TypeErrortyperG   	enumeratecastr<   current_streamr   cuLaunchKernelrr   _as_parameter_)rj   r|   r}   r~   r   r   r$   r   processed_argsc_argsargptrr   r   c_args_arrayis                   r   __call__z_CudaKernel.__call__   s   & 	**##557D 13 	KC#u||,{{CJJ3==?$Y  ooclln5%%c*fll3/0C%S)fll512C' ..-fll734"=d3i[ IJJ-	K2 #f+58' 	@FAs$kk#v?LO	@ >ZZ..0F""		QQQaaa%%	
r   )r   r   r   r   Nr   N)rv   rw   rx   __doc__r   rH   rk   tupler#   r   listr   r   r   r   r   rm   rm      s    V__ foo $  &/&/# $P
CcM"P
 S#s]#P
 tn	P

 P
 P
 
P
r   rm   ra   kernel_namesc           
      8   ddl }t               }t        | t              r| j	                  d      } t        j                         }|j                  j                         }|5  t        |j                  t        j                  |      |              ddd       |st        |      S i }|D ]c  }t        j                         }t        |j                  t        j                  |      ||j	                  d                   t        ||      ||<   e |S # 1 sw Y   xY w)a,  
    Loads a CUDA module from PTX code and returns a module object that can access kernels.

    Args:
        ptx (bytes or str): The PTX code to load
        kernel_names (list, optional): List of kernel names to extract from the module.
                                      If None, will return a module object with __getattr__.

    Returns:
        object: If kernel_names is None, returns a module object with __getattr__ to access kernels.
               If kernel_names is provided, returns a dict mapping kernel names to _CudaKernel objects.
    r   Nr:   )r<   r   r   ry   r?   r   rH   r&   r   r   cuModuleLoadDatar   rd   rp   rm   )	ra   r   r$   r   re   r   kernelsrl   rr   s	            r   _cuda_load_moduler   (  s        !G #sjj! __FZZ&&(F	 IG,,V\\&-A3GHI 6"" G 2 ''T"FDKK,@	

 $D&12 N!I Is    /DDdeviceoptional	allow_cpuc                    t        | t              r| S t        | t              rt        j                  |       } t        | t        j                        r;|r| j
                  dvr+t        d|        | j
                  dk7  rt        d|        t        j                  j                         s0t        | t        j                  j                        r| j                  S t        | ||      S )a  Get the device index from :attr:`device`, which can be a torch.device object, a Python integer, or ``None``.

    If :attr:`device` is a torch.device object, returns the device index if it
    is a CUDA device. Note that for a CUDA device without a specified index,
    i.e., ``torch.device('cuda')``, this will return the current default CUDA
    device if :attr:`optional` is ``True``. If :attr:`allow_cpu` is ``True``,
    CPU devices will be accepted and ``-1`` will be returned in this case.

    If :attr:`device` is a Python integer, it is returned as is.

    If :attr:`device` is ``None``, this will return the current default CUDA
    device if :attr:`optional` is ``True``.
    )r&   cpuz(Expected a cuda or cpu device, but got: r&   z!Expected a cuda device, but got: )r   r#   ry   r$   r   r   r   jitis_scriptingr&   idx_torch_get_device_index)r   r   r   s      r   r   r   X  s      &#&#f%&%,,'{{/1 #KF8!TUU[[F"@IJJ99!!#fejj//0::"68Y??r   )N NNrg   )FF)r   r
   typingr   r   r   r$   torch._utilsr   r   r   r   r#   r   r,   ry   r   bytesrb   rd   rm   dictr   boolr   r   r   <module>r      sK    
 ' '  F+6;; +	7 	7 	76FKK 6, )-(,#'yyy !y 	y
  ~y 4.y yxV V:Y
 Y
z AE-	sEz	-*249*=-
;S-/001-b <A@@@48@@r   