
    bi>U                        d dl mZmZmZ d dlmZmZmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlZd dlmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlZd dlZd dlZd d	lmZ d
efdZde
j@                  fdZ! ejD                         d        Z# ejD                         de$fd       Z%de$fdZ& ejD                         de$fd       Z' ejD                  d      d        Z(de$fdZ) ed       G d d             Z* G d de      Z+y)    )BaseBackend	GPUTargetLanguage)irpassesllvmnvidia)knobs)
PTXASError)	dataclassN)AnyDictTupleOptional)
ModuleType)Pathtargetc                 >    dt         t        t        t        f   fd}|S )Nreturnc                     | j                   j                  }|j                   j                  }||k(  sJ d       |dk(  ryy)Nz%lhs and rhs bitwidth must be the same   )   r       )r   r      )scalarprimitive_bitwidth)lhs_typerhs_typelhs_bitwidthrhs_bitwidths       Y/var/www/html/engine/venv/lib/python3.12/site-packages/triton/backends/nvidia/compiler.pycheck_dot_compatibilityz-min_dot_size.<locals>.check_dot_compatibility   sB    9999|+T-TT+1    )r   int)r   r"   s     r!   min_dot_sizer%      s!    uS#s]7K  #"r#   r   c                  6    t         j                  j                  S N)r
   r	   ptxas r#   r!   	get_ptxasr*   "   s    <<r#   c                      t         j                  j                  } | | S t        j                  t               j                  dg      j                  d      }|S )Nz	--versionutf-8)r
   r	   mock_ptx_version
subprocesscheck_outputr*   pathdecode)mock_verversions     r!   get_ptxas_versionr4   &   sI    ||,,H%%y{'7'7&EFMMgVGNr#   c                    t        | t              sJ t        t        | j	                  d            \  }}|dk(  r|dk  rd|z   S d|z   dz
  S |dk(  rd|z   S |dk(  rd	|z   S |d
k\  rd}||d
z
  dz  z   |z   S t        d| z         )zK
    Get the highest PTX version supported by the current CUDA driver.
    .      P   r      F   
   ?      Z   z?Triton only support CUDA 10.0 or higher, but got CUDA version: )
isinstancestrmapr$   splitRuntimeError)cuda_versionmajorminorbase_ptxs       r!   ptx_get_versionrI   /   s    
 lC(((sL..s34LE5{19::>!{Ez{Ez{52:++e33
X[gg
hhr#   archc                 `    | j                   }|t               j                  }t        |      }|S r'   )ptx_versionr*   r3   rI   )optionsrJ   rL   rE   s       r!   get_ptx_version_from_optionsrN   G   s0    %%K {**%l3r#   c                 @    t        | |      }t        d|      }d| }|S )NV   z+ptx)rN   min)rM   rJ   rL   llvm_ptx_versionfeaturess        r!   get_featuresrT   O   s0    .w=K 2{+&'(HOr#   c                     t        | d      5 }t        j                  |j                               j	                         cd d d        S # 1 sw Y   y xY w)Nrb)openhashlibsha256read	hexdigest)r0   fs     r!   	file_hashr]   ]   s>    	dD	 4Q~~affh'1134 4 4s   1AA
capabilityc                 "    | dk\  rdnd}d|  | S )Nr?   a sm_r)   )r^   suffixs     r!   sm_arch_from_capabilityrd   c   s!    "$S"FVH%%r#   T)frozenc                   z   e Zd ZU dZeed<   dZeed<   dZeed<   dZeed<   d	Z	e
e   ed
<   dZeed<   d	Zeed<   d	Zeed<   d	Ze
e   ed<   dZeed<   dZeed<   dZeed<   dZee   ed<   dZee   ed<   dZeed<   dZee   ed<   d	Zeed<   d	Zeed<   dZeed<   d Zeed!<   dZeed"<   d	Z eed#<   d$Z!eed%<   d& Z"d' Z#y	)(CUDAOptions   	num_warpsr   num_ctas   
num_stagesr   	warp_sizeNmaxnreg)r   r   r   cluster_dimsrL   ptx_optionsir_overrideTenable_fp_fusionFlaunch_cooperative_grid
launch_pdl)fp8e5fp8e4b15supported_fp8_dtypesr)   !deprecated_fp8_dot_operand_dtypestf32default_dot_input_precision)ry   tf32x3ieeeallowed_dot_input_precisionsmax_num_imprecise_acc_defaultextern_libsdebugcudabackend_namesanitize_overflowrJ   ra   instrumentation_modec                    t        t              j                  dz  }| j                  i nt	        | j                        }|j                  dd       s-t        j                  j                  xs t        |dz        |d<   t        j                  | dt        |j                                      | j                  dkD  r| j                  | j                  dz
  z  dk(  sJ d       y )Nlib	libdevicezlibdevice.10.bcr   r   r   znum_warps must be a power of 2)r   __file__parentr   dictgetr
   r	   libdevice_pathrA   object__setattr__tupleitemsri   )selfdefault_libdirr   s      r!   __post_init__zCUDAOptions.__post_init__   s    h..6 ,,4b$t?O?O:P{D1',||'B'B'mc.[lJlFmK$4k6G6G6I0JK~~!t~~!9K'LQR&R 	0/	0R&Rr#   c           	      ^   t        | j                        }t        d t        |d         D              |d<   dj	                  t        |j                               D cg c]  \  }}| d|  c}}      }t        j                  |j                  d            j                         S c c}}w )Nc              3   <   K   | ]  \  }}|t        |      f  y wr'   )r]   ).0kvs      r!   	<genexpr>z#CUDAOptions.hash.<locals>.<genexpr>   s     (htq!!Yq\):(hs   r   _-r,   )
r   __dict__r   sortedjoinr   rX   rY   encoder[   )r   	hash_dictnamevalkeys        r!   hashzCUDAOptions.hash   s    '	#((hviXeNfGg(h#h	- hh	@Q9RSID#4&#ST~~cjj12<<>> Ts   B)
)$__name__
__module____qualname__ri   r$   __annotations__rj   rl   rm   rn   r   ro   r   rL   rp   rA   rq   rr   boolrs   rt   rw   r   rx   rz   r}   r~   r   r   r   r   r   rJ   r   r   r   r)   r#   r!   rg   rg   i   s   IsHcJIs "GXc]!#L%#KK!%K#%!d!$)T)J'<%*<46%uSz6'--/I %*I*.!4.KE4L#"t"D# "#"0?r#   rg   c                        e Zd ZdZedefd       Zd ZdefdZ	deddf fdZ
defdZd	 Zd
 Zdeeef   fdZd Zed        Zed        Zd Zd Zd Zd Zd Z ej6                         d        Z xZS )CUDABackendNr   c                      | j                   dk(  S )Nr   )backend)r   s    r!   supports_targetzCUDABackend.supports_target   s    ~~''r#   c                     d}t        j                  ||      }|st        d|       t        |j	                  d            S )Nz	^sm(\d+)$z(TRITON_OVERRIDE_ARCH must have the form r   )re	fullmatch
ValueErrorr$   group)r   rJ   patternmatchs       r!   _parse_archzCUDABackend._parse_arch   s@    Wd+GyQRR5;;q>""r#   r   c                 B    | j                  |j                        }d| S )Ncuda:)r   rJ   )r   rM   r^   s      r!   get_target_namezCUDABackend.get_target_name   s#    %%gll3
zl##r#   c                 2    t         |   |       d| _        y )Ncubin)super__init__
binary_ext)r   r   	__class__s     r!   r   zCUDABackend.__init__   s     !r#   c                    dt         j                  j                  xs d| j                  j                   i}|j                  t        j                  j                         D ci c]  }||v s||   |||    c}       t        | j                  |d               }|j                  dd      dkD  r|dk  rt        d| d      d|vrFt        t        j                        }|d	k\  r|j                  d
       t!        t#        |            |d<   d|vr
|dk\  rd|d<   d|vrt         j$                  j&                  |d<   |dk(  rdnd|d<   t        di |S c c}w )NrJ   smrj   r   r?   zBnum_ctas > 1 requires NVIDIA SM90+ (Hopper). Current target is sm_zM. This configuration will fail. Please set num_ctas=1 or target an SM90+ GPU.rw   Y   fp8e4nvrx   )rv   rr   i   @r   r~   r)   )r
   runtimeoverride_archr   rJ   updaterg   __dataclass_fields__keysr$   r   r   r   setrw   addr   r   languagedefault_fp_fusion)r   optsargsr   r^   rw   s         r!   parse_optionszCUDABackend.parse_options   so   33NDKK<L<L;M7NO)I)I)N)N)PuATUY]T]aefgahatQQZuv))$v,78
88J"Q&:? !66@\ BNO Q Q "-#&{'G'G#H R$((3+08L1M+ND'(.d:R<J89T)',~~'G'GD#$9Cr9Iq,-"T""/ vs   #	E-E3Ec                     |j                   |j                  |j                  |j                  d   |j                  d   |j                  d   fS )Nr   r      )ri   rj   sharedro   )r   metadatas     r!   pack_metadatazCUDABackend.pack_metadata   sO    OO!!!$!!!$!!!$
 	
r#   c                     dd l mc mc m} t	        | j                  |j                              }|dk\  r|j                  n|j                  t        | j                        d}|S )Nr   r9   )convert_custom_typesr%   )triton.language.extra.cudar   extrar   r$   r   rJ   convert_custom_float8_sm80convert_custom_float8_sm70r%   r   )r   rM   r   r^   codegen_fnss        r!   get_codegen_implementationz&CUDABackend.get_codegen_implementation   sV    11))',,78
 0:R/?D++TEdEd%

 r#   c                     ddl m} d|iS )Nr   )r   ztriton.language.extra.libdevice)r   r   )r   r   s     r!   get_module_mapzCUDABackend.get_module_map   s    819==r#   c                     t        j                  |       t        j                  r t        j                  j                  |       y y r'   )r	   load_dialectsr   instrumentation)r   ctxs     r!   r   zCUDABackend.load_dialects   s2    S!&&''55c: 'r#   c                    t        j                  | j                        }|j                          t        j
                  j                  |       t        j                  j                  |       |dz  dk  rt        j                  j                  |       t        j
                  j                  |       t        j                  j                  |       t        j                  j                  |       t        j
                  j                  |       t        j
                  j                  |       t        j                  j                  |       |j!                  |        | S )Nr<   	   )r   pass_managercontextenable_debugr   commonadd_inlinerttiradd_rewrite_tensor_pointer(add_rewrite_tensor_descriptor_to_pointeradd_canonicalizeradd_combineadd_reorder_broadcastadd_cseadd_symbol_dceadd_loop_unrollrun)modr   optr^   pms        r!   	make_ttirzCUDABackend.make_ttir   s    __S[[)
!!"%..r2aKK@@D''+#))"-b!$$R(##B'
s
r#   c                    |j                   H| j                  dt        j                  | j                        j                  |j                                t        j                         }|j                  <|j                  d   |_	        |j                  d   |_
        |j                  d   |_        t        j                  | j                        }|j                         }t        j                  j!                  |d| |j"                  d|j$                         t        j&                  j)                  |       |dz  dk\  rt        j&                  j+                  |       t        j                  j,                  j/                  ||       t        j&                  j1                  |       t        j&                  j3                  |       t        j&                  j5                  |       t        j&                  j1                  |       t        j&                  j7                  ||d	k\         t        j                  j,                  j9                  |       t        j                  j;                  |       |dz  d
v rFt        j&                  j=                  |       t        j>                  jA                  |       t        j                  jC                  |       t        j>                  jA                  |       t        j&                  jE                  |       t        j                  jF                  jI                  ||jJ                  |       t        j&                  jM                  ||jJ                         t        j&                  jO                  |       t        j&                  jQ                  ||jJ                  |       n|dz  dk\  rt        j&                  j=                  |       t        j>                  jA                  |       t        j                  jC                  |       t        j&                  jS                  |       t        j&                  jU                  |d       t        j                  j,                  jW                  |       t        j&                  jM                  ||jJ                         t        j&                  jO                  |       t        j&                  jY                  ||jJ                         t        j&                  jQ                  ||jJ                  |       t        j&                  jE                  |       t        j&                  jU                  |d       t        j                  j,                  j[                  |       nt        j                  jC                  |       t        j>                  jA                  |       t        j                  j;                  |       t        j&                  j]                  |       t        j&                  j7                  ||d	k\         t        j&                  j_                  |       t        j                  j,                  ja                  |       t        j&                  j1                  |       t        j                  j,                  jc                  |       t        j&                  je                  |       t        j&                  jg                  |       t        j                  j;                  |       t        j>                  ji                  |       |dz  dk\  r)t        j                  j,                  jk                  |       t        j                  j,                  jm                  ||       t        j                  j,                  jo                  |       t        j>                  jq                  |       t        j>                  js                  |       t        j>                  jA                  |       |ju                  |        |j                  |j                  |j                  f|d<   | jw                         }||d<   | S )Nzttg.maxnregr   r   r   r   r   r<   r   r9   )r   r   FTr   ro   tensordesc_meta)<rn   set_attrr   builderr   get_int32_attrr	   ClusterInforo   clusterDimXclusterDimYclusterDimZr   r   r   r   add_convert_to_ttgpuirri   rj   ttgpuiradd_coalesceadd_f32_dot_tc	ttnvgpuiradd_plan_ctaadd_remove_layout_conversionsadd_optimize_thread_localityadd_accelerate_matmuladd_optimize_dot_operands add_optimize_descriptor_encodingadd_loop_aware_cseadd_fuse_nested_loopsr   r   add_triton_licm add_combine_tensor_select_and_ifhopperadd_hopper_warpspecrl   add_assign_latenciesadd_schedule_loopsadd_pipelineadd_optimize_accumulator_initadd_hoist_tmem_allocadd_promote_lhs_to_tmemadd_warp_specializeadd_remove_tmem_tokensadd_prefetchadd_coalesce_async_copyadd_optimize_tmem_layoutsadd_interleave_tmemadd_reduce_data_duplicationadd_reorder_instructionsr   add_tma_loweringadd_fence_insertionadd_lower_mmaadd_sccpr   r   get_tensordesc_metadata)r   r   r   r^   cluster_infor   dump_enabledr   s           r!   
make_ttgirzCUDABackend.make_ttgir   si    ;;"LL

3;;(?(N(Ns{{([\))+''*'7'7':L$'*'7'7':L$'*'7'7':L$__S[[)(**2zl/CS]]TVX[XdXde##B'q NN))"-,,R>44R833B7,,R044R800Z25EF@@D&&r*v%NN004MM++B/KK''+MM++B/NN;;B?MM  44RVNN//CNNCNN--b1NN''CNNLI2#NN004MM++B/KK''+NN88<NN//E:MM##;;B?NN//CNNCNN--b1NN..r3>>BNN''CNNLINN;;B?NN//D9MM##::2>KK''+''+&&r*##B'00Z25EF..r299"=44R833B72226//3&&r*$$R(q MM##44R833B
C--b1r"b!''+
s$0$<$<l>V>VXdXpXp#q 557&5"#
r#   c                 $   |}t        j                  |j                        }|j                          t        j
                  j                  |       t        j
                  j                  |       t        j                  j                  |       t        j                  j                  |       t        j
                  j                  |       t        j                  j                  |       |j                  |       |j!                         |d<   |S )Nr   )r   r   r   r   r   gluonr   add_resolve_auto_encodingsr   r#  r   r  r   r  r  r   r$  )r   srcr   rM   r^   r   r   s          r!   gluon_to_ttgirzCUDABackend.gluon_to_ttgirE  s    __S[[)
  $//3r"&&r*&&r*77;
s&)&A&A&C"#
r#   c                 
   t        || j                  j                        }|}t        j                  |j
                        }|j                          t        j                  j                  |       t        j                  j                  |       t        j                  j                  |       t        j                  j                  j                  |||       t        j                  j                  j!                  |       t"        j$                  j&                  rt        j                  j)                  |       t        j                  j+                  |       t        j                  j                  j-                  ||       t.        j0                  r+t.        j0                  j3                  d||j
                         t        j                  j                  j5                  |||       t        j6                  j9                  |       t        j6                  j;                  |       t        j                  j                  j=                  |       t        j                  j                  j?                  |       t        j6                  j9                  |       t        j6                  j;                  |       t        j6                  jA                  |       t        j                  jC                  |       t"        j$                  jD                  st        jF                  jI                  |       t.        j0                  r+t.        j0                  j3                  d||j
                         |jK                  |       tM        jN                          tM        j
                         }t"        j$                  jP                  rtS        d      tM        jT                  ||      }	tW        |      }
tY        || j                  j                        }d}t        jZ                          tM        j\                  |	||
|       t        j^                  |	       |j`                  rIt        jb                  |	      r4|j`                  D cg c]  \  }}|	 }}}tM        jd                  |	|       tM        jf                  |	tL        jh                         |jk                  d      }|||d<   |jk                  d      |d<   |jk                  d	      |d
<   |jk                  d      |d<   |jk                  d      |d<   |jk                  d      xs d|d<   |jk                  d      xs d|d<   tm        |	      }~	~|S c c}}w )Nttgpuir_to_llvmirllvmir_to_llvmzYAddress Sanitizer Error: Address sanitizer is currently only supported on the AMD backendnvptx64-nvidia-cudazttg.total-num-warpsri   z
ttg.sharedr   zttg.tensor_memory_size	tmem_sizezttg.global_scratch_memory_sizeglobal_scratch_sizez#ttg.global_scratch_memory_alignmentglobal_scratch_alignzttg.profile_scratch_memory_sizer   profile_scratch_sizez$ttg.profile_scratch_memory_alignmentr   profile_scratch_align)7rN   r   rJ   r   r   r   r   r   r  r  add_allocate_warp_groupsconvertadd_scf_to_cfr	   add_allocate_shared_memory_nvr  add_allocate_tensor_memoryr
   compilationenable_experimental_consanadd_concurrency_sanitizer"add_allocate_global_scratch_memoryadd_proxy_fence_insertionr   r   patchadd_to_llvmirr   r   r   add_nvgpu_to_llvmadd_warp_specialize_to_llvmr   add_nvvm_to_llvmdisable_line_infollvmiradd_di_scoper   r   init_targetsenable_asanrD   	to_modulerd   rT   set_short_ptrattach_datalayoutset_nvvm_reflect_ftzr   has_extern_depslink_extern_libsoptimize_moduleOPTIMIZE_O3get_int_attrrA   )r   r+  r   rM   r^   rL   r   r   r   llvm_modprocrS   tripler   r0   pathstotal_num_warpsrets                     r!   	make_llirzCUDABackend.make_llirU  s   27DKK<L<LM__S[[)
77;//3$$R(;;B
KX::2>77NN44R899"=99"jI&&''--.A2s{{S++B
KH''+b!11"5;;B?''+b!$$R(''+  22MM&&r*&&''--.>CKKP
s,,.((km m>>#w/&z2)9)9:&xx@##H-6#9#9(#C.5.A.ABltTTBEB!!(E2Xt'7'78 **+@A&$3H[! --l; # 0 01I J*-*:*:;[*\&'+.+;+;<a+b'(+.+;+;<]+^+cbc'(,/,<,<=c,d,ihi()(m
' Cs   =U-c           	         t        || j                  j                        }d}t        |      }t	        || j                  j                        }t        j                  ||||g |j                  d      }	t        j                  d|	      }
t        |
      dk(  sJ |
d   |d<   |dz   d|dz   }t        j                  d	d
| |	t        j                        }	t        j                  dd| |	t        j                        }	t        j                  dd|	      }	t        j                  j                  rt!        d       t!        |	       |	S )Nr0  Fz(.visible .entry ([a-zA-Z_][a-zA-Z0-9_]*)r   r   r   r<   r6   z\.version \d+\.\d+z	.version )flagsz\.target sm_\d+z.target sm_z,\s*debug|debug,\s*ra   z // -----// NVPTX Dump //----- //)rN   r   rJ   rd   rT   r   translate_to_asmrr   r   findalllensub	MULTILINEr
   r	   
dump_nvptxprint)r   r+  r   r   r^   rL   rU  rT  rS   rX  namess              r!   make_ptxzCUDABackend.make_ptx  s$   238H8HI&&z2T[[%5%56##CxSEYEY[`a

FL5zQ 8$b);r>*:;ff*i},EsRTR^R^_ff';zl)CSPRP\P\]ff+R5<<""45#J
r#   c           
         t               j                  }t        j                  ddd      5 }t        j                  ddd      5 }|j	                  |       |j                          |j                  dz   }g }	t        j                  j                  r|	dd	gz  }	n't        j                  j                  r|	d
gz  }	n|	dgz  }	|j                  rg ndg}
t        |      }t        j                  j                  rddgng }|j                  r|j                  j                  d      ng }|g|	|
d||d| |j                  d|}	 t!        j"                  |dd|       t        j                  j$                  r7t'        |j                        5 }t)        |j+                                d d d        t,        j                  j/                  |j                        rt-        j0                  |j                         t,        j                  j/                  |j                        rt-        j0                  |j                         t'        |d      5 }|j+                         }d d d        t,        j                  j/                  |      rt-        j0                  |       d d d        d d d        S # 1 sw Y   xY w# t         j2                  $ r}t'        |j                        5 }|j+                         }d d d        n# 1 sw Y   nxY wt,        j                  j/                  |j                        rt-        j0                  |j                         |j4                  dk(  rd}n2|j4                  dt6        j8                  z   k(  rd}nd|j4                   }| d ddj;                  |       d}t)        d| d| d       t=        |      d }~ww xY w# 1 sw Y   wxY w# 1 sw Y   HxY w# 1 sw Y   S xY w) NFwz.ptx)deletemoderc   rz.logz.oz	-lineinfoz-suppress-debug-infoz-gz--fmad=falsez--opt-level0 z-vz--gpu-name=z-oT)check	close_fdsstderr   z!Internal Triton PTX codegen error   z`ptxas` raised SIGSEGVz`ptxas` failed with error code z
`ptxas` stderr:
z
Repro command: 
zC

================================================================
z

zy
================================================================
please share the reproducer above with Triton project.
rV   )r*   r0   tempfileNamedTemporaryFilewriteflushr   r
   r;  rE  r	   disable_ptxas_optrr   rd   rp   rC   r.   r   dump_ptxas_logrW   rb  rZ   osexistsremoveCalledProcessError
returncodesignalSIGSEGVr   r   )r   r+  r   r   r^   r(   fsrcflogfbin
debug_infofmadrJ   disable_optptx_extra_options	ptxas_cmdlog_fileelogerrorr\   r   s                        r!   
make_cubinzCUDABackend.make_cubin  s     ((CO G	 SW''u3vNG	 RVJJsOJJL99t#DJ  22{,BCC
//tf$
 {m+
--2N3CD*:6D 38,,2P2P=#.VXK ?Boo 5 5c :SU "%)+/2=@QU`ae`fSgimirirI$(ydS<<..dii /Hhmmo./ 77>>$)),IIdii(77>>$)),IIdii(: dD! !Q!ww~~d#		$OG	  G	 P O/ / 00 ($))_ *"--/C* * *77>>$)),IIdii(<<3&?E\\S6>>%994E=all^LE!7 #--0E 2++.88I+>*?rC       !''5(8! !IG	  G	  G	 P s   O0DO#
AJ<J/,BJ<O#O!<O#O0/J94J<<OO%K?	6	O?LC
OOO#O O##O-	(O00O:c                      j                  j                        |t        j                  k(  r fd|d<    fd|d<   n|t        j                  k(  r
 fd|d<    fd|d<    fd|d	<    fd
|d<   y )Nc                 ,    j                  | |      S r'   )r   r+  r   r^   rM   r   s     r!   <lambda>z(CUDABackend.add_stages.<locals>.<lambda>  s    4>>#xQXZd3e r#   r   c                 ,    j                  | |      S r'   )r'  r  s     r!   r  z(CUDABackend.add_stages.<locals>.<lambda>  s    DOOCSZ\f4g r#   ttgirc                 ,    j                  | |      S r'   )r,  r  s     r!   r  z(CUDABackend.add_stages.<locals>.<lambda>  s    D4G4GXW^`j4k r#   c                 ,    j                  | |      S r'   )rY  r  s     r!   r  z(CUDABackend.add_stages.<locals>.<lambda>  s    t~~c8WV`/a r#   llirc                 T    j                  | |j                  j                        S r'   )rd  r   rJ   r+  r   rM   r   s     r!   r  z(CUDABackend.add_stages.<locals>.<lambda>  s#    dmmC7TXT_T_TdTd.e r#   ptxc                 T    j                  | |j                  j                        S r'   )r  r   rJ   r  s     r!   r  z(CUDABackend.add_stages.<locals>.<lambda>  s#    XwX\XcXcXhXh0i r#   r   )r   rJ   r   TRITONGLUON)r   stagesrM   r   r^   s   ` ` @r!   
add_stageszCUDABackend.add_stages  se    %%gll3
x&eF6NgF7O'kF7Oaveuiwr#   c                 L    t               }| d| j                  j                   S )Nr   )r4   r   rJ   )r   r3   s     r!   r   zCUDABackend.hash
  s&    #%!DKK,,-..r#   )r   r   r   r   staticmethodr   r   r   rA   r   r   r   r   r   r   r   r   r   r   r   r'  r,  rY  rd  r  r  	functools	lru_cacher   __classcell__)r   s   @r!   r   r      s    O(	 ( (#$# $"y "T "#S #6
>S*_ 5 >;
    M M^ FP,JX	j Y/ /r#   r   ),triton.backends.compilerr   r   r   triton._C.libtritonr   r   r   r	   tritonr
   triton.runtime.errorsr   dataclassesr   r  typingr   r   r   r   typesr   rX   r   rr  r}  rx  r.   pathlibr   r%   
NvidiaToolr*   r  r4   r$   rI   rN   rT   r]   rd   rg   r   r)   r#   r!   <module>r     s-   E E 8 8  , !  - -   	   	  # #5##    iS i i.  
 
 
 T4 4
& & $)? )? )?Xw/+ w/r#   