
    biQ                         d dl mZmZmZ d dlmZmZmZmZ d dl	m
Z
 d dlmZ d dlmZmZmZ d dlmZ d dlZd dlZd dlZd dlZd dlZd dlmZ d	efd
Zd Zd Z ed       G d d             Z G d de      Zy)    )BaseBackend	GPUTargetLanguage)irpassesllvmamd)knobs)	dataclass)AnyDictTuple)
ModuleTypeN)Pathtargetc                     d S )Nc                      y)N   r   r    )lhs_typerhs_types     V/var/www/html/engine/venv/lib/python3.12/site-packages/triton/backends/amd/compiler.py<lambda>z"get_min_dot_size.<locals>.<lambda>   s        r   r   s    r   get_min_dot_sizer      s
     0/r   c                     t         j                  j                  | dk(  xs | dk(  xr |du S t         j                  j                  S )Ngfx942gfx950T)r
   r	   use_block_pingpong)archuse_async_copys     r   is_pingpong_schedule_enabledr$      sI    --5 HM!1!Ln6L X;@99;W;WXr   c                 t    t         j                  j                  | dk(  S t         j                  j                  S )Nr   )r
   r	   use_in_thread_transposer"   s    r   is_in_thread_transpose_enabledr(      s.    !&!B!B!JDHqPUPYPYPqPqqr   T)frozenc                   `   e Zd ZU dZeed<   dZeed<   dZeed<   dZeed<   dZ	e
ed	<   d
Zeed<   dZeed<   dZeed<   dZeed<   dZee   ed<   dZee   ed<   dZeed<   dZee   ed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<   d Zeed!<   d"Zeed#<   d$Zeed%<   d& Z d' Z!y)(
HIPOptions   	num_warpsr   waves_per_eu   
num_stagesnum_ctasNextern_libsr   cluster_dimsFdebugTsanitize_overflowr"   )fp8e4nvfp8e5fp8e5b16fp8e4b8supported_fp8_dtypesr   !deprecated_fp8_dot_operand_dtypesieeedefault_dot_input_precision)r<   allowed_dot_input_precisionsenable_fp_fusionlaunch_cooperative_gridr   matrix_instr_nonkdimkpackallow_flush_denormmax_num_imprecise_acc_defaulthipbackend_name instrumentation_modenoneschedule_hintc                    t        | j                  dd       }|dk\  rdnd}t        j                  | d|       | j                  dkD  r| j                  | j                  dz
  z  dk(  sJ d	       | j                  d
k(  rI| j
                  dk7  r:t        j                  d| j
                   d       t        j                  | dd       t        t              j                  dz  }| j                  i nt        | j                        }dD ]  }t        || dz        ||<    t        j                  | dt        |j                                      y )N   
       @   	warp_sizer   r   znum_warps must be a power of 2r    zckpack is deprecated starting from gfx950 and will be removed in later releases. So for now kpack = z7 will be overwritten to 1 to make transitioning easier.rB   lib)ocmlocklz.bcr2   )intr"   object__setattr__r-   rB   warningswarnr   __file__parentr2   dictstrtupleitems)self	gfx_majorrQ   default_libdirr2   rR   s         r   __post_init__zHIPOptions.__post_init__G   sG   		!B(	#r/Br	4i8~~!t~~!9K'LQR&R 	0/	0R II!

aMMuvz  wA  wA  vB  By  z tWa0h..6 ,,4b$t?O?O:P# 	AC">se3K#?@K	A4k6G6G6I0JKr   c           	          dj                  | j                  j                         D cg c]  \  }}| d|  c}}      }t        j                  |j                  d            j                         S c c}}w )N_-zutf-8)join__dict__r_   hashlibsha256encode	hexdigest)r`   namevalkeys       r   hashzHIPOptions.hashZ   s]    hh9L9L9NOID#4&#OP~~cjj12<<>> Ps   A4
)"__name__
__module____qualname__r-   rU   __annotations__r.   r0   r1   r2   r\   r3   r^   r4   boolr5   r"   r]   r:   r   r;   r=   r>   r?   r@   rA   rB   rC   rD   rF   rH   rJ   rc   rp   r   r   r   r+   r+      s   IsL#JHcK#L%#E4"t"D#
 (S%*R46%uSz6'--/9 %*9!d!$)T) !#!E3N$$)*!3*L# "#"  M3L&?r   r+   c                   F    e Zd ZdZedefd       Zdeddf fdZdefdZ	de
fdZd Zd	 Zdeeef   fd
Zd Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zed        Zd Z ej:                         d        Z xZS )
HIPBackendNr   c                      | j                   dk(  S )NrE   )backendr   s    r   supports_targetzHIPBackend.supports_targetb   s    ~~&&r   returnc                 j    t         |   |       t        |j                  t              sJ d| _        y )Nhsaco)super__init__
isinstancer"   r]   
binary_ext)r`   r   	__class__s     r   r   zHIPBackend.__init__f   s+     &++s+++!r   c                      d|j                    S )Nhip:r'   r`   optionss     r   get_target_namezHIPBackend.get_target_namek   s    gll^$$r   c                 \   dt         j                  j                  xs | j                  j                  i}|j                  dd      dkD  rt        d      | j                  j                  dk(  rBt        t        j                        }|j                  dh       t        t        |            |d<   d|vr%t        t        t        j                              |d<   | j                  j                  d	k(  rCt        t        j                        }|j                  d
dh       t        t        |            |d<   d|vrt         j                  j                   |d<   |j                  t        j"                  j%                         D ci c]  }||v r||   |||    c}       t        di |S c c}w )Nr"   r1   r   z'num_ctas > 1 not supported for AMD GPUsr   tf32r>   r:   r    r8   r9   r;   r?   r   )r
   runtimeoverride_archr   r"   get
ValueErrorsetr+   r>   updater^   sortedr:   r;   languagedefault_fp_fusion__dataclass_fields__keys)r`   optsargsr>   r;   ks         r   parse_optionszHIPBackend.parse_optionsn   su   33Gt{{7G7GH88J"Q&FGG ;;x'+.z/V/V+W((//938@\9]3^D/0!-+0
8W8W1X+YD'(;;x'03J4`4`0a--44j)5LM8=fEf>g8hD45T)',~~'G'GD#$)H)H)M)M)O ;AT	d1g&9 QZ ; 	<!D!!;s   F)c                     |j                   |j                  |j                  |j                  d   |j                  d   |j                  d   fS )Nr   r   r/   )r-   r1   sharedr3   )r`   metadatas     r   pack_metadatazHIPBackend.pack_metadata   sO    OO!!!$!!!$!!!$
 	
r   c                 0    dt        | j                        iS )Nmin_dot_size)r   r   r   s     r   get_codegen_implementationz%HIPBackend.get_codegen_implementation   s     0 =>>r   c                     ddl m} d|iS )Nr   )	libdeviceztriton.language.extra.libdevice)triton.language.extra.hipr   )r`   r   s     r   get_module_mapzHIPBackend.get_module_map   s    719==r   c                     t        j                  |       t        j                  r t        j                  j                  |       y y N)r	   load_dialectsrw   instrumentation)r`   ctxs     r   r   zHIPBackend.load_dialects   s2    #%%&&44S9 &r   c                     dd l }d}t        | d      r| j                         |k  S t        | |j                        r-t        | d      r!| j                         j                         |k  S y)Nr   i	ptr_rangeuntyped_storageF)torchhasattrr   r   Tensorr   size)argr   
MAX_INT_32s      r   is_within_2gbzHIPBackend.is_within_2gb   s]    
3$==?j00c5<<(WS:K-L&&(--/:==r   c                 H    t        j                  |       }d| v r|ddggz  }|S )NSztt.pointer_rangerO   )r   
parse_attr)descrets     r   r   zHIPBackend.parse_attr   s1    $$T*$;',--C
r   c                     t        j                  | |fi |}t        j                  j                  r|dk(  rt
        j                  |       r|dz  }|S )Ntensorr   )r   get_arg_specializationr
   r	   use_buffer_opsrw   r   )r   tykwargsr   s       r   r   z!HIPBackend.get_arg_specialization   sJ    00bCFC 99##h:;S;STW;X3JC
r   c                    t        j                  | j                        }|j                          t        j
                  j                  |       t        j                  j                  |       t        j                  j                  |       t        j
                  j                  |       t        j                  j                  |       t        j                  j                  |       t        j
                  j                  |       t        j                  j                  |       t        j
                  j                  |       t        j                  j!                  |       |j#                  |        | S r   )r   pass_managercontextenable_debugr   commonadd_inlinerttiradd_rewrite_tensor_pointer(add_rewrite_tensor_descriptor_to_pointeradd_canonicalizeradd_combineadd_reorder_broadcastadd_cseadd_triton_licmadd_symbol_dceadd_loop_unrollrun)modr   r   pms       r   	make_ttirzHIPBackend.make_ttir   s    __S[[)
!!"%..r2<<R@''+#))"-b!##B'$$R(##B'
s
r   c                    t        j                  | j                        }|j                          t        j
                  j                  |d|j                   |j                  |j                  |j                         |j                  |        t        j                  | j                        }|j                          t        j                  j                  |       t        j                  j                  |       t        j                  j                  |       t         j                  j                  j#                  ||j                  |j$                  |j&                         t        j                  j                  |       t         j                  j                  j)                  |       t        j                  j+                  |d       t         j                  j                  j-                  |       t        j                  j/                  |       t        j0                  j3                  |       t        j
                  j5                  |       t        j0                  j3                  |       t6        j                   j8                  }t6        j                   j:                  }t6        j                   j<                  }t?        |j                  |      }t         j                  j                  jA                  ||jB                  ||||       |r4t         j                  j                  jE                  ||j                         t        j0                  j3                  |       |jF                  jI                         dk7  r4t         j                  j                  jK                  ||jF                         t        j                  j+                  |d       t        j                  j                  |       t        j                  jM                  |       tO        |j                        rHt         j                  j                  jQ                  |       t        j                  j                  |       t         j                  j                  jS                  |       |rC|jB                  dkD  r4t         j                  j                  jU                  ||jB                         t6        j                   jV                  rt         j                  j                  jY                  |       t        j0                  j3                  |       t         j                  j                  j[                  ||j                  t6        j                   j\                         t         j                  j                  j_                  |       t        j0                  j3                  |       t        j0                  ja                  |       t        j0                  jc                  |       |r4t         j                  j                  je                  ||j                         |j                  |        | S )Nr   TrI   r   )3r   r   r   r   r   r   add_convert_to_ttgpuirr"   r-   rQ   r1   r   ttgpuiradd_coalesceadd_remove_layout_conversionsadd_optimize_thread_localityr	   add_accelerate_matmulrA   rB   add_optimize_epilogueadd_optimize_dot_operandsadd_hoist_layout_conversionsadd_fuse_nested_loopsr   r   r   r
   global_prefetchlocal_prefetchr#   r$   add_stream_pipeliner0   add_coalesce_async_copyrJ   lowerinsert_instruction_sched_hintsadd_reduce_data_duplicationr(   add_in_thread_transposeadd_reorder_instructionsadd_block_pingpongr   add_canonicalize_pointersadd_convert_to_buffer_opsuse_buffer_atomicsadd_fold_true_cmpir   r   add_update_async_wait_count)r   r   r   r   r   r   r#   r!   s           r   
make_ttgirzHIPBackend.make_ttgir   s   __S[[)
**2gll^/DgFWFWY`YjYj+2+;+;	=
s__S[[)
##B'44R833B7

00W\\7C_C_ahanano44R8

00400T:

77;,,R0''+##B'''+))3311119',,W

..r73E3EXfhv/A	CJJ66r7<<H''+  &&(F2JJ==b'BWBWX00T:44R82226)',,7JJ66r:NN88<

33B7'"4"4q"8JJ11"g6H6HI99##JJ88<MM++B/JJ88W\\599KgKgh

--b1''+b!$$R(JJ::2w||L
s
r   c                    | }t        j                  |j                        }|j                          t        j
                  j                  |       t        j
                  j                  |       t        j                  j                  |       t        j                  j                  |       t        j
                  j                  |       t        j                  j                  |       |j                  |       |S r   )r   r   r   r   r   gluonr   add_resolve_auto_encodingsr   add_sccpr   add_loop_aware_cser   r    add_combine_tensor_select_and_ifr   )srcr   r   r   r   s        r   gluon_to_ttgirzHIPBackend.gluon_to_ttgir  s    __S[[)
  $//3r"&&r*&&r*77;
s
r   c                 V   | }t        j                  |j                        }|j                          d}t        j
                  j                  j                  ||j                  |       t
        j                  j                  |       t
        j                  j                  |       t        j
                  j                  j                  |       t        j                  r+t        j                  j                  d||j                         d}t        j
                  j                  j!                  ||j                  |       t
        j"                  j%                  |       t
        j"                  j'                  |       t
        j                  j)                  |       t
        j                  j+                  |       t
        j"                  j%                  |       t
        j"                  j'                  |       t
        j"                  j-                  |       |j.                  j1                         dk7  r?t        j
                  j                  j3                  ||j                  |j4                         t        j                  r+t        j                  j                  d||j                         t6        j8                  j:                  st
        j<                  j?                  |       t        j
                  j                  jA                  ||       |jC                  |       tE        jF                          tE        j                         }tE        jH                  ||      }t	        jJ                  |       d}	t6        j8                  jL                  rd}	tE        jN                  |t        jP                  |j                  |	       t	        jR                  ||j                         t	        jT                  |d       t	        jV                  |d	d
       t	        jV                  |dd       t	        jV                  |dd
       t	        jV                  |d|jX                  dk(         |j[                         D 
cg c]  }
|
j]                         r|
 }}
|d   j_                  t        j`                         |d   jc                  dd|jd                  |jX                  z          |d   jc                  d|jf                          |jh                  rdnd}|d   jc                  d|       t6        j8                  jL                  r'|d   jk                  d       |d   jm                          t	        jn                  |d          t6        j8                  jL                  r\tq        tr              jt                  dz  }tw        |dz        tw        |dz        tw        |dz        g}tE        jx                  ||       ne|jz                  rY|jz                  D cg c]  \  }}t	        j|                  ||      s|  }}}t        |      dkD  rtE        jx                  ||       tE        j                  |tD        j                  |j                  dg |j                         t	        j                  |j                        r<|d   j                  d       |d   j                  d       |d   j                  d       t6        j                  j                  rt	        j                  |d          | j                  d      |d<   | j                  d      xs d|d<   | j                  d       xs d!|d"<   t	        j                  |       t	        j                  |       tw        |      S c c}
w c c}}w )#Nr   ttgpuir_to_llvmirTrI   llvmir_to_llvmrG   +xnacki  __oclc_finite_only_optF__oclc_correctly_rounded_sqrt32__oclc_unsafe_math_opt__oclc_wavefrontsize64rP   zamdgpu-flat-work-group-sizez1,zamdgpu-waves-per-euzpreserve-signr<   zdenormal-fp-math-f32rR   z
asanrtl.bczocml.bczockl.bczamdgpu-no-workgroup-id-xzamdgpu-no-workgroup-id-yzamdgpu-no-workgroup-id-zz
ttg.sharedr   zttg.profile_scratch_memory_sizeprofile_scratch_sizez$ttg.profile_scratch_memory_alignmentr   profile_scratch_align)Jr   r   r   r   r	   r   r   add_optimize_lds_usager"   convertadd_scf_to_cfadd_index_to_llvmiradd_allocate_shared_memoryrw   r   patchadd_to_llvmirr   r   r   add_cf_to_llvmiradd_arith_to_llvmirr   rJ   r   lower_instruction_sched_hintsr0   r
   compilationdisable_line_infollvmiradd_di_scopeadd_builtin_func_to_llvmirr   r   init_targets	to_moduleattach_target_tripleenable_asanattach_datalayoutTARGET_TRIPLEset_isa_versionset_abi_versionset_bool_control_constantrQ   get_functionsis_declarationset_calling_convCALLING_CONV_AMDGPU_KERNELadd_fn_attrr-   r.   rC   add_fn_target_featureadd_fn_asan_attrset_all_fn_arg_inregr   rZ   r[   r]   link_extern_libsr2   need_extern_liblenoptimize_moduleOPTIMIZE_O3r?   has_architected_sgprsremove_fn_attrscalarize_packed_fops#add_scalarize_packed_fops_llvm_passget_int_attrcleanup_bitcode_metadatadisable_print_inline)r   r   r   r   r   custom_lds_size_HIPBackend__HIP_FTZr   llvm_modtarget_featuresfnfnsdenormal_moderb   pathsrm   paths                    r   	make_llirzHIPBackend.make_llir  s   __S[[)
 

11"gllOT$$R(**2.

55b9%%&&,,-@"ckkR 	

((W\\9E''+b!''+**2.''+b!$$R(  &&(F2JJ<<RwOaOab %%&&,,-=r3;;O  22MM&&r*

55b)D
s 	,,.>>#w/  *((&Ox):):GLL/Z 	Hgll3Hc*%%h0H%P%%h0QSWX%%h0H%P%%h0H'J[J[_aJab %224PbB<M<M<OrPPA > >?A8Bw?P?PQXQbQb?b>c:de 	A0W5I5I4JL+2+E+E6A1=A((F((2F##%
 	  Q(((!(^22U:NN\12NY./NY./E
 !!(E2  .5.A.AiltTSEXEXYacgEhTiEi5zA~%%h6Xt'7'7r2wOgOgh $$W\\2F!!"<=F!!"<=F!!"<=99**33CF; !--l;+.+;+;<]+^+cbc'(,/,<,<=c,d,ihi()$$X. 	  *8}w Q@ js   8^ ^ ^%3^%c           	         t        j                  d|       }t        |      dk(  sJ |d   |d<   g }|j                  dk(  r|j	                  d       d|j
                  v rdnd	}t        j                  | t        j                  |j
                  |||j                  d
      }t        j                  j                  rt        d       t        |       |S )Nz3define amdgpu_kernel void @([a-zA-Z_][a-zA-Z0-9_]*)r   r   rm   	attentionzsink-insts-to-avoid-spillsgfx11z-real-true16rG   Fz!// -----// AMDGCN Dump //----- //)refindallr!  rJ   appendr"   r   translate_to_asmr	   r  r?   r
   dump_amdgcnprint)r   r   r   namesflagsfeaturesamdgcns          r   make_amdgcnzHIPBackend.make_amdgcn  s    
 

QSVW5zQ 8
   K/LL56%,%<>"&&sC,=,=w||XW\^e^v^v',.99  56&Mr   c                 l   d}t         j                  j                  rd}t        j                  | |j
                  |      }t        j                         5 }t        j                         5 }t        |j                  d      5 }|j                  |       d d d        t        j                  |j                  |j                         d d d        t        |j                  d      5 }|j                         }	d d d        d d d        	S # 1 sw Y   txY w# 1 sw Y   NxY w# 1 sw Y   +xY w# 1 sw Y   	S xY w)NrG   r   wbrb)r
   r	  r  r	   assemble_amdgcnr"   tempfileNamedTemporaryFileopenrm   write
link_hsacoread)
r   r   r   r.  r}   tmp_outtmp_infd_infd_outr   s
             r   
make_hsacozHIPBackend.make_hsaco  s    ((&O##CG((* 	$g,,. :&&++t, 'KK&'v{{GLL9: gllD) $Vkkm$	$ 
' ': :$ $	$ 
sT   D))D D2DD)"D3D)D
DD	D)D&	"D))D3c                      |t         j                  k(  r fd|d<    fd|d<   n|t         j                  k(  r	 fd|d<    fd|d<    fd|d	<    fd
|d<   y )Nc                 *    j                  | |      S r   )r   r   r   r   r`   s     r   r   z'HIPBackend.add_stages.<locals>.<lambda>  s    4>>#xQX3Y r   r   c                 *    j                  | |      S r   )r   rT  s     r   r   z'HIPBackend.add_stages.<locals>.<lambda>  s    DOOCSZ4[ r   ttgirc                 *    j                  | |      S r   )r   rT  s     r   r   z'HIPBackend.add_stages.<locals>.<lambda>  s    D4G4GXW^4_ r   c                 *    j                  | |      S r   )r4  rT  s     r   r   z'HIPBackend.add_stages.<locals>.<lambda>  s    t~~c8W/U r   llirc                 *    j                  | |      S r   )rB  rT  s     r   r   z'HIPBackend.add_stages.<locals>.<lambda>  s    1A1A#xQX1Y r   rA  c                 *    j                  | |      S r   )rQ  rT  s     r   r   z'HIPBackend.add_stages.<locals>.<lambda>  s    Xw0W r   r}   )r   TRITONGLUON)r`   stagesr   r   s   ` ` r   
add_stageszHIPBackend.add_stages  sR    x&YF6N[F7O'_F7OUvYxWwr   c                     | j                    S r   r   )r`   s    r   rp   zHIPBackend.hash  s    ++r   ) rq   rr   rs   r   staticmethodr   rz   r   r]   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r4  rB  rQ  r_  	functools	lru_cacherp   __classcell__)r   s   @r   rw   rw   _   sM   O'	 ' '"y "T "
%# %"S "4
?>S*_ 5 >
:
          8 8t   A AF  .  X Y   r   rw   )triton.backends.compilerr   r   r   triton._C.libtritonr   r   r   r	   tritonr
   dataclassesr   typingr   r   r   typesr   ri   rG  r8  rb  rX   pathlibr   r   r$   r(   r+   rw   r   r   r   <module>rl     sw    E E 5 5  ! # #    	   0Y 0X
r $=? =? =?@n  n r   