
    i	                      U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZmZmZmZmZmZ d dl m Z  d dl	m!Z! d dl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- d dl.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4 d dlm5Z5 d dl6Z6d dl7Z7d dl8m9c m:Z; d d	l<m=Z= d d
l>m?Z? d dl@mAZA d dlBmCZC d dl8mDZDmEZE ddgZFd dlGmHZHmIZImJZJmKZK e+rVd dlmLZLmMZMmNZN d dl7mOZOmPZPmQZQ d dlRmSZS d dlTmUZU d dlVmWZW ddlXmYZY ddlZm[Z[ ddl\m]Z] ddl^m_Z_m`Z`maZambZbmcZcmdZd ddlemfZf ddlgmhZhmiZi g dZj e,d      Zkej                  dd       Zmd d lnmoZo d d!lpmqZq d d"lrmsZs d d#ltmuZu d d$lvmwZw d d%lxmyZy d d&lzm{Z{m|Z|m}Z}m~Z~mZ d d'lmZmZ d d(lmZmZ dd)lmZ dd*lmZ ej                  d+k(  Z ej                  e      Ze7j                  j!                  ed,      Z e,d-      Zee6j(                  e6j(                  f   Ze)e-e7j,                  ee7j                  f      Zd.d/d0Zd1Zd1Zd1Zd2Zd3Zeedz
  z  d k(  red4k\  sJ d5       dd6Zd d7Z G d8 d9e6jB                        Z ejF                  d:;       G d< d=             Zd!d"d>Z	 d!	 	 	 	 	 	 	 d"d?Zej                  d#d@       Zd$dAZd%dBZd&dCZd'dDZ	 	 	 	 	 	 d(dEZd)dFZ	 	 	 	 d*dGZd+dHZ	 	 	 	 d,dIZd-dJZdK f	 	 	 	 	 d.dLZ	 	 	 	 	 	 	 	 d/dNZd0d1dOZ	 	 d2	 	 	 	 	 	 	 	 	 d3dPZ	 	 	 	 	 d4	 	 	 	 	 	 	 	 	 	 	 	 	 d5dQZd6dRZd7dSZd8dTZd9dUZd:dVZ e1dW      Z e,dXd:Y      Z G dZ d[e*e&eef         Zd;d\Z	 	 	 	 d<d]Z	 	 	 	 	 	 d=d^Z	 	 	 	 	 	 d>d_Z	 d?	 	 	 	 	 d@d`Z	 	 	 	 	 	 dAdaZÐdBdbZĐdCdcZŐdDddZƐdEdeZǐdFdfZȐdGdgZɐdHdhZʐdIdiZːdJdjZ	 	 	 	 dKdkZ͐dLdlZΐdMdmZd dlZАdNdnZg ZdMedo<   dOdpZԐdNdqZej                  	 	 	 dP	 	 	 	 	 	 	 dQdr       ZeZeZeZڐdRdsZ	 	 	 	 	 	 dSdtZ ej                  d4      dTdu       Z G dv dwe(      ZejF                   G dx dy             Z G dz d{      Z G d| d}e      Zej                  dUd~       Z G d d      Z G d de      Zej                  dVdWd       Zej                  dXd       Zej                  d#d       ZdXdZ	 d?	 	 	 	 	 	 	 dYdZ	 	 	 	 	 	 dZdZd[dZd[dZddd:d	 	 	 	 	 	 	 	 	 d\dZddd]dZddd]dZd^dZd_dZe-ee6j(                  f   Zded<   ej                  d`d       Zej                  d`d       Zej                  dad       Zej                  dbd       Zej                  dcd       ZdddZd^dZd^dZdddZdddZ	 	 	 	 	 	 	 	 dedZ	 	 	 	 df	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dgdZd#dZ  G d d      Z	 	 	 	 	 	 	 	 dhdZ	 	 	 	 	 	 	 	 dhdZdidZdjdZdkdZ	 	 	 	 	 	 	 	 dkdZ	 	 	 	 	 	 	 	 dldZej                  	 	 	 	 	 	 dmd       Z		 d?	 	 	 	 	 dndZ
dodZdpdZdqdZdqdZdrdZdsdZej                  dtd       ZdXdZej                  dXd       Zej                  dud       Zej                  dXd       ZdXdZdvdZdwdZd#dZd#dZdxdZdJdZ G d dej:                        Z	 	 	 	 	 	 	 	 	 	 dydZdzdZ 	 	 	 	 dzdÄZ!	 d?	 	 	 	 	 d{dĄZ"d|dńZ#d}dƄZ$d}dǄZ%	 	 	 	 	 	 d~dȄZ&	 	 	 	 	 	 	 	 ddɄZ'dʄ f	 	 	 	 	 	 	 	 	 	 	 dd˄Z(d̄ f	 	 	 	 	 	 	 	 	 	 	 dd̈́Z)dd΄Z*ddτZ+ejF                   G dЄ dѫ             Z,ej                  dd҄       Z-ddӄZ.ddԄZ/ddՄZ0ddքZ1	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddׄZ2dd؄Z3ddلZ4ddڄZ5ddۄZ6	 	 	 	 	 	 	 	 dd܄Z7dd݄Z8	 	 	 	 	 	 	 	 ddބZ9dd߄Z:	 d?	 	 	 	 	 	 	 ddZ;	 	 	 	 	 	 ddZ<ddZ=	 	 	 	 	 	 ddZ>d#dZ?ddZ@ddddddddZAeAj                         D  ci c]  \  } }|| 
 c}} ZC ej                  d      ZEddZFddZGddZHddZIej                  dd       ZJejF                   G d d             ZKi ZLded<   	 	 	 	 	 	 	 	 ddZM eC       ZNded<   ddZOddZPddZQ e,d      ZR e,d      ZS G d d eeReSf         ZT e0d:      d?d:d;dd       ZUddZV G d dej:                        ZWej                  dd       ZXd#dZYddZZdd	Z[dd
Z\d#dZ]ddZ^dZ_ddZ`ddZaddZb	 	 d	 	 	 	 	 	 	 	 	 ddZcddZdd#dZeddZf	 	 d	 	 	 	 	 	 	 ddZgddZh ejF                  d:;       G d d             Zie$de#f   Zje$ejeigejf   Zk G d d      Zl el       ZmddZnddZoyc c}} w (      )annotationsN)
Collection	GeneratorIteratorMappingMutableMapping
MutableSet)datetime)StringIO)AnyCallablecastGenericLiteral
NamedTupleOptionalProtocolTYPE_CHECKINGTypeVarUnion)Concatenatedataclass_transform	ParamSpecSelf	TypeAlias	TypeGuard)mock)datasheet_tops)DeviceProperties)dtype_abbrs)
OrderedSet)tree_flattentree_map_only!activation_quantization_aten_passinductor_autotune_lookup_table)free_symbolsfree_unbacked_symbolsIterateExprsShapeEnv)IterableSequence
ValuesView)SymBoolSymFloatSymInt)ELEMENTWISE_TYPE_PROMOTION_KIND)GraphModule)Node   )WorkspaceArgPythonWrapperCodegenGraphLowering)BufferExternKernelIRNodeLayout	OperationReinterpretViewCompiledFxGraph)BaseSchedulerNodeSchedulerBuffer)cudampsxpumtiaTc                     t         D  cg c]#  } t        t        |       j                         s"| % }} t	        |      dk  sJ t	        |      dk(  rd}|S |j                         }|S c c} w )Nr3   r   rC   )	GPU_TYPESgetattrtorchis_availablelenpop)x
avail_gpusgpu_types      O/var/www/html/engine/venv/lib/python3.12/site-packages/torch/_inductor/utils.pyget_gpu_typerS   i   sg    &K'%*;*H*H*J!KJKz?aZA-vHO 4>>>3CHO Ls
   #A'A')get_interface_for_device)detect_fake_mode)
DeviceType)	EventList)GraphTransformObserver)	ShapeProp)CeilDivCleanDivFloorDivIdentityModularIndexing)make_symbolSymT)bound_sympyValueRangesconfig)ceildivwin32
perf_hints_Tz.cubinz.spv)rC   rE         @      zmust be power of 2c                *    | t         z   dz
  t          z  S )z/Round up to the nearest multiple of ALIGN_BYTESr3   )ALIGN_BYTES)nbytess    rR   _alignrp      s    [ 1$44    c                   t        | t        j                  t        j                  f      r#t	        t        t        | j                              S t        | t              xs! t        j                  | t              t        k(  S )z:v can be statically proven to be a multiple of ALIGN_BYTES)
isinstancesympyAddMaxallmap_is_alignedargsaligngcdrn   )vs    rR   ry   ry      sQ    !eii+,3{AFF+,,aK599Q#<#KKrq   c                  *    e Zd ZdZdZdZedd       Zy)r{   z<Symbolically round up to the nearest multiple of ALIGN_BYTESr3   Tc                    t        |t        t        j                  f      rt	        t        |            S t        |      r|S y N)rs   intrt   Integerrp   ry   )clsvalues     rR   evalz
align.eval   s6    ec5==12#e*%%uL rq   N)r   
sympy.ExprreturnzOptional[sympy.Expr])__name__
__module____qualname____doc__nargs
is_integerclassmethodr    rq   rR   r{   r{      s!    FEJ rq   r{   Tfrozenc                  :    e Zd ZU dZded<   ded<   ded<   ded<   y	)
GraphPartitionMapzP
    Mapping from the partition info (e.g., input/output) to the graph info
    r   idzlist[Optional[int]]input_index_mappingoutput_index_mapping	list[str]constant_namesNr   r   r   r   __annotations__r   rq   rR   r   r      s$    
 	G -,-- rq   r   c           
         |         t         j                  j                          t        j                  t	        d      t         j
                  d      }t         j                  j                  d      }t         j                  j                  d      }|j                          t        d      D ]  }|j                           |          |j                          t         j                  j                          |j                  |      dz  }t        dt	        ||z              }t        dt	        ||z              }	t        |      D ]	  } |          t        |	      D cg c]"  }t         j                  j                  d      $ }}t        |	      D cg c]"  }t         j                  j                  d      $ }}t         j                  j                  t         j                  j                  j                  g      5 }
t         j                  j                          t        |	      D ]q  }|j                          ||   j                          t         j                  j                   j                  d	      5   |         d
d
d
       ||   j                          s t         j                  j                          t        j"                  t%        ||      D cg c]  \  }}|j                  |       c}}      }d
d
d
       t        j&                        j)                         }t*        j-                  d       t*        j-                  
j/                         j1                  dd             t3        |
j5                         D cg c]A  }|j6                  t8        j                  k(  r"t;        j<                  d|j>                        |C c}      }|r"|tA        j&                  d |D              dz  z  }t*        j-                  d|       |S c c}w c c}w # 1 sw Y   xY wc c}}w # 1 sw Y   3xY wc c}w )R  
    Returns benchmark results by examining torch profiler events.
    This could be more accurate as it doesn't count CPU side overhead.
    However, this also requires manually excluding irrelevant event, e.g.
    vectorized_elementwise_kernel which is used to fill L2 cache,
    various CUDA events, etc, so could also be fragile.
        ArC   dtypedeviceTenable_timing   r3   
activitiesRunCudaModuleN
raw eventsself_device_time_totalsort_by	row_limitzfused_abs_max_\dc              3  4   K   | ]  }|j                     y wr   device_time_total.0events     rR   	<genexpr>zfp8_bench.<locals>.<genexpr>	  s     QE33Q        @@profiling results: %s ms)!rK   rC   synchronizeemptyr   float16Eventrecordrangezero_elapsed_timemaxprofilerprofileProfilerActivityCUDAnvtxtensorzipmeanitemlogdebugkey_averagestablerW   eventsdevice_typerV   rematchname
statistics)fnwarmuprepcachestart_event	end_event_estimate_msn_warmupn_repeatpisetimesresr   filtered_eventss                     rR   	fp8_benchr      sT    D	JJKKJu}}VLE **"""6K

  t 4I1X 
 	JJ**959K 1c&;./0H1c#+,-H 8_ 
 BGxQA5::##$#7QKQ?DXO!!!!5OIO			NN++00
 
  
 
 


 x 	"AKKMN!!#&&7 aL!	" 	

 +.{I+FG41aQ^^AG

" **U

 
 
"CIIlIIann$$-EQS$TU 	
!!Z__4HH0%**=I	 	
	O OOQQQ	

 II(#.JO RO 
 H
 
*	
sE   "'P'PA9P2=PAP2P,9P2AP?P)$P22P<c                L    |         t         j                  j                          t        j                  t	        d      t         j                  d      }t         j                  j                  d      }t         j                  j                  d      }|j                          t        d      D ]  }|j                           |          |j                          t         j                  j                          |j                  |      dz  }t        dt	        ||z              }t        dt	        ||z              }	t        |      D ]	  } |          t         j                  j                          t         j                  j                  t         j                  j                  j                  g      5 }
t        |	      D ]  }|j                           |          t         j                  j                          d	d	d	       t        j!                  d
       t        j!                  
j#                         j%                  dd             t'        |
j)                         D cg c]0  }|j*                  t,        j                  k(  r|j.                  dk7  r|2 c}      }t1        |      |	z  dk7  rt3        dt1        |      |	      t1        |      |	z  }t'        t5        |      D cg c]  \  }}||z  dk7  r| c}}      }|j7                          |j#                         }t        j!                  d       t        j!                  |j%                  d             t9        d |D              dz  |	z  }t        j!                  d|       |S # 1 sw Y   xY wc c}w c c}}w )r   r   rC   r   Tr   r   r3   r   Nr   r   r   r   zContext Syncr   zYFailed to divide all profiling events into #repeat groups. #CUDA events: %d, #repeats: %szprofiling time breakdown)r   c              3  4   K   | ]  }|j                     y wr   r   r   s     rR   r   z+do_bench_using_profiling.<locals>.<genexpr>b  s     A%e%%Ar   r   r   )rK   rC   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rW   r   r   rV   r   rM   RuntimeError	enumerate_build_treesum)r   r   r   r   r   r   r   r   r   r   r   r   r   r   num_event_per_groupactual_eventsr   s                    rR   do_bench_using_profilingr     s    D	JJKKJuyyHE **"""6K

  t 4I1X 
 	JJ**959K 1c&;./0H1c#+,-H 8_ 
 
JJ			NN++00
 
  
 ! 
x 	AKKMD		 	

 ! IIlIIann$$-EQS$TU 	
  JOO3

n8T 	
O ?h&!+- 	
 	
 o.9 &o6	
5&&!+ 	
M !..0MII()IIm!!B!/0
A=A
AF
JX
UCII(#.J_! !$	
	
s   6AN$5N(N 
Nc                    	 ddl m}  t        j                  j	                  dd       | d uxr% t        t        t        j                  dd       d      S # t        $ r Y yt        $ r}dt        |      v sJ Y d }~yd }~ww xY w)	Nr   )	roi_alignztorchvision::nmsMetatorchvisionr   Fztorchvision::nms does not exist)torchvision.opsr   rK   _C%_dispatch_has_kernel_for_dispatch_keyhasattrrJ   opsImportErrorr   str)r   r   s     rR   has_torchvision_roi_alignr   g  s|    -667I6R$ 
EII}d3[*
 	
   0CF:::s   AA 	A?A?&A::A?c                b   | t        j                  d      j                  S t        | t              rt        j                  |       } | j
                  dvrZ| j                  Nt        | j
                        }t        j                  | j
                  |j                  j                               S | S )Ng        )cpumeta)index)
rK   r   r   rs   r   typer   rT   Workercurrent_devicer   device_interfaces     rR   decode_devicer  w  s    ~||C '''&#f%{{/)fll.B3FKK@||FKK/?/F/F/U/U/WXXMrq   c                |    t        j                  t        j                  | t        j
                  j                        S r   )	functoolsreduceoperatormulrt   SOne)its    rR   sympy_productr    s#    HLL"eggkk::rq   c           	         t        |       t        |      k(  sJ t        j                  t        d t	        | |      D                    S )Nc              3  ,   K   | ]  \  }}||z    y wr   r   )r   abs      rR   r   zsympy_dot.<locals>.<genexpr>  s     >daAE>s   )rM   rt   expandr   r   )seq1seq2s     rR   	sympy_dotr    s8    t9D	!!!<<>c$o>>??rq   c                \    | D ci c]  }t        |      | c}j                         S c c}w r   )r   values)r  rO   s     rR   uniquer    s'     !BqE1H!((**!s   )c           
     n   t        | t        j                        st        |t        j                        r2t        t        j                  |       t        j                  |            S t        | t
              rt        |t
              s$J |  dt        |        d| dt        |              t        | |      S )Nz: , )rs   rt   ExprrZ   sympifyr   r   runtime_ceildiv)numberdenoms     rR   re   re     s     &%**%E5::)Fu}}V,emmE.BCC fc"z%'= ("T&\N"UG2d5k];= 65))rq   c                f   | yt        |       j                  d      d   }i dddddd	d
ddddddd	dddddddddddddddddd d!d"dd#d$d%d&}|j                  t        |j	                               D ci c]  }|| c}       t        | t               r| S d'||    S c c}w )(Nz*i8.r   booli1
float8e4nvfp8e4nvfloat8e5fp8e5float8e4b15fp8e4b15float8e4b15x4
fp8e4b15x4float8_e4m3fnfloat8_e5m2float8_e8m0fnuu8float4_e2m1fn_x2r   fp16bfloat16bf16float32fp32float64fp64int8i8int16i16int32i32int64i64u16u32u64)uint8uint16uint32uint64*)r   splitupdatelistr  rs   )key	dtype_strtysr}   s       rR   _type_ofrN    sR   
 {Cs#B'Ii 	G 	z	
 	 	 	w 	$ 	D 	6 	F 	6 	6  	!" 	#$ 	%& 	'( /C4 JJd3::<01112S#&3@aI/?,@@ 2s   
B.c                R    | D cg c]  }t        j                  |       c}S c c}w )z
    Gets the shape and stride of a tensor. For non-symbolic tensors, this is
    trivial. But for symbolic tensors, we need to map from SymIntNode into
    sympy.Expr.
    )rt   r  lstr   s     rR   convert_shape_to_inductorrR    s!     '**EMM!***s   $c                    ddl m} t        | t              r| S t        | t        j
                        rt        |       S |j                  j                  j                  j                  | d      S )zL
    Like convert_shape_to_symint, but operates on a single expression.
    r3   VN)hint)
virtualizedrU  rs   r   rt   r   graphsizevars	shape_envcreate_symintnode)r   rU  s     rR   convert_to_symintr\    se      a 	

 !U]]+ F	 !!++==ad=Krq   c                >    | D cg c]  }t        |       c}S c c}w )zz
    Takes a list of shapes from Inductor and converts them into symints (or just
    ints if all shapes are static).
    )r\  rP  s     rR   convert_shape_to_symintr^    s     +..Qa ...s   c                N    t        d | j                  j                  D              S )z-
    Does this op overload have aliasing
    c              3  8   K   | ]  }|j                   d u  y wr   )
alias_infor   r  s     rR   r   zis_view.<locals>.<genexpr>  s     FAq||4'Fs   )any_schema	argumentsops    rR   is_viewrh    s     F1E1EFFFrq   c                     yNFr   )r   s    rR   <lambda>rk        rq   c                   | j                   dk(  syt        | j                  t        j                  j
                        s| j                  t        j                  u syt        t        j                  j
                  | j                        }|t        j                  u st        |      rt        fd| j                  D              S t        j                  j                  |j                  v xs  |      S )z
    Do all uses of this op have torch.Tag.pointwise or return True for optional `is_pointwise_fn`

    Uses in views ops will follow the views uses
    call_functionFc              3  6   K   | ]  }t        |        y wr   )is_pointwise_use)r   uis_pointwise_fns     rR   r   z#is_pointwise_use.<locals>.<genexpr>  s     KA#A7Ks   )rg  rs   targetrK   _ops
OpOverloadr  getitemr   rh  rw   usersTag	pointwisetags)userr  rs  s    ` rR   rp  rp    s     66_$3::uzz445xGWGW9W%**''4F!!!WV_KKKK99&++-H1HHrq   	list[Any]c           	        t         j                  j                         g dfd} j                  | gt	        t         j
                  |||f       }t        | j                  j                        dk(  r2t        | j                  j                  d   j                        dk(  r|f}j                  |       t         j                  j                  i       }|fS )Nc                `    j                  |        j                  dt                     S )Narg)appendplaceholderrM   )r  g
graph_argss    rR   add_tensor_argz)gen_gm_and_inputs.<locals>.add_tensor_arg  s,    #}}s3z?"3455rq   r3   r   Tensor)r  torch.Tensorr   r2   )rK   fxGraphrn  r#   r  rM   rd  returnsr   r   outputr1   )rs  rz   kwargsr  nodegmr  r  s         @@rR   gen_gm_and_inputsr    s     	A%'J6 1??u||^dF^LD 	FNN""#q(&&q)../8;wHHTN			b!	$Bz>rq   c                h    | dk(  ry t        |       }|j                         r|j                          y y Nr   )rT   rL   r   r  s     rR   r   r      s4    /7$$&$$& 'rq   c                    t        |       t        j                  d       t        j                         }t        |      D ]  } | | }t        |        t        j                         }J ||z
  S )Ni9  )r   rK   manual_seedtimeperf_counterr   )modelexample_inputsr   r   t0r   resultt1s           rR   timedr  (  sr     	d				B5\ 'F 
			B7Nrq   c                    t        j                  t        |      D cg c]  }t        | |||       c}      }t        j                  |      |z  }t        ||z  d       |j                         S c c}w )Nz.6f)rK   r   r   r  medianprintr   )	r  r  r   repeatbaseliner   r   timingstooks	            rR   print_performancer  :  sg     ll>CFmLuneV	4LG << 5(D	TH_S!#99;	 	Ms   A1c                H     t        | |             t        | |fd       y)zKReplace obj.method() with a new method that returns a precomputed constant.c                      S r   r   )r  s   rR   rk  z#precompute_method.<locals>.<lambda>M  s     rq   N)rJ   setattr)objmethodr  s     @rR   precompute_methodr  J  s     !WS&!#FC(rq   c                *    |D ]  }t        | |        y)zFReplace methods with new methods that returns a precomputed constants.N)r  )r  methodsr  s      rR   precompute_methodsr  P  s     '#v&'rq   c                <    t        | |kD        t        | |k        z
  S r   )r   )r  r  s     rR   cmpr  V  s    q1u:AE
""rq   c                ~    t        | t              r| g|z  S t        |       dk(  r t        |       | d   g      |z  S | S )Nr3   r   )rs   r   rM   r   )rO   sizes     rR   pad_listliker  Z  sC    !SsTz
1v{tAw!v%%Hrq   c                D    t        |       dk(  rg S dd}t        | |      S )Nr   c                n    t        | t              r| S ddlm} t        | |      sJ | j	                         S )Nr3   )rA   )rs   r   	schedulerrA   get_name)elemrA   s     rR   	sort_funcztuple_sorted.<locals>.sort_funcg  s1    dC K0$ 1222}}rq   rK  )r  rh   r   r   )rM   sorted)rO   r  s     rR   tuple_sortedr  c  s&    
1v{	 !##rq   PRV)	covariantc                  &    e Zd Zedd       ZddZy)CachedMethodc                     y r   r   )r   s    rR   clear_cachezCachedMethod.clear_cachex  s    ),rq   c                     y r   r   selfrz   r  s      rR   __call__zCachedMethod.__call__{  rl  rq   N)r   r   r   None)rz   P.argsr  P.kwargsr   r  )r   r   r   staticmethodr  r  r   rq   rR   r  r  w  s    , ,Drq   r  c           	         | j                   }d| dd| i}t        d| d d dj                         |        t        j                  |       || d         }d
fd	}||_        |S )N___cacher   z        def zC_cache_on_self(self):
            try:
                return self.zy
            except AttributeError:
                pass
            rv = fn(self)
            object.__setattr__(self, "z%", rv)
            return rv
        _cache_on_selfc                8    t        |       rt        |        y y r   )r   delattrr  rK  s    rR   r  z"cache_on_self.<locals>.clear_cache  s    4D# rq   )r  r   r   r  )r   execlstripr  wrapsr  )r   r   ctxwrapperr  rK  s        @rR   cache_on_selfr    s    ;;DtfF
C *CF  E "' (+e ,			 FH "ioob!#n&=">?G &GNrq   c           
     ^   ddl m} t        | t              rgt	        j
                  t        j                  | D cg c]0  }t        |d      r"|j                  r|j                  j                  2 c}t                     S t        | |j                        r| j                  S t               S c c}w )Nr3   irr  ) r  rs   rJ  r  r  r  or_r   r  originsr!   r:   )node_scheduler  r  s      rR   aggregate_originsr    s     -&LL *4(TYY 		!!
 L
 	
 
M2??	3$$$|s   5B*
c                   t        |       }|dk(  rq|D cg c]Q  }|j                  dk(  r@d|j                  v r2|j                  d   #|j                  d   j                  j                  S }}t        t        |            }n|dk(  rg }|D ]y  }|j                  dk(  sd|j                  v s"|j                  d   d   }t        |d   t              r|j                  |d          \|j                  |d   j                         { t        t        |            }n5|dk(  r*|D cg c]  }|j                  dk(  s|j                    }}nt        |}dj                  d	g|z         S c c}w c c}w )
Noriginal_atenrn  rK   source_fn_stackr   r3   inductor_noder   fused)r  rg  r   _overloadpacketr   r  r!   rs   r   r  r   NotImplementedErrorjoin)r  descriptive_namesall_originsoriginsources	source_fns         rR   get_fused_kernel_namer    sm    $M2KO+ &
yyO+6;;.O,8	 KK(88AA
 
 G,-	g	%! 	:FyyO+0AV[[0P"KK(9:2>	ilC0NN9Q<0NN9Q<#8#89	: G,-	o	-&1
"VYY/5QFKK
 
 "!G88WI'((5
(
s   AE(%E-:E-c                	    t        |       }|D cg c]  }|j                  dk(  s| }}t        j                  t              }t        j                  t              }dt        |      rt        d |D              }t        |      dk(  r_|d   j                  t        d      s/t        j                        D 	ci c]  \  }}	|	|
 }
}}	|
_        |j                  fd       |D ]  }d	|j                  v rO|j                  d	   @t        |j                  d	   j                        }||   j!                  |j"                         d
|j                  v so|j                  d
   d   j"                  }||   j!                  |j"                          dnd}|j$                   d| ddj'                  |j)                                ddj'                  |j)                                d}|j$                   dg}t+        |j-                               D ]@  \  }}|j!                  |j$                   d| ddj'                  t+        |                    B dddlm |j!                  |j$                   d       t               }g }t3        | j4                        sddlm} 	 	 	 	 	 	 d'fd}d(d d) fd}| D ]  }	t        |	d      r|	j:                  t        |	j:                  d      r|	j:                  j<                  |	j:                  j<                  D ]  }|j"                  |v r|j?                  |j"                         |j                  jA                  |j"                        }|U |||j"                        \  }}|j!                  |j$                   d| d ||       d| d        t        |	j:                  d       s|	j:                  jB                  )|	j:                  jB                  D ]T  }|j                  jA                  |j"                        }|+ |||j"                        \  }}|j!                  d!|z          V  |D ]2  }|j!                  |j$                   d|jE                  d"#              4 |j!                  |j$                   d$d%j'                  |              |d&j'                  |      fS c c}w c c}	}w )*aH  
    Retrieves metadata information for a kernel.
    Args:
        node_schedule (Union[Sequence[BaseSchedulerNode], ExternKernel]):
            Either a sequence of BaseSchedulerNode objects or an ExternKernel instance.
        wrapper (PythonWrapperCodegen):
            An instance of PythonWrapperCodegen, used to define the code comment format.
    Returns:
        tuple[str, str]:
            A tuple containing two strings:
                - The first string represents the kernel's metadata.
                - The second string represent the kernel's detailed metadata.
    rn  Nc              3  4   K   | ]  }|j                     y wr   )rX  )r   ns     rR   r   z&get_kernel_metadata.<locals>.<genexpr>  s     "Cq177"Cr   r3   r   )_inductor_kernel_metadata_node_to_idx_mapc                "    j                   |    S r   )r  )r  single_graphs    rR   rk  z%get_kernel_metadata.<locals>.<lambda>  s    lTTUVW rq   r  r  	from_nodezTopologically SortedUnsorted z Source Nodes: [r  z], Original ATen: []z" Source node to ATen node mapping:z   z => r  z Graph fragment:rT  c                >   t        | j                        rAt        | j                  j                        r!| j                  j                  j                  }n| j                  }||}n|j
                  }	 | j                         }||fS # t        $ r d }Y ||fS w xY wr   )rs   	TensorBoxdata
StorageBoxorigin_noder   
get_layoutr  )bufferrw_namer  r   layoutr  s        rR   get_buffer_infoz,get_kernel_metadata.<locals>.get_buffer_info  s     fbll3
KK9 #)++"2"2">">K"("4"4K&"D&++D"#..0F V|# + "!FV|#"s   7B BBc           	     d    ddj                  | D cg c]  }t        |       c}       dS c c}w )N[r  r  )r  r   )shaperO   s     rR   stringify_shapez,get_kernel_metadata.<locals>.stringify_shape/  s-    499e%<c!f%<=>a@@%<s   -
c                    | y | j                          } | j                         }| j                   }dt        | j                      | | | dS )Nr  ")r  strider   r    r   )r  shape_annotationstride_annotationdevice_annotationr  s       rR   stringfy_layoutz,get_kernel_metadata.<locals>.stringfy_layout2  sl    >&5fkk&B%C '6v}}'E&F!'-}}o! FLL123C2D()*;)<A?rq   read_writesreadsz   %z
 : Tensor z = PlaceHolder[target=writes%T)include_tensor_metadataz
   return ,
)r  z2Union[ir.TensorBox, ir.Buffer, ir.TorchBindObject]r  r   r   ztuple[str, ir.Layout | None])r  zIterable[int]r   r   )r  zir.Layout | Noner   r   )#r  rg  collectionsdefaultdictrJ  rM   r!   rX  r   r   nodesr  sortr   r   r  r  r   commentr  keysr  itemsr  r  rs   r:   rW  rU  r  r  addtry_get_bufferr  format_node)!r  r  r  r  inductor_nodesfrom_node_dictoriginal_aten_dictunique_graphsidxr  node_to_idx_mapr  rK  sort_strmetadatadetailed_metadataoriginal_noder  	all_reads
all_writesrU  r  r
  rr  
input_namer  woutput_namer   r  r  r  s!                                 @@@rR   get_kernel_metadatar,    s   $ $M2K+6W&)):VfWNW ,,T2N$006
 L
>""CN"CC}")!,22L<)TU8A,BTBT8U"Vfc11c6"V"VIXFW     2dii'DIIo,F,Rdii0@@ACs#**4995$))#))K(+00C3&&tyy12 *6)A%zH??
1XJ&6tyyATATAV7W6X Y99%7%<%<%>?@	C  $OO,,NOP &~';';'= > 
u  s=/diiu6N5OP	

   GOO#44D!EF%/\	 "
-9&$J$UX$-$(A
 # =q-0AMM4I1=='2q}}7J7J7V]]00 66Y.$!aff-!"!7!7!?!>$-<VQVV-L*
F)00&/tJ<z.v677Mj\YZ\ AMM84,,8]]11 =!"!7!7!?!>$)8)HQ"))#*;<=-=< # 	D$$??#3t'7'7PT'7'U&VW	
 	  GOO#4Jsxx
?S>T!UVTYY0111w X #Ws   SS Sc                    t        |       } t        |       }| rV| j                         }|j                  D ]4  }|r	 ||      r||vs|j	                  |       | j                  |       6 | rV|S )zJReturns the set of nodes whose values depend on those within initial_queue)rJ  r!   rN   rw  r  r  )initial_queueskip_filterdominated_setr  users        rR   dominated_nodesr2  f  sz    
 'M}-M
  "JJ 	+D{40=(!!$'$$T*	+  rq   c                4  	 ddl m d	fd	t        |      \  }}|D cg c]  } 	|      s|j                   }}t        |       \  }}|D cg c]  } 	|      s|j                   }}t	        t        j                  g ||       S c c}w c c}w )Nr3   r  c                F   t        | j                        r | j                        S t        | j                        r | j                        S t        | j                        xr9 t        | j
                  j                  j                  j                  f       S r   )	rs   r  r  r  r;   ComputedBufferInputsKernelInputBufferTemplateBuffer)r  r  is_unrealized_nodes    rR   r9  z*gather_origins.<locals>.is_unrealized_node  s    a&%aff--a'%aff--!RYY' 

!!!!	1
 -
 	
rq   )r  r;   r   r"  )r  r  r"   r  r!   	itertoolschain)
rz   r  kwargs_flattenr   valkwargs_originsargs_flattenargs_originsr  r9  s
           @@rR   gather_originsrA  z  s     
" %V,NA-;Wc?QRU?VckkWNW"4(OL!+7SC;Mc;RCKKSLSiooE|EnEFF XSs   BBB Bc                J    dddfddfddfd |       S )z
    Normal sympy str is very slow, this is a lot faster.  The result are
    somewhat worse, as it doesn't do as much simplification.  So don't
    use this for final codegen.
    c                    t        | t        j                        xr, t        | j                        dk(  xr | j                  d   dk(  S )N   r   r   )rs   rt   MulrM   rz   )exprs    rR   is_neg_leadzsympy_str.<locals>.is_neg_lead  s:    tUYY'VC		Na,?VDIIaLTVDV	
rq   c                `   t        | t        j                        rt        | j                        dk(  rO | j                  d         r: | j                  d          d | j                  d   j                  d          S dj                  t        | j                              S  |       S )NrD  r3   r   z - z + )rs   rt   ru   rM   rz   r  rx   )rF  rG  sympy_str_muls    rR   sympy_str_addz sympy_str.<locals>.sympy_str_add  s    dEII& 499~"{499Q<'@'		!56c-		RSHYHYZ[H\:]9^__zz#mTYY"?@@ &&rq   c                    t        | t        j                        rE |       rd | j                  d          S dj	                  t        | j                              S  |       S )N-r3   z * )rs   rt   rE  rz   r  rx   )rF  rG  sympy_str_atoms    rR   rI  z sympy_str.<locals>.sympy_str_mul  s[    dEII&4  >$))A,7899zz#ndii"@AA!$''rq   c                   t        | t        j                        r| j                  S t        | t        j                  t        j
                  f      rd |        dS t        | t        t        t        t        f      rC| j                  j                   ddj                  t        t        | j                               dS t!        |       S )N()r  )rs   rt   Symbolr   ru   rE  r^   r[   r\   r]   funcr   r  rx   	sympy_strrz   r   )rF  rJ  s    rR   rM  z!sympy_str.<locals>.sympy_str_atom  s    dELL)99uyy%))45}T*+1--(HMNii(()499SDII5N+O*PPQRRt9rq   )rF  r   r   r"  rF  r   r   r   r   )rF  rG  rJ  rM  rI  s    @@@@rR   rS  rS    s$    

	'	( rq   c                    ddl m} t        j                  r3t	        |j
                  dd       x}r|j                  dk7  rt        |       S t        j                         S )Nr3   rT  current_node
index_expr)
rW  rU  rd   compute_all_boundsrJ   interpreterrs  ra   rb   unknown)r   rU  fx_nodes      rR   get_bounds_index_exprr\    sN     	!!~tDDWDNNl*5!!""$$rq   c                    | d   dk(  S )Nr   r(  r   )prefixs    rR   prefix_is_reductionr_    s    !9rq   c                J    | t         j                  k7  sJ t        | |dd      S )9
    Used to generate an integer-nonnegative symbol.
    Tintegernonnegative)r`   SIZEr_   )r^  r   s     rR   sympy_index_symbol_with_prefixrf    s)     TYY vsDdCCrq   c                N    | xs t         j                  xr t         j                  S r   )rd   debug_index_assertsassert_indirect_indexing)checks    rR   generate_assertrk    s    /V//TV5T5TTrq   c                F    | d   dk7  sJ t        j                  | dd      S )ra  r   r   Trb  )rt   rQ  r   s    rR   sympy_index_symbolrn    s)     7c>> <<d==rq   c                    	 	 	 	 	 	 dd}t        j                  |       j                  |j                         D ci c]  \  }}| |||       c}}      S c c}}w )z
    When the passed replacement symbol v is a string, it is converted to a symbol with name v that
    have the same replaced expression integer and nonnegative properties.
    c                    t        | t        j                        sJ t        |t              r,t        j                  || j
                  | j                        S |S )Nrb  )rs   rt   r  r   rQ  r   is_nonnegative)replacedreplacements     rR   	to_symbolzsympy_subs.<locals>.to_symbol  sP     (EJJ///k3'<< ++$33  rq   )rr  r   rs  zUnion[sympy.Expr, str]r   sympy.Symbol)rt   r  xreplacer  )rF  replacementsrt  kr}   s        rR   
sympy_subsry    sf    +A	 ==''(4(:(:(<=1IaO	= =s   A
c                    t        | t        j                        xs^ t        | t        j                        xrB t	        d t        j                  | j                         | j                               D              S )Nc              3  2   K   | ]  }t        |        y wr   is_symbolicr   rO   s     rR   r   zis_symbolic.<locals>.<genexpr>  s     N1AN   )	rs   rK   r/   r  rc  r:  r;  r  r  )r  s    rR   r}  r}    sS    a& 1ell# 	ON	!((*(MNNrq   c                 &    t        d | D              S )Nc              3  2   K   | ]  }t        |        y wr   r|  rb  s     rR   r   z"any_is_symbolic.<locals>.<genexpr>  s     ,!{1~,r  rc  )rz   s    rR   any_is_symbolicr    s    ,t,,,rq   c                T   ddl m} t        g d      }t        j                         r|j                  d       | j                  j                  D ]  }t        |j                        |v r|c S t        j                  j                  j                  slt        |j                  t        j                  j                        r>t        j                   j"                  j$                  |j                  j&                  v r|c S |j(                  j+                  d      x} ||      s|c S  y )Nr   )r'   )z,aten._fused_moving_avg_obs_fq_helper.defaultz7aten._fused_moving_avg_obs_fq_helper_functional.defaultzfbgemm.dense_to_jagged.defaultz%fbgemm.jagged_to_padded_dense.defaultrun_and_save_rng_staterun_with_rng_statezaten._local_scalar_densezaten._assert_scalar)zaten._unsafe_index_put.defaultz0aten._unsafe_masked_index_put_accumulate.defaultzaten.index_put.defaultzaten.index_put_.defaultzaten.scatter.srczaten.scatter.reducezaten.scatter.value_reducezaten.scatter_add_zaten.scatter_add.defaultzaten.scatter_reduce.twozaten.scatter_reduce_.twozaten.scatter_reduce.two_outr=  )%torch.fx.experimental.symbolic_shapesr'   r!   rK   $are_deterministic_algorithms_enabledrI  rX  r  r   rs  	_inductorrd   graph_partitionrs   rt  ru  r   rx  cudagraph_unsaferz  r   get)r  r'   forbidden_setr  r=  s        rR   %get_first_incompatible_cudagraph_noder    s     L	
M  113	
"  t{{},K &&664;;

(=(=>--1A1AA
 K99==''C49Ns9SK" rq   c                    t        t        t        | j                  j                                    }|j
                  dk(  sJ |S )z$Get the output node from an FX graphr  )nextiterreversedrX  r  rg  )r  	last_nodes     rR   output_noder  S  s6    T(288>>234I<<8###rq   c                    | j                   j                  d      }t        d |D              }t        |       j                  d   }t        |t              r|n|f}t        d |D              }||z  S )Nr  rf  c              3     K   | ]P  }t        |j                  j                  d       t        j                        r|j                  d    j
                   R ywr=  N)rs   r   r  rK   r  r   )r   r  s     rR   r   z"get_all_devices.<locals>.<genexpr>\  sB      9diimmE*ELL9 			%9s   AAr   c              3     K   | ]t  }t        |t        j                  j                        rNt        |j                  j                  d       t        j                        r|j                  d    j                   v ywr  )rs   rK   r  r2   r   r  r  r   )r   r  s     rR   r   z"get_all_devices.<locals>.<genexpr>d  sS      7c588==)sxx||E*ELL9 	7s   A:A<)rX  
find_nodesr!   r  rz   rs   tuple)r  placeholder_nodesinput_devicesout_argout_argsout_devicess         rR   get_all_devicesr  Z  s}    ++}+=.8 9%9 /M "o""1%G$We4w7*H,6 77 -K ;&&rq   c                    t        t        j                  j                               D ]'  } | j	                  d      st        j                  |    }|j
                  j                         D ]  }|j	                  d      st        ||      }t        |t        j                  j                  j                  j                        sZ|j                  D ]i  }t        |t        j                  j                  j                  j                        s<|j                  j                   j"                  j%                          k  t        j                  | = * dt        j                  v rRt        j                  d   }t'        |j(                  j*                  j,                        `|j(                  j*                  `t1        j2                          y )Nz&torch._inductor.runtime.compile_tasks.triton_ztriton.runtime.driver)rJ  sysmodulesr  
startswith__dict__rJ   rs   rK   r  runtimetriton_heuristicsCachingAutotunercompile_resultsTritonCompileResultkernelrunmod__del__r   driveractiveutilsinstancegccollect)module_namem	attr_namer  r  r  s         rR   unload_xpu_triton_pydsr  p  sR   CKK,,./ %%%&NOKK$* 	<I##I. I.EOO33EEVV #)"8"8 <%"!OO33EEYY #MM--1199;<	< KK$!%& #++-kk12""(()2JJ#JJLrq   _registered_cachesc                    t        | d      rt        | j                        st        |  d      t        j                  |        | S )zh
    Use this decorator to register any caches that should be cache_clear'd
    with fresh_cache().
    cache_clearz# does not have a cache_clear method)r   callabler  AttributeErrorr  r  r  s    rR   clear_on_fresh_cacher    s?    
 3&hs.Gu$GHIIc"Jrq   c                 :    t         D ]  } | j                           y)z&
    Clear all registered caches.
    N)r  r  r  s    rR   clear_cachesr    s     " rq   c              #    K   t                ddlm}  |t        j                  |            	 t
        j                  j                  t        j                  di      5  t        j                  d        |t        j                  j                  d            }t
        j                  j                  t        j                  d|i      5  d t        | t              rt        |       dk(  sJ d	       t        j                  j!                  |      rtt        j"                  |      }| j%                  |D ci c]D  }d
|vr>|t        j                  j'                  t        j                  j                  ||            F c}       ddd       ddd       |rUt)               r(t*        j,                  j/                         r
t1                t3        j4                  t)               fd       t                yc c}w # 1 sw Y   xxY w# 1 sw Y   |xY w# t6        $ r t        j9                  d        w xY w# t                w xY ww)z
    Contextmanager that provides a clean tmp cachedir for pt2 caches.

    Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
    generated with this cache instance.
    r   )normalize_path_separator)dirTORCHINDUCTOR_CACHE_DIRzUsing inductor cache dir %stritonTRITON_CACHE_DIRNz!expected empty cache_entries dictz.lockc                4    t         j                  d|      S )Nz*Failed to remove temporary cache dir at %s)exc_info)r   warning)rR  pathr  inductor_cache_dirs      rR   rk  zfresh_cache.<locals>.<lambda>  s    S[[@&% 6A 6 rq   )ignore_errorsonerrorz(on error, temporary cache dir kept at %s)r  torch._inductor.cpp_builderr  tempfilemkdtempr   patchdictosenvironr   r   r  r  rs   rM   existslistdirrI  getsize
is_windowsrK   rE   rL   r  shutilrmtree	Exceptionr  )cache_entriesr  deleter  triton_cache_dirfilesfr  s          @rR   fresh_cacher    s     ND1(2B2Bs2KL)ZZ__JJ24FG
 	 II35GH7/:  .@BR-ST mT2}-2W4WW2ww~~&67 "

+; <%,, */$%#*!#3 !"277??277<<@PRS3T#U U	( |		 6 6 8&(MM" )l  	5 	 	H  >@RS 	sn   -I0H !A-HA-H;A	G=HHAH 2I=HH	HHH !H;;H> >I

Ic           	         | j                   }t        t        |             }t        t	        t        ||d                  S )NT)rK  reverse)__getitem__r   rM   rJ  r  r  )seqgettera_rs      rR   argsortr    s1    __F
C/C>?@@rq   c           	     2    d fd}t        |      D cg c]9  \  }}|t        |t        j                        r|j                  j
                  n|f; }}}t        |t        j                  |            }|D cg c]  \  }}|	 }}}|S c c}}w c c}}w )Nc                n    | \  }}|\  }}dfd} |||k        ry |||kD        ry||k  ry||kD  ryy)Nc                N    t        | t              r| S j                  | d      S )NT)size_oblivious)rs   r"  evaluate_expr)rF  rZ  s    rR   evaluatez*argsort_sym.<locals>.cmp.<locals>.evaluate  s(    $%**4*EErq   r   r3   r   )rF  z%Union[bool, torch.SymInt, sympy.Expr]r   r"  r   )r  r  a_idxa_valb_idxb_valr  rZ  s          rR   r  zargsort_sym.<locals>.cmp  sT    uu	F
 EEM"EEM"
 5=5=rq   r  )r  tuple[int, sympy.Expr]r  r  r   r   )	r   rs   rK   r/   r  rF  r  r  
cmp_to_key)rZ  r  r  r   r   exprsr   r  s   `       rR   argsort_symr    s    4  nC 
Z5<<8affkka@E  5i22378E %&fc1c&F&M
 's   >B<Bc                t    | t         j                  k(  ryt        j                  d|       j                         S )Nrl   r   r   )rK   rF  r   element_sizer  s    rR   get_dtype_sizer    s-     ;;r'4466rq   c                      e Zd ZU ded<   y)LineContextr   contextNr   r   r   r   r   rq   rR   r  r    s    Lrq   r  c                  "    e Zd ZU ded<   ded<   y)ValueWithLineMapr   r   zlist[tuple[int, LineContext]]line_mapNr   r   rq   rR   r  r    s    J++rq   r  c                      e Zd ZdZdddZej                  dd       ZddZddZ	ddZ
ddZddZdd	Zdd
ZddZ	 	 	 	 ddZdd dZdd!dZdd!dZ	 d"	 	 	 	 	 d#dZd$dZddZd%dZd&dZy)'IndentedBuffer   c                     g | _         || _        y r   )_lines_indent)r  initial_indents     rR   __init__zIndentedBuffer.__init__(  s    GI%rq   c              #  b   K   | j                   }	 || _         d  || _         y # || _         w xY wwr   )tabwidth)r  r  prevs      rR   set_tabwidthzIndentedBuffer.set_tabwidth,  s,     }}	!$DM DMDDMs   /# /	,/c                   t               }d}g }| j                  D ]  }t        |t              r
 |       }|1t        |t              r|j                  ||j                  f       K|}t        |t              sJ |j                  |       |j                  d       |d|j                  d      z   z  } t        |j                         |      S )Nr3   r  )r   r  rs   DeferredLineBaser  r  r  r   writecountr  getvalue)r  bufr   linemaplilines         rR   getvaluewithlinemapz"IndentedBuffer.getvaluewithlinemap5  s    j13++ 	&B"./t<B,2::/dC(((IIdOIIdOTZZ%%%A	&  88rq   c                6    | j                         j                  S r   )r  r   r  s    rR   r  zIndentedBuffer.getvalueI  s    '')///rq   c                f   t               }| j                  D ]  }t        |t              r
 |       }|t        |t              r.|}t        |t
              sJ |j                  d      r|j                  |d d        h|j                  |       |j                  d        |j                         S )N\r   r  )	r   r  rs   r  r  r   endswithr  r  )r  r  r  r  s       rR   getrawvaluezIndentedBuffer.getrawvalueL  s    j++ 	 B"./t<B,dC(((}}T"		$s)$		$		$	   ||~rq   c                8    | j                   j                          y r   )r  clearr  s    rR   r!  zIndentedBuffer.clear`  s    rq   c                ,    t        | j                        S r   )r"  r  r  s    rR   __bool__zIndentedBuffer.__bool__c  s    DKK  rq   c                :    d| j                   | j                  z  z  S )Nr  )r	  r  r  s    rR   r^  zIndentedBuffer.prefixf  s    dllT]]233rq   c                &    | j                  d       y )Nr  	writeliner  s    rR   newlinezIndentedBuffer.newlinei  s    trq   c                   t        |t              r| j                  j                  |       y t        |t              r9| j                  j                  |j                  | j                                      y |j                         r.| j                  j                  | j                          |        y | j                  j                  d       y Nr  )rs   r  r  r  r  with_prefixr^  stripr  r  s     rR   r'  zIndentedBuffer.writelinel  s    dK(KKt$./KKt//>?ZZ\KK$++-78KKr"rq   c                4    |D ]  }| j                  |        y r   r&  )r  linesr  s      rR   
writelineszIndentedBuffer.writelinesv  s      	!DNN4 	!rq   c                H     t         j                  d fd       } |       S )Nc               3     K   xj                    z  c_         	 d  xj                    z  c_         y # xj                    z  c_         w xY wwr   r	  )offsetr  s   rR   r  z"IndentedBuffer.indent.<locals>.ctx}  s9     LLF"L'&&s   A4 AAAr   Iterator[None])
contextlibcontextmanager)r  r4  r  s   `` rR   indentzIndentedBuffer.indent|  s$    		"	"	' 
#	' urq   c                .    | xj                   |z  c_         y r   r3  r  r4  s     rR   	do_indentzIndentedBuffer.do_indent      rq   c                .    | xj                   |z  c_         y r   r3  r;  s     rR   do_unindentzIndentedBuffer.do_unindent  r=  rq   c           	        t        |t              rt        d      }|j                  D ]E  }t        |t              r|st        |t        |      t        |j                               z
        }G t        j                  |      rd}|j                  D ]P  }t        |t              r| j                  j                  |       /t        j                  | |t        |      d         R y t        j                  |      }|r|j                         }|sy |j                         }|j!                  d      D ]  }| j                  |        y )Ninfr   r  )rs   r  floatr  r  minrM   r  mathisinfr  r'  r   textwrapdedentrstriprH  )r  
other_coder,  rG  r  r   s         rR   splicezIndentedBuffer.splice  s    j.15\F")) I!$4 TS5G)GHFI zz&!")) HdK0KK&&t,",,T4F3FG	H "4J'..0
#**,J%%d+ "q!"rq   c                    t        | j                        }| j                  D cg c]
  } ||       c}|_        |S c c}w N)r
  )r  r	  r  )r  rR  r   r  s       rR   rx   zIndentedBuffer.map  s4    DLL9-1[[9Td4j9

 :s   >c                @    t        |        d| j                          dS )NrO  rP  )r   r  r  s    rR   __repr__zIndentedBuffer.__repr__  s     t*Qt}}/q11rq   c                    | j                   |j                   k(  sJ t        | j                         }|j                  | j                         |j                  |j                         |S rL  )r	  r  r0  r  )r  otherr   s      rR   __add__zIndentedBuffer.__add__  sK    ||u}},,,DLL9t{{#u||$
rq   c                    || j                   v S r   )r  )r  new_lines     rR   containszIndentedBuffer.contains  s    4;;&&rq   Nr   )r
  r   r   r  )r  r   r   r6  )r   r  r   r   r   r  r   r"  )r  z)Union[LineContext, DeferredLineBase, str]r   r  )r/  z3Sequence[Union[LineContext, DeferredLineBase, str]]r   r  r   )r4  r   r   'contextlib.AbstractContextManager[None])r4  r   r   r  )F)rI  zUnion[IndentedBuffer, str]r,  r"  r   r  )rR  zCallable[[Any], Any]r   r  )rP  r   r   r  )rS  z)Union[DeferredLineBase, LineContext, str]r   r"  )r   r   r   r  r  r7  r8  r  r  r  r  r!  r#  r^  r(  r'  r0  r9  r<  r?  rJ  rx   rN  rQ  rT  r   rq   rR   r  r  %  s    H& ! !9(0(!4#!H!	!	 EJ"4"=A"	"2
2'rq   r  c                  (     e Zd Zd fdZddZ xZS )FakeIndentedBufferc                "    t         |           y r   )superr  )r  	__class__s    rR   r  zFakeIndentedBuffer.__init__  s    rq   c                V    |dk(  rt         j                  | |      S t        d| d      )Nr^  zTried to call self.z on FakeIndentedBuffer. This bufferis currently used on TritonTemplateKernel to prevent actualwrites to the body without explicitly specifying the body with`TritonTemplateKernel.set_subgraph_body(name)`)object__getattribute__r   )r  r   s     rR   ra  z#FakeIndentedBuffer.__getattribute__  s;    ;**466!$ (= =
 	
rq   rW  )r   r   r   r   )r   r   r   r  ra  __classcell__r^  s   @rR   r[  r[    s    
rq   r[  c               #     K   t         j                  t         j                  }} 	 d  | |ct         _        t         _        y # | |ct         _        t         _        w xY wwr   )r  stdoutstderr)initial_stdoutinitial_stderrs     rR   restore_stdout_stderrri    s@     %(ZZNN@!/
CJ
CJs   !AA  A AAc                  P    e Zd ZdZddZddZddZddZddZddZ	ddZ
dd	Zy
)r  z.A line that can be 'unwritten' at a later timec                6    |j                         sd}|| _        y r*  )r,  r  r-  s     rR   r  zDeferredLineBase.__init__  s    zz|D	rq   c                    t         )zJReturns either self.line or None to indicate the line has been 'unwritten'r  r  s    rR   r  zDeferredLineBase.__call__      !!rq   c                    t         )z3Returns a new deferred line with the same conditionrm  r-  s     rR   	_new_linezDeferredLineBase._new_line  rn  rq   c                @    | j                  | | j                         S r   rp  r  )r  r^  s     rR   r+  zDeferredLineBase.with_prefix  s    ~~455rq   c                T    | j                  | j                  j                               S r   )rp  r  r  r  s    rR   r  zDeferredLineBase.lstrip  s    ~~dii..011rq   c                >    | j                  | j                  |         S r   rr  )r  r   s     rR   r  zDeferredLineBase.__getitem__  s    ~~dii.//rq   c                ,    t        | j                        S r   )r"  r  r  s    rR   r#  zDeferredLineBase.__bool__  s    DIIrq   c                ,    t        | j                        S r   )rM   r  r  s    rR   __len__zDeferredLineBase.__len__  s    499~rq   N)r  r   )r   zUnion[str, None])r  r   r   r   )r^  r   r   r   )r   r   )r   zUnion[int, slice]r   r   rX  r   r   )r   r   r   r   r  r  rp  r+  r  r  r#  rw  r   rq   rR   r  r    s-    8
""620rq   r  c                  4     e Zd ZdZd fdZddZddZ xZS )DelayReplaceLinez6At end of codegen call `line.replace(key, value_fn())`c                @    t         |   |       || _        || _        y r   )r]  r  rK  value_fn)r  rK  r|  r  r^  s       rR   r  zDelayReplaceLine.__init__  s     rq   c                j    | j                   j                  | j                  | j                               S r   )r  replacerK  r|  r  s    rR   r  zDelayReplaceLine.__call__  s#    yy  4==?;;rq   c                D    t        | j                  | j                  |      S r   )rz  rK  r|  r-  s     rR   rp  zDelayReplaceLine._new_line  s    $-->>rq   )rK  r   r|  zCallable[[], str]r  r   rV  )r  r   r   rz  )r   r   r   r   r  r  rp  rb  rc  s   @rR   rz  rz    s    @!
<?rq   rz  c                   t        | t        j                        r| }nt        j                  t               |       }t	        j
                  |      }t        j                  j                  rC|j                  J |j                  dk  s|j                  dk(  rt        j                  d       yy|j                  dk(  rdnd}|j                  }||k  rt        j                  d	||d
       yy)N	   
   z6GPU arch does not support max_autotune_gemm mode usageFTrE   ri   D   z,Not enough SMs to use max_autotune_gemm mode)min_sms	avail_sms)extra)rs   rK   r   rS   r   createversionhipmajorr   r  r   multi_processor_count)index_or_devicer   propr  r  s        rR   
is_big_gpur    s    /5<<0 lno>""6*D }}zz%%%::>TZZ2-KKPQKK5(bbG**I7:%I> 	 	
 rq   c                     t         j                  j                         r(t         j                  j                         j                  S t         j
                  j                  d      j                  S )NrC   )rK   rE   rL   get_device_propertiesgpu_subslice_countrC   r  r   rq   rR   get_max_num_smsr    sF    yyyy..0CCC::++F3IIIrq   c                     t         j                  j                         syt         j                  j                  t         j                  j	                               } | j
                  dk(  S )zEReturns true if the device is a NVIDIA B200, otherwise returns false.Fr  )rK   rC   rL   r  r  r  )device_propertiess    rR   
using_b200r  %  sJ     ::""$

889R9R9TU""b((rq   c                     t         j                  j                         r
t               S t         j                  j                         } t               | | z
  S dz
  S )zFHandle experimental carveout if set otherwise return hardware SM countr   )rK   rE   rL   r  r   _get_sm_carveout_experimental)carveouts    rR   get_num_smsr  /  sJ     yy  xx557HH,@HHaHHrq   c                    ddl m}m} |
t               }|j	                  d      }|| z  t
        z  } |||| |j                               S )zKBuilds and returns a WorkspaceArg for the device side TMA workspace buffer.r3   )r4   WorkspaceZeroModeF)r  	zero_moder   
outer_name)codegen.commonr4   r  r  	from_boolTMA_DESCRIPTOR_SIZEunique_name)num_tma_descriptorsr   num_programsr4   r  r  r  s          rR   get_tma_workspace_argr  8  sZ     @"}!++E2I--0CCD+<++-	 rq   c                    | j                   |vr!t        j                  d| j                   |       t        | j                  j
                        xr% | j                   |v xr t        | j                        S )NzDNot using template since dtype %s is not in allowed layout dtypes %s)r   r   r   is_gpur   r   r  )r  allowed_layout_dtypess     rR   _use_template_for_gpur  L  sf     ||00		RLL!	
 	v}}!!" 	&LL11	&v}}%rq   c                    | j                         t        j                  j                         j                  d      D cg c]  }|j	                          c}v S c c}w Nr  )upperrd   max_autotune_gemm_backendsrH  r,  backendrO   s     rR   _use_autotune_backendr  \  M    ==?!<<BBDJJ3O	      Ac                    | j                         t        j                  j                         j                  d      D cg c]  }|j	                          c}v S c c}w r  )r  rd   max_autotune_conv_backendsrH  r,  r  s     rR   _use_conv_autotune_backendr  b  r  r  F)enable_int32enable_float8check_max_autotunec                  ddl m}m} t        j                  t        j
                  t        j                  g}|r>t        j                  t        j
                  t        j                  t        j                  g}|r/|j                  t        j                  t        j                  g       t        | j                  j                        xr t        | |      xs) | j                  j                  dk(  xr | j                  |v xrS t         j"                  xs t         j$                  xs | xr* t'        d      xr  || j                  |j(                        S )Nr3   )BackendFeaturehas_backend_featurer   TRITON)r  r  r  rK   r   r2  r4  r<  extendr,  r-  r  r   r   r  r   rd   max_autotunemax_autotune_gemmr  TRITON_TEMPLATES)r  r  r  r  r  r  layout_dtypess          rR   use_triton_templater  h  s    D]]ENNEMMBMu{{Se1153D3DEF v}}))* A)&-@O ""e+M0M
	P   VF$<$<VDV@V
	P "(+
	P  ~/N/NOrq   )
add_guardsc                     ddl m} ddlm d	fdd
 fdd
fd |       xr t	        fd|D              S )u  
    Return True iff *all* supplied tensors satisfy the CUDA-12.9 TMA constraints
    that Triton relies on today.
    * https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html

    A tensor is accepted when:
      * 2 ≤ rank ≤ 5
      * dtype ∈ {FP16, BF16, FP8-E4M3FN}
      * Every logical size ≥ 2
      * Base pointer 16-byte aligned
      * All "outer" dims have 16-byte aligned strides
      * The “inner” dim has stride 1 (contiguous)
      * For FP8 tensors, inner dim ≥ 32
    r   )has_triton_tma_devicer3   rT  c                X    j                   j                  j                  | t              S r   )rX  rY  statically_known_multiple_ofTMA_ALIGNMENT)
expr_bytesrU  s    rR   _alignedzcan_use_tma.<locals>._aligned  s     ww<<ZWWrq   c                   | j                         }| j                         }t        |      }| j                         }|j                  }|dk  s|dkD  ry|t
        j                  t
        j                  t
        j                  fvry| j                         j                  j                  v ryrKj                  j                  j                  |      }j                  j                  j                  |      }nd|D cg c]'  }j                  j                  j                  |      ) }}|D 	cg c]'  }	j                  j                  j                  |	      ) }}	t        fd|D              ryt!        |      D 
	cg c]-  \  }
}	j                  j                  j#                  |	d      r|
/ }}
}	t        |      dk7  ry|d   }t!        |      D ]  \  }
}	|
|k(  r |	|z        r y ||   } ||z        sy|t
        j                  k(  r'j                  j                  j%                  |d      syyc c}w c c}	w c c}	}
w )	NrD  r   Fc              3  l   K   | ]+  }j                   j                  j                  |d         - ywrD  N)rX  rY  statically_known_geq)r   r   rU  s     rR   r   zBcan_use_tma.<locals>._is_tma_compatible_default.<locals>.<genexpr>  s+     P1177##88A>>Ps   14r3   r       T)get_size
get_striderM   	get_dtypeitemsizerK   r   r2  r,  r  rX  unaligned_buffersrY  guard_int_seqsymbolic_hintrc  r   statically_known_equalsr  )rO   sizesstridesrankr   r  sizes_i	strides_ir   str   inner	inner_idx	inner_dimrU  r  r  s                 rR   _is_tma_compatible_defaultz/can_use_tma.<locals>._is_tma_compatible_default  s   

,,.5z>> !8tax 8K8KLL ::<177444gg&&44U;G((66w?IBGHQqww''55a8HGHFMN))77;NIN PPP
 #9-
2ww77A> 
 

 u:?!H	 y) 	EArI~BM*		 I&		H,- E'''0@0@0U0Ur1
 G IN
s   >,H<0,I2Ic                D   | j                         }|D cg c]'  }j                  j                  j                  |      ) }}t	        |      D cg c]-  \  }}j                  j                  j                  |d      r|/ }}}t        |      dk7  ryyc c}w c c}}w )Nr3   FT)r  rX  rY  r  r   r  rM   )rO   r  r  r  r   r  rU  s         rR   _is_tma_compatible_xpuz+can_use_tma.<locals>._is_tma_compatible_xpu  s    ,,.BIJBQWW%%33B7J	J #9-
2ww77A> 
 

 u:? K
s   ,B2Bc              3     K   | ]5  }|j                         xj                  dk7  r |      n |       7 y w)NrE   )
get_devicer   )r   r  r  r  m_devices     rR   r   zcan_use_tma.<locals>.<genexpr>  sK      +  &H/8==E3I 	#1%#A&	'+s   ;>)r  Union[int, sympy.Expr]r   r"  rO   r;   r   r"  )torch.utils._tritonr  rW  rU  rw   )r  matricesr  rU  r  r  r  r  s   `  @@@@@rR   can_use_tmar    sD     :X:x !" s + 	+ ( rq   c                x    t        d |D              xr' t        |d| ixr t        j                  j                  S )Nc              3  T   K   | ]   }t        |j                               d k(   " ywr  )rM   r  )r   r  s     rR   r   z*use_triton_tma_template.<locals>.<genexpr>  s      5qC

"5s   &(r  )rw   r  rd   r  enable_persistent_tma_matmul)r  r  s     rR   use_triton_tma_templater    s9    5H55 	79j9	7MM66rq   c                <   ddl m} |j                  j                  j	                  ||z  |z  d      }|dk  s|t
        j                  j                  k  ryddlm	} t        j                  j                  ryt        j                  t        j                  t        j                  g}t!        | |      xr/ t
        j"                  xs t
        j$                  xr t'        d      }|r6 |       s/t(        j+                  d	t
        j                  j,                         y|S )
Nr3   rT  r   fallbackr   F)try_import_cutlassCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cuda.cutlass_dir %s is set correctly. Skipping CUTLASS backend for now.)rW  rU  rX  rY  	size_hintrd   rC   cutlass_backend_min_gemm_sizecodegen.cuda.cutlass_utilsr  rK   r  r  r   r2  r<  r  r  r  r  r   r  cutlass_dir)	r  r  r  rx  rU  	gemm_sizer  r  r   s	            rR   use_cutlass_templater    s      **1q519r*BIA~V[[%N%NN> }} ]]ENNEKK@Mfm4 	-  <F$<$<	-!),  !#KK4 ''	 Jrq   c                    t         j                  j                  j                         }|dk(  ry| j                         |j	                  d      D cg c]  }|j                          c}v S c c}w )z8Check if CUTLASS should be used for the given operation.ALLTr  )rd   rC   cutlass_enabled_opsr  rH  r,  )op_nameenabled_opsrO   s      rR   _use_cutlass_for_opr    sU    ++11779Ke==?+2C2CC2HIQqwwyIIIIs   A,r   _IntLikec           
        ddl m} t        j                  j                  }t
        j                  j                   xr |j                  j                  j                  t        j                  t        j                  ||| z        t        j                  |||z                    xr0 |j                  j                   xr |j                  j                   S )Nr   rT  )torch._inductor.virtualizedrU  rd   r  decompose_k_thresholdrK   r  r  rX  rY  statically_known_truert   AndGeaot_modecpp_wrapper)r  r  rx  rU  r  s        rR   use_decompose_k_choicer	     s    -"MM?? MM 	$GG22II1A561A56
	$    	$ ###
rq   c           
        t         j                  j                  }ddlm} t        t        j                  j                        xr |j                  j                  j                  t        j                  t        j                  ||| z        t        j                  |||z                    xr0 |j                  j                   xr |j                  j                    S )z
    Check if we should use the contiguous subgraph transform.
    This transform makes the second matrix contiguous before the matmul.
    r   rT  )rd   rocmcontiguous_thresholdr  rU  r"  rK   r  r  rX  rY  r  rt   r  r  r  r  )r  r  rx  r  rU  s        rR   use_contiguousr  3  s     ";;;; . 	U]] 	$GG22II01450145
	$    	$ ###
rq   c                   t         j                  j                  }g d}t        |t        j
                        r|j                  s|S |dk(  rg S t        | t        j
                        r| j                  r&t        |t        j
                        r|j                  sd}nt        || z  ||z        }d}t	        j                  |      }|D cg c]  }||k  r||k\  r| }}g g g }}
}	|D ]Z  }||z  }|dk  r||dz
  z  dk(  r|dk\  r|	j                  |       0|dz  dk(  r|
j                  |       J|j                  |       \ t         j                  dk(  r|	|
z   |z   S |	|
z   |z   }|d | S c c}w )	N)ri   r  rk   rj      r   r  rD  rj   r3   r  
EXHAUSTIVE)rd   r  num_decompose_k_splitsrs   rt   r  	is_numberrC  divisorsr  max_autotune_gemm_search_space)r  r  rx  k_splits_limitdefault_k_splitsmax_k_splitmin_k_splitr  divisorpow_of_2_divisorsmul_of_32_divisorsrest_of_splitsdkPartbest_splitss                  rR   get_k_splitsr   K  s    ]]99N .!UZZ 	1		1ejj!!++1ejj!!++!q&!q&)K~~a H  k!g&< 	H  =?B>) %Q 3; EAI!#$$Q'RZ1_%%a( !!!$%" ,,< #55FF#&88>IK''=s   
E,c                T    t         j                  j                  |       j                  S r   )rK   rC   r  gcnArchNamer   s    rR   _rocm_native_device_arch_namer$    s    ::++F3???rq   c                     	 dd l } ddlm}m} ddlm} t        j                  j                  | j                        }||||fS # t        $ r dd}dd} G d d      }d }Y %w xY w)	Nr   )gen_ops_librarygen_ops_preselected)CKGemmOperationc                     g S r   r   r   rq   rR   r&  z*try_import_ck_lib.<locals>.gen_ops_library      Irq   c                     g S r   r   r   rq   rR   r'  z.try_import_ck_lib.<locals>.gen_ops_preselected  r*  rq   c                      e Zd Zy)*try_import_ck_lib.<locals>.CKGemmOperationN)r   r   r   r   rq   rR   r(  r-    s    rq   r(  )r   r|  )ck4inductor(ck4inductor.universal_gemm.gen_instancesr&  r'  ck4inductor.universal_gemm.opr(  r  r  dirname__file__r   )r.  r&  r'  r(  package_dirnames        rR   try_import_ck_libr4    sl    	
	
 ''//+*>*>? O-@/QQ  			 	 s   ;A A#"A#c                   t         j                  st         j                  syt        j                  j
                  sy| j                  j                  dk(  syt        | j                        }t         j                  j                  D ci c]  }|j                  d      d   | c}xs |j                  d      d   |i}|j                         t         j                  j                  z  D cg c]  }||   	 }}|sy| j                  t        j                  t        j                   t        j"                  fvryt%               \  }}}}|st&        j)                  d       yt        j*                         r|t         j                  _        t         j                  j,                  st&        j)                  d       y|t         j                  j,                  k7  rt&        j)                  d       yyc c}w c c}w )	NFrC   :r   z,Please pip install Composable Kernel packagez,Please set TORCHINDUCTOR_CK_DIR env variablezInvalid path to CK libraryT)rd   r  r  rK   r  r  r   r   r$  r  archrH  r  ck_supported_archr   r   r2  r4  r4  r   r  	is_fbcodeck_dir)r  native_archrx  requested_archsrequested_supported_archsck_package_dirnamer   s          rR   use_ck_templater?    s   6#;#;====' 0>K39;;3C3CDaqwws|A)D #q!;IO
 !%%'&++*G*GG! 	! ! %||EMM5>>5==II"3"51aBC/;;BCV[[///01= E!s   G-,G2c                    ddl m} t        d      xr= t        |       xr0 |j                  j
                  j                  ||z  |z  d      dkD  S )Nr3   rT  CKr   r  r   rW  rU  r  r?  rX  rY  r  r  r  r  rx  rU  s        rR   use_ck_gemm_templaterD    sR     	d# 	CF#	CGG&&q1uqy2&>Brq   c                    ddl m} t        d      xr= t        |       xr0 |j                  j
                  j                  ||z  |z  d      dkD  S )Nr3   rT  CKTILEr   r  r   rB  rC  s        rR   use_ck_tile_gemm_templaterG    sR     	h' 	CF#	CGG&&q1uqy2&>Brq   c                2    t        d      xr t        |       S )NrA  )r  r?  r  s    rR   use_ck_conv_templaterJ    s    %d+G0GGrq   c                |    t         j                  xs t         j                  xr | j                  j                  dk(  S r  )rd   r  r  r   r   rI  s    rR   _use_template_for_cpurL    s2    7v77&
--


%&rq   c                    ddl m} t        |j                  |      sJ t	        | ||d      xr |j                  j                         S )Nr3   )r<   F)require_constant_mat2)r  r<   rs   r  use_cpp_gemm_templateis_contiguous)r  mat1mat2r<   s       rR   use_cpp_bmm_templaterS    sE     dkk6*** 	fdDN 	(KK%%'rq   c                `   ddl m} ddlm} ddlm}	 ddlm}
 t        |       rt        d      syt        j                  j                  sy|j                         t        j                  t        j                   fv }t        j"                  t        j$                  t        j&                  t        j                  g} |
|||r| j(                  nd ||      \  }}}} }}t+        ||f      ryt-        ||j.                        r|j1                         } |	|j                               \  }} |d	||||j                         |j                         |t3               | |

      }dd}| j(                  |v xr= |d uxr7  ||      xr- t-        ||j4                        xr |j7                         xs | S )Nr3   r  )create_micro_gemm)*get_gemm_template_output_and_compute_dtype)mm_argsCPPF)	out_dtypemat2_transposeduse_4x2_dim
micro_gemm)input_dtypeinput2_dtypeoutput_dtypenum_threadsuse_refq_group_sizec                N    | j                          | j                         d   dk(  S )Nr   r3   )freeze_layoutr  rO   s    rR   is_last_dim_stride1z2use_cpp_gemm_template.<locals>.is_last_dim_stride13  s"    	||~b!Q&&rq   r  )r  r  codegen.cpp_micro_gemmrU  codegen.cpp_utilsrV  kernel.mm_commonrW  rL  r  rd   cppweight_prepackr  rK   rC  r8  r4  r2  halfr   has_free_symbolsrs   BaseViewunwrap_viewparallel_num_threadsr  is_module_buffer)r  rQ  rR  rZ  rN  is_woq_int4rb  r  rU  rV  rW  	int8_gemmr  r  r  rx  r_  r   r\  rf  s                       rR   rO  rO     s    9M) (0Ee0L::$$ U[[%**$==I]]ENNEJJLM")"+&,,'#Aq!VT4 A$$!@AQROL!"			NN$^^%!(*!J'
 	% 	Cd"	C%	C tR]]+	C ""$A,A(Arq   c                 b    t         j                  xs t         j                   xs t        d      S )NATEN)rd   r  r  r  r   rq   rR   use_aten_gemm_kernelsrv  @  s-    7v77 '	v	&'rq   c                  T    e Zd ZU  ej                  d      Zded<   ddZddZd	dZ	y)
DebugDirManagerr   r   prev_debug_namec                @    t        t        j                        | _        y r   )r  rx  counterr   r  s    rR   r  zDebugDirManager.__init__J  s    ../rq   c                    t         j                  j                  j                  | _        | j                   d| j
                   | _        | j                  t         j                  j                  _        y )N_tmp_)rK   _dynamord   debug_dir_rootry  r   new_namer  s    rR   	__enter__zDebugDirManager.__enter__M  sM    $}}33BB//0dggY?.2mm+rq   c                    t        j                  | j                         | j                  t        j
                  j                  _        y r   )r  r  r  ry  rK   r~  rd   r  )r  rz   s     rR   __exit__zDebugDirManager.__exit__R  s*    dmm$.2.B.B+rq   NrW  )rz   r   r   r  )
r   r   r   r:  r  r{  r   r  r  r  r   rq   rR   rx  rx  F  s(    iooa G0<
Crq   rx  c                    ddl m} g dfd}t        j                  j	                  |d|      5  t
        j                  j                           | |i |}d d d        |fS # 1 sw Y   fS xY w)Nr3   r7   c                (    j                  |        y r   r  codesource_codess    rR   save_output_codez*run_and_get_code.<locals>.save_output_code`      D!rq   r  r  r   r   r  rX  r8   r   r  r`  rK   r~  reset)r   rz   r  r8   r  r  r  s         @rR   run_and_get_coder  W  su    
 % L" 
		=*<>N	O %T$V$% <% <s   'A$$A0c                    t        | g|i |\  }}g }|D ]6  }|j                  t        j                  d|t        j                               8 ||fS )Nz	'''.*?''')r  r  r   findallDOTALL)r   rz   r  r  r  kernelsr  s          rR   run_and_get_kernelsr  i  sZ     ,B@@@FLG Brzz,bii@AB7?rq   c                &     d fd}t        |      S )Nc                 R            } | j                         j                          | S r   )r   backward)r  r   s    rR   run_with_backwardz1run_fw_bw_and_get_code.<locals>.run_with_backwardt  s!    

rq   )r   r   )r  )r   r  s   ` rR   run_fw_bw_and_get_coder  s  s    
 -..rq   c                X   ddl m} g dfdd	fd}t        j                  j	                  |d|      5  t        j                  j	                  |d      5  t
        j                  j                           | |i |}ddd       ddd       S # 1 sw Y   xY w# 1 sw Y   S xY w)
zLGet the inductor-generated code, but skip any actual compilation or running.r3   r7   c                (    j                  |        y r   r  r  s    rR   r  z"get_code.<locals>.save_output_code  r  rq   c                     G d d      }| j                   r| j                         n| j                         \  }} |j                         |r |j                          |       S )Nc                       e Zd ZdZddZddZy)@get_code.<locals>.patched_compile_to_module.<locals>.DummyModulez4This is empty to replace the generated triton modulec                     y r   r   r  s    rR   r  zIget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.__init__  s    rq   c                     y r   r   r  s      rR   callzEget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.call  s    rq   NrW  rz   r   r  r   r   r  )r   r   r   r   r  r  r   rq   rR   DummyModuler    s    Frq   r  )r  codegen_with_cpp_wrappercodegenr   )r  r  wrapper_codekernel_coder  s       rR   patched_compile_to_modulez+get_code.<locals>.patched_compile_to_module  s]    	 	 04/?/?D))+T\\^ 	"k 	++,[../}rq   compile_to_moduler  Nr  )r  r8   r   r   r  )r   rz   r  r8   r  r   r  r  s         @@rR   get_coder  |  s    $ L", 	

.0I	
  	

-);=MN	  	          s#   "B'BBB	BB)c                |    t        | g|i |}dt        |      cxk  rdk  sn J dt        |              |d   S Nr3   rD  z%expected one or two code outputs got r   )r  rM   )r   rz   r  r  s       rR   get_triton_coder    sQ    B000LL!&Q& 
/L0A/BC& ?rq   c                    t        | g|i |\  }}dt        |      cxk  rdk  sn J dt        |              |d   S r  )r  rM   )r   rz   r  r   r  s        rR   run_and_get_triton_coder    sW     'r;D;F;OA|L!&Q& 
/L0A/BC& ?rq   c                    ddl m ddlm} |j                  g dfd}t
        j                  j                  |d|      5   | |i |}d d d        |fS # 1 sw Y   fS xY w)Nr   r7   r?   c                 ^     | i | | d   }t        |      sJ j                  |       y )NrD  )rs   r  )rz   r  rX  r8   graph_lowerings	real_inits      rR   	fake_initz-run_and_get_graph_lowering.<locals>.fake_init  s7    4"6"Q%///u%rq   r  r  )torch._inductor.graphr8   torch._inductor.output_coder@   r  r   r  r`  )	r   rz   r  r@   r  r  r8   r  r  s	         @@@rR   run_and_get_graph_loweringr    sq     4;((IO& 
		?J		B %T$V$% ?""% ?""s   	AA(c              #     K   ddl m} |j                  |    }	 t        j                  ||      |j                  | <   d ||j                  | <   y# ||j                  | <   w xY ww)z
    Override the lowering of aten_op with override_fn.
    The first argument of override_fn is the original lowering fn.
    r   )loweringN)torch._inductorr  	loweringsr  partial)aten_opoverride_fnr  orig_fns       rR   override_loweringr    s`      )  )G.&/&7&7W&M7#&-7#g7#s   A$'A  A$A!!A$c                     ddl m} |j                  d fd}t        j                  j
                  j                  |d|      S )zr
    Add hook functions to be called at the beginning and end of Scheduler.__init__.
    Used for unit tests.
    r   )	Schedulerc                B     | |        | |      }r	 | |       |S r   r   )r  r  outr  post_fnpre_fns      rR   r  z(add_scheduler_init_hook.<locals>.wrapper  s+    y% i'Iu%
rq   r  )r  r   r  r   r   r   )torch._inductor.schedulerr  r  unittestr   r  r`  )r  r  r  r  r  s   ``  @rR   add_scheduler_init_hookr    s9     4  G ==%%iWEErq   c                z    t         j                  rt        j                  |        yt        j	                  |        y)z
    Warnings that will be actionable for PyTorch developers, but not
    end users.  Allows us to easily disable them in stable releases but
    keep them on for nightly builds.
    N)rd   developer_warningsr   r  info)msgs    rR   developer_warningr    s$       Crq   c                    	 t         j                  j                  d      } | dz   t        t         j                        k  rTt        t         j                  | dz            dkD  r2t         j                  | dz      d   dk7  rt         j                  | dz      S t         j                  D ]#  }|j                  d      s|t        d      d c S  y# t        $ r Y Bw xY w)a  
    An experimental API used only when config.benchmark_kernel is true.

    The benchmark name is only available at codegen time. So we can not
    directly call it in benchmark_all_kernels which is run after codegen.

    The function assumes the argument after --only is the benchmark name.
    It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
    scripts, this function may return None.

    There are 2 flavors of --only argument we need handle:
    1. --only model_name
    2. --only=model_name
    z--onlyr3   r   rL  z--only=N)r  argvr   rM   
ValueErrorr  )r   r  s     rR   get_benchmark_namer  	  s    	hhnnX&!Gc#((m#CHHS1W%&*q!!$+88C!G$$ xx )>>)$s9~'(()   s   BC 	CCc                &    t        d | D              S )Nc              3  &   K   | ]	  }|d k(    ywr3   Nr   r~  s     rR   r   zis_ones.<locals>.<genexpr>%	       %!qAv%   rw   r  s    rR   is_onesr  $	      %u%%%rq   c                &    t        d | D              S )Nc              3  &   K   | ]	  }|d k(    yw)r   Nr   r~  s     rR   r   zis_zeros.<locals>.<genexpr>)	  r  r  r  r  s    rR   is_zerosr  (	  r  rq   c                &    t        d | D              S )Nc              3     K   | ]@  }t        |t        j                        r$|j                  t        j                  d       k(   B yw)r   N)rs   rK   r  r   )r   r   s     rR   r   z is_cpu_device.<locals>.<genexpr>-	  s8      dELL) 	u||E**s   AAr  )inputss    rR   is_cpu_devicer  ,	  s       rq   c                    t        | t        j                        sJ d       | j                  rt        j
                  S t        j                  S )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)rs   rt   r  r   rK   r>  r6  )r=  s    rR   get_sympy_Expr_dtyper  4	  s=    c5::& B& ~~{{}}rq   c              /     K   | r-t        j                  j                  |i |5 }| d d d        y d  y # 1 sw Y   y xY wwr   )rK   r   r   )should_profilerz   r  r   s       rR   maybe_profiler  >	  sE     ^^##T4V4 	G	 	 		 	s   "A7AA Ac                 l    t         j                  j                  } | dk  rt        j                         } | S Nr3   )rd   rj  threadsrK   get_num_threads)r  s    rR   rp  rp  G	  s+    jj  G{'')Nrq   c                     ddl m}   |        }|j                  dt        j                  j
                  rd      S d      S )Nr3   )get_backend_options
num_stagesrD     )runtime.triton_helpersr  r  rK   r  r  )r  optionss     rR   get_backend_num_stagesr  N	  s2    ;!#G;;|%--*;*;QCCCCrq   c                   t        | t        j                  j                  j                  j
                        }||S ddlm}m} t        j                  j                         xr! t        j                  j                         dk\  }| t        j                  t        j                  t        j                  fv sJ t        j                  |      j                   j#                  d      rddlm}  |       }| t        j                  t        j                  fv r|r	 || |      S t        j                  j                  j                  j
                  r |t        j                  |      S  |t        j                  |      S | t        j                  t        j                  fv r
|r ||       S t        j                  j                  j                  j
                  r |t        j                        S  |t        j                        S )z
    We don't want to throw errors in this function. First check to see if the device is in device_info.py,
    then fall back to the inaccurate triton estimation.
    )is_tf32r   )get_max_simd_tflopsget_max_tensorcore_tflops)rl   r   
clock_rate)max_clock_rate)r   rK   backendsrC   matmul
allow_tf32triton.testingr  r  rL   get_device_capabilityr   r2  r4  inspect	signature
parametersr  torch._utils_internalr  )r   ds_topsr  r  SM80OrLaterr  sm_clocks          rR   get_device_tflopsr  V	  sk    UENN,?,?,F,F,Q,QRGM**))+ 

0P0P0R W 1K
 U]]ENNEMMBBBB,-88<<\J8!#U]]ENN33,UH==>>%%00,U]]HEE&u}}h??U]]ENN33,U33>>%%00,U]];;&u}}55rq   c                     ddl m}   |        S )Nr   get_dram_gbps)r  r  r  s    rR   get_gpu_dram_gbpsr	  	  s    ,?rq   c                 x    ddl m}  | j                  j                  j	                  d      j                  dd      S )Nr   r  max_shared_mem)triton.runtimer  r  r  r  r  r  s    rR   get_gpu_shared_memoryr  	  s.    %==44Q7;;<LaPPrq   c                $    | j                  d      S )Nwelford)r  reduction_types    rR   is_welford_reductionr  	  s    $$Y//rq   c                (    t        |       ry| dk(  ryy)Nr  online_softmax_reducerD  r3   )r  r  s    rR   reduction_num_outputsr  	  s    N+	2	2rq   c                 0    t        j                         dk(  S )NLinux)platformsystemr   rq   rR   is_linuxr  	  s    ??''rq   c                 (    t         j                  dk(  S )Nrf   )r  r  r   rq   rR   r  r  	  s    <<7""rq   c                &    t        d | D              S )Nc              3  n   K   | ]-  }t        |t        j                        xr |j                    / y wr   )rs   rt   r  r  r~  s     rR   r   z#has_free_symbols.<locals>.<genexpr>	  s)     Jz!UZZ(<_<Js   35r  )itrs    rR   rm  rm  	  s    JcJJJrq   c            	     x   ddl m} | D ]  }t        ||j                  |j                  |j
                  |j                  |j                  f      r=t        |j                         xs d      st        |j                         xs d      s yt        ||j                        st        dt        |              y)Nr3   r  r   Tzunexpected type for is_dynamic F)r  r  rs   r  r  rn  r5  r9   rm  maybe_get_sizemaybe_get_strider;   	TypeErrorr   )rz   r  ts      rR   
is_dynamicr%  	  s     IbmmR[[":K:KRYYW
   0 0 2 8b9=M""$*> Aryy)=d1gYGHHI rq   c                      e Zd ZdZdZy)PlaceholderKERNEL_NAMEDESCRIPTIVE_NAMEN)r   r   r   r(  r)  r   rq   rR   r'  r'  	  s      K *rq   r'  c                   ddl m} t        j                  ddd      5 }t	        j
                         }t	        j
                         } t        |t        |            j                  |  t        d|j                   |	       t        |j                  |	       t        j                         }t        ||      5   | |j                         d d d        t        j                         |z
  }	 ||j                         |j                  j                          |j                          t        d
|j                   |	       t        |j                  |	       |j!                         |j!                         k(  }
t"        j%                  d||j&                  |
|	       d d d        y # 1 sw Y   xY w# 1 sw Y   y xY w)Nr3   )stable_topological_sortr*  zutf-8F)modeencodingr  )r  	fake_modezBefore:
)filezAfter:
zZ%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s)pattern_matcherr+  r  NamedTemporaryFileior   rY   rU   	propagater  rX  r
   nowrX   lint	recompiler  r   r  r   )rR  r  inpr  r+  r  	before_ioafter_io
start_timetime_elapsedr$  s              rR   pass_execution_and_saver<  	  sX    9		$	$
 
 
KKM	;;=C	R#3C#89CCSI	"(($1-bhhY'\\^
#B, 	N	||~
2)


#!,bhhX& H$5$5$77hFF	
-
 
	 	
 
s%   BF4<F(CF4(F1	-F44F=c                ~    ddl m} t        | |j                        xr  t        | j                  |j
                        S )zB
    Check if input buffer is a multi-outputs template buffer
    r3   r  )r  r  rs   CppTemplateBufferr  MultiOutputLayout	input_bufr  s     rR   is_multi_outputs_templaterB  	  s9     i!5!56 :"..< rq   c                    ddl m} t        | |j                        xr2 t	        | j
                        dk(  xr t        | j
                  d         S )zL
    Check if input buffer is a output of multi-outputs template buffer
    r3   r  r   )r  r  rs   MultiOutputrM   r  rB  r@  s     rR   #is_output_of_multi_outputs_templaterE  	  sL      	9bnn- 	;	  !Q&	;%i&6&6q&9:rq   c                (   | yddl m} t        | |j                        xr- t        | |j                         xr |d u xs | j
                  |u xsB t        |       |j                  k(  xr' t        t        j                  j                  d      xr; | j
                  t        j                  j                  j                  j                  k(  xs t        t        j                  j                  d      xr; | j
                  t        j                  j                  j                  j                  k(  xsa t        t        j                  j                  d      xr; | j
                  t        j                  j                  j                  j                  k(  S )NFr3   r  all_to_all_singleall_gather_into_tensorreduce_scatter_tensor)r  r  rs   _CollectiveKernel_WaitKernelop_overloadr   FallbackKernelr   rK   r   torchrecrG  defaultrH  rI  r  rg  r  s      rR   is_collectiverQ  
  sG    | 	4--. 	3400	34Z14++r1 	T
b''' 	

 		**,?@ U$$		(:(:(L(L(T(TT
 		**,DE E$$99%%<<DDE 		**,CD Y$$		(:(:(P(P(X(XX/rq   c                >    ddl m} t        |       |j                  k(  S Nr3   r  )r  r  r   rK  r  r  s     rR   is_waitrU  '
  s    :''rq   c                    ddl m} t        | |      rt        d | j                  D              S t        | j                        S )Nr   GroupedSchedulerNodec              3  2   K   | ]  }t        |        y wr   )contains_collectiver~  s     rR   r   z&contains_collective.<locals>.<genexpr>1
  s     @a&q)@r  )r  rX  rs   rc  snodesrQ  r  snoderX  s     rR   rZ  rZ  -
  s4    >%-.@5<<@@@$$rq   c                    ddl m} t        | |      rt        d | j                  D              S t        | j                        S )Nr   rW  c              3  2   K   | ]  }t        |        y wr   )contains_waitr~  s     rR   r   z contains_wait.<locals>.<genexpr>:
  s     :=#:r  )r  rX  rs   rc  r[  rU  r  r\  s     rR   r`  r`  6
  s4    >%-.:U\\:::uzz""rq   c                    ddl m} t        |t        j                  j
                        r|g}t        | |j                        xr | j                  |v S rS  )r  r  rs   rK   rt  ru  rM  rL  rP  s      rR   is_fallback_oprb  ?
  sE     "ejj++,TdB--.I43C3Cr3IIrq   c                B    |||    j                   j                            S r   )defining_opr  )buf_namename_to_bufname_to_fused_nodes      rR   buf_name_to_fused_snoderh  J
  s#     k(3??HHJKKrq   c                     yrj  r   r]  s    rR   rk  rk  U
  rl  rq   c                     ||       ry |j                  |        | j                  D ].  }t        |j                  ||      }||v rt	        |||||       0 y )Ncriteria_cb)r  unmet_dependenciesrh  r   find_recursive_deps_of_node)r]  collected_node_setrf  rg  rm  depdefining_op_for_deps          rR   ro  ro  P
  sn     55!'' 
5HHk#5
 "44##	

rq   c                     yrj  r   rj  s    rR   rk  rk  n
  rl  rq   c           	     z    ||       ry |j                  |        | j                         D ]  }|j                  D ]}  }|j                  J |j                  j	                         dk(  r/|j                  j	                         |vrL||j                  j	                            }||v rnt        |||||         y )NOUTPUTrl  )r  get_outputsrw  r  r  find_recursive_users_of_node)r]  rp  rf  rg  rm  or1  user_ops           rR   rw  rw  i
  s     55!  GG 	D99(((yy!!#x/yy!!#+==(););)=>G,,(""'	rq   c                b    t         j                  j                  j                  rdnd}|| z
  |z
  S )zaComputes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)rD  r   )rK   
_functorchrd   functionalize_rng_ops)dynamo_gm_num_inputsaot_fw_gm_num_inputsnum_rng_seed_offset_inputss      rR   num_fw_fixed_argumentsr  
  s6     $$::   "669SSSrq   c                    dd}d}g }| j                   j                  D ]0  }|j                  dk(  s ||      r|j                  |       |dz  }2 |t	        t        t        |                  k(  sJ t        |      S )z>
    Infers which inputs are static for a backwards graph
    c                ~    d| j                   vxr. d| j                   vxr d| j                   vxr d| j                   vS )Ntangentsbwd_seedbwd_base_offsetbwd_rng_staterm  re  s    rR   is_saved_tensorz'count_tangents.<locals>.is_saved_tensor
  sH    aff$ .!&&(.!/.  qvv-		
rq   r   r  r3   )rO   r2   r   r"  )rX  r  rg  r  rJ  r   rM   )fx_gr  	arg_countstatic_arg_idxsr  s        rR   count_tangentsr  
  s    

 IOZZ 44= q!&&y1NI	 d5_)=#>????rq   c                  2    e Zd ZU ded<   ddZedd       Zy)	BoxedBoolr"  r   c                    | j                   S r   )r   r  s    rR   r#  zBoxedBool.__bool__
  s    zzrq   c                6    t        | t              r	d| _        | S yrj  )rs   r  r   r  s    rR   disablezBoxedBool.disable
  s    c9%CIJrq   NrX  )r  r   r   zUnion[BoxedBool, bool])r   r   r   r   r#  r  r  r   rq   rR   r  r  
  s     K  rq   r  c              #      K   ddl m} |j                  	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 d fd}t        j                  j                  |d|      5  d  d d d        y # 1 sw Y   y xY ww)Nr3   r5   c                @    j                  |        | |||||      S r   r  )r  kernel_namer  r#  gpucpp_definitionkernel_listorig_define_kernels         rR   define_kernelz.collect_defined_kernels.<locals>.define_kernel
  s-     	;'!+{Hc>
 	
rq   r  )NTN)r  r6   r  r   r  r   r#  Optional[str]r  r"  r  r  r   r   )codegen.wrapperr6   r  r   r  r`  )r  r6   r  r  s   `  @rR   collect_defined_kernelsr  
  s     5-;; #'(,
"

 
  	

 
 &
 

 
		/-	P   s   AA*A	A*A'#A*c                    | dz   S )N__original__r   rm  s    rR    get_cloned_parameter_buffer_namer  
  s    .  rq   c                    | t         v S r   )rI   r#  s    rR   r  r  
  s    Yrq   c                &    | dk7  xr t        |       S )NrD   )r  r#  s    rR   device_need_guardr  
  s    U?-vf~-rq   c                d   t        j                         rc| t        j                  k(  rPt        j                  j                         r2t        j                  j                         dk\  rt         j                  ry| t        t        j                  t        j                  t        j                  g      v S )N)r  r   F)rd   r9  rK   r2  rC   rL   r  bfloat16_atomic_adds_enabledr!   r>  r"  r  s    rR   ,needs_fallback_due_to_atomic_add_limitationsr  
  sp    
 	U^^#JJ##%JJ,,.&8//
EKKU^^#LMMMrq   c                   | j                   t        j                  j                  j                  t        j                  j                  j
                  fv r|y| j                   t        j                  j                  j                  k(  rdnd}|d |fvxs |xr t        |      xr t        |      xs | j                   t        j                  j                  j                  k(  xrW |dk(  xrP |xrL |dk(  xrE t        j                  j                  xr) t        j                  j                  xs t               dk7  xs? ||k(  xr" |t        j                  t        j                  fv xs t        j                          S )NFr  r   r   r3   )overloadpacketrK   r   atenscatter_reduce_scatter_reducescatter_r  r  rd   rj  fallback_scatter_reduce_sumdynamic_threadsrp  r"  r>  r  )rL  r  
self_dtype	src_dtypesrc_device_typesrc_is_tensor	reduce_tys          rR   use_scatter_fallbackr  
  sZ    	""IINN**EIINN,I,IJ	K" ++uyy~~/F/FFE 
 	tY// 	8 H'H<YG		8 &&%))..*H*HH L%'LL  5(L 

66	L
 ++J/C/E/J	8 i'SJ5::u{{:S,S	8 557!rq   c                   ddl m}m} ddlm} t        dt        |        d       t        |       D ]  \  }}t        d|dd       ||u rt        d	       '||u rt        d
       7t        ||      r|j                         }t        |rdnd d       |r:|j                  J t        d|j                  j                  j                          t        d       |j                  j                  D ]  }t        |        t        d       |j                  j                  D ]  }t        |        t!        dt#        |              y)z
    An API that can be used in pdb to dump a node_schedule.
    Right mainly dump the read/write dependencies but can add more as needed.
    r   )DisableReductionEnableReduction)SchedulerNodezNode schedule with z nodesr  3r6  zenable reductionzdisable reductionredpwz scheduler nodeNzoriginal reduction hint zReadDep:z	WriteDep:zUnrecognized node type: )torch._inductor.codegen.simdr  r  r  r  r  rM   r   rs   is_reductionr  r  reduction_hintr  r  r  r   r   )r  r  r  r  r   r  is_redrq  s           rR   dump_node_scheduler    s=   
 O7	M 236
:;}- H	T#al?"$%%%%&m,&&(FfU$/?@yy,,,01N1N0OPQ*''-- c
+''.. c
 !9$t*FGG'Hrq   c                z    ddl m}  || j                         t        | j                        z  t
        z  dk(        S )Nr   )r  )r  r  storage_offsetr  r   GPU_ALIGN_BYTES)r   r  s     rR   tensor_is_alignedr  2  s:     L 				 >&,,#?	??RVWW rq   c                |    t        | j                  j                        syt        j                  xs t        |       S rj  )r  r   r   rd   assume_aligned_inputsr  )example_inputs    rR   should_assume_input_alignedr  @  s2     -&&++,''K+<]+KKrq   c                 6   t         j                  j                  j                         } | st	        j
                         S | j                  r| j                  j                  st	        j
                         S | j                  j                  }|j                         S r   )	rK   _guardsTracingContexttry_getr7  nullcontextr.  rZ  suppress_guards)tracing_contextrZ  s     rR   #maybe_get_suppress_shape_guards_ctxr  I  sv    
 mm22::<O%%'' $$O,E,E,O,O%%''))33I$$&&rq   c                   t         j                  j                  j                  t        dd      5  t
        j                  j                          dd l}dd l	} |j                         } |j                  |      }ddlm} |j                  |       |j                  }|j!                  |j"                          | |i |}	|j%                         }
|j!                  |       |j'                  |       d d d        |	|
fS # 1 sw Y   	
fS xY w)Nr   Tr   )output_code_log)r  r   r  r`  rd   rK   r~  r  r2  loggingr   StreamHandlertorch._inductor.codecacher  
addHandlerlevelsetLevelDEBUGr  removeHandler)r   rz   r  r2  r  log_capture_stringchr  
prev_levelr  r   s              rR   run_and_get_cpp_coder  Y  s     
			#	#FGT	: *(R[[]"W""#56=""2&$**
  /T$V$'')  ,%%b)*  19!*  19s   CC>>D
c                    t        |       }||j                  S | D ]4  }t        |t        j                        s|j
                  j                  c S  y r   )rU   rZ  rs   rK   r/   r  )r  r.  inputs      rR   shape_env_from_inputsr  r  sT     (I """  (eU\\*::'''(
 rq   c                <     t              dk(  r S d fd}|S )Nr   c                z    t        |       \  }} |       }t        |      rt        j                  ||       |S r   )copy_misaligned_inputsrM   rK   _foreach_copy_)
new_inputsold_tensorsnew_tensorsr  inputs_to_checkr  mutated_input_idxss       rR   r  z)align_inputs_from_check_idxs.<locals>.run  sE    #9);$
 [ J {  k:
rq   )r  list[InputType]r   r   )rM   )r  r  r  r  s   ``` rR   align_inputs_from_check_idxsr    s#    
 ?q  Jrq   c                T   d| j                         v rd}n;t        d t        | j                         | j                               D              dz   }t	        j
                  | |fd      j                         }t	        j
                  || j                         | j                               S )Nr   c              3  2   K   | ]  \  }}|d z
  |z    ywr  r   )r   r  r  s      rR   r   z)clone_preserve_strides.<locals>.<genexpr>  s     Tf$Tr  r3   r   )r  r   r   r  rK   
as_stridedclone)rO   needed_sizer  s      rR   clone_preserve_stridesr    s    AFFH} T#affh
:STTWXX 	 a+6<<>FFAFFHahhj99rq   c                2   g }g }|du}|D ]  }| |   }t        |t        j                        sJ dt        |              |j	                         t
        z  sMt        |      | |<   |s^||v sc|j                  |       |j                  | |           ||fS )z
    Clones misaligned tensors which we inferred were aligned. Returns a tuple of [old_tensors], [new_tensors] for every
    cloned tensor which is in `return_pair_idxs`.
    Nz Expected tensors only, but got: )rs   rK   r  r   data_ptr	ALIGNMENTr  r  )r  check_inputs_idxsreturn_pair_idxsr  r  ret_pair_definedr   _inps           rR   r  r    s     ')K&(K (t3 
2!}$- 	
.tDzl;	
- ==?Y&248JqMA)9$9""4("":a=1
2 ##rq   c                    g }|D ]N  }| |   }t        |t        j                        s#|j                         t        z  dk(  s>|j                  |       P t        |      t        |      k7  r|S |S )z[
    We require all inputs to be aligned, so introduce a copy for any
    that aren't.
    r   )rs   rK   r  r  r  r  rM   )r  static_input_idxsaligned_static_input_idxsr   r  s        rR   remove_unaligned_input_idxsr    st     !#  2seU\\*0@90LQR/R%,,S12 $%->)??((rq   c                   ddl m} t        j                  t        j                        j
                  }|j                  j                  j                  }|j                  j                  j                  j                  }|j                  j                  j                  | |k        ry|j                  r)|j                  j                  j                  | dk        ry ||       xr  ||       |k  S )Nr3   rT  Tg@xDF)rW  rU  rK   iinfor<  r   rX  rY  r  rZ  has_hintr  aot_compilation)r   rU  int_maxr  r  s        rR   expr_fits_within_32bitr    s    kk%++&**G  **Iww))22H 	ww--a7l; 	 7711!d(;  A;29Q<722rq   c                   t         j                  j                  j                         }||j                  t        |j                        dk(  sJ t        |       |j                  J |j                  D ]  }||j                  j                  d        !dt         j                  j                  j                         x}r|j                  dfd|j                  j                  t        fd|D                      y y y )Nr   Fc                f    t        |       S rj                  |       S j                  |       S r   )r   deserialize_symexprevaluate_symexpr)r   fakify_first_callrZ  s    rR   map_exprz4set_tracing_context_output_strides.<locals>.map_expr  s7     ("1v((<<Q??$55a88rq   c              3  .   K   | ]  } |        y wr   r   )r   r   r  s     rR   r   z5set_tracing_context_output_strides.<locals>.<genexpr>  s     5!(1+5s   )r   r   r   z,Union[float, int, SymInt, SymFloat, SymBool])
rK   r  r  r  output_stridesrM   r  r  r  r  )r  compiled_graphr  r  r  r  r  rZ  s        @@@rR   "set_tracing_context_output_stridesr    s     mm**224Gw55A7))*a///).9	,,888#22 	E}&&--d3$)!--66>>@@3@(+(=(=%9 &&--5u55		  Brq   c                    t         j                  t         j                  S t        j                         syt        j                  j                         ry	 ddlm}  | t        j                  j                  d      k\  S # t        $ r Y yw xY w)NFr   REMOTE_CACHE_VERSIONz.pytorch/remote_cache:fx_graph_memcache_version)
rd   fx_graph_remote_cacher9  rK   _utils_internalis_fb_unit_testtorch._inductor.fb.remote_cacher  ModuleNotFoundErrorjustknobs_getval_intr  s    rR    should_use_remote_fx_graph_cacher    s    ##/+++,,.H  5#8#8#M#M8$    s   A> >	B
	B
c                0    t        j                  dd|       S )Nz[^a-zA-Z0-9_]r   )r   subrm  s    rR   normalize_namer  -  s    66"C..rq   ztl.int1ztl.float8e4nvztl.float8e5ztl.float8e4b8ztl.float8e5b16ztl.uint8)ztl.boolztl.float8_e4m3fnztl.float8_e5m2ztl.float8_e4m3fnuzztl.float8_e5m2fnuzztl.float8_e8m0fnuztl.float4_e2m1fn_x2z^.*[.]c                l    t         j                  dt        |             }t        j	                  ||      S )z"Convert torch.dtype to triton typetl.)_triton_type_rer  r   _triton_type_mappingr  )r   triton_type_names     rR   triton_typer  C  s.    &**5#e*=##$46FGGrq   c                    t         j                  | |       }|j                  dd      }t        t        |      }t        |t        j                        sJ |S )Nr  r  )_torch_triton_mappingr  r~  rJ   rK   rs   r   )r   adjusted_type	type_namerY  s       rR   triton_type_to_torchr#  I  sL    )--eU;M%%eR0Iy)Ii---rq   c                   | j                    xr | j                         |j                         k(  xr | j                         |j                         k(  xr | j                  |j                  k(  xr{ | j                  |j                  k(  xr` | j                         j                         |j                         j                         k(  xr! | j                         |j                         k(  S r   )	is_mkldnnr  r  r   r   untyped_storager  r  r  r   s     rR   is_same_tensorr(  Q  s    NN 	<IIK5::<'	<KKMU\\^+	< JJ%++%	< KK5<<'		<
   "++-1F1F1H1Q1Q1SS	< !U%9%9%;;rq   c                v   | j                   xr | j                         |j                         k(  xr | j                  |j                  k(  xrn | j                  |j                  k(  xrS t        j
                  j                  j                  |       t        j
                  j                  j                  |      k(  S r   )r%  r  r   r   rK   r   mkldnnr  r'  s     rR   is_same_mkldnn_tensorr+  ]  s     	PIIK5::<'	PJJ%++%	P KK5<<'	P II%%d+uyy/?/?/H/H/OOrq   c                      y)N)rE  isnanlogical_notlogical_andsignbitand_leltgegteqner  xorr   r   rq   rR   boolean_opsr9  g  s    rq   c                  "    e Zd ZU ded<   ded<   y)OpDtypeRuler0   type_promotion_kindOptional[torch.dtype]override_return_dtypeNr   r   rq   rR   r;  r;  {  s    8800rq   r;  zdict[str, OpDtypeRule]op_dtype_propagation_rulesc                *    t        ||      t        | <   y r   )r;  r?  )r   r<  r>  s      rR   #register_op_dtype_propagation_rulesrA    s    
 (32(t$rq   zOrderedSet[str]op_requires_libdevice_fp64c                .    t         j                  |        y r   )rB  r  rm  s    rR   #register_op_requires_libdevice_fp64rD    s    ""4(rq   c                     ddl m}  | j                  j                         j                  }|dk(  rt
        j                  S |dk(  ryt
        j                  S )Nr   rT  r   rD   )r  rU  rX  get_current_device_or_throwr   rd   cpu_backendcuda_backend)rU  
device_strs     rR   get_current_backendrJ    sH    -446;;JU!!!	u	"""rq   c                    | t         j                  t         j                  fv r7t        j                  j
                  rt               dk(  rt         j                  S | S )z"Maybe upcast [b]float16 to float32r  )rK   r   r2  rd   r  codegen_upcast_to_fp32rJ  r4  r  s    rR   upcast_compute_typerM    s@     	%--00MM00!X-}}Lrq   KeyTypeValTypec                  Z    e Zd ZdZddZddZddZddZdddZddZ	dd	Z
dd
ZddZy)
ScopedDictz
    A dictionary-like object that allows for scoped updates. It maintains
    an original dictionary and a set of new items that can override
    the original items within the scope.  The original dictionary is
    unmodified.
    c                     || _         i | _        y r   original_dict	new_items)r  rT  s     rR   r  zScopedDict.__init__  s    *13rq   c                Z    || j                   v r| j                   |   S | j                  |   S r   rU  rT  r  s     rR   r  zScopedDict.__getitem__  s.    $.. >>#&&!!#&&rq   c                "    || j                   |<   y r   )rU  )r  rK  r   s      rR   __setitem__zScopedDict.__setitem__  s    #srq   c                >    || j                   v xs || j                  v S r   rW  r  s     rR   __contains__zScopedDict.__contains__  s!    dnn$At/A/A(AArq   Nc                t    || j                   v r| j                   |   S | j                  j                  ||      S r   )rU  rT  r  )r  rK  rO  s      rR   r  zScopedDict.get  s6    $.. >>#&&!!%%c733rq   c                z    t        | j                        }| j                  D ]  }|| j                  vs|dz  } |S r  )rM   rT  rU  )r  r  rx  s      rR   rw  zScopedDict.__len__  sC    ""# 	A***Q	 rq   c              #     K   | j                   E d {    | j                  D ]  }|| j                   vs|  y 7 )wr   rS  )r  rx  s     rR   __iter__zScopedDict.__iter__  s@     %%%% 	A***	 	&s   ><!>>c                H    t        | j                  xs | j                        S r   )r"  rT  rU  r  s    rR   r#  zScopedDict.__bool__  s    D&&8$..99rq   c                    t         r   rm  r  s     rR   __delitem__zScopedDict.__delitem__  s    !!rq   )rT  Mapping[KeyType, ValType])rK  rN  r   rO  )rK  rN  r   rO  r   r  )rK  r`  r   r"  r   )rK  rN  rO  Optional[ValType]r   rd  rx  )r   zIterator[KeyType]rX  )rK  rN  r   r  )r   r   r   r   r  r  rY  r[  r  rw  r_  r#  rb  r   rq   rR   rQ  rQ    s5    4'
$B4
:"rq   rQ  )frozen_defaultc              (    dfd}| |S  ||       S )Nc                    t         j                  dk\  rt        j                  | d      S t        j                  |       S )N)r  r  T)kw_onlyr   r   )r  version_infodataclasses	dataclass)r   r   s    rR   wrapzir_dataclass.<locals>.wrap  s;    w&((d6JJ ((V<<rq   )r   rh   r   rh   r   )r   r   rl  s    ` rR   ir_dataclassrm    s    = {9rq   c                     t         j                  j                  j                         } | "| j                  r| j                  j
                  S y r   )rK   r  r  r  fw_metadatabw_donated_idxs)r  s    rR   get_donated_idxsrq    s=    mm22::<O"'B'B**:::rq   c                       e Zd ZdZdZdZdZdZy)TritonAttrsDescriptorVersionr   r3   rD  r  r  N)r   r   r   V0_NO_TRITONV1_COMPILERV2_BACKENDSV3_BACKENDS_TUPLEV4_DICTr   rq   rR   rs  rs    s     LKK	  Grq   rs  c                 P   t         j                  j                  d      t        j                  S dd l} dd l} t        | j                  j                  d      rt        j                  S t        | j                  j                  d      rt        j                  S t        j                  S )Nr  r   AttrsDescriptor)	importlibutil	find_specrs  rt  triton.backends.compilertriton.compiler.compilerr   r  compilerrv  ru  rx  )r  s    rR   #get_triton_attrs_descriptor_versionr     s{    ~~)1+888##v''):; ,777	))+<	=+777 ,333rq   c                 8    t               t        j                  k(  S r   )r  rs  rx  r   rq   rR   triton_version_uses_attrs_dictr    s    .04P4X4XXXrq   c                   ddl m} t        | |j                        syt        | j                  t
        j                  j                        r;t
        j                  j                  j                  | j                  j                  v ryy)zq
    Returns True if the node is an op that is not cudagraphable.
    Usually only custom ops have this tag.
    r3   r  FT)r  r  rs   rM  rL  rK   rt  ru  r   rx  r  rz  rT  s     rR   is_cudagraph_unsafe_opr    s^    
 dB--. 	4##UZZ%:%:;HHLL))T-=-=-B-BBrq   c                    t         j                  j                  dd      } t        j                         rUddlm}  |       }|rFt         j                  j                  |dd      }| r!t         j                  j                  || g      n|} | S )NLD_LIBRARY_PATHr  r   )get_runtime_pathr  lib)
r  r  r  rd   r9  libfb.py.parutilr  r  r  pathsep)r  r  runtime_pathlib_paths       rR   get_ld_library_pathr  1  sg    ::>>+R0D5')ww||L)UCH8<2::??Hd#34(DKrq   c                F    ddl m} t        | |      xr | j                  d uS )Nr   )SubgraphPythonWrapperCodegen)torch._inductor.codegen.wrapperr  rs   partition_signatures)r  r  s     rR   #is_codegen_graph_partition_subgraphr  >  s*    L 	789 	5((4rq   c                     t         j                  j                  j                  j                  xs t
        j                  d uxr$ t         j                  j                  j                  S r   )rK   r  rd   r  
cudagraphs&_unstable_customized_partition_wrapperr  r  r   rq   rR   is_using_cudagraph_partitionr  G  sN    %%00 	F199E1 //
 
 
0
01rq   c                    ddl m} |j                  j                  j	                  | d      r6|j                  j                  j                  | d      rt        j                  S t        j                  S )Nr3   rT  l        i   )	rW  rU  rX  rY  statically_known_ltr  rK   r<  r>  )r  rU  s     rR   dtype_from_sizer  N  sP    ww++e
''


/
/h
?{{{{rq   )r   rE   c                h    | dk(  r(t         j                  j                  j                         S d| v ryy)z;
    Returns True if the device supports MKL-DNN BF16.
    r   rE   TF)rK   r   r*  _is_mkldnn_bf16_supportedr   s    rR   is_mkldnn_bf16_supportedr  \  3     eyy99;;	+	rq   c                h    | dk(  r(t         j                  j                  j                         S d| v ryy)z;
    Returns True if the device supports MKL-DNN FP16.
    r   rE   TF)rK   r   r*  _is_mkldnn_fp16_supportedr  s    rR   is_mkldnn_fp16_supportedr  h  r  rq   c           
     n   |D cg c]  }t        t        |             }}| D ]R  }t        |      t        |      k(  sJ t        |      D ])  \  }}t        ||   t        t        |                  ||<   + T g }|j	                  dj                  d t        ||      D                     t        |      t        |      dz  z   t        |      dz
  z   }|j	                  d|z         | D ]3  }|j	                  dj                  d t        ||      D                     5 dj                  |      S c c}w )N|c              3  6   K   | ]  \  }}d || dd   ywr  r  Nr   )r   hr*  s      rR   r   ztabulate_2d.<locals>.<genexpr>{  s$     H41aAa0tWA,H   rD  r3   rL  c              3  6   K   | ]  \  }}d || dd   ywr  r   )r   r   r*  s      rR   r   ztabulate_2d.<locals>.<genexpr>  s$     Htq!!QCp4lHr  r  )rM   r   r   r   r  r  r   r   )elementsheadersr   widthsrowr   r/  total_widths           rR   tabulate_2dr  t  s   #*+ac#a&k+F+ 43x3w<'''cN 	4DAqF1Is3q6{3F1I	44 E	LLH3w3GHHIf+Vq1S[1_EK	LL{"# JSXXHs37GHHIJ99U ,s   D2c              #     K   t        | j                               t        |j                               z  }|D ]3  }| j                  |      }|j                  |      }|||n|||n|f 5 yw)a  
    Zip two dictionaries together, replacing missing keys with default values.

    Args:
        dict1 (dict): The first dictionary.
        dict2 (dict): The second dictionary.
        d1_default (Any): the default value for the first dictionary
        d2_default (Any): the default value for the second dictionary

    Yields:
        tuple: A tuple containing the key, the value from dict1 (or d1_default if missing),
               and the value from dict2 (or d2_default if missing).
    N)r!   r  r  )dict1dict2
d1_default
d2_defaultall_keysrK  value1value2s           rR   	zip_dictsr    sv     ( %**,'*UZZ\*BBH  	
33 (Fj(Fj
 	
	
s   A-A/c                   	 	 	 	 	 	 	 	 d	d}| j                  dt        j                  j                        }| j	                         } |rA || dd        || dd        || dt
        j                  j                           || dd       | S )
a1  
    Ensures the configuration is internally consistent for standalone AOTInductor.

    If `aot_inductor.compile_standalone` is set to True in the provided
    `config_patches` (or falls back to the global config), this function ensures
    that the following configs are also enabled:
        - `aot_inductor.package_cpp_only`

    Args:
        config_patches (dict[str, Any]): A dictionary of user-provided config
            overrides for AOTInductor compilation.

    Returns:
        dict[str, Any]: The possibly-updated `config_patches` dictionary.
    c                    | j                  |t        t        |            }||| |<   y |s||k7  rt        d| d| d      y y )NzInvalid config: =z. when aot_inductor.compile_standalone is True.)r  rJ   rd   r   )config_patchesconfig_nameconfig_valuer   s       rR   patch_configz2maybe_aoti_standalone_config.<locals>.patch_config  s]     "";0LM=*6N;'5L0";-q>lm  1rq   zaot_inductor.compile_standalonezaot_inductor.package_cpp_onlyTz aot_inductor.embed_kernel_binaryz#aot_inductor.emit_multi_arch_kernelz+aot_inductor.model_name_for_generated_files
aoti_model)r  dict[str, Any]r  r   r  r   r   r  )r  rd   aot_inductorcompile_standalonecopyrK   r  r  )r  r  r  s      rR   maybe_aoti_standalone_configr    s    "	&	58	HK			 (++)6+>+>+Q+Q $((*N^%DdK^%GNAu}}GXGXCX	
 	I<	
 rq   c                     ddl m}  | j                  j                  }|yt	        |t
              st        d      |dk(  ryt        j                  d|      st        d      y)zL
    Validates if a model name is suitable for use in code generation.

    r   rc   Tz4Invalid AOTI model name: Model name must be a stringr  z^[a-zA-Z_][a-zA-Z0-9_]*$zVInvalid AOTI model name: Model name can only contain letters, numbers, and underscores)	r  rd   r  model_name_for_generated_filesrs   r   r  r   r   )rd   
model_names     rR   is_valid_aoti_model_namer    sh    
 '$$CCJj#&OPPR 88/<d
 	
 rq   c                2    |rt        |       S t        |       S r   )r'   r&   )rO   unbacked_onlys     rR   get_free_symbolsr    s    $Q''Arq   c                    t         j                  j                  sy| |  }|rE|j                  x}r7|j	                         x}r%|j
                  j                  dd      x}r| d| }t        j                  |       y)z
    Cudagraph partition may lead to extra memory overhead so we
    log partition reasons to help users understand the overhead.
    Nstack_tracez. Found from : 
 )	rd   r  r  r  get_origin_noder   r  perf_hint_logr  )r  r^  r  warning_msgir_noder[  r  s          rR   maybe_log_cudagraph_partitionr    s     ==##HSE"K 			!W!//11W1#LL,,]DAA[A$%7}E+&rq   c                    i t         j                  dt         j                  j                  dt         j                  j	                  t
        j                              i} t        j                         rt        j                  d      | d<   | S )zA
    Get a base environment for running Python subprocesses.
    
PYTHONPATHTORCH_CUSTOM_PYTHONPATHr  
PYTHONHOME)r  r  r  r  r  r  r  rd   r9  	sysconfigget_path)envs    rR   python_subprocess_envr    sl    

** 	bjjnn%rzzsxx'@
	C  %..v6LJrq   c                  &    e Zd ZU dZded<   ded<   y)CUDAGraphWrapperMetadataz
    Metadata for Customized CUDAGraphWrapper.

    Currently assumes there is 1 dynamo graph and will extend to
    multiple graphs in the future.
    r   num_partitionspartition_indexNr   r   rq   rR   r  r  .  s      rq   r  .c                      e Zd ZU dZded<   y)CUDAGraphWrapperNzOptional[CUDAGraphWrapperType]r  )r   r   r   r  r   r   rq   rR   r  r  E  s    .2G+2rq   r  c                    | t         _        y r   )r  r  )r  s    rR   !set_customized_partition_wrappersr  W  s    5<*2rq   c                8   | j                   j                  }| j                   j                  g || j                   j                  | j                   j                        }| j                   j                  }t        j                  ||f      \  }}dd}|D cg c]7  } ||      r+t        j                  j                  j                  |d      n|9 }}dddfd}|D cg c]
  } ||       }}t        j                  ||      \  }}||fS c c}w c c}w )	Nc                    t        | t        j                  j                  j                        xr/ t        | t        j                  j                  j
                         S r   )rs   rK   r  r  r;   GeneratorStatere  s    rR   _is_tensor_irz(snode_args_kwargs.<locals>._is_tensor_ird  sH    !U__//667 

u!!00A
 =
 	
rq   F)guard_shapec                2    t        j                  | ||      S )Nr   )rK   r   )r  r   r   s      rR   _tensorz"snode_args_kwargs.<locals>._tensorp  s    {{4uV<<rq   c                    t        | t        j                        s| S  | j                         | j                  | j
                        }|S r   )rs   rK   r  r  r   r   )r   r  r  s     rR   to_real_tensorz)snode_args_kwargs.<locals>.to_real_tensors  s7    !U\\*Haffh2
rq   rX  )r   r  )r   r   r   r   )r  r  fill_non_provided_argsconstant_argsr  pytreer"   rK   r  r  ir_node_to_tensortree_unflatten)	r]  rz   r  	flat_argsflat_args_pytree_specr  r  r  r  s	           @rR   snode_args_kwargsr  [  s   ::D::,,*$*))*

D ZZF'-':':D&>'J$I$
 	   	,,QE,B	I = -66q"6I6((4IJLD&<%  7s   <D$DrV  )ro   r   r   r   )r}   r   r   r"  )   d   )r   zCallable[[], Any]r   r   r   r   r   rB  rX  )r   z"Union[Optional[torch.device], str]r   torch.device)r  zIterable[sympy.Expr]r   r   )r  Sequence[sympy.Expr]r  r  r   r   )r  zIterable[_T]r   zValuesView[_T])r  r  r  r  r   r  )rK  r=  r   r   )rQ  z"Iterable[Union[int, torch.SymInt]]r   zlist[sympy.Expr])r   r  r   zUnion[int, torch.SymInt])rQ  z Iterable[Union[int, sympy.Expr]]r   zlist[Union[int, torch.SymInt]])rg  torch._ops.OpOverloadr   r"  )r{  r2   rr  z'Callable[[torch._ops.OpOverload], bool]r   r"  )rs  r   rz   r|  r  r  r   z&tuple[GraphModule, list[torch.Tensor]])rC   )r   r   r   r  )r3   rC   )
r  Callable[..., Any]r  Sequence[Any]r   r   r   r   r   rB  )r   r  r  g      ?rC   )r  r  r  r  r   r   r  r   r  rB  r   r   r   rB  )r  r   r  r   r   r  )r  r   r  r   r   r  )r  r   r  r   r   r   )rO   zUnion[int, Sequence[int]]r  r   r   Sequence[int])rO   ztuple[_T, ...]r   zlist[_T])r   z!Callable[Concatenate[Any, P], RV]r   zCachedMethod[P, RV])r  0Union[Sequence[BaseSchedulerNode], ExternKernel]r   zOrderedSet[Node])r  Sequence[BaseSchedulerNode]r  z8Literal[True, 'torch', 'original_aten', 'inductor_node']r   r   )r  r  r  r6   r   ztuple[str, str]r   )r.  zIterable[torch.fx.Node]r/  zOptional[Callable[[Any], bool]]r   OrderedSet[torch.fx.Node])rz   zSequence[IRNode]r  zdict[str, IRNode]r   r  rT  )r   r   r   zValueRanges[Any])r^  r   r   r"  )r^  r`   r   r   r   ru  )rj  r"  r   r"  )r   r   r   ru  )rF  r   rw  zdict[sympy.Expr, Any]r   r   )r  r   r   z,TypeGuard[Union[torch.SymInt, torch.Tensor]])rz   r   r   r"  )r  torch.fx.GraphModuler   zOptional[torch.fx.Node])r  r   r   r2   )r  r   r   zOrderedSet[torch.device]rW  )r  r   r   r   )NNT)r  zOptional[dict[str, Any]]r  r  r  r"  r   r6  )r  r  r   	list[int])rZ  r)   r  z.Sequence[Union[int, torch.SymInt, sympy.Expr]]r   r  )r   torch.dtyper   r   r5  rU  )r  zUnion[int, torch.device]r   r"  rx  )r  r   r   r  r  Optional[int]r   r4   )r  r<   r  zlist[torch.dtype]r   r"  )r  r   r   r"  )
r  r<   r  r"  r  r"  r  r"  r   r"  )r  r;   r  r"  r   r"  )
r  r<   r  r   r  r   rx  r   r   r"  )r  r   r   r"  )r  r   r  r   rx  r   r   r"  )r  r   r  r   rx  r   r   r  )r   r   r   r   )r   zQtuple[Optional[str], Callable[[], list[Any]], Callable[[], list[Any]], type[Any]])r  r<   r   r"  )r  r<   rQ  zUnion[ReinterpretView, Buffer]rR  r;   r   r"  )FTFN)r  r<   rQ  r;   rR  r;   rZ  r"  rN  r"  rr  r"  rb  r  r   r"  )r   Callable[P, _T]rz   r  r  r  r   ztuple[_T, list[str]])r   r  r   ztuple[Any, list[str]])r   r  rz   r  r  r  r   r   )r   r  rz   r  r  r  r   r   )r   r  rz   r  r  r  r   ztuple[Any, list[GraphLowering]])r  r  r  r  r   r6  )r  r  r  zOptional[Callable[..., Any]]r   r   )r  r   r   r  )r   r  )r  r  r   r"  )r  zSequence[torch.Tensor]r   r"  )r=  r   r   r  )r  r"  rz   r   r  r   r   zIterator[Any])r   r  r   rB  )r  r   r   r"  )r  r   r   r   )r  zIterable[Any]r   r"  )
rR  r  r  r1   r7  r  r  r   r   r  )rA  z"Optional[Union[Buffer, Operation]]r   r"  )r  z Optional[Union[Node, Operation]]rg  z!Optional[torch._ops.OperatorBase]r   r"  )r  z"Optional[Union[IRNode, Operation]]r   r"  )r]  rA   r   r"  )r  zOptional[Operation]rg  z?Union[torch._ops.OpOverload, Collection[torch._ops.OpOverload]]r   r"  )re  r   rf  r  rg  r  r   r   )r]  rA   rp  zMutableSet[BaseSchedulerNode]rf  zdict[str, SchedulerBuffer]rg  zdict[str, BaseSchedulerNode]rm  zCallable[[Any], bool]r   r  )r}  r   r~  r   r   r   )r  r   r   r   )r  r   r   r6  )r   r   r   r   )r   r  r   r"  )r   r   r   r"  )r   r  r   r"  )rL  r  r  r  r  r  r  r  r  r   r  r"  r   r"  )r  r  r   r  )r   r  r   r"  )r  r  r   r"  )r   rY  )r   r  rz   r  r  r  r   ztuple[_T, str])r  Sequence[InputType]r   zOptional[ShapeEnv])r  Callable[[list[InputType]], _T]r  r  r  zOrderedSet[int]r   r  )rO   r  r   r  )r  r  r  r  r  zOptional[OrderedSet[int]]r   z-tuple[list[torch.Tensor], list[torch.Tensor]])r  r  r  r  r   r  )r   r   r   r"  )r  r  r
  r@   r   r  )r   r  r   r   )r   r   r   r  )r  r  r   r  r   r"  )r   ztuple[str, ...])r   r   r<  r0   r>  r=  r   r  )r   r   r   r  )r   r  r   r  )r   zOptional[type[Any]]r   r"  r   r   )r   zOptional[list[int]])r   rs  )r  r=   r   r"  )r  r6   r   r"  )r  r   r   r  )r   r   r   r"  )r  zSequence[Sequence[T]]r  zSequence[T]r   r   )NN)
r  rc  r  rc  r  ValType | Noner  r  r   zEGenerator[tuple[KeyType, ValType | None, ValType | None], None, None])r  r  r   r  )rO   r(   r  r"  r   zOrderedSet[sympy.Symbol])zcudagraph partition due to N)r  r   r^  r  r  zOptional[BaseSchedulerNode]r   r  )r   zdict[str, str])r  CUDAGraphWrapperTyper   r  )r]  rA   r   z tuple[list[Any], dict[str, Any]](p  
__future__r   r  r7  rj  enumr  r{  r  r2  r:  r  rD  r  r  r  r   r  r   r  r  r  rF  r  r  collections.abcr   r   r   r   r   r	   r
   r   typingr   r   r   r   r   r   r   r   r   r   r   typing_extensionsr   r   r   r   r   r   r   rt   rK   torch.utils._pytreer  _pytreer  $torch._inductor.analysis.device_infor   torch._inductor.runtime.hintsr   torch.utils._dtype_abbrsr    torch.utils._ordered_setr!   r"   r#   OPTIMUS_EXCLUDE_POST_GRADr  r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   torch._prims_commonr0   torch.fxr1   torch.fx.noder2   r  r4   r  r6   rX  r8   r  r9   r:   r;   r<   r=   r>   output_coder@   r  rA   rB   rI   rG   r   rS   torch._dynamo.device_interfacerT   torch._dynamo.utilsrU   torch.autogradrV   torch.autograd.profiler_utilrW   (torch.fx.passes.graph_transform_observerrX   torch.fx.passes.shape_proprY   torch.utils._sympy.functionsrZ   r[   r\   r]   r^   torch.utils._sympy.symbolr_   r`   torch.utils._sympy.value_rangesra   rb   r  rd   runtime.runtime_utilsre   r  _IS_WINDOWS	getLoggerr   r   _logginggetArtifactLoggerr  rh   r  r  	VarRangesr  r   	InputTypeGPU_KERNEL_BIN_EXTSr  r  r  r  rn   rp   ry   Functionr{   rk  r   r   r   r   r  r  r  r  rN  rR  r\  r^  rh  rp  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r,  r2  rA  rS  r\  r_  rf  rk  rn  ry  r}  r  r  r  r  r  r  r  r   r  r  r8  r  clear_on_fresh_inductor_cacheclear_inductor_cachesfresh_inductor_cacher  r  	lru_cacher  r  r  r  r[  ri  r  rz  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r	  r  r   r$  r4  r?  rD  rG  rJ  rL  rS  rO  rv  rx  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rp  r  r  r	  r  r  r  r  r  rm  r%  Enumr'  r<  rB  rE  rQ  rU  rZ  r`  rb  rh  ro  rw  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   compiler  r  r#  r(  r+  r9  r;  r?  rA  rB  rD  rJ  rM  rN  rO  rQ  rm  rq  rs  r  r  r  r  r  r  r  SUPPORTED_MKLDNN_DEVICESr  r  r  r  r  r  r  r  r  r  PartitionFnTyper  r  r  r  r  )rx  r}   s   00rR   <module>r3     s   "        	     	  	   
                $ $ ? : 0 / ; ($ 
  >>//C$",5$TT,= +	CL
   D 0 % 2 K 0  8 D  = llg%g!00<H T]UZZ'(	U5<<ell:;<	'7 	 {Q'A-+2B XDX XB5
LENN  d#  $"GV 9<SS#&S25S
Sl  ;@
+*"*+A**#AL+	+++"/	)/#/G @OI	I<I 
I0 *8+0' 	!  	
 ( %'!  	
    )'#$  cNTT"E8WQU^ E:C*!).!)O!) 	!)HN2CN2!N2 N2f 48*0 (G
G$5GG:,^%	DU	>2-888v'& 
: !# I "	 .29+9	9 9 	9 9z !5 $ " A!!L!!H Q7 7*  , , ,
R' R'j
 
 @ @ @?' ?  8 J J ) )I #'   	(+<	  #  	
  
: 7< dN CH BJ CO,) ,  $  . 5( 5(p @ @ R R:+\H&

8
@F
	
" ""&"&==
= = 	=
  = =  = 
=@'C C"      	 $ &2:/(V &2:## &#2:#$#* ...@.. .$ IMFF)EFF*	B&&   D D %6 %6P  Q0(#K(*$)) *!

!
"-!
4A!
HK!
	!
H1	" -1#
*#)# 
#L(%#J
JGJ 
JLL .LDRLL *=

5
 ,
 5	

 '
 

< *=5 , 5	
 ' 
:T 2     ,!.N $&$!$ $ 	$
 $ $ 
$NH>L'  &2:2(*" ( %	0	: 37$$$$ 0$ 3	$<$ $ 3F!3B	:&/ '#)* $%
  +?*D*D*FG$!QAG  "**Y'H	  & 1 1 1
 68 2 7
8 1 
	 /9l O :)	# )

)
-" 01 -"` D)t  *499  4 42Y&
1 * 		& "&!%	 
$ 
$ 
  
 	 

 K 
F.b6 :(,'	'' &' 
	'26 d#  $ 38$./@ 3 3 *:); &= } Hs   <k8