
    irl                   R   U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlmZmZ d dlmZmZmZmZmZmZmZ d dlmZmZ erd dlmZmZ d dlm Z  d dl!Z!d dl"Z"d dl#Z#d dl$Z#d dl%m&c m'Z( d dl)m*Z*m+Z+ d d	l,m-Z-m.Z. d d
l/m0Z0 d dl1m2Z2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl8m9Z9m:Z:m;Z; d dl<m=Z= ddl>m?Z?m@Z@mAZAmBZBmCZCmDZD ddlEmFZF ddlGmHZHmIZImJZJ ddlKmLZLmMZM ddlBmNZNmOZOmPZPmQZQ ddlRmSZSmTZT ddlUmVZV ddlCmWZWmXZXmYZYmZZZm[Z[ ddl\m]Z] ddl^m_Z_m`Z` ddlambZbmcZc ddldmeZe ddl&mfZfmgZgmhZhmiZimjZjmkZkmlZlmmZmmnZnmoZompZpmqZqmrZrmsZsmtZtmuZumvZv ddlwmxZx  ej                  ez      Z{e#j                  j                  ezd      Z~e#j                  j                  ezd       Ze#j                  j                  ezd!      Zed"   Zd#ed$<    ed%      Z ed&      Z e!j                         Zd'ed(<   	 	 	 	 	 	 dOd)Zej                   G d* d+             Zej                   G d, d-e             Z G d. d"      Zej                  dPd/       ZdQd0ZdRd1ZdSd2Z G d3 d4      ZdTd5Z G d6 d7      Z	 	 	 	 	 	 	 	 dUd8Z G d9 d:e      Z G d; d<e      Z G d= d>e      Z	 	 	 	 dVd?Z	 	 	 	 	 	 	 	 dWdAZ G dB dCe      Z G dD dEe      Z G dF dGe      Z	 dX	 	 	 	 	 	 	 dYdHZej                   G dI dJ             Z ej@                         ZdZdKZ G dL d@      Z G dM dN      Zy)[    )annotationsN)Counterdefaultdict)AnyCallableGenericOptionalTYPE_CHECKINGTypeVarUnion)	ParamSpec	TypeAlias)IteratorSequence)
ModuleType)countersdynamo_timed)LambdaFuturePyCodeCache)TritonTemplateCallerBase)get_metric_tableis_metric_table_enabled)free_symbols
OrderedSet)free_symbol_is_typesymbol_is_typeSymT)
has_triton   )commsconfigconfig_commsdependenciesirmetrics)can_codegen_without_upcasts)BackendFeatureget_scheduling_for_deviceKernel) estimate_nccl_collective_runtime/estimate_nccl_collective_runtime_nccl_estimator)Dep	MemoryDepStarDepWeakDep)GPUTooOldForTritonTritonMissing)count_flops_fx)get_device_typeGraphPartitionSignatureMultiOutputMultiOutputLayout
NoneLayout)LoopBody)MemoryPlanningInfoForBufferMemoryPlanningInfoForNode)
green_textred_text)SimplifyIndexing)&_unstable_customized_partition_wrappercache_on_selfcmpdevice_need_guardget_device_tflopsget_dtype_sizeget_gpu_dram_gbpsGraphPartitionMapIndentedBufferis_collectiveis_cudagraph_unsafe_opis_gpuis_multi_outputs_template#is_output_of_multi_outputs_templateis_waitmaybe_log_cudagraph_partitionsympy_product)Vfusionloop_orderingcompute_dependenciesBaseSchedulerNoder   PartitionType_T_PzEweakref.WeakKeyDictionary[torch._ops.OpOverload, Callable[..., bool]]_custom_should_partition_fnsc                b    t        | t        j                  j                        sJ |t        | <   y)ax  Register a function that says if Inductor should partition the graph on this op.

    The function should be have the same signature as the operator.
    Inductor will invoke the function with FakeTensors when it needs to decide
    if the graph should be partitioned.

    `register_should_partition_rule` is currently private and experimental.
    Use at your own risk.
    N)
isinstancetorch_ops
OpOverloadrX   )opfuncs     S/var/www/html/engine/venv/lib/python3.12/site-packages/torch/_inductor/scheduler.pyregister_should_partition_rulera   f   s)     b%**//000'+ $    c                      e Zd ZU ded<   ded<   ded<    ej
                  e      Zded	<    ej
                  e      Z	d
ed<   ddZ
ddZddZddZddZddZddZddZddZddZy)SchedulerBuffer	Scheduler	schedulerz	ir.BuffernodeOptional[BaseSchedulerNode]defining_op)default_factorylist[NodeUser]usersr:   
mpi_bufferc                B    | j                   }|J |j                         S N)ri   get_name)selfr^   s     r`   defining_op_namez SchedulerBuffer.defining_op_name   s#    ~~{{}rb   c                @    t        | j                  j                        S ro   )hashrg   namerq   s    r`   __hash__zSchedulerBuffer.__hash__   s    DIINN##rb   c                v   t               }| j                         }|j                  | dt        | j                        j
                          |j                  | d| j                  j                          | j                         r-|j                  | dt        | j                                       | j                         r-|j                  | dt        | j                                       t        | j                        dk  r0|j                  | d| j                          |j                         S |j                  | d       |j                  d      5  | j                  D ]  }|j                  | d        	 d d d        |j                  d	       |j                         S # 1 sw Y   *xY w)
N: z
.layout = z.aliases = z.mutations = r    z	.users = z
.users = [,])rG   rp   	writelinetyperg   __name__layoutget_aliasespformatget_mutationslenrl   indentgetrawvalue)rq   resultru   users       r`   	debug_strzSchedulerBuffer.debug_str   s   !}}D6DO$<$<#=>?D6DII,<,<+=>?v[9I9I9K1L0MNOv]74;M;M;O3P2QRStzz?avYtzzl;< !!## vZ01q! 1 JJ 1D$$vQZ011 S!!!##	1 1s   &F//F8c                6    | j                   j                         S ro   rg   rp   rv   s    r`   rp   zSchedulerBuffer.get_name       yy!!##rb   c                   | j                   J | j                   j                         sy | j                   j                         sL| j                   j                         s2t	        | j                   j                         t        j                        r4t        j                  j                  j                  | j                          y t        t        j                  d      r| j                         t        j                  j                  v rt        j                  j                  | j                            }|| j                   j"                  v r$| j                   j"                  |   j                   }n#| j                   j$                  |   j                   }t        j                  j                  j'                  || j                          y t        j                  j                  j                  | j                          y )Nargs)rg   should_allocateget_inputs_that_alias_outputget_mutation_namesrZ   get_output_specr%   CommBufferLayoutrP   graphwrapper_codecodegen_allocationhasattrkernelrp   inplace_update_buffersrf   name_to_donated_buffername_to_bufcodegen_inplace_reuse)rq   input_buffer_nameinput_buffers      r`   allocatezSchedulerBuffer.allocate   sV   yy$$$yy((* II224yy++-$))335r7J7JKGG  33DII> AHHf%188#B#BB !" ? ? P DNN$I$II#~~DD% $   $~~99:KLQQGG  66		
 GG  33DII>rb   c                   | j                   J t        | j                   j                  t        j                        st        | j                         ry| j                  D ]  }t        |j                   t              s y yNFT)rg   rZ   r   r%   r8   rK   rl   
OutputNode)rq   uses     r`   can_freezSchedulerBuffer.can_free   sg    yy$$$dii&&6:SII;
 :: 	C#((J/	 rb   c                ,   i }|D ]o  }t        |j                        |v r>|j                  |t        |j                                 |t        |j                        <   X||t        |j                        <   q t        |j	                               | _        y ro   )idrg   mergelistvaluesrl   )rq   rl   r   r   s       r`   	set_userszSchedulerBuffer.set_users   st    &( 	+C#((|v%'*yy3881E'Fr#((|$'*r#((|$		+
 &--/*
rb   c                R    | j                   J | j                   j                         S ro   )rg   r   rv   s    r`   r   zSchedulerBuffer.get_aliases   s%    yy$$$yy5577rb   c                R    | j                   J | j                   j                         S ro   )rg   r   rv   s    r`   r   zSchedulerBuffer.get_mutations   %    yy$$$yy++--rb   c                R    | j                   j                         j                         S ro   )rg   r   
get_devicerv   s    r`   r   zSchedulerBuffer.get_device   s    yy((*5577rb   Nreturnstrr   intr   Noner   bool)rl   rk   r   r   r   zSequence[str]r   Optional[torch.device])r~   
__module____qualname____annotations__dataclassesfieldr   rl   r:   rm   rr   rw   r   rp   r   r   r   r   r   r    rb   r`   rd   rd   w   sz    
O,,-K--dCE>C.?k.?.?3/J+ 
$$($?B
+8.8rb   rd   c                      e Zd ZU dZded<   y)SchedulerDonatedBufferNrh   ri   )r~   r   r   ri   r   r   rb   r`   r   r      s    /3K,3rb   r   c                     e Zd ZU ded<   ded<   ded<   ded<   ded	<   d
ed<   dZded<   dCdZdDdZdEdZdEdZdEdZ	dFdZ
dEdZdGdZ	 	 	 	 	 	 dHdZdIdZdJdZdKdZdLdZ	 	 	 	 	 	 dMdZdGdZdNdZdNdZdGd ZdGd!Z	 	 	 	 dOd"ZdEd#ZdEd$ZedNd%       ZedNd&       ZedKd'       ZedKd(       ZdPd)Z dQd*Z!dRd+Z"dSd,Z#dKd-Z$dKd.Z%dKd/Z&dKd0Z'dKd1Z(dKd2Z)dKd3Z*dTd4Z+dKd5Z,dGd6Z-	 dU	 	 	 	 	 dVd7Z.edWd8       Z/edWd9       Z0edWd:       Z1	 	 	 	 	 	 dXd;Z2	 	 	 	 	 	 dYd<Z3edZd=       Z4d[d>Z5ed[d?       Z6d\d@Z7d]dAZ8e9	 	 	 	 d^dB       Z:y)_rT   z7tuple[torch.device, tuple[tuple[sympy.Expr, ...], ...]]groupdependencies.ReadWritesread_writesOrderedSet[Dep]unmet_dependenciesr   	min_order	max_orderr;   mpi_nodeNOptional[float]override_estimated_runtimec                "    || _         d | _        y )Nc                     g S ro   r   )r   kwargss     r`   <lambda>z,BaseSchedulerNode.__init__.<locals>.<lambda>   s    B rb   )rf   debug_device_str)rq   rf   s     r`   __init__zBaseSchedulerNode.__init__   s    $-& 	rb   c                Z   || _         t               | _        t        t                  | _        d| _        |j                         D cg c]  }t        | j                  ||        c}| _	        | j                  D ci c]  }|j                         | c}| _        i | _        y c c}w c c}w )NF)rf   rg   ri   )rg   r   	ancestorsr   
last_usagewrittenget_outputsrd   rf   outputsrp   outputs_by_namemutation_renames)rq   rg   outputbufs       r`   _init_from_nodez!BaseSchedulerNode._init_from_node   s    ,0	*4,$
   **,/
  .. /
 ,0<<<
$'CLLNC<
 13#/
<
s   B#;B(c                T    t        |       j                   d| j                         dS )Nz(name=)r}   r~   rp   rv   s    r`   __repr__zBaseSchedulerNode.__repr__  s'    t*%%&fT]]_,?qAArb   c                H   | j                         }t               }|j                  | dt        |       j                   dt        t        | dd            j                   d| dt        | j                  j                         d| dt        | j                         d| d	t        | j                  j                  | j                  z
         d| d
       |j                         5  | j                         D ]!  }|j                  |j                                # 	 ddd       |j                  d       	 |j                  | j                                |j'                         j)                         S # 1 sw Y   XxY w# t         $ r t"        j%                  dd       Y Lw xY w)#Longer form printout for trace logsry   (rg   N)

.writes = 
.unmet_dependencies = .met_dependencies = z.outputs = [
        r{   Ignoring error in debug_str()Texc_info)rp   rG   splicer}   r~   getattrr   r   writesr   readsr   r   r   r|   debug_str_extra	Exceptionlogwarningr   rstrip)rq   ru   r   outs       r`   r   zBaseSchedulerNode.debug_str  s   }}

bd		QtGD&$$?@IIJ Kj))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU V 		
 ZZ\ 	,'') ,

3==?+,	, 	c	HJJt++-.  ''))	, 	,  	HKK7$KG	Hs   %5E25E> 2E;> F! F!c                     y)N r   rv   s    r`   r   z!BaseSchedulerNode.debug_str_extra2      rb   c                $    | j                  |       S ro   )r   rv   s    r`   _debug_str_for_devicez'BaseSchedulerNode._debug_str_for_device5  s    $$T**rb   c                   t        | j                  dd       }d}t        |t        j                  j
                  j                        r'd|j                  |j                         gdd      z   }nct        |t        j                  j
                  j                        r5d|j                  |j                         |j                         gdd      z   }|  | S )Ndatar   z, F)shorten	multiline)r   rg   rZ   r[   	_inductorr%   	Pointwise
str_helperget_size	Reductionget_reduction_sizeget_reduction_type)rq   
maybe_datadata_strs      r`   debug_str_shortz!BaseSchedulerNode.debug_str_short8  s    TYY5
j%//"4"4">">?j33$$&'% 4  H 
EOO$6$6$@$@Aj33..0*2O2O2QR 4  H
 z""rb   c                p    t         j                  d| | j                  | j                  j                         y )Nz(%s: unmet_dependencies = %s, writes = %s)r   infor   r   r   rv   s    r`   log_detailszBaseSchedulerNode.log_detailsG  s,    6####		
rb   c                     yNFr   )rq   self_dep	other_deps      r`   reorder_loops_by_dep_pairz+BaseSchedulerNode.reorder_loops_by_dep_pairO       rb   c                    d | j                   j                         D        D ci c]  }||v r|||    c}| _        | j                  | j                   j	                  | j                               y c c}w )Nc              3  4   K   | ]  }|j                     y wro   ru   .0deps     r`   	<genexpr>z9BaseSchedulerNode.update_mutated_names.<locals>.<genexpr>W  s     QcQ   )r   reads_and_writesr   set_read_writesrename)rq   renamesru   s      r`   update_mutated_namesz&BaseSchedulerNode.update_mutated_namesT  sp     RT-=-=-N-N-PQ!
w '$-!

 	T--44T5J5JKL!
s   A2c                X    | j                  | j                  j                  |             y ro   )r  r   	with_readrq   r  s     r`   add_fake_depzBaseSchedulerNode.add_fake_dep\  s!    T--77<=rb   c                B    t        d | j                         D              S )Nc              3  `   K   | ]&  }|j                         xs |j                          ( y wro   )r   r   )r  r   s     r`   r  z=BaseSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>`  s-      
9<COO4!2!2!44
s   ,.)anyr   rv   s    r`   has_aliasing_or_mutationz*BaseSchedulerNode.has_aliasing_or_mutation_  s%     
@D@P@P@R
 
 	
rb   c                h    || _         | j                   j                  | _        | j                          y ro   )r   r   r   
prune_deps)rq   rws     r`   r  z!BaseSchedulerNode.set_read_writesd  s(    "&"2"2"8"8rb   c                b    | j                         }t        fd|D              }||z
  | _        y )Nc              3  B   K   | ]  }j                  ||        y wro   )get)r  kmutation_real_names     r`   r  z3BaseSchedulerNode.set_last_usage.<locals>.<genexpr>m  s     !U1"4"8"8A">!Us   )used_or_aliased_buffer_namesr   r   )rq   future_used_buffersr0  used_bufferss     ` r`   set_last_usagez BaseSchedulerNode.set_last_usagei  s0     88:!!U!UU&)<<rb   c                F    | j                   D ]  }|j                           y ro   )r   r   )rq   r   s     r`   mark_runzBaseSchedulerNode.mark_runp  s    << 	CLLN	rb   c                    t        d t        j                  | j                  j                  | j                  j
                        D              S )Nc              3  4   K   | ]  }|j                     y wro   r  r  s     r`   r  z6BaseSchedulerNode.used_buffer_names.<locals>.<genexpr>u  s      
 HH
r  )r   	itertoolschainr   r   r   rv   s    r`   used_buffer_namesz#BaseSchedulerNode.used_buffer_namest  s?     
 t'7'7'='=t?O?O?V?VW
 
 	
rb   c                $   t               t        j                  | j                  j                  | j                  j
                        D cg c]  }|j                   }}t        |      dkD  r|j                         }j                  |       t        j                  j                  j                  |      rC|j                  fdt        j                  j                  |   j                         D               t        |      dkD  rS c c}w )Nr   c              3  *   K   | ]
  }|vr|  y wro   r   )r  alias
used_namess     r`   r  zABaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<genexpr>  s#       J.	 s   )r   r9  r:  r   r   r   ru   r   popaddrP   r   name_to_bufferr.  extendr   )rq   r  depsr?  s      @r`   r1  z.BaseSchedulerNode.used_or_aliased_buffer_namesz  s    &0l
 !t'7'7'='=t?O?O?V?VW
 HH
 
 $i!m((*CNN3ww%%))#. !"!7!7"224	 	 $i!m 
s   Dc                L     t         fd j                  D               _        y )Nc              3  f   K   | ](  }|j                   j                  j                  vr| * y wro   )ru   rf   available_buffer_namesr  r  rq   s     r`   r  z/BaseSchedulerNode.prune_deps.<locals>.<genexpr>  s/      -
xxt~~DDD -
s   .1r   r   rv   s   `r`   r*  zBaseSchedulerNode.prune_deps  s#    ", -
..-
 #
rb   c                     d fdt        fd j                  j                  D              } j                   j                  j	                  |             y )Nc                    t        | t              syj                  j                  | j                     j                         }|t        j                  j                  v S r  )	rZ   r0   rf   r   ru   rr   rP   r   removed_operations)r  op_namerq   s     r`   should_prunez7BaseSchedulerNode.prune_weak_deps.<locals>.should_prune  sF    c7+nn00:KKMGagg8888rb   c              3  4   K   | ]  } |      s|  y wro   r   r  r  rN  s     r`   r  z4BaseSchedulerNode.prune_weak_deps.<locals>.<genexpr>  s      
\#5FC
   r  r-   r   r   )r   r   r   r  remove_reads)rq   	to_removerN  s   ` @r`   prune_weak_depsz!BaseSchedulerNode.prune_weak_deps  sN    	9  
++11
 
	 	T--::9EFrb   c                F    t        | || j                  j                         y ro   )_prune_redundant_depsrf   r   )rq   name_to_fused_nodes     r`   prune_redundant_depsz&BaseSchedulerNode.prune_redundant_deps  s     	d$68R8RSrb   c                R    | j                   J | j                   j                         S ro   )rg   get_operation_namerv   s    r`   rp   zBaseSchedulerNode.get_name  r   rb   c                "    | j                         S ro   rp   rv   s    r`   get_first_namez BaseSchedulerNode.get_first_name  s    }}rb   c                B    t        d | j                         D              S )Nc              3  <   K   | ]  }|j                           y wro   r]  r  rg   s     r`   r  z8BaseSchedulerNode.get_operation_names.<locals>.<genexpr>  s     Gd$--/G   )r   	get_nodesrv   s    r`   get_operation_namesz%BaseSchedulerNode.get_operation_names  s    Gdnn6FGGGrb   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wro   r]  r  r   s     r`   r  z5BaseSchedulerNode.get_buffer_names.<locals>.<genexpr>  s     AS#,,.Arb  )r   r   rv   s    r`   get_buffer_namesz"BaseSchedulerNode.get_buffer_names  s    ADLLAAArb   c                B    t        d | j                         D              S )Nc              3  Z   K   | ]#  }t        |t              xr t        |d        % yw)T)disallow_fp32_opsNrZ   SchedulerNoder'   r  ns     r`   r  zABaseSchedulerNode.can_codegen_in_low_precision.<locals>.<genexpr>  s7      
  q-( G+AFG
s   )+allrc  rv   s    r`   can_codegen_in_low_precisionz.BaseSchedulerNode.can_codegen_in_low_precision  s%     
 ^^%
 
 	
rb   c                B    t        d | j                         D              S )Nc              3  V   K   | ]!  }t        |t              xr t        |       # y wro   rl  rn  s     r`   r  z@BaseSchedulerNode.can_codegen_without_upcasts.<locals>.<genexpr>  s-      
 q-(K-H-KK
s   ')rp  rv   s    r`   r'   z-BaseSchedulerNode.can_codegen_without_upcasts  s#     
^^%
 
 	
rb   c                    | gS ro   r   rv   s    r`   rc  zBaseSchedulerNode.get_nodes  s	    vrb   c                    | j                   S ro   )r   rv   s    r`   r   zBaseSchedulerNode.get_outputs  s    ||rb   c                     | j                   |   S ro   )r   )rq   buf_names     r`   
get_outputzBaseSchedulerNode.get_output  s    ##H--rb   c                R    | j                   J | j                   j                         S ro   )rg   r   rv   s    r`   r   zBaseSchedulerNode.get_device  s%    yy$$$yy##%%rb   c                L    | j                         }|d uxr |j                  dk(  S Ncpu)r   r}   rq   devices     r`   is_cpuzBaseSchedulerNode.is_cpu  s'    "T!:fkkU&::rb   c                X    | j                         }|d uxr t        |j                        S ro   )r   rJ   r}   r~  s     r`   rJ   zBaseSchedulerNode.is_gpu  s'    "T!9fV[[&99rb   c                     yr  r   rv   s    r`   is_reductionzBaseSchedulerNode.is_reduction      rb   c                     yr  r   rv   s    r`   is_split_scanzBaseSchedulerNode.is_split_scan  r  rb   c                     yr  r   rv   s    r`   is_templatezBaseSchedulerNode.is_template  r  rb   c                     yr  r   rv   s    r`   	is_externzBaseSchedulerNode.is_extern  r  rb   c                     yr  r   rv   s    r`   
is_foreachzBaseSchedulerNode.is_foreach  r  rb   c                     yr  r   rq   read_deps     r`   can_inplacezBaseSchedulerNode.can_inplace  r  rb   c                     yr  r   rv   s    r`   has_side_effectsz"BaseSchedulerNode.has_side_effects  r  rb   c                \
    ddl m} t         t              rt        j
                  rt        j                  j                   j                         t        j                        r{t        t        j                  t        j                  j                  j                   j"                        rt%        t        j                  dd      t'        t        j                  d      sy j(                  t        j                  j*                  z   j,                  j.                  z  }d fd} j1                         D ]  }|j2                  }|J |j5                         rJ|j7                         s:|j9                         s*|j;                         t        j                  j<                  v ro j>                  j@                  D ]h  }|jB                   j,                  jD                  v r$ j,                  jD                  |jB                     }n/ j,                  jF                  jI                  |jB                        }|s|t        j                  jJ                  jM                  |       st        |jN                  tP              r|jR                  J |jR                  D cg c]   }|j2                  j;                         |vr|" }	}tU        |	      dk(  s|	d   jV                  s&|	d   j2                   u s9|j2                  Gt        |j2                  jY                         tZ        j\                  tZ        j^                  tZ        j`                  f      r|jN                  rft        |jN                  j2                  tZ        jb                  tZ        jd                  f      r(tU        |j2                  j7                               dkD  r ||j2                  |j2                        s+ ||      s5t        j                  jf                  ji                  |j;                         |j;                                t        t        j                  t        j                  j                  j                   j"                        rnt        j                  jj                  jm                  |j;                                t        j                  jj                  jm                  |j;                                |j;                         t        j                  jn                  |j;                         <      yc c}w )	z~
        Decide if there should be inplace updates for the node
        and record the decision in the active kernel.
        r    )can_match_buffer_size	mutationsNr   c                   | j                   j                        }| j                         t               }| j                  D ]  }|j
                  }t        |t              s |j                         | j                   j                  vs| j                   j                  |      |urd|fd|j                  j                         D        z  }t        |      dkD  s y y)Nc              3  @   K   | ]  }|j                   k(  r|  y wro   r  )r  orx  s     r`   r  z^BaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node.<locals>.<genexpr>"  s%      vv)    r    FT)rf   get_fused_noderp   r   rl   rg   rZ   rT   r^  rX  r   r  r   )buf_to_be_inplaced
fused_noderD  r   	user_noderx  rq   s        @r`   single_index_in_fused_nodezKBaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node
  s    
 ,55DDTJJ)224H %/LD*00 ! II	!)->? ,,.-77JJK)33BB9M%&  &22CCE 
 t9q= '!* rb   r   )r  rd   r   r   )8codegen.wrapperr  rZ   rm  r"   inplace_buffersrP   r   has_featurer   r(   INPLACE_BUFFERSr   r[   r  codegensimd
SIMDKernelr   r   r   rL  rf   completed_operationsr   rg   r   r   r   rp   removed_buffersr   r   ru   r   r   r.  r   	can_reuseri   NopKernelSchedulerNoderl   r   r  r   r%   r8   r7   MutationLayoutSHOULDREMOVEFallbackKernelr6   r   make_inplacer  rA  r   )
rq   r  inconsequential_nodesr  r   buf_noderead	input_bufxremaining_usess
   `         r`   decide_inplace_updatez'BaseSchedulerNode.decide_inplace_update  s   
 	; t]+&&##DOO$5~7U7UVqxx)@)@)E)E)P)PQ188[$7C &) NNgg(()nn112 	 	D ##% C	CxxH''',,.88:..0<<>QWW%<%<<((.. 899 E EE $ E Edii PI $ : : > >tyy II ,,66y$G&y'<'<>TU$??666 "+&66??,4II &N & N+q0*1-99*1-22d:%NN6 *%NN::< " " 4 4 " = =! &11 * ) 5 5 : :!#!2!2BNN C! !$INN$O$O$Q RUV V1)..#((K6yA
 2293E3E3GX%HHeoo&=&=&B&B&M&M HH..2293E3E3GHHH..223<<>B &..0 77G q8C	0&s   %T)c                R   t         j                  sy |r| j                  ry | j                  J | j                  j	                         }g }|D ]0  }|j
                  dk(  r|j                  d       |j                  d       d|j
                   d|j                   }d|j                  v r|d|j                  d    z   }|j                  |       d|j                  v s|j                  d    }|j                  d	d
      d   }|j                  d|j                  dd      j                  dd      j                  dd      j                  dd      z          |j                  d       |j                  d       3 t        |      dk(  ry |j                  |       d| _        y )Nr   r   z#pragma CMT ORIGIN:z#pragma CMT  seq_nrz seq_nr:stack_trace|r    )maxsplit{z{{}z}}r   \z\\z#pragma CMT END ORIGINr   T)r"   comment_originr   rg   get_originsr^   appendtargetmetarsplitreplacer   
writelines)	rq   buffer	only_onceorigins	out_linesr  op_info_strr  stack_trace_last_lines	            r`   codegen_originating_infoz*BaseSchedulerNode.codegen_originating_infoq  s    $$yy$$$))'')	 	%AttxR 23(az:K166!)hqvvh7G6H,II[)&!"!6 7(3(:(:3(:(KB(O%  "+33C>WS$'WT4(Wf	   !9:  $3	%6 y>Q 	)$rb   c                (    | j                  dd      S )NTinclude_readsinclude_writes!get_read_write_buffers_sizes_implrv   s    r`   get_read_write_buffers_sizesz.BaseSchedulerNode.get_read_write_buffers_sizes  s    55t 6 
 	
rb   c                (    | j                  dd      S )NTFr  r  rv   s    r`   get_read_buffer_sizesz'BaseSchedulerNode.get_read_buffer_sizes  s    55u 6 
 	
rb   c                (    | j                  dd      S )NFTr  r  rv   s    r`   get_write_buffer_sizesz(BaseSchedulerNode.get_write_buffer_sizes  s    55 6 
 	
rb   c                Z    t        | j                  ||      j                         d      S )Nr  r   )start)sumget_read_write_buffer_accessesr   )rq   r  r  s      r`   r  z3BaseSchedulerNode.get_read_write_buffers_sizes_impl  s3     //+N 0 fh	
 	
rb   c                    t         t              ri S t         t              rt         j                  t              ri S t         t              r`t         j                  t
        j                        r< j                  j                  t        j                  j                  j                  u ri S ddt         t              r@ t         j                         d         t         j                         d         z        nt        d      t!        j"                  t$              }|r9 j&                  j(                  D ]   }||j*                     j-                  |       " |r9 j&                  j.                  D ]   }||j*                     j-                  |       " |r&t1        d  j&                  j(                  D              n	t1               }|r&t1        d  j&                  j.                  D              n	t1               }d fdt         t2              rt1         fd|D              }||z
  }||z
  }i }||z  D ]  }	t5        fd	||	   D              |	t6        j8                  j:                  v rt6        j8                  j:                  |	   }
n;|	t6        j8                  j<                  v rt6        j8                  j<                  |	   }
n	 	 	 	 d fd
 |
      }|	|vr|||	<   ||	xx   |z  cc<    |S )az  
        Counting the number of bytes accessed for a kernel is
        surprisingly tricky. In particular, there is a differentiation
        between 'theoretical' memory accesses and practical memory
        accesses. For example, a layernorm kernel may actually access an
        input 3 times, but in theory, it only needs to access its input
        once (and may be optimized to do so through say, persistent
        reductions)

        Another example is that even though a buffer is passed in, we may
        not access the entire buffer. This may occur if we are accessing
        a slice of the buffer. Another tricky case is for indirect
        indexing, where the amount of bytes accessed depends on the
        values of the input.

        What this function aims to compute is the memory accesses for
        worst-case inputs, best-case optimization. What this means is
        that for each buffer we compute the amount of potential accesses in two ways and take the minimum.

        1. Numel in ranges multiplied by number of deps the buffer has
        2. The buffer size

        Returns memory accesses per buffer.
        c                X    t         j                  j                  j                  | d      S )Nr   fallback)rP   r   sizevars	size_hint)ss    r`   try_size_hintzGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.try_size_hint  s"    77##--a!-<<rb   r   r        eAc              3  4   K   | ]  }|j                     y wro   r  r  s     r`   r  zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>  s     BCsxxBr  c              3  4   K   | ]  }|j                     y wro   r  r  s     r`   r  zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>   s     CCsxxCr  c                    j                   j                  |    j                  }t        d |D              }t	        |t        |      z
        dkD  S )Nc              3  4   K   | ]  }|j                     y wro   rg   )r  r   s     r`   r  z\BaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized.<locals>.<genexpr>  s     !>$))!>r  r   )rf   r   rl   r   r   )r   snodesrl   buf_usesrq   s       r`   is_materializedzIBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized  sG    NN..s399E!!>!>>Hx*V"44599rb   c              3  J   K   | ]  } |j                         r|  y wro   r  )r  r  r  rq   s     r`   r  zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>  s#      )_S$++-N)s   ##c              3  "   K   | ]  }  y wro   r   )r  r  
node_numels     r`   r  zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>  s     $RCZ$Rs   c                   | syt        | t        j                        r| j                         S t        | j                  t
              rj                  j                  | j                            j                  }d}|D ]x  }t        |j                  t              sJ t        |j                  j                  t              r5|j                  j                         D ]  }| |j                        z  } x y |S t        | j                  t        j                        r"t        fd| j!                         D              S  	t#        | j%                                     }t'        | j)                               t+        |      z  S )Nr   c              3  h   K   | ])  } t         j                  j                  |             + y wro   )rP   r   
get_buffer)r  mut_nameget_buf_bytess     r`   r  zZBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes.<locals>.<genexpr>6  s-      $ &agg&8&8&BCs   /2)rZ   r%   TorchBindObjectr  r   r7   rf   r   rp   rl   rg   rT   r6   r   r8   r  r   rO   r  rD   	get_dtypemin)
r   rl   totr   	sched_buf	buf_elemsbuf_accessed_elemsr  rq   r  s
         r`   r  zGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes  sC    c2#5#56,,..

,=> !NN66s||~FLLEC % 	%)$))5FGGG%diinnkB-1YY-B-B-D E	 #}Y^^'D DE $%	% J

BMM: (+(>(>(@  
 !.mCLLN.K LI)#--/:S*I>  rb   )r  z
sympy.Exprr   r   )r   r   r  Sequence[BaseSchedulerNode]r   r   )r   z<Optional[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject]]r   r   )rZ   r  ExternKernelSchedulerNoderg   r6   r%   r  op_overloadr[   _prims	rng_primsgraphsafe_run_with_rng_staterm  rO   
get_rangesr   collectionsr   r   r   r   ru   r  r   r   FusedSchedulerNoder  rP   r   rB  graph_inputs)rq   r  r  buf_accessesr  r   r   r  buf_byte_accessesrx  r   	buf_bytesr  r  r  r  r  s   `           @@@@@r`   r  z0BaseSchedulerNode.get_read_write_buffer_accesses  s   6 d23Id56:II{<
 It67499b&7&78		%%||%%BBC I	= dM*&doo/23 1! 456J
 SJ"..t4''-- 3SXX&--c23 ''.. 3SXX&--c23
  B4+;+;+A+ABB 	  C4+;+;+B+BCC 		:
 d./( )%) O o-FO+E,. 1	9H!$$R<;Q$R!R177111gg,,X6QWW111gg**84!Q!! !F &c*I00.7!(+!(+y8+c1	9f ! rb   c                    | j                   y | j                   j                         }|y t        |      }|y t        j                  j
                  j                  |d      }t        d   dxx   |z  cc<   |S )Nr   r  inductor
flop_count)rg   get_origin_noder3   rP   r   r  r  r   )rq   fx_nodeflopsresolved_flopss       r`   estimate_flopsz BaseSchedulerNode.estimate_flopsH  su    99))++-?w'=))33EA3F\*n<*rb   c                R    | j                   | j                   S | j                         S ro   )r   _get_estimated_runtimerv   s    r`   get_estimated_runtimez'BaseSchedulerNode.get_estimated_runtimeX  s)    **6222**,,rb   c                   | j                         d   j                         d   }|j                  j                         }t	        t        |            syt        | j                        rt        | j                  t        j                        sJ 	 t        j                  rst        |       }t               }|j                  |      }|t        |t              sJ |S t!        |       }|t#        | j                        }|j%                  ||       |S t#        | j                        S t/        | j                        ryt1        |       }||S |j                  j3                         }		 t5               }
t7        |	      dz  }|
dk  rt9        d|
       |dk  rt9        d|       	 | j=                         }|dk(  s|| j?                         |
z  }|dz  }|S d}| j?                         }|dn|}||z  |z  d	z  }||
z  }tA        ||      }|dz  }|S # t&        $ r}t(        j+                  |       Y d}~yd}~wt,        $ r}t(        j+                  |       Y d}~yd}~ww xY w# t:        $ r Y yw xY w)
zC
        Returns estimated op runtime in milliseconds (ms)
        r   Nvaluel    J)z-gpu_memory_bandwidth cannot be <= 0, but got z"gpu_flops cannot be <= 0, but got g    .Ag      ?r  )!rc  r   rg   r   rJ   r4   rH   rZ   r%   IRNoder#   ,runtime_estimations_use_nccl_lib_estimations)get_estimate_runtime_cache_key_from_snodeget_estimate_runtime_cachelookupfloatr,   r+   	set_value
ValueErrorr   r  	TypeErrorrM    maybe_estimate_runtime_benchmarkmaybe_get_dtyperE   rC   AssertionErrorr   r  r  max)rq   r   r   	cache_keycache	cache_valmseretdtypegpu_memory_bandwidth	gpu_flops	flops_estnsfactorcounted_bytescompute_timetransfer_times                     r`   r  z(BaseSchedulerNode._get_estimated_runtime^  sw   
 nnq!--/2))+of-. #dii333LL I$ OI68E %Y 7I ,))U;;;((HNBz=diiHOOIRO8I7		BB TYY
 .t4?J((*	#4#6 )%069I $q($CDXCYZ  A~$'I)%UVV 
 '')	>Y.2247KKBcBI 99;*2*Y6#=%(<< }-#X	o    :  		sC   AH 6H H (>I$ 	I!H66I!II!$	I0/I0c                     y ro   r   rv   s    r`   get_template_nodez#BaseSchedulerNode.get_template_node      rb   c                .    | j                         }|J |S ro   r5  )rq   templates     r`   get_template_node_or_throwz,BaseSchedulerNode.get_template_node_or_throw  s!    ))+###rb   c                f    t        d t        |       D              }| d| }| |   }| |dz   d }|||fS )zQ
        For the list of nodes, get the prologue, template, and epilogue
        c              3  H   K   | ]  \  }}|j                         s|  y wro   r  )r  iro  s      r`   r  zCBaseSchedulerNode.get_prologue_template_epilogue.<locals>.<genexpr>  s     PDAqaPs   ""Nr    )next	enumerate)nodestemplate_indexprologuetemplate_nodeepilogues        r`   get_prologue_template_epiloguez0BaseSchedulerNode.get_prologue_template_epilogue  sN     PIe,<PP.)n-!+-.00rb   )rf   re   r   r   )rg   ir.Operationr   r   r   )r   z	list[str]r   r  r.   r  r.   r   r   r  dict[str, str]r   r   )r  r-   r   r   r   )r+  r   r   r   r2  OrderedSet[str]r0  rJ  r   r   r   rL  rX  dict[str, BaseSchedulerNode]r   r   r   r  )r   zSequence[SchedulerBuffer])rx  r   r   rd   r   r  zdependencies.Depr   r   T)r  rG   r  r   r   r   r   )r  r   r  r   r   r   )r  r   r  r   r   zdict[str, int]r   z
int | None)r   r  r   zOptional[ir.TemplateBuffer])r   zir.TemplateBuffer)rA  list[BaseSchedulerNode]r   zJtuple[list[BaseSchedulerNode], BaseSchedulerNode, list[BaseSchedulerNode]]);r~   r   r   r   r   r   r   r   r   r   r   r
  r  r  r   r$  r(  r  r4  r6  r;  r1  r*  rU  rY  rp   r^  r@   rd  rh  rr  r'   rc  r   ry  r   r  rJ   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r5  r:  staticmethodrF  r   rb   r`   rT   rT      s}   BB(('' NN''266
34B*2+#
!.7	
M>


=#2=HV=	=
(
GT">T	T
. H H B B 
 
 
 
.&;:@F 9=-$-15-	-^ 
 

 
 

 
 


!
37
	
J!!J!37J!	J!X  - U Un
 1&1	S1 1rb   c                 R    t         j                  j                  j                         S ro   )r[   r  	codecache
LocalCacher   rb   r`   r  r    s    ??$$//11rb   c                   t        | j                  dd      }| j                  j                  }| j                  j                  g || j                  j                  | j                  j
                        }| j                  j
                  }t        j                  ||f      \  }}ddt        |ft        fd|D              z         }|S )Npython_kernel_namer   c                p    t        | t        j                        xr t        | t        j                         S ro   )rZ   r%   r  GeneratorStater  s    r`   _is_tensor_irz@get_estimate_runtime_cache_key_from_snode.<locals>._is_tensor_ir  s(    !RYY'P
1b>O>O0P,PPrb   c              3  d   K   | ]'  } |      rt        |j                               nd  ) y wro   )tupler  )r  ar_  s     r`   r  z<get_estimate_runtime_cache_key_from_snode.<locals>.<genexpr>  s(     Ua}Q'7ajjl#TAUs   -0r   )
r   rg   inputsfill_non_provided_argsconstant_argsr   pytreetree_flattenr   ra  )snoder[  r   r   	flat_argsflat_args_pytree_specr%  r_  s          @r`   r  r    s     -A2F::D::,,*$*))*

D ZZF'-':':D&>'J$I$Q 	
U9U
U	VI rb   c                   t        | t              sy t        j                  j                  j
                  t        j                  j                  j                  t        j                  j                  j                  d}t        | j                  dd      }||vry t        | j                  t        j                        sy ||   S )N)zextern_kernels.mmzextern_kernels.bmmzextern_kernels.addmmr[  r   )rZ   r  r[   opsatenmmbmmaddmmr   rg   r%   ExternKernel)rh  mms_fnsr[  s      r`   _get_mm_like_fnrs    s    e67"YY^^..#iinn00 %		 4 4G
 !-A2F(ejj"//2%&&rb   c                L   	
 d 	d }t         j                  rt               }|y |	 fd}ny t               }t	               }|j                  |      }|t        |t              sJ |S ddlm	  |       \  
ddl
m}  |	
fd      }|j                  ||       |S )Nc                             S ro   r   )rh  snode_args_kwargss   r`   r   z2maybe_estimate_runtime_benchmark.<locals>.<lambda>  s    !25!9 rb   r    )rv  r   )do_benchc                       i S ro   r   )r   bench_fnr   s   r`   r   z2maybe_estimate_runtime_benchmark.<locals>.<lambda>  s    (D3F3 rb   r  )r"   !runtime_estimations_mms_benchmarkrs  r  r  r  rZ   r  utilsrv  triton.testingrw  r  )rh  args_kwargs_fnmm_fnr%  r&  r'  rw  r(  r   ry  r   rv  s   `       @@@@r`   r!  r!    s    HN//&=99%@I&(EY'I)U+++(!#LD&'	3	4B	OOIRO(Irb   c                  B    e Zd ZU g dZded<   ded<   d
dZddZddZy	)	WhyNoFusename1name2reasonr   r   r  ztuple[Any, ...]r   c                X    |j                         | _        |j                         | _        y ro   )rp   r  r  rq   node1node2s      r`   r   zWhyNoFuse.__init__  s    ^^%
^^%
rb   c                J    || _         || _        t        j                  |        y ro   )r  r   
fusion_logdebug)rq   r  r   s      r`   __call__zWhyNoFuse.__call__  s    	rb   c                p    d| j                    d| j                   d| j                  | j                  z  z   S )Nzcannot fuse z with ry   r  rv   s    r`   __str__zWhyNoFuse.__str__$  s6    djj\

|2>KK$))#
 	
rb   Nr  rT   r  rT   r   r   )r  r   r   r   r   r   r   )r~   r   r   	__slots__r   r   r  r  r   rb   r`   r  r    s#     5IK
&

rb   r  c                    t        | t        t        f      rt        | t              } t        j                  | d      }d|v rdt        j                  |d       S |S )Nkey   )r   r       )	rZ   r   setsortedr   pprintr   textwrapr   )objr   s     r`   r   r   *  sR    #
C()Sc"^^C*Fv~HOOFG4566Mrb   c                  0    e Zd ZddZddZddZd	dZeZy)
r   c                &    t        |g      | _        y ro   rI  r#  s     r`   r   zOutputNode.__init__5  s    ",cU"3rb   c                     yr  r   rv   s    r`   r  zOutputNode.is_reduction8  r  rb   c                     y)Nr   r   rv   s    r`   r   z'OutputNode.get_inputs_that_alias_output;  r   rb   c                     y)NOUTPUTr   rv   s    r`   rp   zOutputNode.get_name>  s    rb   N)r  r/   r   r   r   r   r   )r~   r   r   r   r  r   rp   r   r   rb   r`   r   r   4  s    4 Hrb   r   c                    t        j                          j                  D ]N  }t        |t              r|j
                     j                         }|   j                         xx   dz  cc<   P d fdt        fd j                  D              }|r? j                  |z
   _         j                   j                  j                  |             yy)am  
    Prunes weakdeps intended for mutation ordering
    on an upstream fused node if after fusion there is another dependency
    on the fused upstream node, making the weakdep redundant

    In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
    be incrementally removed, enabling other fusions, ensuring they are fused in order.
    r    c                    t        | t              rD| j                     j                         }|   j	                            dkD  }|   k(  }|xs |S y)Nr   F)rZ   r0   ru   rr   rp   )r  rM  is_redundantis_self_depr   name_to_dep_countrX  rg   s       r`   rN  z+_prune_redundant_deps.<locals>.should_pruneX  sb    c7#!#((+<<>G,-?-H-Q-Q-STWXXL -W5=K.;.rb   c              3  4   K   | ]  } |      s|  y wro   r   rP  s     r`   r  z(_prune_redundant_deps.<locals>.<genexpr>d  s      ,s2CrQ  NrR  )r  r   r   rZ   r0   ru   rr   rp   r   r  r   rS  )rg   rX  r   r  rM  deps_to_pruner  rN  s   ```   @@r`   rW  rW  D  s     '2&9&9&;&& K#w'!#((+<<>G09BBDEJEK

 
  .. M "&"9"9M"IT--::=IJ rb   c                  8     e Zd Zd fdZddZddZddZ xZS )r  c                    t         |   |       | j                  |       | j                  |j	                                y ro   superr   r   r  get_read_writesrq   rf   rg   	__class__s      r`   r   z"ExternKernelSchedulerNode.__init__n  5    #T"T1134rb   c                V    | j                          dt        | j                  dd        S )Nz.node.kernel = r[  )rp   r   rg   rv   s    r`   r   z)ExternKernelSchedulerNode.debug_str_extras  s*    --/"/'$))EY[_2`1abbrb   c                     yNTr   rv   s    r`   r  z#ExternKernelSchedulerNode.is_externv  r6  rb   c                    | j                   J t        | j                   d      xr | j                   j                         S )Nr  )rg   r   r  rv   s    r`   r  z*ExternKernelSchedulerNode.has_side_effectsy  s6    yy$$$tyy"45V$)):T:T:VVrb   rf   re   rg   rG  r   r   r   r   )r~   r   r   r   r   r  r  __classcell__r  s   @r`   r  r  m  s    5
cWrb   r  c                        e Zd Zd fdZ xZS )r  c                    t         |   |       | j                  |       | j                  |j	                                y ro   r  r  s      r`   r   zNopKernelSchedulerNode.__init__  r  rb   r  )r~   r   r   r   r  r  s   @r`   r  r  ~  s    5 5rb   r  c                  x    e Zd ZU dZded<   ded<   	 	 	 	 	 	 d fdZ	 	 d	 	 	 	 	 ddZ	 	 d	 	 	 	 	 d dZ	 	 	 	 	 	 d!d	Zd"d
Z		 	 	 	 	 	 d#dZ
d$dZ	 	 	 	 	 	 d%dZd&dZd'dZd(dZd(dZd(dZd)dZd*dZ	 	 	 	 d+dZd,dZ	 d-	 	 	 d.dZed/d       Zed/d       Zd0dZed1d       Zed( fd       Z xZS )2rm  zu
    A SchedulerNode is a node for scheduling that encapsulates either
    a ComputedBuffer or a TemplateBuffer.
    z tuple[Sequence[sympy.Expr], ...]_sizesr9   _bodyc                f    t         |   |       | j                  |       | j                          y ro   )r  r   r   _compute_attrsr  s      r`   r   zSchedulerNode.__init__  s,    
 	#T"rb   c                   t        | j                  t        j                  t        j                  f      sJ | j                  j                  ||      \  | _        }|| _        | j                  j                         }| j                  j                  |      j                  }| || j                        f| _        t        j                   xs t        |j                          }t        | j                  t        j                        r,| j#                  | j                  j%                  |             y | j#                  t'        j$                  | j                  g| j                  d|i       y )Nextra_indexing_constraintsrecompute_sizes_body_func)	normalizer  )rZ   rg   r%   ComputedBufferTemplateBuffersimplify_and_reorderr  r  get_device_or_errorrf   get_backendgroup_fnr   r"   loop_ordering_after_fusionrJ   r}   r  extract_read_writesr$   )rq   r  r  bodyr  r  should_normalizes          r`   r  zSchedulerNode._compute_attrs  s7   
 $))b&7&79J9J%KLLL II::'A&? ; 
T 
..0>>--f5>>ht{{34
  &@@@ 
KKI
 E
 dii!2!23  		--8H-I   00JJ!%8Hrb   c                *    | j                  ||       y )Nr  )r  )rq   r  r  s      r`   recompute_size_and_bodyz%SchedulerNode.recompute_size_and_body  s    
 	'A&? 	 	
rb   c                   t        d | j                  j                  D              }| j                  t	        j
                  | j                  g| j                  d|ij                  |      j                  | j                               | j                  j                  |        |r!ddlm} |j                  j!                          y y )Nc              3  N   K   | ]  }t        |t        t        f      s|  y wro   )rZ   r0   r/   r  s     r`   r  z5SchedulerNode.refresh_dependencies.<locals>.<genexpr>  s#      0
ZgwEW5XC0
s   %%r  r    SIMDScheduling)r   r   r   r  r$   r  r  r  r"  r  r   pointwise_read_writesclear_cachecodegen.simdr  candidate_tilingscache_clear)rq   r  need_clear_tiling_cache	fake_depsr  s        r`   refresh_dependenciesz"SchedulerNode.refresh_dependencies  s    
 &0 0
++110
 &
	 	,,

![[4= Yy!VD))*	
 	""..t4"4 ,,88: #rb   c                    | j                   j                  |      | _         | j                   j                  | _        | j	                  dd       y )NFTr  r  )r  reorder_iter_loopssizesr  r  )rq   	new_orders     r`   apply_new_loop_orderz"SchedulerNode.apply_new_loop_order  sA    ZZ22

 jj&&!!E4!Prb   c                   t        | j                  t        j                  t        j                  f      sJ | j
                  j                  ||      | _        | j
                  j                  | _        | j                  j                         }| j                  j                  |      j                  }| || j                        f| _        | j                  dd       y )NTr  )rZ   rg   r%   r  r  r  #expand_dimension_for_pointwise_noder  r  r  rf   r  r  r   r  )rq   	dimension	new_ranger  r  s        r`   r  z1SchedulerNode.expand_dimension_for_pointwise_node  s     $))b&7&79J9J%KLLLZZCCy

 jj&&..0>>--f5>>ht{{34
 	!!D$!Orb   c                    | j                   j                         | _         | j                   j                  | _        | j	                  dd       y )NTFr  )r  merge_loopsr  r  r  rv   s    r`   r  zSchedulerNode.merge_loops  s<    ZZ++-
jj&& 	!!D%!Prb   c                   d }| j                   d   }t        |      |j                  cxk(  r|j                  k(  rn n|j                  |      }|rPt        xj
                  dz  c_        t        j                  d| j                         |       | j                  |       yt        j                  d| j                                y)Nr   r    z"Reorder loops for %s with order %sTzEDon't reordering %s because we can not decide the suitable loop orderF)
r  r   num_varsdecide_loop_order_to_matchr&   num_loop_reorderingloop_ordering_logr  rp   r  )rq   r  r  r  
self_sizess        r`   r  z'SchedulerNode.reorder_loops_by_dep_pair  s     	[[^
z?h//E93E3EE ;;IFI''1,'##4dmmoy %%i0##W rb   c                $   | j                         }| d| j                  d    | d| j                  d    | d| j                   g}| j                  j	                         D ]  }t        |t              r|j                  }t        j                  j                  |      }t        |t        j                        rZ|j                  | dt        |j                                 t        | j                   t"              rR|j                  d| d       |j                  t%        j&                  | j                   j)                         d	             | j*                  J |j-                  | j/                                d
j1                  |      S )Nz.group.device = r   z.group.iteration = r    z	.sizes = z
_layout = zclass z_loop_body:r  r   )rp   r   r  r   r  rZ   r0   ru   rP   r   r  r%   r  r  r   r   r  r9   r  r   r   rg   rC  r   join)rq   ru   linesr  rx  r   s         r`   r   zSchedulerNode.debug_str_extra  sK   }}f$TZZ]O4f'

17fIdkk]+

 ##446 	OCc7+88gg((2!#r'9'9:LLH:Z

8K7L!MN	O djj(+LL6${34LL)=)=)?HIyy$$$T//12yyrb   c                    | j                   S ro   )r  rv   s    r`   r  zSchedulerNode.get_ranges1      {{rb   c                    t        | j                  t        j                  t        j                  f      sJ dt        | j                               t        | j                  j                               S Nztype(self.node)=)rZ   rg   r%   r  r  r}   r   r  rv   s    r`   r  zSchedulerNode.is_reduction4  s[    $))b&7&79J9J%KL 	
tDII !	
L DII00233rb   c                L   t        | j                  t        j                  t        j                  f      sJ dt        | j                               t        | j                  t        j                        xr. t        | j                  j                  t        j                        S r  )rZ   rg   r%   r  r  r}   r   	SplitScanrv   s    r`   r  zSchedulerNode.is_split_scan:  sy    $))b&7&79J9J%KL 	
tDII !	
L $))R%6%67 
JIINNBLL=
 	
rb   c                J    t        | j                  t        j                        S ro   rZ   rg   r%   r  rv   s    r`   r  zSchedulerNode.is_templateB  s    $))R%6%677rb   c                f    t        | j                  t        j                        r| j                  S d S ro   r  rv   s    r`   r5  zSchedulerNode.get_template_nodeE  s$    &tyy"2C2CDtyyN$Nrb   c                f    | j                          | j                          | j                  |       y ro   )r  r6  r  )rq   
index_varss     r`   runzSchedulerNode.runH  s#    ""$Z rb   c                &   | j                   }t        t        t        |            t        t        t        |            k(  sJ t	        t        t        j                  j                  |      t        j                  j                  |                  }|S ro   )	r  r  mapr   dictzipr9  r:  from_iterable)rq   r  r  
var_rangess       r`   ranges_from_index_varsz$SchedulerNode.ranges_from_index_varsM  sp     3sE?#s3sJ+?'@@@@--j9--e4

 rb   c                   | j                  |      }	 t        j                  t        t        j                         |            5  t        j
                  j                  |       5   | j                  |  ddd       ddd       y# 1 sw Y   xY w# 1 sw Y   yxY w# t        $ r" t        j                  d| j                          w xY w)a  
        Generate code for this node using the provided index variables.

        This method sets up the appropriate context for code generation, including
        simplifying indexing expressions based on the variable ranges, and then
        calls the node's body function with the index variables.

        Args:
            index_vars: A sequence of sequences of sympy expressions representing
                        the index variables for each dimension of the computation.
        NzError in codegen for %s)r  rP   set_ops_handlerr>   get_ops_handlerr   set_current_noder  r   r   fatalrg   )rq   r  r  s      r`   r  zSchedulerNode.codegenZ  s     00<
	!!"213D3D3F
"ST())$/( 

J'	( ( ( ( ( (
  	II/;	sA   1B  B$B4B<B B	
BBB B +Cc                    |r| j                   nt        | j                         \  }}t        j                  | j                  |t
        j                  j                  gt        |      z  g      S )z\
        Get the memory dependencies in either the pointwise or the reduction axes.
        )hidden_args)	r  reversedr$   r  r  sympySZeror   )rq   	pointwise
keep_sizesignore_sizess       r`   "pointwise_or_reduction_read_writesz0SchedulerNode.pointwise_or_reduction_read_writesq  sT     3<4;;$++AV 
L//JJ
%'',,#lBS1S0T
 	
rb   c                &    | j                  d      S )zH
        Get the memory dependencies in the non-reduction axes.
        Tr  r  rv   s    r`   r  z#SchedulerNode.pointwise_read_writes|  s    
 666FFrb   c                &    | j                  d      S )zD
        Get the memory dependencies in the reduction axes.
        Fr  r  rv   s    r`   reduction_read_writesz#SchedulerNode.reduction_read_writes  s    
 666GGrb   c                   | j                         ryt        d | j                         D              ryt        | j                  j
                        dk(  rt        |t        j                        rt        t        | j                  j
                              }t        |t        j                        sJ dt        |             |j                  |j                  k(  xr |j                  |j                  k(  S y)NFc              3  <   K   | ]  }|j                           y wro   )r   rg  s     r`   r  z,SchedulerNode.can_inplace.<locals>.<genexpr>  s     ?Ss ?rb  r    ztype(write_dep)=)r  r'  r   r   r   r   rZ   r$   r.   r?  iterr}   indexsize)rq   r  	write_deps      r`   r  zSchedulerNode.can_inplace  s    ?D,<,<,>??t&&'1,l,,2
 T$"2"2"9"9:;Ii)?)?@WEUT)_DVBWW@>>Y__4X)..9XXrb   c                   t               }t        | j                  t              r| j                  j	                         D ]  }|j
                  dk(  s|j                  dk(  s#d|j                  v r|j                  d   dk(  s,t        |j                        dk(  s\|j                  d   dk(  so|j                  d|j                  v r|j                  d   n(t        |j                        dk\  r|j                  d	   nd
        |S )Ncall_methodstoremode
atomic_add   r  ru      r    r   )r   rZ   r  r9   rc  r^   r  r   r   r   rA  )rq   buffers_store_as_atomic_addrg   s      r`   _get_atomic_add_buffersz%SchedulerNode._get_atomic_add_buffers  s    7A|#djj(+

,,. GG},w.4;;.4;;v3F,3V		Na/DIIaLL4P 033!T[[0 F+.1$))n.Adiilr +*rb   c                p    | j                   | j                   j                  d      ryt        |          S )Ndevice_assert_asyncT)r  has_opr  r  rq   r  s    r`   r  zSchedulerNode.has_side_effects  s2     ::!djj&7&78M&Nw'))rb   )rf   re   rg   z+Union[ir.ComputedBuffer, ir.TemplateBuffer]r   r   NN)r  *Optional[tuple[dict[Any, Any], list[Any]]]r  zOptional[Callable[_P, _T]]r   r   )r  r+  r  zOptional[Callable[..., Any]]r   r   )r  r   r  r   r   r   )r  Sequence[int]r   r   )r  r   r  r   r   r   r   rH  r   )r   Sequence[Sequence[sympy.Expr]]r   rT  )r  Sequence[sympy.Expr]r   r   )r  r-  r   zdict[sympy.Expr, sympy.Expr])r  r-  r   r   rR  )r  r   r   r   )r   r   rQ  rM  )r~   r   r   __doc__r   r   r  r  r  r  r  r  r  r   r  r  r  r  r5  r  r  r  r  r@   r  r  r  r%  r  r  r  s   @r`   rm  rm    s   
 -,O : 
	 RV@D$N $> 
	F RVBF
$N
 $@
 
	
;;8<;	;<QPP),P	P"
Q!.7	, ,4
8O!
8	%0 !%	
	
	 	
 G G H H + +& * *rb   rm  c           	     n     j                   } j                  t        j                  j	                  |D cg c]  }|j
                   c}             t         fdt        j                  |D cg c]  }|j                   c} D               j
                  j                  z
   _        y c c}w c c}w )Nc              3  Z   K   | ]"  }|j                   j                         vr| $ y wro   ru   rh  )r  r  group_snodes     r`   r  z2refresh_group_node_dependencies.<locals>.<genexpr>  s.      
xx{;;== 
   (+)
r  r  r$   
ReadWrites
merge_listr   r   unionr   r   )r3  r  r  s   `  r`   refresh_group_node_dependenciesr8    s     F**6+JaAMM+JK
 	 
!'')O1!*>*>)OP
 	

 
!
!
(
(	) " ,K *Ps   B-0B2re   c                   t        | t        t        f      sJ || _        || _        d | _        t        j                  |D cg c]  }|j                  |j                   c} | _        t        |        t        d | j                  D              | _        t        d | j                  D              | _        | j                         D ci c]  }|j                         | c}| _        y c c}w c c}w )Nc              3  4   K   | ]  }|j                     y wro   r   r  r  s     r`   r  z"init_group_node.<locals>.<genexpr>       HHr  c              3  4   K   | ]  }|j                     y wro   )r   r<  s     r`   r  z"init_group_node.<locals>.<genexpr>  r=  r  )rZ   r  GroupedSchedulerNoder  rf   rg   r   r7  r   r8  r  r   r$  r   r   rp   r   )r3  rf   r  r  r   s        r`   init_group_noder@    s    
 k$68L#MNNNK%KK&,,%	A!)@!++	AK $K0H[5G5GHHKH[5G5GHHK'2'>'>'@# ##K 
B#s   C*C*	C/c                      e Zd ZU dZded<   e	 	 	 	 	 	 dd       Zedd       Z	 	 	 	 	 	 d dZ	d! fdZ
ed"d       Zd"d	Zed#d
       Zd$dZd"dZd"dZ	 	 	 	 	 	 d% fdZed#d       Zed#d       Zd&dZd"dZed'd       Zed'd       Zed'd       Zed(d       Zd)dZed'd       Zd*dZd+dZd,dZd"dZed' fd       Z  xZ!S )-r  z
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be fused together. The way it does this is by maintaining
    its unmet dependencies as the union of its constituent nodes.
    rU  r  c           	        |j                   |j                   u sJ t        |t        t        f      sJ |j	                         rt        |t
              rt        |j                  t              sJ t        |j                  j                        dk(  sJ t        t        t        |j                  j                              t              sJ t        t        |j                  j                              j                  }|j                         D cg c]  }|j	                         s| }}t        |      dk(  sJ |d   }t        |j                  j                        dk(  sJ t        t        |j                  j                              }t        |t               sJ t#        t!        ||j$                  |j&                  |j(                  |j*                        g      |j                  _
        nt        |t        t        f      sJ t-        t/        j0                  |j                         |j                                     } | |j                   |      S c c}w )Nr    r   )rf   rZ   rm  r  r  r  rg   r6   r   r   r   r?  r  r/   ru   rc  r.   r   r  	var_namesr  r   r   r9  r:  )	clsr  r  ru   rg   template_nodesrD  writerA  s	            r`   fusezFusedSchedulerNode.fuse  s    %//111%-1C!DEEE:e5N#O ejj+666u((//0A555d4(9(9(@(@#ABGLLLU..5567<<D/4/@WtDDTDTDVdWNW~&!+++*1-M}00778A===m77>>?@EeY///'1ekk5??EJJ

(E$ em5G%HIIIY__U__%68IJK5??E**! Xs   I'Ic                    t        t        d d | j                         D                    }t        |      dk(  ry t	        |      }|S )Nc              3  |   K   | ]4  }|j                         s|j                         r|j                          6 y wro   r  r  r  ra  s     r`   r  z4FusedSchedulerNode.estimate_flops.<locals>.<genexpr>  6      '')T^^-= '')   :<r   r   filterrc  r   r  rq   fpsr*  s      r`   r  z!FusedSchedulerNode.estimate_flops  K      $ 0	
 s8q=#h
rb   c                   | j                         ryd}| j                  D ]`  }t        |t              sJ |;t	        |      t	        |j
                  d         k7  rt        j                  d        y|j
                  d   }b d}|J t        |      |j                  cxk(  r|j                  k(  rn n|j                  |      }|s%t        j                  d| j                                yt        xj                  dz  c_        t        j                  d| j                         |       | j                  D ]%  }t        |t              sJ |j                  |       ' t        |        y)	z@
        Return true if a loop reordering is performed.
        FNr   z1Can not reorder fused node due to different sizeszODont reordering fused node %s because we can not decide the suitable loop orderr    z-Reorder loops for fused node %s with order %sT)r  r  rZ   rm  ra  r  r  r  r   r  r  rp   r&   r  r  r8  )rq   r  r  r  rh  r  s         r`   r  z,FusedSchedulerNode.reorder_loops_by_dep_pair  sL    
[[ 	)Ee]333%%
*;uU\\RS_?U*U!''G aJ	) 	%%%z?h//E93E3EE ;;IFI##a ##q(#;T]]_i	
 [[ 	2Ee]333&&y1	2 	(-rb   c                    t         |   |       t        | ||       g | _        t	        |d       j
                  | _        y )Nc                4    t        | j                               S ro   )r   r  r^  s    r`   r   z-FusedSchedulerNode.__init__.<locals>.<lambda>E  s    s1>>3C/D rb   r  )r  r   r@  rl   r$  r   )rq   rf   r  r  s      r`   r   zFusedSchedulerNode.__init__A  s8    #i0%'
%DEKK
rb   c                z    dj                  | j                  D cg c]  }|j                          c}      S c c}w N_r  r  rp   rq   r  s     r`   rp   zFusedSchedulerNode.get_nameG  )    xxt{{;!;<<;   8c                <    | j                   d   j                         S Nr   r  rp   rv   s    r`   r^  z!FusedSchedulerNode.get_first_nameK      {{1~&&((rb   c                |    t        j                  | j                  D cg c]  }|j                          c} S c c}w ro   r   r7  r  rh  rY  s     r`   rh  z#FusedSchedulerNode.get_buffer_namesN  .    !L1!"4"4"6!LMM!L   9c                j    g }| j                   D ]!  }|j                  |j                                # |S ro   r  rC  r   rq   r   rg   s      r`   r   zFusedSchedulerNode.get_outputsR  4    (*KK 	.DMM$**,-	.rb   c           
     ~   t        | j                        D cg c]+  \  }}| j                          d| d|j                          - }}}| j                  d   j                  }||j                  | j                                t        j                  dj                  |      j                         d      S c c}}w )Nz.snodes[z] =
r   r   r  )r@  r  rp   r   rg   rC  r   r  r   r  r   )rq   r>  rg   r  s       r`   r   z"FusedSchedulerNode.debug_str_extraX  s     %T[[1
4 }}xs%0@/AB
 
 {{1~""LL3356tyy/668&AA
s   0B9c                h    | j                   D cg c]  }|j                          }}|  d| S c c}w )Nz
, snodes: )r  r
  )rq   rg   
snodes_strs      r`   r
  z"FusedSchedulerNode.debug_str_shortc  s9    9=Ed**,E
Ez*.. Fs   /c                    t         |   ||       t               }t        | j                        D ]/  }|j                  ||       |j                  |j                         1 y ro   )r  r4  r   r
  r  updater   )rq   r2  r0  rg   r  s       r`   r4  z!FusedSchedulerNode.set_last_usageg  s\    
 	24FG 0:|T[[) 	8D 35GH&&t7	8rb   c                |    t        j                  | j                  D cg c]  }|j                          c} S c c}w ro   )r   r7  r  r;  rY  s     r`   r;  z$FusedSchedulerNode.used_buffer_namest  s.    !MA!"5"5"7!MNN!Mrc  c                |    t        j                  | j                  D cg c]  }|j                          c} S c c}w ro   )r   r7  r  r1  rY  s     r`   r1  z/FusedSchedulerNode.used_or_aliased_buffer_namesx  s3    8<D1a,,.D
 	
Drc  c                    | j                   S ro   r  rv   s    r`   rc  zFusedSchedulerNode.get_nodes~  r  rb   c                T    t        |       j                   d| j                          dS )Nz(nodes=r   r   rv   s    r`   r   zFusedSchedulerNode.__repr__  s'    t*%%&gdmmo->a@@rb   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wro   )r  r<  s     r`   r  z2FusedSchedulerNode.is_reduction.<locals>.<genexpr>  s     91>>#9rb  r'  r  rv   s    r`   r  zFusedSchedulerNode.is_reduction  s    9T[[999rb   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wro   )r  r<  s     r`   r  z3FusedSchedulerNode.is_split_scan.<locals>.<genexpr>  s     :1??$:rb  rs  rv   s    r`   r  z FusedSchedulerNode.is_split_scan  s    :dkk:::rb   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wro   r=  r<  s     r`   r  z1FusedSchedulerNode.is_template.<locals>.<genexpr>  s     8q1==?8rb  rs  rv   s    r`   r  zFusedSchedulerNode.is_template  s    8DKK888rb   c                j    | j                   D ]$  }|j                         s|j                         c S  y ro   )r  r  r5  rq   rg   s     r`   r5  z$FusedSchedulerNode.get_template_node  s5    KK 	0D!--//	0 rb   c                     | j                   d   S r]  )r   rv   s    r`   r   zFusedSchedulerNode.get_device  s    zz!}rb   c                :    t        d | j                  D              S )Nc              3  <   K   | ]  }|j                           y wro   )r(  r<  s     r`   r  z>FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>  s     EA1--/Erb  rs  rv   s    r`   r(  z+FusedSchedulerNode.has_aliasing_or_mutation  s    EEEErb   c                    t         ro   NotImplementedError)rq   r  s     r`   r   z'FusedSchedulerNode.update_mutated_names      !!rb   c                    t         ro   r~  )rq   ru   s     r`   r$  zFusedSchedulerNode.add_fake_dep  r  rb   c                    t         ro   r~  r  s     r`   r  zFusedSchedulerNode.can_inplace  r  rb   c                P   | j                         }dj                  d | j                  D              }t               }|j	                  | dt        |       j                   d| d| dt        | j                  j                         d| dt        | j                         d| d	t        | j                  j                  | j                  z
         d| d
       |j                         5  | j                         D ]!  }|j	                  |j                                # 	 ddd       |j                  d       	 |j	                  | j!                                |j)                         j+                         S # 1 sw Y   XxY w# t"        $ r t$        j'                  dd       Y Lw xY w)r   rz   c              3  F   K   | ]  }t        |      j                    y wro   )r}   r~   rn  s     r`   r  z/FusedSchedulerNode.debug_str.<locals>.<genexpr>  s     FQQ 0 0Fs   !ry   r   r   r   r   r   r   z.outputs = [
            Nr{   r   Tr   )rp   r  r  rG   r   r}   r~   r   r   r   r   r   r   r   r   r|   r   r   r   r   r   r   )rq   ru   node_typestrr   r   s        r`   r   zFusedSchedulerNode.debug_str  s   }}xxF$++FF

bd		Q|n -j))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU V 	
 ZZ\ 	,'') ,

3==?+,	, 	c	HJJt++-.  ''))	, 	,  	HKK7$KG	Hs   )5E69F 6E? F%$F%c                p    | j                   t        d | j                   D              S t        |          S )Nc              3  <   K   | ]  }|j                           y wro   )r  ra  s     r`   r  z6FusedSchedulerNode.has_side_effects.<locals>.<genexpr>  s     G4t,,.Grb  )r  r'  r  r  r)  s    r`   r  z#FusedSchedulerNode.has_side_effects  s0    ;;"G4;;GGGw'))rb   r  rT   r  rT   r   r  rS  rH  )rf   re   r  rU  r   r   r   rM  r   zlist[SchedulerBuffer]rK  rP  r   rT  )r   torch.devicerI  )ru   r-   r   r   rQ  )"r~   r   r   r/  r   classmethodrG  r@   r  r  r   rp   r^  rh  r   r   r
  r4  r;  r1  rc  r   r  r  r  r5  r   r(  r   r$  r  r   r  r  r  s   @r`   r  r    s    $#+%+.?+	+ +B  "'!'.7'	'RL = =) N N	B/8#28HV8	8 O O 
 

A : : ; ; 9 9   F F
"""*4 * *rb   r  c                  N    e Zd ZU dZ	 	 	 	 ddZ	 	 	 	 ddZedd       Ze	 	 	 	 	 	 dd       Z	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	e	 	 	 	 dd       Z
e	 	 	 	 dd       ZeZd	ed
<   e	 	 	 	 dd       Ze	 	 	 	 dd       ZddZddZddZddZd dZd!dZ	 	 	 	 d"dZ xZS )#ForeachKernelSchedulerNodez
    This is a schedular node that consists of a set of scheduler nodes that
    has no data dependencies among them and can be executed in parallel.
    c                    |j                         D ]=  }|j                         | j                  v s | j                  |j                            c S  y ro   )r   rp   read_to_node)rq   producerr   s      r`   get_consumer_subnode_forz3ForeachKernelSchedulerNode.get_consumer_subnode_for  sL     '') 	9C||~!2!22((88	9 rb   c                   t        t                  }|j                  j                  D ]  }|j                  | j
                  j                  vr&| j
                  j                  |j                     j                         }|| j                  v sf|j                  | j                  |           t        |      dk(  rt        t        |            S y Nr    )r   rT   r   r   ru   rf   r   rr   name_to_noderA  r   r?  r  )rq   consumer	producersrd	node_names        r`   get_producer_subnode_forz3ForeachKernelSchedulerNode.get_producer_subnode_for  s     013	&&,, 	<Bwwdnn88822277;LLNID---d//	:;	< y>QY((rb   c                   t        |      }j                         r|j                         rt        j                  t              t        j                  t        |      }t        j                        t        |j                        k(  }|s |d       |xr2 t        fdt        j                  |j                        D              S |j                         rkj                         r	 |d       yt        j                  t        |      }|j                        }||j                  j                  |      S  |d       yj                         rk|j                         r	 |d       yt        j                  t              j                  |      }|j                  j                  ||      S  |d       yt        d      )	Nzforeach do not have same lengthc              3  \   K   | ]#  \  }}j                   j                  ||       % y wro   )rf   can_fuse)r  lrr  s      r`   r  z6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>  s0      )Aq ""++Aq1)s   ),zXcandidate producer is a reduction, foreach ops cannot be fused with reductions currentlyFz5candidate producer is not dep of any foreach consumerzXcandidate consumer is a reduction, foreach ops cannot be fused with reductions currentlyz5candidate consumer has no dep in any foreach producerzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)r  r  typingcastr  r   r  rq  r  r  r  rf   r  r  r#  )rD  r  r  whyforeach_matchconsumer_subnodeproducer_subnodes    `     r`   r  z#ForeachKernelSchedulerNode.can_fuse  s   (+ X%8%8%:{{#=xHH{{#=xHH0C4HHM 56  S )A) &    "$$&n {{#=xHH'@@J+))228=MNNGH  "$$&n {{#=xHH'@@J+))223CXNNGHf
 	
rb   c                
   |j                         s|j                         sJ |j                         r3t        j                  t        |      }|j                  }|j
                  }n2t        j                  t        |      }|j                  }|j
                  }d }d }|j                         r|j                         r|t        j                  t        |      }t        j                  t        |      }t        |j                  |j                        D cg c]  \  }}t        j                  ||       }	}}n/|j                         rt        j                  t        |      }|j                  |      }
g }	|}d }|j                  D ]A  }||
u r*t        j                  ||      }|}|	j                  |       1|	j                  |       C n|j                         rt        j                  t        |      }|j                  |      }g }	|}d }|j                  D ]A  }||u r*t        j                  ||      }|}|	j                  |       1|	j                  |       C nt        d       | |j                  |	||||      S c c}}w )NzTAt least one node passed to ForeachKernelSchedulerNode.fuse should be a foreach node)use_custom_partition_algoprev_node_1prev_node_2enable_autotune)r  r  r  r  r  r  r  r  r  rG  r  r  r  r#  rf   )rD  r  r  r  r  r  r  r  r  fused_nodesr  rg   new_noder  s                 r`   rG  zForeachKernelSchedulerNode.fuse  sZ    ""$(;(;(=== {{#=xHH(0(J(J%&66O{{#=xHH(0(J(J%&66O X%8%8%:{{#=xHH{{#=xHH  AAq #''1-K    "{{#=xHH'@@JK"KK  -++166tXFH"*K&&x0&&t,-   "{{#=xHH'@@JK"KK  -++166xFH"*K&&x0&&t,- !f  &?##+
 	
Ks    I?c                    i  _         i  _        ||qt           ||       |D ]Z  }|j                  j
                  D ]  }| j                   |j                  <    |j                         D ]  }	| j                  |	<    \ n| _        | _	        d  _
        g  _         j                  t        j                  j                  |j                  |j                  g             t!         fdt!        j"                  |j$                  |j$                        D               j                  j&                  z
   _        t)        |j*                  |j*                  g       _        t-        |j.                  |j.                  g       _        |j1                         rt3        |t4              sJ ||}}
nt3        |t4              sJ ||}}
|
j6                   _         j6                  j9                  |j6                         |
j                   _        |j                         D ]  }	| j                  |	<     j                  D ci c]'  }|j:                  j=                         D ]  \  }}||
 ) c}}} _        | _        |d   jA                         }|sJ |tC        jD                  d      fff _#        t!        tH        jJ                  jL                             _'        | _(        y c c}}}w )Nc              3  Z   K   | ]"  }|j                   j                         vr| $ y wro   r2  rH  s     r`   r  z6ForeachKernelSchedulerNode.__init__.<locals>.<genexpr>~  s0       xxt'<'<'>>	 r4  r   combo_kernel))r  r  r  r   r   r   ru   rd  rf   r  rg   rl   r  r$   r5  r6  r   r7  r   r   r  r   r$  r   r  rZ   r  r   rl  r   itemsr  r   r  Exprr   r[   fxNoder  r  )rq   rf   r  r  r  r  r  rg   r  ru   foreach_node
other_noderh  r/  vr  r  s   `               r`   r   z#ForeachKernelSchedulerNode.__init__\  s    +"5GY/ 3 ,,22 8D37D%%dii08 !446 3D.2D%%d+3	3 'DN DKDI)+DJ  ''22 ,,k.E.EF  )//#668V8V   ""))* # !+"7"79N9N!OPDN +"7"79N9N!OPDN%%'!+/IJJJ+6j!+/IJJJ+6j)33DNNN!!*"6"67 , 9 9D"668 5*4!!$'5 #'++@ @%:O:O:U:U:W@26!Q1@@D  *C&%%'v

> :<>?
!%((--02.@s   ,K&c           	        |D cg c]  }t        |t              s| }}|rSt        j                  dt	        |      |D cg c])  }|j
                  |j
                  j                         + c}       |D cg c]  }t        |t        t        f      s| }}|D cg c]  }t        |t              s| }}|rt        j                  dt	        |             |D cg c]  }t        |t              r| }}|D cg c]  }|j                         s| }}|r t        j                  dt	        |      |       |D cg c]	  }||vs| }}|S c c}w c c}w c c}w c c}w c c}w c c}w c c}w )Nz/ComboKernels: %d external nodes are filtered %sz+ComboKernels: %d foreach nodes are filteredz0ComboKernels: %d template nodes are filtered: %s)
rZ   r  r   r  r   rg   r  r  r  r  )rD  rA  r  externrg   filtered_nodesforeach_nodesrE  s           r`   combinable_nodesz+ForeachKernelSchedulerNode.combinable_nodes  st    #Oj4M&N!OOIIAF5;UTtyy?T&&(U 
a"8:S!TU 
 
 &
A7Q)RA
 
 IICSEWX%
Z;U-VA
 
 &4Gq}}!GGIIBN#
 &4Oq7N!OO9 P
 V




 H PsL   EEEE:EE#5E# E(6E( E-E-	E2E2c           
         | j                         }g }d}|D ];  }|j                  t        dt        |      |      D cg c]
  }||||z     c}       = |S c c}w )zS
        Returns a list of lists of nodes that are to be grouped together.
           r   )_topological_sort_nodesrC  ranger   )rf   sorted_nodesgrouped_nodesmax_num_nodesrA  r>  s         r`   &_default_group_nodes_for_combo_kernelszAForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels  sw     !88:! 	E   #1c%j-@ !a-/0	 s   A
4Callable[[Scheduler], list[list[BaseSchedulerNode]]]!group_algorithm_for_combo_kernelsc                    | t         _        y ro   r  r  )custom_group_algorithms    r`   %set_group_algorithm_for_combo_kernelsz@ForeachKernelSchedulerNode.set_group_algorithm_for_combo_kernels  s    
 # 	#Drb   c                ,    t         j                  |       S ro   r  rf   s    r`   group_nodes_for_combo_kernelsz8ForeachKernelSchedulerNode.group_nodes_for_combo_kernels  s     *KKIVVrb   c                    t         ro   r~  rv   s    r`   r6  z#ForeachKernelSchedulerNode.mark_run  r  rb   c                    t         ro   r~  rv   s    r`   r  z"ForeachKernelSchedulerNode.codegen  r  rb   c                     yr  r   rv   s    r`   r  z%ForeachKernelSchedulerNode.is_foreach  r6  rb   c                ,    t        | j                        S )zeReturns a list of nodes which comprise the combo kernel.
        These nodes may be vertically fused.)r   r  rv   s    r`   get_subkernel_nodesz.ForeachKernelSchedulerNode.get_subkernel_nodes  s     DKK  rb   c                t    t        t        j                  j                  d | j                  D                    S )zqReturns all nodes contained in this kernel, unpacking fused nodes
        into their constituent scheduler nodes.c              3  <   K   | ]  }|j                           y wro   )rc  r<  s     r`   r  z7ForeachKernelSchedulerNode.get_nodes.<locals>.<genexpr>  s     1UA!++-1Urb  )r   r9  r:  r   r  rv   s    r`   rc  z$ForeachKernelSchedulerNode.get_nodes  s(     IOO111U1UUVVrb   c                <    | j                   d   j                         S r]  )r  r^  rv   s    r`   r^  z)ForeachKernelSchedulerNode.get_first_name  s    {{1~,,..rb   c                    t        | || j                  j                         | j                  D ]  }|j	                  |        y ro   )rW  rf   r   r  rY  )rq   rX  rg   s      r`   rY  z/ForeachKernelSchedulerNode.prune_redundant_deps  s=     	d$68R8RSKK 	:D%%&89	:rb   )r  rT   r   rh   )r  rT   r   rh   r  rT   r  rT   r   r   )r  rT   r  rT   r   r  )NNF)rf   re   r  rU  r  r   r  rh   r  rh   r  r   r   r   rA  rU  r   rU  )rf   re   r   list[list[BaseSchedulerNode]])r  r  r   r   r   r   r   rU  rP  r   rN  )r~   r   r   r/  r  r  r  r  rG  r   r  rV  r  r  r   r  r  r6  r  r  r  rc  r^  rY  r  r  s   @r`   r  r    s   
)	$)	$& ,
 ,
\ >
(>
4E>
	#>
 >
J 4837 %F/F/ (F/ $(	F/
 1F/ 1F/ F/ 
F/P +	  B 	& * 	/ & ( / 
 T
	
 
 WW	&W W
""!
W
/:">:	:rb   r  c                       e Zd ZU dZded<   edd       Z	 d	 	 	 	 	 	 	 d fdZddZddZ	e
dd       Zdd	Ze
dd
       ZddZe
dd       ZddZedd       Z xZS )r?  aC  
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be *grouped* together (it does not allow another node to be scheduled
    in between its constituent nodes, nor does it allow another node to fuse into any of its constituent nodes).
    The way it does this is by maintaining its unmet dependencies as the union of its constituent nodes.
    Fusion will still happen among the nodes within each GroupedSchedulerNode.
    At codegen time, this scheduler node will be unpacked and codegen is called on each constituent node.
    rU  r  c                    |d   j                   t        fd|D              sJ  | |      }|D ]  }|j                  |j                         <   ! |j                  |j                         <   |S )Nr   c              3  :   K   | ]  }|j                   u   y wro   r  )r  rg   rf   s     r`   r  z.GroupedSchedulerNode.create.<locals>.<genexpr>  s     B44>>Y.B   )rf   rq  rX  rp   )rD  r  grouped_snoderh  rf   s       @r`   createzGroupedSchedulerNode.create  sy    1I''	B6BBBBIv. 	KE=JI(()9:	KAN	$$]%;%;%=>rb   c                L    t         |   |       t        | ||       || _        y ro   )r  r   r@  temp_grouping)rq   rf   r  r  r  s       r`   r   zGroupedSchedulerNode.__init__!  s(     	#i0 +rb   c                6   | j                   r| j                  S | j                  D ])  }|| j                  j                  |j	                         <   + | j                  j                  | j	                         = | j                  j                  | j                        S )z
        Do fusion among nodes within this GroupedSchedulerNode,
        and then unpack this GroupedSchedulerNode into regular nodes.
        )r  r  rf   rX  rp   
fuse_nodes)rq   rh  s     r`   unpackzGroupedSchedulerNode.unpack0  sx    
 ;;[[ 	HEBGDNN--enn.>?	HNN--dmmo>~~((55rb   c                    | j                  | j                  j                  |             | j                  j	                  |       y ro   )r  r   r"  r   rA  )rq   fake_deps     r`   r$  z!GroupedSchedulerNode.add_fake_dep=  s5    T--77AB##H-rb   c                z    dj                  | j                  D cg c]  }|j                          c}      S c c}w rV  rX  rY  s     r`   rp   zGroupedSchedulerNode.get_nameA  rZ  r[  c                <    | j                   d   j                         S r]  r^  rv   s    r`   r^  z#GroupedSchedulerNode.get_first_nameE  r_  rb   c                |    t        j                  | j                  D cg c]  }|j                          c} S c c}w ro   ra  rY  s     r`   rh  z%GroupedSchedulerNode.get_buffer_namesH  rb  rc  c                j    g }| j                   D ]!  }|j                  |j                                # |S ro   re  rf  s      r`   r   z GroupedSchedulerNode.get_outputsL  rg  rb   c                    t        t        d d | j                         D                    }t        |      dk(  ry t	        |      }|S )Nc              3  |   K   | ]4  }|j                         s|j                         r|j                          6 y wro   rJ  ra  s     r`   r  z6GroupedSchedulerNode.estimate_flops.<locals>.<genexpr>X  rK  rL  r   rM  rO  s      r`   r  z#GroupedSchedulerNode.estimate_flopsR  rQ  rb   c                    | j                   S ro   r  rv   s    r`   rc  zGroupedSchedulerNode.get_nodesd  r  rb   c                     yr  r   )rD  r  r  s      r`   r  zGroupedSchedulerNode.can_fuseg  r  rb   )r  rU  r   r?  F)rf   re   r  rU  r  r   r   r   r  )r  r-   r   r   r   rM  r  rS  rP  r  )r~   r   r   r/  r   r  r  r   r  r$  r@   rp   r^  rh  r   r  rc  r  r  r  s   @r`   r?  r?    s     $#  $	++ (+ 	+
 
+6. = =) N N  "  rb   r?  c           
          t         j                  d fd       }t        t        t	        t         d                           }t        |      dkD  r|D cg c]  } |   	 c} t        j                  r|j                  |       |S c c}w )z
    A heuristic to decide loop iteration orders.  This has not been well
    tuned and may be something we should autotune.
    c                t   |    dk(  s|   dk(  rt        |    dk(  |   dk(        S D cg c]  }t        ||           }}D cg c]  }t        ||          }}t        d t        ||      D              }t        d t        ||      D              }||kD  ry||kD  ryt        ||       S c c}w c c}w )Nr    c              3  :   K   | ]  \  }}|d k(  xs ||k    ywr   Nr   r  sl_asl_bs      r`   r  z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>  )      
)3tDAI$$
   c              3  :   K   | ]  \  }}|d k(  xs ||k    ywr  r   r  s      r`   r  z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>  r  r  r  )rA   absr  r  )	rb  bslstride_len_astride_len_ba_firstb_firstr  stride_lengthss	          r`   	index_cmpz"pick_loop_order.<locals>.index_cmpw  s    8q=E!HMuQx1}eAh!m44 .<<rBqE
<<-;<rBqE
<<  
7:<7V
 
  
7:<7V
 
 WW 1ay# =<s   B0	B5r   r  )rb  r   r  r   r   r   )		functools
cmp_to_keyr   r
  r  r   r"   pick_loop_orderssort)r   r  priority_idxr  orderpis   ``    r`   pick_loop_orderr	  m  s      4 %N1$5 6789E
<17CD.,D

y
!L Es   Bc                  T    e Zd ZU ded<   dZded<   dZded<   ddZddZdd	Zdd
Z	y)NodeUser$Union[BaseSchedulerNode, OutputNode]rg   Fr   r  is_weakc                v    t        | j                  j                         | j                  | j                  f      S ro   )rt   rg   rp   r  r  rv   s    r`   rw   zNodeUser.__hash__  s+    TYY'')4+;+;T\\JKKrb   c                    t        |t              xrW | j                         |j                         k(  xr4 | j                  |j                  k(  xr | j                  |j                  k(  S ro   )rZ   r  rp   r  r  rq   others     r`   __eq__zNodeUser.__eq__  s[    uh' .5>>#33.  E$5$55. -		
rb   c                6    | j                   j                         S ro   r   rv   s    r`   rp   zNodeUser.get_name  r   rb   c                    | j                   |j                   u sJ t        | j                   | j                  xr |j                  | j                  xr |j                        S ro   )rg   r  r  r  r  s     r`   r   zNodeUser.merge  sP    yyEJJ&&&II2!2!2LL*U]]
 	
rb   Nr   )r  objectr   r   r   )r  r  r   r  )
r~   r   r   r   r  r  rw   r  rp   r   r   rb   r`   r  r    s3    
..K GTL
$
rb   r  c                 "    t         j                  S ro   )r"   rz  r   rb   r`   *used_non_deterministic_runtime_estimationsr    s    333rb   c                      e Zd ZdZdMdZdM fdZdNdZedOd       Zej                  dPd       ZdQdZ
dRdZdSd	ZdQd
ZdQdZdQdZdQdZ	 	 	 	 dTdZdUdZdVdZdQdZdQdZdTdZdQdZ	 	 	 	 dWdZ	 dX	 	 	 	 	 	 	 dYdZ	 	 	 	 	 	 dZdZdQdZd[dZ	 	 	 	 	 	 d\dZd]dZ	 	 	 	 dTdZdXd^dZ d_dZ!	 	 	 	 d`dZ"	 	 	 	 	 	 dad Z#	 	 	 	 	 	 dad!Z$	 	 	 	 	 	 	 	 dbd"Z%	 	 	 	 	 	 dad#Z&	 	 	 	 	 	 	 	 dcd$Z'	 	 	 	 	 	 ddd%Z(ded&Z)	 	 	 	 	 	 	 	 dfd'Z*	 	 	 	 	 	 dgd(Z+dad)Z,	 	 	 	 	 	 dad*Z-	 	 	 	 	 	 	 	 dhd+Z.did,Z/djd-Z0	 	 	 	 	 	 ddd.Z1	 	 	 	 dkd/Z2	 	 	 	 dld0Z3dQd1Z4dQd2Z5dQd3Z6dmd4Z7dnd5Z8dod6Z9dpd7Z:	 	 	 	 	 	 dqd8Z;	 dr	 	 	 	 	 dsd9Z<	 	 dtd:Z=	 	 	 	 dud;Z>	 	 	 	 	 	 dvd<Z?	 	 	 	 	 	 dwd=Z@	 	 	 	 dxd>ZA	 	 	 	 dTd?ZB	 	 	 	 dTd@ZC	 	 	 	 dTdAZD	 	 dydBZEdQdCZF	 	 	 	 	 	 dzdDZG	 	 	 	 	 	 d{dEZH	 	 	 	 	 	 d|dFZIdQdGZJd_dHZK	 	 	 	 d}dIZLd~dJZMddKZNdQdLZO xZPS )re   z
    A Scheduler is a graph of BaseSchedulerNodes. It is responsible for
    optimizations such as fusion, reorder, and graph partition.
    c                f    t        d      5  | j                  |       d d d        y # 1 sw Y   y xY w)NzScheduler.__init__)r   _initrq   rA  s     r`   r   zScheduler.__init__  s,    ./ 	JJu	 	 	s   '0c           
         t                     t        j                  _        i  _        t        t               _        t        j                          _        t                _        t        g t        j                  j                  j                         t        j                  j                   j                         t        j                  j"                  j                                _        |D cg c]  } j'                  |       c} _        d  _         j-                           j$                  j/                  t        j                  j                   j                                 j(                  D ]  }|j1                           d  _         j5                          _         j(                  D ci c]  }|j9                         | c} _         j(                  D ci c](  }|j=                         D ]  }|j9                         | * c}} _         j:                  jA                          _!        i  _"        i  _#        tI        jJ                   j(                   j>                   jB                         _         jM                           jO                   j(                         _         jQ                           j(                  D ci c]  }|j9                         | c} _!         jS                          tT        xjV                  tY         j(                        z  c_+        ddl-m.}m/}  | j(                         tY         j(                         _0         jc                           jO                   j(                         _        t        td        tf        tf        f              _4        tj        jl                  $tk        jl                   j(                         _         jo                   j(                         _        tj        jp                  $tk        jp                   j(                         _         js                           ju                          tj        jv                  r)ty        ddd      5   j{                  d        d d d        tj        j|                  rddl?m>}  | j(                   j>                   jB                  t        t        j                  j                  j                               t        t        j                  j                                      _        tj        j                  rtj        j|                  s#dd	l?mB}  | j(                   j>                         t               r(t        j                  rdd
l$mF}	  |	 j(                         ddlGmH}
  |
dd  fd       tI        j                   j(                         _         j                          t        j                  jj                  j                  rnt        j                  jj                  j                  j                  r@ j                   j(                         _         j                   j(                         _         j                          t        j                  jj                  j                  j                  r j                           | j(                         t        j                  j                   j(                          j                          t                _Y        i  _Z        t        d      j                   fd       y c c}w c c}w c c}}w c c}w # 1 sw Y   xY w)Nr   )log_ir_post_fusionlog_ir_pre_fusionz#Scheduler.create_combo_kernel_nodesTlog_pt2_compile_eventlog_waitcounter)num_ck_nodesr    )reorder_for_peak_memory)1assign_memory_planning_info_for_scheduler_buffers)6align_runtime_estimations_across_all_distributed_ranks)trace_structuredartifactc                     dddS )N#scheduler_nodes_before_comm_overlapstring)ru   encodingr   r   rb   r`   r   z!Scheduler._init.<locals>.<lambda>L	  s    A (% rb   c            
         dj                  t        j                        D  cg c]0  \  } }d|  d|j                         z   d|j	                          z   2 c}}       S c c}} w )Nz

zsnode[r{   z buffer_names:)r  r@  rA  r   rh  )r>  ro  rq   s     r`   r   z!Scheduler._init.<locals>.<lambda>P	  sl    6;;
 %.djj$9	 !Aq !1++-(*1+=+=+?*@AB$ s   5A"
)metadata_fn
payload_fngraph_statsc                 ^     j                    j                  t         j                        dS )N)graph_idnum_nodes_before_fusionnum_nodes_after_fusion)post_grad_graph_idnum_orig_nodesr   rA  rv   s   r`   r   z!Scheduler._init.<locals>.<lambda>t	  s'     33+/+>+>*-djj/ rb   )]r  r   rP   r   rf   backendsr?  _post_grad_graph_counterr4  r9  count_graph_partition_counterr   r  r  keys	constantstorchbind_constantsrG  create_scheduler_noderA  current_nodeupdate_zero_dim_cpu_tensorrl  r*  default_device_contextget_donated_buffersr   rp   r  r   r   copyrX  r0  r   r!   decide_global_ordering_of_commsrS   topological_sort_scheduledead_node_eliminationcompute_ancestorsr&   ir_nodes_pre_fusionr   torch._inductor.debugr  r  r5  create_foreach_nodesra  r   logged_slow_fusionr"   _pre_fusion_custom_passr  _post_fusion_custom_passr  finalize_multi_template_bufferscombo_kernelsr   create_combo_kernel_nodesr#  memoryget_output_names reorder_for_compute_comm_overlapr$  r  r#   6runtime_estimations_align_across_all_distributed_ranksr%  torch._loggingr&  $reorder_compute_and_comm_for_overlapprocess_grouped_nodesr[   r  graph_partitiontriton
cudagraphs&maybe_reorder_for_minimizing_partition,reorder_for_partition_with_simple_dependencycompute_last_usagetest_configstrack_memory_lifecycleinsert_memory_check_nodesr  graph_diagramdebug_draw_graphbuffer_names_to_freeorigin_to_indexr   add_row)rq   rA  ro  rg   r   r  r  r#  r$  r%  r&  r  s   `          r`   r  zScheduler._init  s    <>"&'?"@(1(9%5?\!&0%%**,""'') ,,113'
# >CCd003C
9='')##**177+<+<+A+A+CDJJ 	DOO	 ?C# $$& 	# &*ZZ;
 !AJJL!O;
 -1JJ8
$($BRBRBT8
;>CLLNC8
8
 AE@Q@Q@V@V@X 35 13 ::JJ##

 	!!#33DJJ?
""$<@JJ"Gq1::<?"G ##s4::6#O$**%!$**o!!#33DJJ?
",U38_"="?))577

CDJ__TZZ0
**688DDJ,,.5&* $ B
 ..D.AB ))70

  ''177//44671773356DJ 2211UAJJ 0 0
 ;< WW GtzzR7 CCDJJODJ""$ OO""22&&--88DDTZZPDJJJ4::VDJ!??!!..EE**,4::&	djj) 6@\! :<'//	
o D;
8
B #H*B Bs$   5]8]'-]];]!!]+c                   i }t         j                  j                  D ]d  }t        t         j                  j                  |   t        j
                        s9t        | t         j                  j                  |   d       ||<   f |S )N)ri   )rP   r   graph_inputs_originalrZ   r%   DonatedBufferr   )rq   name_to_donated_bufru   s      r`   rA  zScheduler.get_donated_buffers{	  sp     GG11 	D!''77=r?O?OP,BGG11$7 $-#D)	 #"rb   c                6    t         j                  j                  S ro   rP   r   current_devicerv   s    r`   rk  zScheduler.current_device	  s    ww%%%rb   c                .    |t         j                  _        y ro   rj  r~  s     r`   rk  zScheduler.current_device	  s    !'rb   c                    t         j                  j                  dd      dk(  rddlm}  || j
                  d       yy)z,Generate an image of the graph for debuggingINDUCTOR_WRITE_SCHEDULER_GRAPHN1r    )draw_buffersT)print_graph)osenvironr.  r  rp  rA  )rq   rp  s     r`   ra  zScheduler.debug_draw_graph	  s1    ::>>:DASH+6 Irb   c                    t         j                  t        j                        r8t         j	                  d|       | j
                  D ]  }|j                           y y )Nz%s:)r   isEnabledForloggingINFOr  rA  r  )rq   labelrg   s      r`   debug_print_nodeszScheduler.debug_print_nodes	  sF    GLL)HHUE"

 #  "# *rb   c                6   |j                         J d       |j                         rt        | |      S t        |t        j
                  t        j                  f      rt        | |      S t        |t        j                        rt        | |      S t        |      )Nz2All nodes passed to scheduling must have an origin)r  is_no_opr  rZ   r%   r  r  rm  rq  r  r  ry  s     r`   r=  zScheduler.create_scheduler_node	  s    !- 	
@	
- ==?)$55r00"2C2CDE t,,boo.,T488%d++rb   c                   t               }g }| j                  j                         }t        j                  j
                  j                         D ]  }|D cg c]%  }||v rt        | j                  |   t              s|' }}|s6|j                  |       |D cg c]  }| j                  |    }}t        j                  dkD  }t        | |d|      }|j                  |       |D ]  }|| j                  |<     | j                  D 	cg c]  }	|	j!                         |vs|	 c}	t#        |      z   | _        y c c}w c c}w c c}	w )Nr    Fr  r  )r   rX  r:  rP   r   listsr   rZ   r  r  rl  r"   combo_kernels_autotuner  r  rA  rp   r   )
rq   removed_node_namesfe_nodeskept_node_namesnamesru   r  r  fe_noderg   s
             r`   rI  zScheduler.create_foreach_nodes	  sN   .8l11668WW]]))+ 	8E "?*"4#4#4T#:<RS E  %%e,:?@$d''-@F@$;;a?O0*/ /	G OOG$ 807''-81	88 "ZZ
4==?BT+TD
N
5 A
s   *D<EE#Ec                   $%&  G $fddt         t                 $t        j                  $      % j                  D ]  }|j                         D ]  }|j                         }t        |j                  j                  t        j                        rt        |j                               dkD  r^|j                         D ]X  }|%v r=|%v r9%|   }%|   }||z   }%j                         D ]  }%|   |u s%|   |u s|%|<    D|%v r	%|   %|<   Q%|   %|<   Z   d& fd&	 	 d	 	 	 	 	 	 	 	 	 d%&fd}	i }
t        j                   j"                  j%                         D ]  \  }}t        |t&        j(                        r|j*                  D ]  }d|
|<   	 7t        |t        j,                        sR|j/                         D cg c]  }t        |t&        j(                        s|! }}|D ]  }|j*                  D ]  }d|
|<   	   d} j                  D ]s  }|j                  J t1        |j                  j3                         d 	      }|D ]8  }t        |t&        j4                        sJ d
}||
vs&|j                         |
|<   : u  j                  D ]_  }t6        j9                  d|j                         |r|j                  J t1        |j                  j;                  d
      d 	      }|D ]d  }||
v sJ | d|
        |
|   x} j<                  |   j                         D ]*  }|j?                  tA        |j                                      , f t        |jB                  jD                        dk(  rGtG        tI        |jB                  jD                              x}rt        |tJ              r|jL                  }nd}|j                         D ]  }t        |jO                               dk  sJ |jO                         D ]  } &|      } |	||       |j?                  tA        ||             %|   j$                  D ]  }|j                         |j                         k(  r%t        |j                  tP              sJ |j                  jS                         D ]?  } &|      }|j?                  tU        ||j                                       |	||d
       A    |jB                  jV                  D ]6  }t        |tT              r |	|jX                  ||j[                  |             8 |j]                   j^                         |j                         D ]  }|jO                         D ]y  }|j                          j^                   &|      <   |j                          j^                  |<    j`                  jc                  ||       j`                  |j                         <   {  b t        j                   je                         D ]3  }t6        j9                  d|        |	|tg        tA        |                   5 |rt        j                   jh                  D ]  }|j;                  d
      D ]|  }||
v sJ | d|
j                                 |
|   x}s) j<                  |   jS                         D ]4  }t6        j9                  d||        |	|tg        tA        |                   6 ~   j^                  D ]  }|t        j                   j"                  v rE |	|tg        tA        |                   t        j                   jj                  jm                  |       d|t        j                   jn                  v s |	|tg        tA        |                    tq        t        j                   j"                  j                               D ci c]  \  }}||
 }}}t        j                   jj                  D cg c]  }||   	 c}t        j                   _9         j                  D ]C  }|j                         D ].  }|ju                  %|j                            j$                         0 E  jv                  D ]-  } jv                  |   ju                  %|   j$                         / ty               }|j{                  d       %j%                         D ]]  \  }} |j}                         5  | j$                  D !cg c]  }!|!j                          }"}!|j{                  d| d|" d       ddd       _ |j{                  d       |j                         j                         }#t        j9                  d       t        j9                  d|#       yc c}w c c}}w c c}w c c}!w # 1 sw Y   xY w)zi
        Create dependency edges between nodes, handling aliasing and
        mutation properly.
        c                  >    e Zd ZdZ	 	 d	 	 	 	 	 ddZddZd	 fdZy)
1Scheduler.compute_dependencies.<locals>.DedupListan  
            This data structure behaves like a list except it makes sure the
            elements remain unique.
            Normally one could use a OrderedSet/dict for this purpose however
            the list in question gets elements appended as it is being
            iterated over which means that we need to keep the list
            semantics.
            Nc                @    |xs g | _         |xs
 t               | _        y ro   )r  r   
membership)rq   r  r  s      r`   r   z:Scheduler.compute_dependencies.<locals>.DedupList.__init__	  s    
 #[b
","<
rb   c                    || j                   v ry | j                  j                  |       | j                   j                  |       y ro   )r  r  r  rA  )rq   	node_users     r`   r  z8Scheduler.compute_dependencies.<locals>.DedupList.append	  s5    /

!!),##I.rb   c                    t        j                  | j                  |j                        }| j                  |j                  D cg c]  }|| j                  vs| c}z   } ||      S c c}w ro   )r   r7  r  r  )rq   r  new_membershipr  	new_items	DedupLists        r`   __add__z9Scheduler.compute_dependencies.<locals>.DedupList.__add__	  sc    !+!1!1$//5CSCS!T JJ${{*at.FA* 	 !N;;*s   A+A+r*  )r  zOptional[list[_T]]r  zOptional[OrderedSet[_T]]r   r   )r  rV   r   r   )r  DedupList[_T]r   r  )r~   r   r   r/  r   r  r  )r  s   r`   r  r  	  s;     -17;=)= 5= 	=/<rb   r  r    c                N    | j                   v r j                   |          S | S ro   )r   )ro  r  rq   s    r`   r  z.Scheduler.compute_dependencies.<locals>.rename
  s,    D)))d33A677Hrb   Fc                P     |          j                  t        |||             y ro   )r  r  )used_by_namer  r  r  name_to_usersr  s       r`   add_userz0Scheduler.compute_dependencies.<locals>.add_user
  s)     &./66K9rb   Nc                    | j                   S ro   r  r^  s    r`   r   z0Scheduler.compute_dependencies.<locals>.<lambda>;
  s
    AFF rb   r  Tzscheduling %s)unbacked_onlyc                    | j                   S ro   r  r^  s    r`   r   z0Scheduler.compute_dependencies.<locals>.<lambda>N
  s
    !&& rb   z not in )r   )mutating_buf)r  zscheduling output %sz+scheduling output %s for unbacked symint %sr  'z': rz   r  zBUFFER USER LIST
z===== AFTER SCHEDULING =====
%s)ro  r   r   r   )FF)
r  r   r  r  r  r   r  r   r   r   )Br   rV   r  r   rA  r   rp   rZ   rg   r   r%   r8   r   r   r:  rP   r   r  r  r  r  r   	TensorBoxr  r  get_unbacked_symbol_defsSymbolr   r  get_free_symbol_usesr  r$  r/   r   r   r?  r  r.   r   r   rT   rh  r0   r   ru   r  r   r   r0  r.  rQ  r   graph_outputsmutated_inputsrA  r;  r@  mutated_input_idxsr   r   rG   r   r   r   r   compute_dependencies_log)'rq   rg   buf1	buf1_name	buf2_namelist1list2combinedr  r  unbacked_symbol_to_origin_noderu   valfsr  sym_sizehas_non_input_unbacked_defsunbacked_symbol_defsunbacked_symbol_usesr  r   r  	node_modealt_namer   
other_namer  rx  r   r  	inp_nameslogbufr  r  rl   r   r  r  r  s'   `                                   @@@r`   rS   zScheduler.compute_dependencies	  s	   	< 	<> @K?V?V@
 JJ 	LD((* L MMO	 tyy//?D,,./!3!%!1!1!3 LI M1i=6P -i 8 -i 8#(5=#0#5#5#7 >C -c 2e ;#0#5#>5=c 2> #m33@3Ki03@3Ki0LL	L:	 !&!			;	 	 		
 	 MO&
 --335 
	BID##uzz*** >B9=226>C. (+||~S!Auzz9RASS! BAnn B=A6r:BB
	B ',#JJ 	HD99((( $*		224:J$  * H!!U\\222 /3+::8<215H	H  JJ @	DIIotyy1*yy,,,'-II222F(($
 . GA >> #X&D%EF> <A>>K#'#4#4Q#7#C#C#E GC --gclln.EFGG D$$++,1 d&6&6&=&=!>??S?sI.HH	 	 '') E3,,./1444 # 1 1 3 EH%h/HXt,%%ghY&GH -h 7 = = E==?dmmo=$)$))5FGGG*.))*D*D*F EJ)/
);J -- '
 P %ZtDEEEE, ((.. F!$0TYYd.>.>t.DEF %%d&;&;< '')  # 1 1 3 H>AllnD))&*:;69llnD))(3//33HhG ++CLLN;u@	F 002 	>HII,h7Xz'(*;<=	>
 'ww,, N111E NA >> #X&D&I&I&K%LM> ;1==q=(,(9(9!(<(M(M(O NHII M ( !
 %Xz'(:K/LMNNN )) 	:Dqww+++z'$-89&&**40***z'$-89	: ,5QWW5I5I5N5N5P+Q
'E4D%K
	 
 )*(>(>&
 $IdO&
"
 JJ 	CD'') CmCLLN;AABC	C // 	SD''-77d8K8Q8QR	S  !c'--/ 	4JC 4/4{{;!;;#c%234 4	4 	c  "))+ &&';< &&'I3O_ Tl
&
" <4 4s6   f$&f$"f)f/?f9f4%f94f99g	c           
         ddl m}m}m}m} t        t        j                  j                  j                               } | j                  |      }t        j                  j                  j                  s | j                   j                         t        t        j                  j!                               } | j                  ||      \  }}	}	t#        t%         j                              D 	cg c]  }	g g f c}	|D ]}  }
|
j&                  dk(  r|
j(                  dk(  r"|
j*                  j-                         }|
j.                     d   j1                  |       |
j2                     d   j1                  |        ddlm}  |        	 	 	 	 	 	 d fd}g }t9         j                        D ]H  \  }}|j1                  |       |j1                   |||t%         j                        dz
  k(               J | _
        y c c}	w )Nr    )r$  compute_memory_timelineFreeableInputBufferget_freeable_input_bufr   )register_check_mem_opc                X   |    d   }|    d   }|||g}t        j                  t        t        j                  d            t        j
                  j                  j                  j                  g |d       }dj                  |    j                          |_        t        |      S )Nr   r    r}  )r  c                $    | |d   |d   |d   dfS )Nr   r    r#  )alivedeadis_final_stepr   )tensor_argsre  s     r`   r   zWScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node.<locals>.<lambda>
  s*    !.q!1 -a 0)6q)9C rb   )r   r   r  nontensor_argsunflatten_args
mem_check_)r%   MemoryCheckKernelr8   r[   r  rl  _inductor_debugcheck_memory_stepdefaultrA  rp   operation_namer  )step_idxr  expected_newly_aliveexpected_newly_deadr  rg   rq   step_allocs_deallocss         r`   construct_mem_check_nodezEScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node
  s     $8#A!#D "6x"@"C24GWN''!e)<=yy00BBJJ- D %/tzz(/C/L/L/N.O"PD,T488rb   )r  )r  r   r  r   r   r  )rP  r$  r  r  r  r   rP   r   r  r:  rA  r[   r  r"   r#  r   rQ  r  r   
size_alloc	size_freer  rp   
start_stepr  end_step#torch._inductor.runtime.debug_utilsr  r@  )rq   r$  r  r  r  r  name_to_freeable_input_bufr  buf_info_listrW  buf_inforx  r  r  	new_nodesr>  rg   r  s   `                @r`   r_  z#Scheduler.insert_memory_check_nodes
  s   	
 	
 )31773G3G3L3L3N(O"4::|< 	# %%===

D,, *4AGG4L4L4N)O5JJ&
q! $C

O4C
RHC
 & 	HH""a'H,>,>!,C//1H !4!45a8??I !2!23A6==hG	H 	N	9	9*.	9&	92 	 , 	GAtT"(1DJJRS@S;SU	 
eC
s   3Hc                Z  	 g }t        | j                        D ]  }dd	d}|j                         D ]  }t        	fd|j                  D              }|r\t
        j                  d|j                                t        j                  j                  j                  |j                                d} |j                          xr | }|s|j                  |       t
        j                  d|j                                t        j                  j                  j                  |j                                |j                  j                   D ]  }|j"                  | j$                  v s| j$                  |j"                     j                  }|D cg c]0  }|j&                  j                         |j                         k7  s/|2 c}| j$                  |j"                     _          t)        t        |            | _        | j                  D ]  }|j+                           yc c}w )	z0
        Remove any nodes without users
        c                r    | j                   xs* | j                         t        j                  j                  v S ro   )r  rp   rP   r   rL  )r   s    r`   can_eliminate_userz;Scheduler.dead_node_elimination.<locals>.can_eliminate_user  s&    ||Tt}}!'':T:T'TTrb   Fc              3  .   K   | ]  } |        y wro   r   )r  ur  s     r`   r  z2Scheduler.dead_node_elimination.<locals>.<genexpr>  s     #Ma$6q$9#M   zremoved dead buffer: %sTzremoved dead operation: %sN)r   r  r   r   )r
  rA  r   rq  rl   r   r  rp   rP   r   r  rA  r  r  rL  r   r   ru   r   rg   r   rU  )
rq   updated_nodesrg   active_buffersr   can_eliminater  rl   r  r  s
            @r`   rE  zScheduler.dead_node_elimination  s    TZZ( 	DU #N'') * ##M399#M M II7HGG++//?%)N* !% 5 5 77N<NM $$T* 		6H**..t}}? ,,22 DyyD$4$44 $ 0 0 ; A A',="#0AT]]_0TA=((39-	8 (=12
 JJ 	#D  "	#=s   0H(H(c                    t        t                  t               g dfd|D ]  }|j                         D ]  }||<   	  |D ]
  } |        S )z?
        Ensure nodes is in topologically sorted order
        c                    | vrdj                  |        t        | j                  d       D ]&  }|j                  vr |j                            ( j	                  |        y y )Nc                    | j                   S ro   r  )ds    r`   r   zDScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>E  s
    aff rb   r  )rA  r  r   ru   r  )ro  r  r  r   seenvisits     r`   r  z2Scheduler.topological_sort_schedule.<locals>.visitB  se    }!!"6"6<LM 2Cxx|3 ,sxx01	2
 a  rb   )ro  rT   r   r   )r   rT   r  rh  )rq   rA  rg   ru   r  r   r  r  s       @@@@r`   rD  z#Scheduler.topological_sort_schedule8  sy     +,.59V*,	! 	!  	*D--/ *%)T"*	*  	D$K	rb   c                2    t               }t        |t        t        t        t
        f      r-|j                  D ]  }|j                  |j                          nt        dt        |       d       fd|D        }t        t         fd|D                    S )Nz+get_unmet_dep_nodes is not implemented for .c              3  X   K   | ]!  }j                   |   j                          # y wro   )r   rr   rH  s     r`   r  z1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>d  s%     Xc))#.??AXs   '*c              3  <   K   | ]  }j                   |     y wro   rX  )r  ro  rq   s     r`   r  z1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>e  s     Qat66q9Qs   )r   rZ   rm  r  r  r  r   rA  ru   RuntimeErrorr}   r   )rq   rh  
unmet_depsr  unmet_dep_opss   `    r`   _get_unmet_dep_nodeszScheduler._get_unmet_dep_nodesS  s    &0l
)&"	
 // )sxx() =d5k]!L  YZXJQ=QQRRrb   c                z   g }t         j                  | j                  d      }i }| j                  D ]P  }| j                  |      }t	        |      ||<   |D ]*  }|j                  |g       }|j                  |       |||<   , R |j                         D 	cg c]  \  }}	|	dk(  s| }
}}	|
rx|j                  |
       |
D ]7  }|j                  |g       D ]  }||xx   dz  cc<    |j                  |       9 |j                         D 	cg c]  \  }}	|	dk(  s| }
}}	|
rx|rJ d       |S c c}	}w c c}	}w )zU
        Sort nodes by their topological order, return a list of node lists.
        r   r    zTopological sort failed!)	r  fromkeysrA  r  r   r.  r  r  r@  )rq   r  rA  childrenrg   rD  r  cro  r  zero_deg_nodesr   s               r`   r  z!Scheduler._topological_sort_nodesg  sF    djj!,#%JJ 	"D,,T2Dd)E$K "LLb) !"	" ).@1a!@@LL(# $LLB/ %D$K1$K%		! -2KKMDDAqQ!VaDND  444y A Es   D1%D1D7D7c                j   i }| j                   D ]w  }t               }|j                  D ]B  }| j                  |j                     j                         }|j                  |       |||   z  }D |||j                         <   ||_        y t        | j                         D ]  \  }}||_
        ||_         y)z.
        Populate each node.ancestors
        N)rA  r   r   r   ru   rr   rA  rp   r   r@  r   r   )rq   name_to_ancestorsrg   r   r  dep_node_namer  s          r`   rF  zScheduler.compute_ancestors  s    
 9;JJ 	'D)3I.. > $ 0 0 : K K Mm,.}==	> 2;dmmo.&DN	' %TZZ0 	#KE4"DN"DN	#rb   c                H   t         j                  sy | j                  D ]  }t        |t        t
        f      r#|j                         st         j                  dk7  r=|j                         D ]3  }t        |t              r|j                         r$|j                          5  y )Nhalide)r"   r  rA  rZ   rm  r  rJ   cpu_backendrc  r  r  )rq   rg   rh  s      r`   r  zScheduler.merge_loops  s    00JJ 	$D d]4F$GHKKMf&8&8H&D) $!%75;L;L;N!!#$	$rb   c                p   t        ddd      5  t        d      D ]  }t        |      }t        j	                  d|dz   |       | j                  |      }t        |      }t        j	                  d|dz   ||       ||k(  s|dk(  sjt        j	                  d|dz           n |cd	d	d	       S # 1 sw Y   y	xY w)
zB
        Combine eligible nodes into FusedSchedulerNodes.
        zScheduler.fused_nodesTr  
   z/===== attempting fusion (%d/10): %d nodes =====r    z=completed fusion round (%d/10): fused %d nodes into %d nodes
z+===== fusion complete (%d iterations) =====N)r   r  r   r  r  fuse_nodes_once)rq   rA  r>  old_lennew_lens        r`   r  zScheduler.fuse_nodes  s     #4QU
 	 2Y e*  EE
 ,,U3e*  TE	 g%A$$Eq1u '( /	 	 	s   A5B,B,,B5c                    g }| j                   D ]4  }|j                  t        |t              r|j	                         n|g       6 || _         y)zA
        Unpack GroupedSchedulerNode into regular nodes.
        N)rA  rC  rZ   r?  r  )rq   r  rg   s      r`   rV  zScheduler.process_grouped_nodes  sJ     .0	JJ 	D!+D2F!GdV	 
rb   c                    t        |      dkD  sJ |d   j                         }|| _        | j                  |      }t	        ddd      5  |j                  |      cddd       S # 1 sw Y   yxY w)
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        r   benchmark_fused_nodesTcompile_time_autotune_time_us)r   dynamo_compile_column_usN)r   r   rk  r  r   r  )rq   rA  r  backends       r`   r  zScheduler.benchmark_fused_nodes  st     5zA~~q$$&$""6*#"&%D
 	8
 007	8 	8 	8s   
A%%A.c                    t        |      dkD  sJ |d   j                         }|| _        | j                  |      }t	        d      5  |j                  |||      cddd       S # 1 sw Y   yxY w)r  r   r  hint_overrideN)r   r   rk  r  r   generate_kernel_code_from_nodes)rq   rA  benchmark_kernelr  r  r  s         r`   r	  z)Scheduler.generate_kernel_code_from_nodes  sw     5zA~~q$$&$""6*12 	::'} ; 	 	 	s   A%%A.c                    || _         | j                  |      }t        d      5  |j                  |      cddd       S # 1 sw Y   yxY w)r  r  N)rk  r  r   benchmark_codegened_module)rq   moduler  r  s       r`   r  z$Scheduler.benchmark_codegened_module  sH     %""6*12 	>55f=	> 	> 	>s	   ?Ac           
        	 	 	 	 	 	 d	d}t        | j                        D ]  \  }}t        |t              st        |j                  t
        j                        s=|j                  }t        j                  j                  s|j                         \  }}n t        d |j                         D              }t        |t        j                  j
                  j                        rt        j                   ri }||d<   t        j                   D ]k  }|j                  |      }	|	j#                         D 
ci c]  \  }
}t        |
t              r|
| }}
}t%        |j#                         d       d   }|||<   m |j                  j'                  |       n|j                  j)                  |       |j+                         }|j,                  }t        |t
        j.                        sJ |j,                  }t        |t
        j0                        sJ |j2                  |_         |||       | j5                  |      }|| j                  |<   || j6                  |j9                         <   || j:                  |j9                         <   i t=        j>                  |j@                  jB                  |jD                        D ]:  }| jF                  jI                  |jJ                  d      x}s,|jJ                  |<   < d
fd} ||jD                        |_"         ||j@                  jB                        |j@                  _!        tM        |jO                         |jO                               D ]3  \  }}|| jP                  |j9                         <   |jR                  |_)        5 |jT                  |_*        |jV                  |_+        |jX                  |_,         yc c}}
w )a  
        Finalize a backing choice for MultiTemplateBuffers which did not already have a
        choice finalized through fusion. In the case of an extern choice, this will result
        in replacing the SchedulerNode.

        If a MultiTemplateBuffer did not have any fusion opportunities, finalizing a choice
        will force completion of compilation and benchmarking.
        c                   |j                         }| j                         }t        |t              rt        |t              sJ |j                         }| j                         }t        |t              rt        |t              sJ t        j
                  j                  |= ||_        t        j
                  j                  |= ||_	        t        j
                  j                  j                  |       }t        j
                  j                  j                  |       |t        j
                  j                  |<   |t        j
                  j                  |<   t        j
                  j                  j                  |       }t        j
                  j                  j                  |       |t        j
                  j                  |<   |t        j
                  j                  |<   y ro   )rp   rZ   r   r[  rP   r   rB  ru   
name_to_opr  buffersr  remove
operations)	orig_noder  replaced_buf_nameorig_buf_namereplaced_op_nameorig_op_nameorigs          r`   replace_operation_bufferzKScheduler.finalize_multi_template_buffers.<locals>.replace_operation_buffer  sW    !) 1 1 3%..0MmS1jARTW6XXX'::<$779LlC0Z@PRU5VVV&&'89)HM""#34&2H#77??((3DGGOO""8,$,AGGOOD!4<AGG""=177%%++I6DGG%%h/'/AGGt$/7AGG|,rb   c              3  |   K   | ]4  }t        |t        j                  j                  j                        r| 6 y wro   )rZ   r[   r  select_algorithmExternKernelCaller)r  timings     r`   r  z<Scheduler.finalize_multi_template_buffers.<locals>.<genexpr>6  s6       &) & % @ @ S S  #rL  Nr  c                    | d   S r  r   r^  s    r`   r   z;Scheduler.finalize_multi_template_buffers.<locals>.<lambda>O  s    qQRt rb   r  r   c                ,    t        fd| D              S )Nc              3  @   K   | ]  }|j                          y wro   )r  )r  r  r   s     r`   r  zQScheduler.finalize_multi_template_buffers.<locals>.rename_deps.<locals>.<genexpr>n  s     %Sscjj1A&B%Sr  r   )rD  r   s    r`   rename_depsz>Scheduler.finalize_multi_template_buffers.<locals>.rename_depsm  s    %%Sd%SSSrb   )r  zir.MultiTemplateBufferr  zir.OperationBufferr   r   )rD  r   r   r   )-r@  rA  rZ   rm  rg   r%   MultiTemplateBufferr"   r]  %force_extern_kernel_in_multi_templateget_min_choicer?  choice_timingsr[   r  r   multi_kernel_hintsr  r  finalize_as_triton_callersfinalize_as_triton_calleroutput_noder   
StorageBoxOperationBufferr   r=  r  rp   rX  r9  r:  r   r   r   r0  r.  ru   r  r   r   rl   r   r   r   )rq   r  r>  rg   
multi_nodemin_node_unfusedrW  callershinttimingsr/  r  triton_timingschoiceout_tensorboxout_storage
out_buffernew_scheduler_noder  	real_namer"  new_outold_outr   s                          @r`   rM  z)Scheduler.finalize_multi_template_buffers  s   	8-	89K	8	86 !, R	@GAt$.:		2114 "YY
**PP*4*C*C*E'$a'+*4*C*C*E	($ $OO&&?? 00QS(8$*$=$= 3D&0&?&?d&?&SG -4MMO.$(Aq#-a1I#J !"1.N .
 &))=)=)?^%TUV%WF,2GDM3 		<<WE		;;<LM 0 < < >+00!+r}}===(--
!*b.@.@AAA$.$5$5
!(Z@%)%?%?
%K" 2

15G!!$--/2;M''8 $& $??$$**D,C,C ?C %)$;$;$?$?$$OOyO69hh(3	?T 9D&999"5 8C&22888"..4 ),&224d6F6F6H) 2$GW <CD$$W%5%5%78$+MMGM	2 04~~",/3~~",04"-eR	@:.s   5O
c                &    t        d |D              S )Nc              3     K   | ]q  }t        |j                  d       xrU |j                  duxrE t        |j                  j                  d      xr# |j                  j                  j                  dk(   s yw)r   Nscatter_moder!  )r   rg   r   r=  rn  s     r`   r  z,Scheduler._any_atomic_add.<locals>.<genexpr>  so      

 	 AFFF# 9d"9^49 ((L89
s   A7A9)r'  rq   	node_lists     r`   _any_atomic_addzScheduler._any_atomic_add  s     

 
 
 	
rb   c           
     N
    !"#$% t        d fD              }t        j                  s|syj                         r(t	        j                         t        j                        r j                         sj                         ryj                         }|d   j                         sJ j                  dk(  ryj                         }t        t        j                  ||            } j                  |      ryddlm t%              %|d   j                         J dfd!t&        j(                  j*                  j-                         	 d	 	 	 	 	 d fd}|rt        d	 fD              rj                         durj                         nj                         $t	        $t        j.                        sJ i  g t        j0                  D ]8  }$j3                  |      }	t5        |	j7                         d
       D ]u  \  }
}t	        |
t&        j(                  j8                  j:                        s5$j=                  |
      5  j?                  |
g |||
j@                               ddd       w tC        d      }d}i }D ]V  \  }
}}	 ||jE                          $j=                  |
      5   jU                  |      \  }}|||
<   ||k  r|}|
}ddd       X |$jV                  |<   t	        |tX              sJ | |<   ; $j3                         }	$j[                         \  }"r j]                  |      n j]                  |      \  #}g d}t5        |	j7                         t_        j`                  d            D ]  \  }
}t	        |
t&        j(                  j                  jX                        s5s&tc        |
d      r|
jd                  $jd                  k7  r]|"#z   k\  r nQ|dz  }|t        jf                  kD  r n7$j=                  |
      5  j?                  |
g ||             ddd        ti              dk(  ryd !"#$ f	d}|S  ||       ||       ||      d! %fd}|S # 1 sw Y   xY w# tF        $ rR}tH        jK                  tL        jN                        r$tH        jQ                  dsdndtS        |             Y d}~_d}~ww xY w# 1 sw Y   qxY w# 1 sw Y   xY w)
        If config.benchmark_fusion is False, always return True.
        Otherwise, return True if fusion can brings speedup.
        c              3     K   | ]>  }|j                         xr( t        |j                         t        j                         @ y wro   )r  rZ   r5  r%   r#  rn  s     r`   r  z.Scheduler.speedup_by_fusion.<locals>.<genexpr>  sE       
  MMO J1..0"2H2HIJ 
s   AATr   r}  CompilationErrorNc           
     t   t         j                  t        j                        r| ||z   k  rFt         j	                  dj                         j                         t        ||z   | z  d             y t         j	                  dj                         j                         t        | ||z   z  d             y y )Nz9can fuse (benchmark): fusing %s with %s cause %sx speedup.3fz=cannot fuse (benchmark): fusing %s with %s cause %sx slowdown)r  ru  rv  DEBUGr  rh  r<   r=   )ms_fusedms1ms2r  r  s      r`   
log_fusionz/Scheduler.speedup_by_fusion.<locals>.log_fusion  s    &&w}}5cCi'$$S..0..0"sSyH&<S%AC	 $$W..0..0 Hc	$:3#?A	 6rb   c                    j                  | d|      }t        j                  |      }j                         sd }||fS j	                  d|      }t        |t              sJ ||fS )NT)r
  r  triton_)kernel_namesource_code)r	  r   loaduse_process_poolrX  rZ   r   )rA  r  src_codemodfutasync_compilerq   s        r`   compile_kernelz3Scheduler.speedup_by_fusion.<locals>.compile_kernel  s     ;;M < H ""8,C 113
 : $**yh*W!#|444:rb   c              3  @   K   | ]  }|j                         d u  y wro   r8  rn  s     r`   r  z.Scheduler.speedup_by_fusion.<locals>.<genexpr>  s#      %
23A!-%
s   c                    | d   S r  r   r^  s    r`   r   z-Scheduler.speedup_by_fusion.<locals>.<lambda>  s
    !A$ rb   r  r  infException in compiling %s: %srC  rE  r    allowed_prologue_inpsFc            	     h  	 t        d      } d }i }D ]V  \  }}}	 ||j                          j                  |      5  j                  |	      \  }}|||<   || k  r|} |}d d d        X  |        | z   k  rJ|Ht        j                  r|d <   j                         nj                  |       |j                  d <   yy# t        $ rR}t        j	                  t
        j                        r$t        j                  d
sdndt        |             Y d }~d }~ww xY w# 1 sw Y   xY w)NrZ  r[  rC  rE  TF)r  r   r   r  ru  rv  rH  r  r   swap_as_triton_callerr  r"   r'  r(  r)  _choice_timings)min_ms_fusedms_fused_choicenew_timingsr3  future	mod_fusedr)  rI  pathr  epilogue_fusionfuture_choices hint_override_best_fusion_choicerL  rJ  rK  r-  rq   s            r`   benchmark_when_readyz9Scheduler.speedup_by_fusion.<locals>.benchmark_when_readyE  sS   $U|"& 1? 5-FFI!!-"MMO $99&A 5)-)H)H%v*$ /7F+#l2+3L.4O5 550 <c239-/2M00AP8>"==< #<<_M7BJ..t4 ? % !%227==A&,, ?2A
z #A
 !!5 5s#   C	$D'		D$ADD$'D1	c                    ddl m}  	 d   d   d   fD ]  }||j                           j                  d   
      \  t	        j
                        r	 d       yj                  d   
      \  t	        j
                        r	 d       yj                  d   
      \  t	        j
                        r	 d       y        t        d      rWz   k\  rOfj                  vr?j                  j                  f       t        d      j                  fd	       z   k  S # | $ r Y y	$ r}d
t        |      v rY d }~y d }~ww xY w)Nr   )NoTritonConfigsErrorr    z%register spilling of the first kernelFz&register spilling of the second kernelz%register spilling of the fused kernelslow_fusionc            	     $      z   z  dS )N)kernel1_pathkernel1_latencykernel2_pathkernel2_latencyfused_kernel_pathfused_kernel_latencyslow_down_ratior   )rJ  rK  rI  path1path2
path_fuseds   r`   r   zKScheduler.speedup_by_fusion.<locals>.benchmark_when_ready.<locals>.<lambda>  s(    053605365?8@3;sSy3I% rb   Loop-carried variableT))torch._inductor.runtime.triton_heuristicsrk  r   r  mathisinfr   rJ  rA  r   rd  r   )rk  rU  r)  rJ  rK  rI  ru  rv  rw  rE  r  future_and_mod_l1future_and_mod_l1_fusedfuture_and_mod_l2rL  rq   r  s      @@@@@@r`   ri  z9Scheduler.speedup_by_fusion.<locals>.benchmark_when_ready{  s   ; *!,)!,/2  )
 ?JJL) "&!@!@)!,f"JC zz#CD$!%!@!@)!,f"JC zz#DE$+/+J+J/2F,(Hj zz(+CD$xc2 0>$c	1"EN$2I2II//33UENC(7?? 
 $cCi//+ ! ' .#a&8#s<   E AE +5E !5E A3E E.E.E)(E))E.)rI  r  rJ  r  rK  r  r   r   ro   )rA  r  r  Optional[int]r   z)tuple[Optional[LambdaFuture], ModuleType]r   )5r'  r"   benchmark_fusionr  rZ   r5  r%   TritonTemplateBufferr  rc  r   r}   r   r9  r:  r@  triton.compiler.errorsrE  r  r[   r  rV  AsyncCompiler#  r'  r&  r  r  r  TritonTemplateCallerr^  r  r  r  r   r   r  ru  rv  rH  r  r   r  r_  r   r%  r  operator
itemgetterr   r\   max_epilogue_benchmarked_choicesr   )&rq   r  r  is_multi_templatenode_list_1node_list_2node_list_fusedrW  r  r&  r3  unfused_timer`  ra  rb  rc  rd  r)  rI  re  rW  rv  triton_choicesri  rE  rV  r  rf  r|  r}  r~  rg  rh  rL  rJ  rK  r-  r  s&   ```                     @@@@@@@@@@@@@@r`   speedup_by_fusionzScheduler.speedup_by_fusion  s]       
 U^ 
 

 &&/@ u668":Q:QR!! oo'Q**,v ;;%oo'y{KHI
 0;u% #..0!!!	" 55BBD PT	.	?L	6	  %
8=u~%
 "
 $557tCO # ''),,. 
 j"*@*@AAA  - TVN!'!:!: ,R!+!:!:=!I,2"((*- (FL & @ @ U U !#99&A &-- &!/$36CWCW"" "  %U|FJ 1? 5-FFI
!!-"MMO $99&A 5)-)H)H%v*$ /7F+#l2+3L.4O5 55( =H
**=9!/3KLLLBQ0?Y,R^ (668N..0FAs # **;7//< C TVNN(.$$&H,?,?,B) V$ "&%//*<*<*U*UV ((?@44
8X8XX39,!#!F$K$KK55f= V"))6*TN?4S*TUV V3V8 >"a',! ,!\ (' !/{ ; .{ ;&4_&E#@ @D ('Q " % !%227==A&,, ?2A
z #A
 !!5 5bV Vs=   #(R".R/$T6T"R,/	T
8ATT
TT$	c                <    | j                   |j                            S )z0Look up the node in Scheduler name_to_fused_node)rX  r^  ry  s     r`   r  zScheduler.get_fused_node  s    &&t':':'<==rb   c                    t        |      t        j                  t        j                        r@t        j                  d       D ]&  }t        j                  d|j                                ( i 	 	 	 	 	 	 d fd	 	 	 	 	 	 d fd} j                  |      D ]  \  }} |||        j                  |      } j                  |      } j                  ||      sD j                  ||      rW j                  ||      }t        |      r|||f|<   |||f|<   |s ||        t               }j                         D ]j  \  }}	}
||v r|j                  |        j                  |	      |	u sJ  j                  |
      |
u sJ  |       sO j                  |	|
      rb |	|
       l t        d       } j!                  |      } j#                  |       |S )	a  
        Combine eligible nodes into FusedSchedulerNodes.

        This relies on two key functions to control the logic:
            - self.can_fuse(): checks if a fusion is legal
            - self.score_fusion(): assigns priority to a given fusion
        zfuse_nodes_once, candidates:z  %sc                   t         j                  d| j                         |j                                | j                         }|j                         |k(  sJ j	                  |      j                  | |      }j                  |        j                  |       j                  |       j                  j                  |j                         D ci c]  }|j                         | c}       |S c c}w )Nzfusing %s with %s)r  r  rp   r   r  rG  r  rA  rX  rl  rc  )r  r  r  node3ro  r  rq   s        r`   fuse_two_nodesz1Scheduler.fuse_nodes_once.<locals>.fuse_two_nodes  s     0%..2BENNDTU%%'F##%///$$V,11%?Eu%u%OOE"##**.3oo.?@u$@ L As   C6c                   j                  |       v sj                  |      v rj                  j                  |       j                  j                  |      d             }|J |\  }}}j                  |d        j                  |d        j                  |      |u sJ j                  |      |u sJ  |       rj                  | |      r ||       j                  |       v rωj                  |      v ry y ro   )r  r.  r@  will_fusion_create_cycle)	r  r  pending_fusion
is_speedup	node_key1	node_key2r  pending_fusionsrq   s	         r`   resolve_pending_fusionsz:Scheduler.fuse_nodes_once.<locals>.resolve_pending_fusions  s    ##E*o=&&u-@!0!4!4''.#''(;(;E(BDI" &1113A0
Iy##It4##It4**95BBB**95BBB!|t'D'DUE'Ry)4' ##E*o=&&u-@rb   c                    | j                   S ro   r;  r^  s    r`   r   z+Scheduler.fuse_nodes_once.<locals>.<lambda>(  s
    !++ rb   r  )r  rT   r  rT   r   rT   r  )r   r  ru  rv  rH  r  r
  get_possible_fusionsr  r  r  r  callabler   rA  r  rD  rY  )rq   rA  rg   r  r  r  speedupseen_pair_speedup_fnis_speedup_fnr  r  r  r  r  s   `          @@@r`   r  zScheduler.fuse_nodes_once  s'    !'""7==1;<# A  )=)=)?@A  	
	$	->		 	5$	5->	5	52 !55e< 	-LE5 $E51''.E''.E}}UE*43P3Pu4 00>G$.5ue-DOE*.5ue-DOE*ue,)	-, @J|3B3I3I3K 	5/M9i 44 $$]3&&y1Y>>>&&y1Y>>>t'D'D9( y)4	5 {(=>..u5!!%(rb   c                   t        | j                        }d}t        | j                        }t        j	                  d|       t        t        j                  |             D ]  \  }}t        j                  |      }t        |      dk  r+|||kD  r n| j                  |      st        j	                  d|       \|dz  }t        j                  dkD  }t        |d   j                  |d|      }t        j                  d	t        |      |       |D ]  }	|j                  |	        |j                  |       | j                   j#                  |j%                         D 
ci c]  }
|
j'                         | c}
       ! t)        |d
       | _        | j+                  | j                        | _        t        j                  d||t        | j                               | j-                  | j                         yc c}
w )z'
        Groups parallel nodes
        r   z2ComboKernels: Generating with num_ck_nodes = %s...r#  Nz)ComboKernels: Not speeding up %d-th groupr    Tr}  z0ComboKernels: Combining %d nodes for %d-th groupc                    | j                   S ro   r;  r^  s    r`   r   z5Scheduler.create_combo_kernel_nodes.<locals>.<lambda>S  s
    q{{ rb   r  zDGenerated ComboKernel nodes: %d ComboKernels, totally %d -> %d nodes)r   rA  r   r   r  r@  r  r  r  speedup_by_combo_kernelr"   r  rf   r  r  rA  rX  rl  rc  rp   r  rD  rY  )rq   r"  r  r8  num_nodes_orignumr?  r  r3  rg   ro  s              r`   rO  z#Scheduler.create_combo_kernel_nodes-  s    !,TZZ		FU'&DDTJ
 	NC 3CCINI9~!'EL,@//	:		EsKQJE$;;a?O4!&&*. /	K HHBI
 " )""4()OOK(##**4?4I4I4KLq{*L7	< K-BC
33DJJ?
R

O		
 	!!$**- Ms   !G=
c                H    |D ]  }|j                  | j                          y ro   )rY  rX  )rq   rA  rg   s      r`   rY  zScheduler.prune_redundant_deps]  s%     	?D%%d&=&=>	?rb   c                   	
 g 	t        t        t        t        f             
d	
 fd}t        j                  t
              }|D ]=  } j                  |      r|j                         D ]  }||   j                  |        ? |j                         D ]
  } ||        t        j                  rat        j                  t
              }|D ]&  }t        |dd      }|s||   j                  |       ( |j                         D ]
  } ||         j                  	      		j                   j                  d       t         j#                  dt%        	             	S )z^
        Helper to find all legal fusion opportunities, sorted by self.score_fusion()
        c                x   t        |       D ]  \  }}| |dz   |dz   t        j                  z    D ]  }||f}|v rj                  |       j	                  ||      rj                  |       A|j                         s|j                         sbj	                  ||      suj                  ||f         y r  )r@  r"   )max_fusion_buffer_group_pairwise_attemptsrA  r  r  r  r  )rA  node1_indexr  r  r  possible_fusionsr  rq   s        r`   check_all_pairsz7Scheduler.get_possible_fusions.<locals>.check_all_pairsj  s    &/&6 @"U"!Ok'FF'G @E
 !%.Cd{ HHSM}}UE2(//4++-1A1A1CuJ )//?!@@rb   r   NT)r  reversezfound %d possible fusionsrA  rU  r   r   )r   ra  rT   r  r   r   unfusable_noder;  r  r   r"   aggressive_fusionr   *get_possible_fusions_with_highest_priorityr  score_fusion_keyr  r  r   )rq   rA  r  buffer_names_groupingrg   r   node_groupinggroup_groupingr   r  r  s   `        @@r`   r  zScheduler.get_possible_fusionsa  sh    % 13D DEFH	@( !, 7 7 = 	8D""4(--/ 8%c*11$78	8
 399; 	+MM*	+ ##(44T:N 7gt4"5)0067 "0!6!6!8 /./  JJ
 	$"7"7F4c:J6KLrb   c                    t        t                  d fd|j                         j                  j	                         |j                         j                  j	                         z  |j
                  j                  j	                         |j
                  j                  j	                         z  z
  t         fdD              }|r t        ||      d       |S )z~
        Finds whether there's a path from node1 to node2 (or vice-versa)
        caused indirectly by other fusions.
        c                   t        | t              rq| vrmj                  |        | j                         j	                        ryt        | j                  z        xs" t        fd| j                  z
  D              S y)NFc              3  H   K   | ]  } j                   |           y wro   r  r  ro  
found_pathrq   s     r`   r  zIScheduler.will_fusion_create_cycle.<locals>.found_path.<locals>.<genexpr>  s+      H #4#:#:1#=>H   ")rZ   r  rA  rd  issubsetr   r   r'  )rg   combined_ancestorscombined_namesr  rq   visiteds    r`   r  z6Scheduler.will_fusion_create_cycle.<locals>.found_path  s    $ 23G8KD!++-667IJ !   ?@ C H!%2D!DH E  rb   c              3  H   K   | ]  } j                   |           y wro   r  r  s     r`   r  z5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>  s!     WqJt66q9:Wr  zwill create cyclerg   rT   r   r   )r   r  rd  _dictr:  r   r'  r  )rq   r  r  cycler  r  r  r  s   `   @@@@r`   r  z"Scheduler.will_fusion_create_cycle  s     /02	 	2 %%'--224'')//4467 	
 OO!!&&(5??+@+@+E+E+GG WDVWW#IeU#$78rb   c                    ddl m 	 	 	 	 d fd} ||      } ||      }t        fd|D              }t        fd|D              }|j                  |      }d}	|D ]  }
	 |	t	        |
d         z  }	  j                  ||      }t        j                  j                  j                  |	d	|z        ry
y# t
        $ r Y  yw xY w)a  
        Return true if fusing the two nodes can potentially increasing peak memory.

        The implementation is more like a heuristic since we don't really know if we are at peak
        or not when trying to fuse these two nodes. The order of nodes may change later which makes the
        peak memory estimation hard.

        Here is how we decide the LOWER BOUND of extra memory allocation if we fuse these 2 nodes:
        1. find all buffers read by each node with a single user. These buffers are supposed to
           be reused if we don't fuses these 2 nodes
        2. find the intersection of these buffers for the two node and sum the total buffer size.
           If we don't fuse these two nodes, we can at lease avoid this much memory allocation.
           Note that the extra memory allocation is not necessarily causing peak memory increase.
           This is just a heuristic.

        We return true only if the saving for fusion can not trade off the extra memory allocation.
        r    )buffer_reuse_keyc                0   g }| j                   j                  D ]y  }j                  j                  |j                        }|s+t        |j                        dk(  sD|j                  j                         s_|j                  |j                         { |S r  )
r   r   r   r.  ru   r   rl   rg   has_tensor_outputr  )rg   r   r  r   rq   s       r`   _find_single_user_inputszKScheduler.can_fusion_increase_peak_memory.<locals>._find_single_user_inputs  sw     F&&,, ,&&**27733syy>Q.3883M3M3OMM#((+, Mrb   c              3  .   K   | ]  } |        y wro   r   r  r   r  s     r`   r  z<Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>       #Sc$4S$9#Sr  c              3  .   K   | ]  } |        y wro   r   r  s     r`   r  z<Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>  r  r  r   r#  F    T)rg   rT   r   zlist[ir.Buffer])r  r  r   intersectionr   r  score_fusion_memoryrP   r   r  statically_known_gt)rq   r  r  r  lhs_dep_nodesrhs_dep_nodeslhs_reuse_keysrhs_reuse_keyscommon_reuse_keysmemory_overheadr  	bw_savingr  s   `           @r`   can_fusion_increase_peak_memoryz)Scheduler.can_fusion_increase_peak_memory  s    * 	6	#		 1707##S]#SS##S]#SS*77G$ 	C3s1v;.	 ,,UE:	 77//iP  s   $B88	CCc                     |j                   j                  |j                   j                  z  |j                   j                  |j                   j                  z  z
  }t         fd|D              |kD  S )Nc              3  @   K   | ]  }j                  |        y wro   dep_size_hintrH  s     r`   r  z:Scheduler.fusion_accumulate_large_reads.<locals>.<genexpr>  s     @s4%%c*@r  )r   r   r   r  )rq   r  r  	threshold	all_readss   `    r`   fusion_accumulate_large_readsz'Scheduler.fusion_accumulate_large_reads  sd     &&,,u/@/@/F/FF$$u'8'8'?'??
	 @i@@9LLrb   c                    t        t        |j                  |j                  z
        t        |j                  |j                  z
              }|dkD  S )aB  
        This function prevents fusion for nodes that can increase memory
        footprint. This problem is more common in horizontal fusion, where nodes
        that are far apart in the original order get fused, lengthening the live
        intervals of tensors. This is very evident in models with activation
        checkpointing, where the recomputed nodes from different checkpointed
        regions get fused and significantly increase the memory footprint.

        The current attempt is a quick, possibly hacky, heuristic to prevent the
        fusion of nodes that are far away in the original order.

        A better but difficult to implement heurisitic would be to use live
        intervals of the buffers, find region of peak pressure in the original
        program and prevent fusion that crosses that peak region. We might need
        special care or good approximation in this implementation, as fusion of
        node changes live intervals, and re-computing live intervals and peak
        memory after each fusion can introduce large compilation overhead.
        @   )r$  r  r   r   )rq   r  r  proximity_scores       r`   are_long_distant_nodesz Scheduler.are_long_distant_nodes  sE    * %//12%//12
 ##rb   c                   i }|j                   j                         D ci c]  }|j                  | }}|j                   j                         D ci c]  }|j                  | }}|D ]}  }t        j                  j                  |      }	||   }
||   }t        |
t              rt        |t              sdt        |
       dt        |       ||<   k|
j                         |j                         k7  r(d|
j                          d|j                          ||<   t        |
j                        t        |j                        k7  rd||<   |
j                         }|j                         }||k7  rd| d| ||<   |
j                         |j                         k(  rd|
 d| ||<   Ed}t        |	t        j                        sd|	j                    }d	|
 d| d
| ||<    t#        |      S c c}w c c}w )z}
        Try to decide reasons why fusion fail due to no shared memory even though
        there are common buffers.
        znot MemoryDep: z v.s. zdifferent numel: 	broadcastzdifferent offset: zMismatch loop orders: r   zLayout: zUnknown reason: z. )r   r  ru   rP   r   r  rZ   r.   r}   	get_numelrO   r  
get_offsetnormalize_with_stride_orderr%   r  r   r   )rq   r  r  common_buf_namesreasonsr  node1_name2depnode2_name2deprx  r   lhs_deprhs_deplhs_offrhs_off
layout_strs                  r`   decide_fusion_fail_reasonz#Scheduler.decide_fusion_fail_reason!  s    383D3D3U3U3WXC#((C-XX383D3D3U3U3WXC#((C-XX( ,	H''$$X.C$X.G$X.Ggy1GY9W%d7m_F4=/J !   "g&7&7&99'(9(9(;'<F7CTCTCVBWX !  W\\*mGLL.II$/!((*G((*G'! '9	y$Q! 3356689 '=WIVG9$U! Jc2#5#56'

|4
"7)6'"ZLI HU,	\ 7|c YXs   G5G:c                   t         j                  rt        d ||fD              ry|j                  j	                         }|j                  j	                         }||z  }|sy|j                  j                         D ci c]  }|j                  | }}|j                  j                         D ci c]  }|j                  | }}g }	|D ]y  }
||
   }||
   }|j                         |j                         k(  s/|	j                  t        j                  j                  j                  |j                         d      ||f       { t        |	      dk(  ryt        |	t!        j"                  d            \  }}}t%        |t&              rt%        |t&              sy|j(                  |j(                  k7  r3|j+                         |j+                         k(  r| j-                  |      S yd}|j/                         s|j1                  ||      }nV|j/                         s|j1                  ||      }n3t2        j5                  d|j7                         |j7                                |r| j9                  ||      S dS c c}w c c}w )a  
        Right now just greedily reorder the loop of node1 to be compatible with node2,
        but ideally we should have some heuristics to reorder the loop for node2
        to be compatible with node1 if that's more efficient.

        Return the amount of shared data re-computed in this method.
        If no such recomputation happens, return -1 (not return 0 since 0 is a valid
        amount of shared data).

        c              3  <   K   | ]  }|j                           y wro   )r  rn  s     r`   r  z>Scheduler.shared_data_after_reordering_loop.<locals>.<genexpr>o  s      8
AHHJ8
rb  r  r   r  r  Fz?Don't reorder loops since both nodes are reductions: %s v.s. %s)r"   r  r'  r   buffer_namesr  ru   r  r  rP   r   r  r  r  r   r$  r  r  rZ   r.   r  r  r  r  r  r  r  rp   r  )rq   r  r  node1_buffer_namesnode2_buffer_namescommon_buffer_namesr  r  r  
candidatesbuffer_namer  r  _numel	reordereds                  r`   !shared_data_after_reordering_loopz+Scheduler.shared_data_after_reordering_loop_  s[     00C 8
!&8
 5
 "..;;="..;;=03EE"383D3D3U3U3WXC#((C-XX383D3D3U3U3WXC#((C-XX 
. 	K$[1G$[1G3356689 !!((2273D3D3FQR2S	 z?a $'zx7J7J17M#N '9-Z5Sw///
   "g&7&7&99))'22	!!#77II##%77II##Q   :Ct''u5JJg YXs   >I70I<c                    t        |t        t        f      xr) |j                          xr t	        |j
                         S )z>
        Is this node unfusable under any conditions.
        )rZ   r  r  r  rL   rg   ry  s     r`   r  zScheduler.unfusable_node  sD    
 t79OPQ C$$&&C7		BB	
rb   c                   |j                         t        j                  j                  k  ry|j	                         }|j                         }d}|||z  kD  r	 |d       yt        d |j                         D              }|t        j                  j                  j                  j                  fk(  r	 |d       yd	d} ||j                         j                        r|j                         s	 |d       yy)
zT
        Heuristics to avoid benchmarking predictably slow prologue fusions
        T皙?z@prologue fusion will not increase amount of bytes read in kernelFc              3     K   | ]J  }|j                   <|j                   j                         D ]  }|j                  dk(  r|j                   ! L y w)Ncall_function)rg   r  r^   r  )r  ro  r)  s      r`   r  zEScheduler.check_prologue_fusion_heuristics_fusable.<locals>.<genexpr>  sT      
vv!VV'')	
 tt&	 HH

s   AAz\prologue fusion will not increase attempt to fuse in padding bc it increases unaligned readsc                <    | j                   dk  xr | j                  S )Nr#  )itemsizeis_floating_point)r+  s    r`   low_prec_fpzGScheduler.check_prologue_fusion_heuristics_fusable.<locals>.low_prec_fp  s    >>Q&B5+B+BBrb   zVprologue fusion that must be upcast to fp32 not profitable for low precision templates)r+  ztorch.dtyper   r   )rd  rP   r   invoke_quant_opsr  r  ra  rc  r[   rl  rm  constant_pad_ndr  r:  r+  rr  )	rq   prologue_noderD  r  
read_byteswrite_bytesBYTES_THRESHOLD_MULTIPLIERr  r  s	            r`   (check_prologue_fusion_heuristics_fusablez2Scheduler.check_prologue_fusion_heuristics_fusable  s     ,,.!''2J2JJ"88:
#::< &)"'AABRS  
",,.
 
 uyy~~55==??n 	C @@BHHI!>>@h rb   c                <    t        |t              rt        |t              syt        |j                  t        j                        r$t        |j                  t        j                        sy|j                         s|j                         ryt        j                  dk(  ry|j                  |j                  }}|\  }}|\  }}|j                         s,|j                         s||k7  st        |      t        |      k7  ryt        |j                  j                        dkD  s"t        |j                  j                        dkD  ry j                  t        t        |j                  j                                    }	 j                  t        t        |j                  j                                    }
t!        |	|
      t        j"                  kD  ryd fd} ||      s ||      ryg }t%        t'        ||            D ]  \  }\  }}||k7  s|j)                  |       ! t        |      dk7  ry|d   }||   ||   }}t*        j,                  j.                  j1                  ||      r|||fS t*        j,                  j.                  j1                  ||      r|||fS y)ao  
        Fusing two small pointwise nodes significantly reduces kernel overhead
        and launch overhead. However, slightly different sizes would prevent fusion.
        Here, we decide if expanding sizes of one node is profitible by allowing
        fusion, and returns the dimension to expand, node with smaller sizes,
        and new size after expand.
        Nr  r    c                ~   | j                   j                  D ]  }|j                  j                  v rj                  |j                     }n%j                  j                  |j                        }|s]t        j                  j                  j                  ||       st        |j                  t              r y y)NTF)r   r   ru   r   r   r.  rP   r   r   r  rZ   ri   r  )rg   r  r  rq   s      r`   has_reusable_bufferzIScheduler.get_expand_dim_for_pointwise_nodes.<locals>.has_reusable_buffer/  s    ((..  99 ; ;; $ ; ;DII FI $ 0 0 4 4TYY ?I ,,66y$G&y'<'<>TU  rb   r   r  )rZ   rm  rg   r%   r  r(  r"   r  r  r  r   r   r   r  r?  r  r$  small_memory_access_thresholdr@  r  r  rP   r   r  statically_known_lt)rq   r  r  n1_sizesn2_sizesn1_iter_sizesn1_reduce_sizesn2_iter_sizesn2_reduce_sizesnode1_write_memorynode2_write_memoryr  mismatch_dimensionsidxn1_sizen2_sizemismatch_dimmismatch_size1mismatch_size2s   `                  r`   "get_expand_dim_for_pointwise_nodesz,Scheduler.get_expand_dim_for_pointwise_nodes  s]    %/z%7W uzz2#4#455::r'8'89 ))+u/M/M/O ) #\\5<<()1&)1& !!#/1=!S%77 u  ''(1,E4E4E4L4L0MPQ0Q "//T%:K:K:R:R5S0TU!//T%:K:K:R:R5S0TU"$67223 	  u%)<U)C !'0]M1R'S 	0#C#'7'!#**3/	0 "#q(*1-,',' ' 77//O66WW11..Q66rb   c                   ||u ryt        ||      }|j                         r0| j                  |j                               j	                  ||      ryt        |t              st        |t              r	 |d       yt        |t        t        f      r|j                         s	 |d       yt        |t        t        f      r|j                         s	 |d       y|j                         |j                  z  r	 |d       y|j                         r!t        j                  s	 |d       y|j                         s|j                         r	 |d       y|j                         }t        |t        j                         s	 |d	       y|j#                         }t%        d
 |j&                  D              |z
  }|j)                         |z  r	 |d       y|j+                         s|j+                         r	 |d       y|j-                         dd D ]B  }|j/                         }|D ]+  }	t1        fd|	j2                  D              r" |d         y D t        |t4              s|gn*|j6                  D 
cg c]  }
|
j                         s|
 c}
}t9        |      dk(  sJ |d   }t9        d   j:                        dk(  rSt9        d   j:                  d   j2                        dk(  r+d   j:                  d   j2                  d   j<                  |u s	 |d       y| j?                  |||      sy|j                         r9|j+                         s |j                         st        j@                  s	 |d       y|j)                         tB        jD                  jF                  z  s+|j)                         tB        jD                  jF                  z  r	 |d       y|j                         }|j                         }||k7  r |d||       y~| jI                  ||      }|t        jJ                  k  r)t        jL                  r| jO                  ||      }|dk\  r|}t        jP                  r>| jS                  ||      x}r*|\  }}}|jU                  ||       | jI                  ||      }tV        jY                  tZ        j\                        r4tV        j_                  d|ja                         |ja                         |       tB        jb                  je                  | |||      sy|j                         |j                  z  rY| jg                  ||      xrE tB        jb                  jg                  | |||      xr! | j                  |      jg                  ||      S tB        jb                  ji                  | |||      xr! | j                  |      ji                  ||      S c c}
w )zj
        Determine if it is possible to combine node1 and node2 into a
        single fused node.
        FTz/grouped node must not be fused with other nodesznode1 is extern or nopznode2 is extern or nopznode1 must go before node2zprologue fusion turned offz2prologue fusion only supported for pointwise nodesz2prologue fusion only supported for TritonTemplatesc              3  <   K   | ]  }|j                           y wro   r]  )r  inps     r`   r  z%Scheduler.can_fuse.<locals>.<genexpr>  s     Ec3<<>Erb  z;prologue fusion not implemented for kernel for these inputsz:template prologue can only fuse functional pointwise nodesNr  c              3  :   K   | ]  }|j                   v   y wro   r  )r  r   prologue_nodess     r`   r  z%Scheduler.can_fuse.<locals>.<genexpr>  s     QttyyN:Qr  z7template prologue can only fuse nodes with a single user    r   zEtemplate prologue can only fuse nodes with a single use into templateztemplate epilogue not satisfiedz#fusion for buffer explicit disabledzdevice mismatch (%s vs %s)z%s and %s has %s shared data)5r  r  r  r   can_fuse_multi_outputs_templaterZ   r?  r  r  rd  r   r"   prologue_fusionr  r:  r%   r  get_allowed_prologue_inpsr   rc  rh  r(  rc  r   rq  rl   r  r  r   r   rg   r  rf  rP   r   no_fuse_buffer_namesr  score_fusion_memory_thresholdr  r  $expand_dimension_for_pointwise_nodesr#  r  r  ru  rv  rH  r  rp   choicesr  can_fuse_verticalcan_fuse_horizontal)rq   r  r  r  r9  r\  unsupported_prologue_argsrg   	node_outsr   ro  template_snodestemplate_snoder  device2shared_data_scorenew_shared_data_scoreexpand_analysis
expand_dimsmaller_nodeexpand_sizer(  s                        @r`   r  zScheduler.can_fuseW  si   
 E>u%4#3#3$

)
)%
7$8 e12j'7
 ABu8:PQR%%'()u8:PQR%%'()$$&8,-))01!!#u'8'8':HI779Hh(?(?@HI$,$F$F$H! EX__EE'( &
 %%'*CCQR--/53Q3Q3SPQ"__.N&s+ % ,,.	$ %CQsyyQQUV$%% "%);< !&AAaA 
 '1,,,,Q/N N2&../14r*2215;;<A"2&..q177:??>Q[ @@sS**,!!#))12""$qww'C'CC""$qww'C'CC56!!#""$W,fg> 44UEB D DD11$($J$J5RW$X!$)$9!66#FFueTTOT6E3Z{<<ZU $ 8 8 F))'--8##.  !	 yy!!$u6GH$$&8 &&ue4 MII//eUDUVM$$V,>>ueL 9900eU$5 M""6*>>ueLMS Bs   5WWc                   |j                         }t        ||      }t        t              }|j                  D ]j  }| j
                  j                  |j                  |j                        }t        |t              r| j                  |||      rW||   j                  |       l |j                  j                  D ]  }t        |t              s|j                  | j
                  j                  |j                  |j                              }	|	sV|	D ]&  }
| j                  |
|      s|	j!                  |
       (  t#        d t$        j&                  j)                  |j+                               D              }||z  r	 |d       y|j-                         }|D ]E  }| j.                  |   j1                         }|| j2                  |   j4                  z  s= |d        y y)a  
        Check if it is legal to fuse a consumer (node2) into a producer (node1).

        We can fuse them if all the reads of node2 either match
        corresponding writes in node1, or are written by nodes that can
        be scheduled before the fusion of node1 and node2.
        c              3  4   K   | ]  }|j                     y wro   r  r  s     r`   r  z.Scheduler.can_fuse_vertical.<locals>.<genexpr>  s      $
 HH$
r  zmemory deps did not matchFz(intermediate nodes between node1 & node2T)rh  r  r   r   r   r   r.  ru   rZ   r0   fusable_weak_depr  r   r   r.   fusable_read_and_writer  r   r9  r:  r   r   rd  r   rr   rX  r   )rq   r  r  node1_buf_namesr  remaining_deps_by_namer  ru   cd	remainingr  remaining_depsnode1_op_namesrM  s                 r`   r0  zScheduler.can_fuse_vertical  s     002u%7B47H++ 	5C((,,SXXsxx@D#w'D,A,A#ue,T"4(//4		5 ##** 		-Bb),.22%%))"''277;I # -B222r:!((,-		- $ $
 445K5R5R5TU$
 

 O+
 +,224" 	D&&t,==?G 7 7 @ J JJ>?		 rb   c                    |j                   |j                         vry|j                  j                  D cg c]  }|j                   |j                  k(  r| }}t        |      dk7  ry|d   t        t              sJ t        j                  t        j                        ry| j                  |j                     }|j                  j                  D cg c]  }|j                   |k(  s| }}t        fd|D              S c c}w c c}w )NFr    r   c              3     K   | ]q  }t        |t              xr[ t        |j                  t        j
                         xr4 |j                  j                  k(  xr |j                  j                  k(   s y wro   )rZ   r.   r   r  r   TMPr  )r  r  rF  s     r`   r  z-Scheduler.fusable_weak_dep.<locals>.<genexpr>?  sm      

 	 tY' ('

DHH==(

ekk)( 		UZZ'(
s   A7A:)ru   rh  r   r   r  r   rZ   r.   r   r  r   rI  r0  r   rq  )	rq   weak_depr  r  rF  mutating_writesr8  r  relevant_readss	       `    r`   r?  zScheduler.fusable_weak_dep&  s
    == 6 6 88 **11
zzX222 
 

 1$"%+++u{{DHH5++H,A,AB	"..44
		Y8ND
 
  

 '
 
 	
#

s   "DD,Dc                   t        |t              rH| j                  j                  |j                  |j                        }||j                  k7  sHt        |j                  t        j                        s$t        |j                  t        j                        ryt        j                  r9|j                  |j                  k7  r |j                         }|j                         }|j                  |j                  k(  xr\ t        |j                        t        |j                        k\  xr/ |j                  d t        |j                         |j                  k(  S t        |t              r| j                  j                  |j                  |j                        }| j                  j                  |j                  |j                        }|j                   |j                   k(  r|j                   ||k(  ryyr   )rZ   r.   r   r.  ru   r   r  r   rI  r"   r  r  r  r   r  r/   r   )rq   r  rF  	read_name
write_names        r`   r@  z Scheduler.fusable_read_and_writeK  s`   dI&--11$))TYYGI UZZ'&tzz488<&u{{DHH=00T]]enn5T ~~') 

ekk) ?		Nc%**o5?II/EJJ0EJJ>
 g&--11$))TYYGI..225::uzzJJ		UZZ'JJ*+rb   c                @    t         j                  j                  |      S ro   )rP   r   get_dep_size_hintr#  s     r`   r  zScheduler.dep_size_hintm  s    ww((--rb   c                2    t        |j                  j                        t        |j                  j                        z   }t        |j                  j                        t        |j                  j                        z   }t	        ||      dz  t        ||      k  r||kD  r|}|}|}|j                  j                  |j                  j                  z  D cg c]4  }||j                  j                  v s||j                  j                  v r|6 }}t         fd|D              S |j                  j                  |j                  j                  z  |j                  j                  |j                  j                  z  z  }t         fd|D              S c c}w )zn
        The first term in our fusion score that estimates number of saved
        memory operations.
        r  c              3  @   K   | ]  }j                  |        y wro   r  rH  s     r`   r  z0Scheduler.score_fusion_memory.<locals>.<genexpr>  s     ?3t))#.?r  c              3  @   K   | ]  }j                  |        y wro   r  rH  s     r`   r  z0Scheduler.score_fusion_memory.<locals>.<genexpr>  s     Is4%%c*Ir  )r   r   r   r   r  r$  r  )	rq   r  r  node1_dep_lennode2_dep_lentmpr  rD  common_memory_depss	   `        r`   r  zScheduler.score_fusion_memoryp  sh    E--334s5;L;L;S;S7TTE--334s5;L;L;S;S7TT }m,q03}m3TT}, !,,22U5F5F5M5MM%++111SE<M<M<T<T5T D  ?$???#//558I8I8P8PP##e&7&7&>&>>
 I6HIIIs   9Fc                   t        |      dk(  r|S i }|D ]  \  }}|j                         |j                         k(  sJ |j                         }t        | j                  |      j	                  ||            }||vr	||fg||<   p||   j                  ||f        t        |j                         t        j                  d            d   }t        |      dkD  sJ |S )Nr   r  r    )
r   r   r   r  get_fusion_pair_priorityr  r  r  r  r  )rq   r  "possible_fusions_group_by_priorityr  r  r  fusion_pair_priority&possible_fusions_with_highest_prioritys           r`   r  z4Scheduler.get_possible_fusions_with_highest_priority  s   
  A%##  	+ - 	LE5##%)9)9);;;;%%'F#&  (AA%O$  $+MMENL23GH 33GHOOEN	 25.446H<O<OPQ<R2

2. 9:Q>>>55rb   c                B    t        j                  j                  | g| S )z-
        Shim for list.sort(key=...)
        )rP   r/  score_fusionr  s     r`   r  zScheduler.score_fusion_key  s     yy%%d3U33rb   c                    t        t        j                  j                               }t	        | j
                        D ]9  }|j                  || j                         |j                  |j                         ; y)zg
        Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
        N)
r   rP   r   rQ  r
  rA  r4  r0  rl  r   )rq   r2  rg   s      r`   r\  zScheduler.compute_last_usage  s]    
 ))A)A)CDTZZ( 	8D 3T5L5LM&&t7	8rb   c                   t        | j                  t        j                  j                  z
  t        j                  j
                  j                  z
        D ]i  }|| j                  v rT| j                  |   }|j                         s2t        j                  j
                  j                  |j                         f|t        j                  j                  v st        j                  j                  |   }t        |t        j                        r*t        j                  j
                  j                  |       t        |t        j                        r|j                   }t        |t        j"                        r|j%                         sJ t        j                  j
                  j                  |j                          l | j                  j'                          y)z*Free any buffers that are no longer neededN)r  rb  rP   r   r  r   freedr   r   codegen_freerg   r  rZ   r%   r  r]  r   r+  is_input_bufferclear)rq   ru   r   r&  storages        r`   free_bufferszScheduler.free_buffers  sK   %%gg%%&gg""(()
 	DD
 t'''&&t,<<>GG((55chh?---gg**40c2#5#56GG((55c:R%6%67!hhG"7BMM:w?V?V?XXGG((55gllC)	D, 	!!'')rb   c                    | j                   j                         D ]  }|j                           | j                          y ro   )r6  r   flushrg  )rq   r  s     r`   ri  zScheduler.flush  s3    }}++- 	GMMO	rb   c                   t        |t              sJ t        d   dxx   dz  cc<   t        j                  t        d            5  |j                          |j                          d d d        |j                  }t        |t        j                        sJ dt        |             |j                  t        j                  j                         | j                          y # 1 sw Y   |xY w)Nr  extern_callsr    F)increase_kernel_countztype(node)=)rZ   r  r   rP   set_kernel_handlerr*   r  r6  rg   r%   rq  r}   r  r   r   rg  )rq   scheduler_noderg   s      r`   codegen_extern_callzScheduler.codegen_extern_call  s    .*CDDD
 	^,1,!!&u"EF 	&002##%	& ""$0B[T$ZM2BB0QWW))*	& 	&s   !C""C+c                P   t        |j                        r|j                  
J | d       t        j                  j                  |       t        |j                        }|t        d|j                         t               s|j                  dk(  rLt        j                  j                  |      x}j                  dk  rt        |t        j                               t        |j                        r,|j                  dk(  st!        t        j                                ||       S )Nz( should have been normalized in loweringzUnsupported device type: cuda   mps)rJ   r}   r  rP   r   add_device_infor)   r  r   r[   rq  get_device_propertiesmajorr1   inspectcurrentframer2   )rq   r  device_schedulingdevice_propss       r`   create_backendzScheduler.create_backend  s    &++&&,,*B 	
h>?	
B 	
'5fkkB$!:6;;-HII|v%%*ZZ%E%Ef%MM\TTWXX(w7K7K7MNN$V[[E-A#G$8$8$:;; &&rb   c                    |J || j                   vr| j                  |      | j                   |<   | j                   |   S ro   )r6  r{  r~  s     r`   r  zScheduler.get_backend  sB    !!!&$($7$7$?DMM&!}}V$$rb   c                    d fd}|j                         D ci c]8  }|j                  *|j                  j                         D ]  } ||      |fd  : }}}t        |j	                               }|rMt        |t        j                  d            \  }}t        j                  j                  j                  |       y y c c}}w )Nc                    | j                   vrLj                   j                  t        | j                  j                        D  ci c]  \  }} | |
 c} }       j                       S c c} }w ro   )rc  rl  r@  r   rA  )ro  r>  rq   s     r`   	get_orderz*Scheduler.enter_context.<locals>.get_order  s\    ,,,$$++i>V,WdaQT,WX''** -Xs   A+
r   r  )ro  ztorch.fx.Noder   r   )rc  rg   r  r   r:  r$  r  r  rP   r   r   enter_context)rq   rg   r  ro  r)  r  rW  lasts   `       r`   r  zScheduler.enter_context
  s    	+ ^^%
vv!VV'')	
  q\1t#

 
 w||~&'x':':1'=>GAtGG  ..t4 
s   =Cc                    	 | j                   |   j                  }t        fd|D              xr || j                  vxr || j
                  vS # t        $ r Y yw xY w)NFc              3  ^   K   | ]$  }|j                   xs |j                         v  & y wro   )r  rp   )r  r   fused_node_namess     r`   r  zAScheduler.can_buffer_be_removed_through_fusion.<locals>.<genexpr>$  s)     VC3C CCVs   *-)r   rl   KeyErrorrq  r   r0  )rq   ru   r  rl   s     ` r`   $can_buffer_be_removed_through_fusionz.Scheduler.can_buffer_be_removed_through_fusion  sn    	$$T*00E VPUVV 4D1114D333	
  		s   A 	AAc                4    |j                   }t        |t        j                  j                  j
                        r|j                  }||t        v rt        |t        j                  j                        sJ t        |   }|j                         }|J t        j                  j                  j                  |      \  }}}	|sJ d        ||i |	}
|
S t        j                  j                  j                  j                  st         j"                  ydd}|rt$        n|}t        |t&              rt)         fd|j*                  D              S |j                   J |j-                         s |d|       yt        |j                   t        j.                        r |d|       yt        |j                   t        j0                        r |d	|       yt3        |j                   d
d      r |d|       yt5        |j                         r |d|       yy)zBReturn True if we should partition the inductor graph on this nodeNzOIf this op came from a custom inductor pass, make sure to run FakeTensorUpdatorTc                     y ro   r   )msgrg   s     r`   noop_logz,Scheduler.should_partition.<locals>.noop_logL  s    rb   c              3  @   K   | ]  }j                  |        y wro   )should_partition)r  rh  rq   s     r`   r  z-Scheduler.should_partition.<locals>.<genexpr>R  s     Mt,,U3Mr  znon gpu opsr  zDeviceCopy opszConditional opsunbacked_bindingszunbacked binding opszCUDAGraph-unsafe custom opsF)r  r   rg   rh   r   r   )rg   rZ   r[   r  r%   r  r  rX   r\   r]   r  fx_utilsget_fake_args_kwargsr"   rX  rY  r?   wrapperrN   r  r'  r  rJ   
DeviceCopyConditionalr   rI   )rq   rg   
should_logir_noder  should_partition_fnr  success	fake_argsfake_kwargsr  r  log_partition_reasons   `            r`   r  zScheduler.should_partition)  s    ))gu11@@A**H#4P(P!(EJJ,A,ABBB&B8&L#!113***OO,,AA'J 0K  ew $7	#Q[#Q '' &&--886>>F	 AK<PXd./MMMMyy$$${{} T:dii/ !1=dii0 !2>499148 !7dC!$)), !>TJrb   c                    i }|j                  t        j                  j                         | j                  D ]3  }|j
                  j                         D ]  \  }}|j                  ||<    5 |S )z~
        Return a mapping from name strings to the corresponding graph inputs or
        base scheduler node outputs.
        )rl  rP   r   r  rA  r   r  rg   )rq   r  rg   ru   scheduler_buffers        r`   get_name_to_nodeszScheduler.get_name_to_nodesm  sr     UWAGG001JJ 	;D*.*>*>*D*D*F ;&&%5%:%:T";	; rb   c           	        t        t        j                  j                        D ci c]  \  }}||
 }}}t        t        j                  j	                               D ci c]  \  }}||
 }}}g t        j                  _        t        |      D ]  \  }}|j                  rg }|j                  D ]"  }|j                  |j                  |             $ g }	|j                  D ]0  }
|	j                  |j                  |
j                                      2 t        j                  j
                  j                  t        |||	|j                                yc c}}w c c}}w )z
        computes a mapping from partition input/output indices to graph input/output
        indices for each partition.
        N)r@  rP   r   r  rQ  partition_mapsskip_cudagraphinput_nodesr  r.  output_nodesrp   rF   constant_names)rq   
signaturesr  ru   name_to_graph_input_indexname_to_graph_output_indexpartition_id	signatureinput_mappingoutput_mappingrg   s              r`   compute_graph_partition_mapsz&Scheduler.compute_graph_partition_maps}  sT    (11E1E'F%
##tD#I%
! %
 (11I1I1K'L&
##tD#I&
" &
 "$'0'< 	#L)''
 M!-- J$$%>%B%B4%HIJ  N!.. W%%&@&D&DT]]_&UVW GG""))! !",,	!	%
&
s   E!E c                  	
 d		fd		 	 	 	 d
	
fd
	 	 	 	 d	fd	 	 	 	 dd} t               j                  
fd|D         } |j                  fd|j                         D           ||      }t               }|D ]F  }t        j                  j
                  j                  |      }|j                  |j                         H t        t        |t        j                  d                  S )ai  
        Returns all symbol inputs which are required to be in scope to successfully
        perform codegen for this graph partition, including:
        - free symbols used in partition nodes
        - free symbols in partition input/node shapes, strides, and offsets. This is needed
          for recording cudagraphs for tensors with dynamic shapes.
        c                   t               }| j                         }t        |t        j                        r|j                  t        |j                        t        |j                        z  t        |j                        z         t        |t        j                        r!|j                   |j                               |S |
J d|        |S )Nz*Expect layout to be None but found layout=)r   maybe_get_layoutrZ   r%   Layoutrl  r   r  strideoffsetr  r  )rg   free_symbol_usesr   get_layout_symintss      r`   r  zGScheduler.get_graph_partition_symbol_inputs.<locals>.get_layout_symints  s    9C**,F&")), '' -"6==12"6==12
 fb&C&CD$++,>v}},MN
 $# ~ @I~ $#rb   c                ,   t        | t              r* t               j                  fd| j                  D         S | j
                  J | j
                  j                         } |j                  fd| j
                  j                         D          |S )z4
            Gets symbols used in node.
            c              3  .   K   | ]  } |        y wro   r   )r  rh  get_scheduler_node_symbol_usess     r`   r  zfScheduler.get_graph_partition_symbol_inputs.<locals>.get_scheduler_node_symbol_uses.<locals>.<genexpr>  s     U4U;Ur  c              3  .   K   | ]  } |        y wro   r   )r  r  r  s     r`   r  zfScheduler.get_graph_partition_symbol_inputs.<locals>.get_scheduler_node_symbol_uses.<locals>.<genexpr>  s     U'$W-Ur  )	rZ   r  r   r7  r  rg   r  rl  r   )rg   r  r  r  s     r`   r  zSScheduler.get_graph_partition_symbol_inputs.<locals>.get_scheduler_node_symbol_uses  s     $ 23)z|))UU  99(((#yy==?###UTYY=R=R=TU $#rb   c                    t        | t        j                        r
t               S t        | t        j                        r |       S t        dt        |              )zW
            Gets symbols used in input node shapes, strides, and offsets.
            zUnsupported input node type: )rZ   r%   r  r   r  r  r}   )rg   r  s    r`   get_input_node_symbolszKScheduler.get_graph_partition_symbol_inputs.<locals>.get_input_node_symbols  sO     $ 2 23!|#D")),)$// *,I$t**VWWrb   c                &    t        d | D              S )z
            Filters a set of symbols that are required for codegen. Skip symbols
            that are always internal to kernels, such as SymT.TMP, SymT.INDEX,
            and SymT.R0_INDEX.
            c              3     K   | ]N  }t        |t        j                  t        j                  t        j                  t        j
                  f      r| P y wro   )r   r   SIZEFLOATUNBACKED_INTUNBACKED_FLOAT)r  r  s     r`   r  zVScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols.<locals>.<genexpr>  sH      !		

))++	 s   AAr   )symbolss    r`   filter_symbolszCScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols  s         rb   c              3  .   K   | ]  } |        y wro   r   )r  rg   r  s     r`   r  z>Scheduler.get_graph_partition_symbol_inputs.<locals>.<genexpr>  s     It,T2Ir  c              3  4   K   | ]  \  }} |        y wro   r   )r  rW  rg   r  s      r`   r  z>Scheduler.get_graph_partition_symbol_inputs.<locals>.<genexpr>   s     Nwq$$T*Ns   ru   r  )rg   z	ir.IRNoder   OrderedSet[sympy.Symbol])rg   rT   r   r  )rg   z0Union[ir.IRNode, sympy.Expr, ir.TorchBindObject]r   r  )r  r  r   r  )r   r7  r  rP   r   r  simplifyrl  r   r  r  
attrgetter)rq   	partitionr  r  candidate_symbolsresr  symplified_sr  r  r  s           @@@r`   !get_graph_partition_symbol_inputsz+Scheduler.get_graph_partition_symbol_inputs  s    	$$	$#	$%	$"	XB	X%	X 	-	%	, 7Ijl6H6HIyI7
 	 N+:K:K:MN	
 ++<=(2" 	2A77++44Q7LJJ|001	2
 &(*=*=f*EFGGrb   c           	     B    g }t        t        j                  j                               } j	                         }d fdt        t        |      t        |            D ]  \  }}t               }|D ]+  }	|j                  |	j                  j                                - |j                  |      }
t        j                  j                  |D 	cg c]  }	|	j                   c}	      }t        |j                  |j                   z  D cg c]   } |j"                        s|j"                  " c}      |z
  }t         fd|D              }t               }|D ]  }	|j                  |	j$                          |D ci c]  }||v r|||    }}|D ci c]  }||v r	|||v rdnd }}|D cg c]  }||v r||vr| }}|
j                  |       t         fd|
D              }
|
D cg c]  } |      s||    }}|D cg c]!  }|t        j                  j&                  v s |# }} j)                  ||      }t+        ||||||      }|j-                  |       |j/                  ||
z
        } |ddd   S c c}	w c c}w c c}w c c}w c c}w c c}w c c}w )	z
        Gets signature for each graph partition, including input nodes, output nodes, and
        whether deallocating an input within graph partition.
        c                "   j                   j                  | d      }|yt        |j                  j                  t
              rKt        |j                  t        j                        r&j                  j                  | d      x}r |      S yy)z
            Checks if buf_name is NoneLayout. Buffers with NoneLayout is not allocated
            so graph partition should not take it as inputs or outputs.
            NFT)	r   r.  rZ   rg   r   r8   r%   MutationOutputr0  )rx  r   r8  is_none_layoutrq   s      r`   r  z?Scheduler.get_graph_partition_signature.<locals>.is_none_layout  sz    
 ""&&x6C{#((//:6chh(9(9:!%!8!8!<!<Xt!LLIL))44rb   c              3  V   K   | ]   }j                   j                  ||       " y wro   r0  r.  r  ru   rq   s     r`   r  z:Scheduler.get_graph_partition_signature.<locals>.<genexpr>J  ,      / ''++D$7/   &)TFc              3  V   K   | ]   }j                   j                  ||       " y wro   r  r  s     r`   r  z:Scheduler.get_graph_partition_signature.<locals>.<genexpr>j  r  r  Nr  )rx  r   r   r   )r   rP   r   rQ  r  r  r
  rl  r   r:  r  r$   r5  r6  r   r   r   ru   r   r;  r  r5   r  r7  )rq   
partitionsskip_cudagraphsr  unmet_output_namesr  r  r  output_namesrg   returned_output_namesr   r  partition_input_namesrb  ru   r  input_deallocationextra_output_namesr  r  symbol_inputspartition_signaturer  s   `                      @r`   get_graph_partition_signaturez'Scheduler.get_graph_partition_signature  s!    
'(@(@(BC--/	( *-Z (?";*
 ]	%I~ -7LL! A##D$8$8$=$=$?@A %1$=$=>P$Q! '11<<.78d!!8K  "-!2!2[5G5G!G-aff5   " %/ /1/ %!
 5?L ! =$++DOO<=
 2<' l4((K  2"<' d&::dE" " 2"<'D8L,L " " "(();<$. /1/ %! 2%d+ T"L  "7$!''BSBS:SN  !BB;M #:"# 12!6!<!<"%::"w]	~ $B$e 9$
""s0   I>

%J
=JJ.J*J!J&Jc                   |j                   j                         D ci c]$  \  }}|t        j                  j                  vr||& }}}|j
                  j                         D ci c]$  \  }}|t        j                  j                  vr||& }}}|j                  D cg c].  }|j                         t        j                  j                  vr|0 }}|j                  D cg c]   }|t        j                  j                  vr|" }	}t        |j                  ||||j                  |	      S c c}}w c c}}w c c}w c c}w )z
        Updates the partition signature by removing buffers specified in
        V.graph.removed_buffers. See [Note: Removed Graph Partition Arguments]
        )r  r  rP   r   r  r  r  maybe_get_namer  r5   r  r  )
rq   r  ru   r  r  r  r  rg   r  r  s
             r`   .clean_removed_buffer_from_partition_signaturesz8Scheduler.clean_removed_buffer_from_partition_signatures  sK    !* 5 5 ; ; =
f177222 &L
 
 '99??A
c177222 #I
 
 "..
""$AGG,C,CC 
 
 "00
177222 
 

 '##$$
 	
)






s   )D/')D5!3D;$%E c                p   	
 ddl 	t               g g t        |      D ci c]  \  }}||
 c}}d	 fd
d
fd}|D ]5  }t        |j                  j
                        |<   |   dk(  s. 
|       7 g }d}|t        |      k  rsr}r0	j                        \  }}|j                  |        ||       r0r0	j                        \  }}|j                  |        ||       r0|dz  }|t        |      k  rrzr}|t        |      kD  rt        d      |S c c}}w )a  
        Reorder nodes to minimize the number of partitions via a bfs
        topological sort. This is the optimal reordering such that the
        number of partitions cannot be reduced further. This may be
        sub-optimal for other metrics such as peak memory. This does not
        change relative orders of two cudagraphable nodes, nor the
        relative order of two non_cudagraphable nodes.
        r   Nc                    |    | f}j                  |       rj                  |       y j                  |       y ro   )r  heappush)rg   node_with_indexcudagraphable_nodesheapqnode_to_indexnon_cudagraphable_nodesrq   s     r`   insert_pending_nodeszHScheduler.reorder_for_minimizing_partition.<locals>.insert_pending_nodes  s>    ,T2D9O$$T*6H2ODrb   c                    | j                   j                  D ]*  }|   dkD  sJ |xx   dz  cc<   |   dk(  s# |       , y )Nr   r    )r   
succ_nodes)rg   	succ_noder  node_to_indegrees     r`   update_indegreezCScheduler.reorder_for_minimizing_partition.<locals>.update_indegree  sT    !]]55 4	'	2Q666 +q0+#I.!3(3	4rb   r    z
                Failed to schedule, while loop ran too long when
                reordering for minimizing the num of partitions
                rg   rT   r   r   )	r  r  r@  r   r   
pred_nodesheappopr  r  )rq   rA  r  rg   r  schedule	num_itersrW  r  r  r  r  r  r  s   `       @@@@@@r`    reorder_for_minimizing_partitionz*Scheduler.reorder_for_minimizing_partition  sU    	9=CEGI4=e4DEysDsE	E 	E	4  	+D%()A)A%BT"%*$T*	+
 -/	#e*$#':)--(?@4%% *
 &--(;<4%% &
 NI #e*$#': s5z!  ] Fs   D2c           	     X   ddl m}m} t        t        j
                  j                               } ||| j                  | j                  t        t        j
                  j                  j                               |      \  }}| j                  |      } ||||      \  }}	||dz  k  r|S |S )zx
        Reorder nodes to minimize the number of partitions if this only slightly
        increase peak memory.
        r    )estimate_peak_memoryprepare_planning_infor  )rP  r  r  r   rP   r   rQ  r   rX  r  r:  r  )
rq   rA  r  r  r  default_peak_memoryr  reordered_nodesreorder_peak_memoryrW  s
             r`   rZ  z0Scheduler.maybe_reorder_for_minimizing_partition  s     	H"177#;#;#=>:O##qww++0023;
77 ??F!57"
Q
 !4s!::""rb   c                   g }g }g }dd}|D ]l  }| j                  |      }|r*t        |j                        dk(  r|j                  |       @|r ||      r|j                  |       \|j                  |       n ||z   |z   S )a  
        Reorder a node if it should be partitioned and has simple dependency:
        1. move a partitioned node to the front if it has no dependency
        2. move a partitioned node to the back if it is only used by OutputNode
        3. otherwise do not reorder
        c                    | j                         D ]0  }|j                  D ]  }t        |j                  t              r  y 2 yr   )r   rl   rZ   rg   r   )rg   r   r   s      r`   only_output_userzPScheduler.reorder_for_partition_with_simple_dependency.<locals>.only_output_user   sC    '') %99 %C%chh
;$%% rb   r   r  )r  r   r   r  )rq   rA  frontmiddlebackr  rg   r  s           r`   r[  z6Scheduler.reorder_for_partition_with_simple_dependency  s     *,*,(*	  	$D#44T:C(?(?$@A$ET"!&6t&<D!d#	$ v~$$rb   c                n   g }d}g }g }| j                   D ]S  }| j                  |d      }|r)||k7  r$|j                  |       |j                  |       g }|}|j                  |       U |r"|j                  |       |j                  |       | j                  ||      }| j	                  |       ||fS )z
        Given a list of BaseSchedulerNodes, split into a list of
        graph partitions and compute partition input/output signatures.
        T)r  )r  r  )rA  r  r  r  r  )rq   r  r  cur_partitionr  rg   r  r  s           r`   rW  zScheduler.graph_partition2  s     +-
')JJ 	'D#44Td4K3C!C!!-0&&~6 "-N  &	' m,"">277!? 8 

 	))*5:%%rb   c                    t        d      5  t        j                  j                  j                  r| j                         n| j                  | j                        	 cd d d        S # 1 sw Y   y xY w)NzScheduler.codegen)r   r[   r  r"   rW  _codegen_partitions_codegenrA  rv   s    r`   r  zScheduler.codegenR  sX    -. 	 ??))99 ((*]]4::.	 	 	s   AA&&A/c                <   ddl m} t        j                  j                  }t        | j                        }t        j                  j                         5  t        j                  j                  dd| ||       | j                  |       t        t        j                  j                  |      sJ | j                  |      }|t        j                  j                  _        t        j                  j                  j                          t        j                  j                  j                  t        j                  j                        \  }}ddd       t        j                  j                  j!                  j"                         t        j                  j                  j%                  ||       t        j                  j                  j&                  j)                  |j*                  D cg c]  }|j-                          c}       y# 1 sw Y   xY wc c}w )z,Codegen a partition given its inputs/outputsr    )SubgraphPythonWrapperCodegenT
partition_)is_subgraphsubgraph_nameparent_wrapper_codepartition_signaturesN)r  r	  rP   r   r   r?  r9  set_current_wrapper_codeinit_wrapper_coder  rZ   r  r  write_prefixgenerateis_inferencedefine_subgraph_launcher_fnr  codegen_partition_call	allocatedrl  r  rp   )	rq   r  r  r	  r  graph_partition_idpartition_coderW  rg   s	            r`   _codegen_partition_wrapperz$Scheduler._codegen_partition_wrapperZ  s    	Bgg22!$"?"?@WW--/ 	TGG%%  *+=*>?$7%.	 &  MM)$ agg224PQQQKKIVI8AAGG  5GG  --/ ! 4 4 = =agg>R>R SNA-	T0 	
889M9MN	334F	R	&&--)2)?)?@T]]_@	
7	T 	T8 As   C:H.HHc                L     t         j                  d fd       } |       S )Nc               3    K   j                          j                  ryt        j                  j                        rZj                  j                  J d       t
        j                  j                  j                  j                  j                         	 d  j                  rGt        j                  j                        r(t
        j                  j                  j                          d _        y # j                  rGt        j                  j                        r(t
        j                  j                  j                          d _        w xY ww)Ndevice should have an index)
%update_graph_partition_default_devicer@  rB   r}   r  rP   r   r   codegen_device_guard_entercodegen_device_guard_exit)r  rq   r  s   r`   ctxz1Scheduler.use_default_device_context.<locals>.ctx  s    66z:N**/@++000 2288D 1D $$??//553..3D//444 GG((BBD.2+	 ..3D//444 GG((BBD.2+s    BEC;  AE;AEE)r   zIterator[None])
contextlibcontextmanager)rq   r  r  r   s   ``` r`   use_default_device_contextz$Scheduler.use_default_device_context  s&     
	"	"	3 
#	3* urb   c                    t        |      dk(  r|d   j                  sy dd}	 	 	 	 	 	 dd}d }t        ||      D ]  \  }}|j                  r ||      } n |y t        ||      D ]  \  }}|j                  s |||      r y  || _        y )Nr    r   c                4    | d   j                         }|J |S r]  r   )r  partition_devices     r`   get_cudagraph_partition_devicezWScheduler.update_graph_partition_default_device.<locals>.get_cudagraph_partition_device  s'    (|668#///##rb   c                @    | D ]  }|j                         }||k7  s y yr   r&  )r  target_devicerg   r  s       r`   all_on_target_devicezMScheduler.update_graph_partition_default_device.<locals>.all_on_target_device  s/     " !*]* ! rb   )r  rU   r   r  )r  rU   r*  r  r   r   )r   r  r  r@  )rq   r  r  r(  r+  cudagraph_partition_devicer  r  s           r`   r  z/Scheduler.update_graph_partition_default_device  s     z?a
1(D(D 	$
	$	5A		 &*"$'
J$? 	 Iy++-KI-V*	 &-$'
J$? 	 Iy''0D51 		 'A#rb   c                    | j                         \  }}t        |      dkD  rdt        |       d}t        |d       | j                  ||      5  t	        ||      D ]V  \  }}t        |      dk\  sJ dt        |              |j
                  r| j                  |       E| j                  ||       X 	 ddd       t        | j                        }t        j                  j                  j                  |       |dkD  rqt        j                  j                  J |t        t        j                  j                        k(  s.J d	| d
t        t        j                  j                                yy# 1 sw Y   xY w)z
        Split nodes into partitions and codegen each partition into separate functions.
        This allows further applying different optimizations (e.g., cudagraph) to
        each function.
        r    zcudagraph partition into z partitionsr   )r  prefixz5Each partition must have at least one node but found Nr   zExpect z partition maps but got )rW  r   rN   r#  r  r  r  r  r?  r9  rP   r   r   set_all_partition_namesr  )rq   r  r  r  r  r  num_partitionss          r`   r  zScheduler._codegen_partitions  sk    "&!5!5!7
Jz?Q-c*o->kJC)c"=,,ZD 		J(+J
(C J$	99~* KCPYNK[\* ++MM),33IyIJ		J d;;<	44^D A77))555!S)?)?%@@ .))A#aggF\F\B]A^_@ 		J 		Js   A&E44E=c                   t         j                  rdd l}t        j                         }t               }t        |      D ]  }|j                  dk(  r/|j                  |j                  j                  j                  k(  r nQ|j                  |j                  f}||vs"J d|j                   d|j                   d       |j                  |        | j                  | _        | j                  rBt         j                   j"                  r(t$        j&                  j(                  j+                          |D ]  }t,        j/                  t0        j2                        r4	 t,        j5                  d|j7                         |j9                                | j=                  |       |j?                         x}r|| j                  k7  s |jA                         s|jC                         r| jE                          || j                  k7  r| j                  rGtG        | j                  jH                        r(t$        j&                  j(                  jK                          || _        tG        |jH                        rF|jL                  J d       t$        j&                  j(                  jO                  |jL                         || _(        | jR                  jU                  |jV                         |jC                         rP|jY                  t[        |j]                                     \  }	}
}| j_                  |      ja                  |
||	       n|jA                         r,tc        jd                  tf        |      }| ji                  |       n|jk                         rqtc        jd                  tl        |      }| j_                  |      }d	d
l7m8} d	dl9m:} tw        |||f      r|}nty        dtI        |             |j{                  |       nYtw        |t|        t~        f      r!| j_                  |      j                  |       n"tw        |t              sJ |j                          t         j                   j                  r| j_                  |      j                          | j                  jU                  |j                                | j                  jU                  |j                                tw        |t              r|j?                         }||jH                  dk7  s| j_                  |      j                         s| jE                           | j                  | j                  k7  rU| j                  J tG        | j                  jH                        r(t$        j&                  j(                  jK                          | jE                          y # t:        $ r( t,        j5                  d|j7                                Y nw xY w)Nr   _compile_innerzDuplicate stack frame :zs; did you add a decorator to one of the functions in this stack trace?  If so, try using a context manager instead.z5Generating code for node %s with estimated runtime %fz6Generating code for node %s with estimated runtime 0.0r  r    )CUDACombinedSchedulingr  ztype(self)=r  )Jr"   "check_stack_no_cycles_TESTING_ONLYtorch._dynamo.convert_frame	tracebackextract_stackr   r
  ru   filename_dynamoconvert_frame__file__linenorA  r@  rk  rX  autotune_at_compile_timerP   r   r   write_get_raw_stream_headerr   ru  rv  rH  r  rp   r  r   r  r   r  r  ri  rB   r}   r  r  r  r>  rb  rl  r   rF  r   rc  r  codegen_templater  r  r  ro  r  r   codegen.cuda_combined_schedulingr4  r  r  rZ   r#  codegen_combo_kernelr  rm  codegen_noder  r6  debug_sync_kernelcodegen_syncrG  rh  r  rd  ready_to_flush)rq   rA  r[   stackr  framer  rg   r  rC  rD  rE  backend_r4  r  r  s                   r`   r  zScheduler._codegen  s   44.++-E7A|D!%  JJ"22%--*E*E*N*NN~~u||4$ ,U^^,<Aell^ LJ J
  #99&&6==+Q+QGG  <<> L	!D.
IIO224 t$**v*d111~~''')JJLT000**/@++000 ,,FFH*0D'(5%||7V9VV7,,GGU $D%%,,T__=!484W4W)*51-   (99!8X !{{#<dC((."{{#=tD++F3T8h9O(PQ&G(KDJ=)9::,,T2D#5}"EF  (55d;!$(>???}}..  (557''..t/D/D/FG%%,,T-E-E-GHd$:;*&v-((0??AJJLYL	!\ $"="== &&222 !4!4!9!9: $$>>@

a ! IIPs   3W

-W;:W;c                    |d   j                         }| t        j                  _        || _        |J | j                  |      }|j                  |      S )r  r   )r   rP   r   rf   rk  r  benchmark_combo_kernel)rq   r?  r  r  s       r`   rK  z Scheduler.benchmark_combo_kernelc  sW     1((* $!!!""6*--i88rb   c                   t         j                  sy|}|d   j                         }||j                  dk(  ryddlm} dg }}t        |      D ]  \  }}|j                         }	| j                  |	      rt        j                  d       	 | j                  |	      \  }
}t        j                  |
      rt        j                  d|        y		 ||
z  }|j                  |        	 | j                  |      \  }}}||z
  dk  xs |dk  }t        j!                  t"        j$                        rP||kD  s|r%t        j                  dt'        ||z  d             n$t        j                  dt)        ||z  d             ||z
  |k  xs |S # |$ r.}d
t        |      v rt        j                  d       Y d}~ y d}~ww xY w# |$ r-}d
t        |      v rt        j                  d       Y d}~y d}~ww xY w)rB  Tr   Nr}  rD  g        z<ComboKernel: benchmarking may not accurate due to atomic_addz;ComboKernel benchmark: register spilling of %d-th subkernelFrx  zCComboKernel benchmark: return True because of loop-carried variableg333333?z/can fuse (benchmark): fusing causes %sx speeduprG  z3cannot fuse (benchmark): fusing causes %sx slowdown)r"   rK  r   r}   r  rE  r@  rc  r@  r  r  r  rz  r{  r   r  ru  rv  rH  r<   r=   )rq   rA  subkernel_nodesr  rE  rJ  
path1_listr>  rh  r?  r(  re  r)  rK  	ms2_clone_path2_listsmall_kernels                    r`   r  z!Scheduler.speedup_by_combo_kernelq  s   
 ,, #..0 >V[[E1;rZ!/2 	$HAu)I ##I.  R55i@D::b>$$U ! " 2ICd#7	$:
	*.*E*Eo*V'CK Y,9c	""7==1SyL  E#)C2
   Ic	#0
 Y$44M $ *c!f4$$]     	&#a&0  Y 	s<   ?F	&F? 	F<"F76F77F<?G1"G,+G,,G1c                p    | j                   |   }|j                  J |j                  j                         S ro   )r   rg   
get_layout)rq   rx  r   s      r`   get_buffer_layoutzScheduler.get_buffer_layout  s5    x(xx###xx""$$rb   c                   | j                   D ]  }|j                         s|j                  j                  D ]  }t        j
                  j                  j                  |j                        }|s9t        |      dk(  sHt        |j                  t        t        f      ri|j                         g k(  s}t        j
                  j                  j!                  |j                           y r|  )rA  rJ   r   r   rP   r   rB  r.  ru   r4   rZ   r   r8   r7   r  zero_dim_cpu_tensor_listrA  )rq   rg   r  r  s       r`   r?  z$Scheduler.update_zero_dim_cpu_tensor  s    JJ 	HD{{} ,,22 
HDWW3377		BF+F3u< *"MMJ8I+J! #OO-388<<TYYG
H	Hrb   )rA  zlist[ir.Operation]r   r   )r   z!dict[str, SchedulerDonatedBuffer]r   )r  r   r   r   r   )rx  r   r   r   )rg   rG  r   rT   r  )rh  rT   r   rU  )r   r  rA  r  r   tuple[float, str]ro   rA  r  r
  r   r  r  r   r   )r  r   r  r  r   rX  )r?  r  r   r   )r  rT   r  rT   r   zUnion[bool, Callable[[], bool]])rg   rT   r   rT   )r"  r  r   r   r  )rA  rU  r   1list[tuple[BaseSchedulerNode, BaseSchedulerNode]]r  rT   r  rT   r   r   )r  rT   r  rT   r  r   r   r   )r  rT   r  rT   r  z"Union[tuple[str], OrderedSet[str]]r   r   r  rT   r  rT   r   r   r  )r
  rT   rD  rT   r  r  r   r   )r  rT   r  rT   r   z/Optional[tuple[int, SchedulerNode, sympy.Expr]])rJ  r0   r  rT   r  rT   r   r   )r  r-   rF  r.   r   r   )r  r-   r   r   )r  rZ  r   rZ  )rA  z+tuple[BaseSchedulerNode, BaseSchedulerNode]r   r   )rn  r  r   r   )r  r  r   BaseScheduling)r  r   r   r]  r  )ru   r   r  rL  r   r   r  )rg   rT   r  r   r   r   )r   ;dict[str, Union[ir.IRNode, ir.TorchBindObject, sympy.Expr]])r  list[GraphPartitionSignature]r   r   )r  rU   r  r^  r   r  )r  list[PartitionType]r  z
list[bool]r   r_  )r  r5   r   r5   )r   z9tuple[list[PartitionType], list[GraphPartitionSignature]])r  rU   r  r5   r   r   )r  r`  r  r_  r   z'contextlib.AbstractContextManager[None])r  r`  r  r_  r   r   r?  r  r   z(tuple[float, float, list[Optional[str]]])rA  rU  r   r   )rx  r   r   z	ir.Layout)Qr~   r   r   r/  r   r  rA  propertyrk  setterra  ry  r=  rI  rS   r_  rE  rD  r  r  rF  r  r  rV  r  r	  r  rM  r@  r  r  r  rO  rY  r  r  r  r  r  r  r  r  r  r#  r  r0  r?  r@  r  r  r  r  r\  rg  ri  ro  r{  r  r  r  r  r  r  r  r  r  r  rZ  r[  rW  r  r  r#  r  r  r  rK  r  rT  r?  r  r  s   @r`   re   re     s   
m
^	# & & ( (7#,"HsPjKZ(#T,	 6S(4#&$6:	808	8, (,	*  %	
 
&
> 
>*6
>	
>w@r
s(&s(/@s(	(s(j	>h,h	 hT..`?4 ,4 	:4 l,&,/@,	,\7&7/@7	7rM&M/@MMPM	M$&$/@$	$6< < !< =	<
 
<|OK&OK/@OK	OKb
9(9 )9 	9
 
9v`&`/@`	8`DXMt3&3/@3	3j

(9
BS
	
J D.J&J/@J	J<6 Q6	:6@4@4	4	8*4
'*%5$

+:
	
 ;@B%B37B	BH	D '1' 
'ReH eH QeH 
"	eHN - @J 	& B"
0"
	 "
H?&? 
!?B& 
!>%,%	 %@&	B&@(
 (
 +(
 
	(
T-;X	06-A--A;X-A	-A^Brh949	19I5V%
Hrb   c                      e Zd Zd fdZddZddZ	 	 	 	 	 	 ddZ	 	 	 	 	 	 ddZ	 	 	 	 	 	 ddZ	 	 	 	 	 	 ddZ		 	 	 	 ddZ
	 	 	 	 	 	 	 	 dd	Z	 d	 	 	 	 	 	 	 dd
ZddZddZddZddZ	 	 	 	 ddZddZ	 	 	 	 	 	 d dZ	 	 	 	 d!dZ xZS )"r]  c                0    t         |           || _        y ro   )r  r   rf   )rq   rf   r  s     r`   r   zBaseScheduling.__init__  s    "rb   c                R    | j                   r| j                   j                          y y ro   )rf   rg  rv   s    r`   free_buffers_in_schedulerz(BaseScheduling.free_buffers_in_scheduler  s    >>NN'') rb   c                    t               S )z0Return a set of .codegen.common.BackendFeature()r   r~  s     r`   get_backend_featuresz#BaseScheduling.get_backend_features  s
    |rb   c                    t         )zO
        Check whether node1 and node2 can be vertically fused or not.
        r~  r  s      r`   r0  z BaseScheduling.can_fuse_vertical  
     "!rb   c                    t         )zQ
        Check whether node1 and node2 can be horizontally fused or not.
        r~  r  s      r`   r1  z"BaseScheduling.can_fuse_horizontal  rk  rb   c                     y)au  
        A Multi-Output Template (referenced in #144012) is a template node
        with MultiOutputLayout, and its output buffers are instances of MultiOutput.
        In this context, we verify whether node1 represents the Multi-Output Template
        and node2 corresponds to one of its outputs. If so, we further check if
        backend supports this fusion.
        Fr   r  s      r`   r)  z.BaseScheduling.can_fuse_multi_outputs_template  s     rb   c                    |j                         s|j                         rt        j                  ||      S t        j                  ||      S )z 
        Fuse two nodes
        )r  r  rG  r  r  s      r`   rG  zBaseScheduling.fuse  sA     !1!1!3-225%@@%**5%88rb   c                    t         )z[
        Process the iteration sizes in case a transformation needs to be applied.
        r~  )rq   r  s     r`   r  zBaseScheduling.group_fn  rk  rb   c                    t         )z
        Given a template node, generate a kernel.

        This function is only available for triton now. If the third-party backend behaves as a sub-class
        of TritonScheduling, it can override it or reuse it.
        r~  )rq   rD  epilogue_nodesr(  s       r`   r@  zBaseScheduling.codegen_template  s
     "!rb   c                    t         zD
        Generate a kernel given a list of pre-fused nodes.
        r~  )rq   rA  r
  r  s       r`   r	  z.BaseScheduling.generate_kernel_code_from_nodes  s
     "!rb   c                    t         rs  r~  ry  s     r`   rC  zBaseScheduling.codegen_node&  
     "!rb   c                    t         )zt
        Generate synchronization code for the kernel. This method depends on the hardware characteristics.
        r~  rv   s    r`   rE  zBaseScheduling.codegen_sync,  ru  rb   c                     y)z
        Check whether the backend is requesting the scheduler to flush the generated kernel.
        If not supported, please return False.
        Fr   rv   s    r`   rF  zBaseScheduling.ready_to_flush2  s    
 rb   c                    t         )z]
        Flush the generated kernel and python wrapper code to the source code file.
        r~  rv   s    r`   ri  zBaseScheduling.flush9  ru  rb   c                    t         )r  r~  r  s     r`   r  z$BaseScheduling.benchmark_fused_nodes?  
     "!rb   c                    t         )z
        Benchmark a compiled module and return the execution time
        in milliseconds on randomly generated inputs.
        r~  )rq   r  s     r`   r  z)BaseScheduling.benchmark_codegened_moduleH  s
    
 "!rb   c                     y)z
        Return an unsigned integer which represents the priority of this fusion pair.
        The smaller is with higher priority.
        r   r   r  s      r`   rZ  z'BaseScheduling.get_fusion_pair_priorityO  s     rb   c                    t         )z
        Benchmark the list of nodes to combine and return the execution time
        and memory copy time in milliseconds on randomly generated inputs.
        r~  r>  s     r`   rK  z%BaseScheduling.benchmark_combo_kernelX  rz  rb   )rf   zOptional[Scheduler]r   )r  r  r   zOrderedSet[BackendFeature]r[  r  )r  r-  r   z"tuple[tuple[sympy.Expr, ...], ...])rD  rT   rq  r  r(  r  r   zOptional[str]ro   rY  )rg   z(Union[FusedSchedulerNode, SchedulerNode]r   r   r   rW  )r  r   r   rX  r\  ra  )r~   r   r   r   rg  ri  r0  r1  r)  rG  r  r@  r	  rC  rE  rF  ri  r  r  rZ  rK  r  r  s   @r`   r]  r]    sV   #*"&"/@"	""&"/@"	"
&
/@
	
	9&	9/@	9		9"3"	+""(" 4" 4	"
 
"$ (,		"*	" 	" %		"
 
	"""""0"	""&/@	"4"	1"rb   r]  )r^   ztorch._ops.OpOverloadr_   zCallable[..., bool]r   r   )r   z$torch._inductor.codecache.LocalCache)rh  rT   r   r   )rh  rT   r   zOptional[Callable[[Any], Any]])rh  rT   r   r   )r  r   r   r   )rg   rT   rX  rO  r   zdict[str, SchedulerBuffer]r   r   )r3  /Union[FusedSchedulerNode, GroupedSchedulerNode]r   r   )r3  r~  rf   re   r  rU  r   r   )r   )r   zlist[list[int]]r  r.  r  r,  r   z	list[int]r   )
__future__r   r  r!  r   r  rw  r9  rv  rz  r  rr  r  r  r7  r  r   r   r   r   r   r	   r
   r   r   typing_extensionsr   r   collections.abcr   r   typesr   weakrefr  r[   torch._inductor.async_compiletorch.utils._pytreer{  _pytreerf  torch._dynamo.utilsr   r   torch._inductor.codecacher   r   torch._inductor.irr   torch._inductor.metricsr   r   %torch.fx.experimental.symbolic_shapesr   torch.utils._ordered_setr   torch.utils._sympy.symbolr   r   r   torch.utils._tritonr   r   r!   r"   r#   r$   r%   r&   analyze_preserves_zero_maskr'   codegen.commonr(   r)   r*   comm_analysisr+   r,   r-   r.   r/   r0   excr1   r2   r  r3   r4   r5   r6   r7   r8   	loop_bodyr9   rP  r:   r;   runtime.runtime_utilsr<   r=   r  r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   virtualizedrP   	getLoggerr~   r   _logginggetArtifactLoggerr  r  r  r   rU   r   rV   rW   WeakKeyDictionaryrX   ra   	dataclassrd   r   rT   r&  r  r  rs  r!  r  r   r   rW  r  r  rm  r8  r@  r  r  r?  r	  r  r8  r7  r  re   r]  r   rb   r`   <module>r     si   "          	     , R R R 2 2     $ $ $ 6 ? 7 M > / O O * D D D M M ; : 2 $    J 7 &    &  g!^^--hA
NN44XO  >>;;$    34y 4T]t_
 G    
,,
, 
," h8 h8 h8V 4_ 4 4b1 b1J 2 2(' <
 
,  &K
&K4&K ,&K 
	&KRW 1 W"5. 5k*% k*\	@	$@ $ 
	,l** l*^~:!3 ~:B
_, _J #%+#++  + 	+\ 
 
 
> +9??, 4L4H L4H^hN" N"rb   