
    i                      d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZ d dlmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d	d
lmZmZ d	dlmZ erd	dlmZmZ d	dl m!Z! d	dl"m#Z#m$Z$m%Z%m&Z&m'Z' d	dl(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/ d	dl0m1Z1  ejd                  e3      Z4ejj                  jm                  e3d      Z7erd dl8m9Z9 	 	 d1dZ:d2dZ;d2dZ<	 	 	 	 d2dZ=	 	 	 	 d2dZ>e G d d             Z?d3dZ@d4dZAd ZBd5d6dZCd ZDd7dZEd  ZF	 	 	 	 d8d!ZG	 	 	 	 d9d"ZH	 	 	 	 	 	 	 	 	 	 d:d#ZI	 	 	 	 d;d$ZJe G d% d&             ZK	 	 	 	 d<d'ZL	 	 	 	 d2d(ZMd=d)ZNd* ZOd+ ZP	 	 	 	 d2d,ZQd>d-ZRd?d.ZSd/ ZT	 	 	 	 	 	 	 	 d@d0ZUy)A    )annotationsN)defaultdict)	dataclass)AnyOptionalTYPE_CHECKINGUnion)trace_structured)StorageWeakRef)
OrderedSet   )configir)WeakDep)IRNode	Operation)SchedulerBuffer)estimate_peak_memoryestimate_peak_memory_allocfreeFreeableInputBufferget_freeable_input_bufSNodeMemory)contains_collectivecontains_waitfind_recursive_deps_of_nodefind_recursive_users_of_nodeis_collectiveis_fallback_opis_wait)Voverlap)BaseSchedulerNodec                   i }| D ]  }|j                         ||<    dd lm} ddlm} |j                         } |       }t        |      D cg c]  }g  }}|j                  |t        |j                               |       t        j                  t        j                  |      d      j                  j                         }	t        t        |             D ]  }
|	|
   | |
   _         y c c}w )Nr   )_get_default_group)dim)get_estimated_runtimetorch.distributeddistributed"torch.distributed.distributed_c10dr$   get_world_sizerangeall_gather_objectlistvaluestorchmediantensortolistlenoverride_estimated_runtime)snodesruntime_estimationssnodedistr$   
world_sizepg_gathered_runtime_estimationsmedian_runtime_estimationsis              O/var/www/html/engine/venv/lib/python3.12/site-packages/torch/_inductor/comms.py6align_runtime_estimations_across_all_distributed_ranksr@   7   s      C%*%@%@%BE"C$E$$&J		BCHCT6Uar6U 6U$d+>+E+E+G&H" "'12"fVVX  3v; M/I!/Lq	,M 7Vs   	C+c                     t        | ddd      S )z7
    Greedily schedules waits as late as possible.
    FTraise_comms
sink_waitsreorder_for_overlap_schedule_for_commr5   s    r?   rD   rD   M   s     Ed     c                     t        | ddd      S )z8
    Greedily schedules comms as early as possible.
    TFrB   rF   rH   s    r?   rC   rC   V   s     DU rI   c                     t        | ddd      S )a  
    This achieves the following overall scheduling procedure:
        Step 1: Given that we've currently scheduled comm N, we now schedule all compute nodes
            that are required for comm N + 1 but do not depend on comm N, to run at the same time with comm N.
        Step 2: If all those compute nodes are sufficient to overlap comm N, we're done.
            Otherwise, we now need to look elsewhere to find compute that overlaps with comm N.
            We prioritize compute nodes that are needed sooner.
        Step 3: We schedule the compute nodes dependent on comm N and required for comm N + 1.
        Step 4: We schedule comm N + 1.
        Repeat this for subsequent comm nodes.
    TrB   rF   rH   s    r?   reorder_compute_for_overlaprL   _   s     DTt rI   c                "    t        |       \  }}|S )a  
    Reorders communication ops relative to computation ops to improve communication-compute overlapping and hide comm
    latency.  Stops moving a particular op if it reaches a point that would have increased the peak memory footprint.

    Currently, follows these heuristics (subject to change or tune):
    - never reorders collectives relative to one another, for SPMD safety
    - has an option for per-collective prefetch limit, but does not enable it by default
    - limits the total number of reorder steps to some factor of the graph size to prevent worst-case quadratic
      performance

    Prerequisite: sink_comms_and_waits - ensure comm and wait nodes are scheduled as late as possible, respecting data
    dependencies.  That allows reorder_communication_preserving_peak_memory to take a best case peak-memory snapshot,
    and then monotonically improve latency by moving collectives backward in time.

    Peak memory impact is computed in an iterative fashion.  First, memory use at each timestep is computed, and global
    peak memory is computed as a max over timesteps.  Then, when swapping any two adjacent nodes, only the curr-memory
    for the earlier of the nodes after the swap is affected.  This enables checking step by step whether a swap is
    peak-memory-safe, and bailing out if not.  Example:

    0   n0      C0
    1   n1      C0 + Allocs(n1) - Frees(n1)
    2   n2      C0 + Allocs(n1) - Frees(n1) + Allocs(n2) - Frees(n2)

    0   n0      C0
    1   n2      C0 + Allocs(n2) - Frees(n2)    <-- After moving n2 to Time 1, only time1 memory changes
    2   n1      C0 + Allocs(n2) - Frees(n2) + Allocs(n1) - Frees(n1)

    )6_reorder_communication_preserving_peak_memory_internal)r5   reordered_snodes
node_statss      r?   ,reorder_communication_preserving_peak_memoryrQ   r   s    @ 	?vF !j rI   c                  v    e Zd ZU dZdZded<   dZded<   dZded<   d	Zd
ed<   d	Z	d
ed<   dZ
ded<   ed        Zy)ReorderInfozE
    Debug info describing how an individual snode was reordered
    floatinitial_exposedfinal_exposedNonestrlimiting_factorr   intmovesgrouped grouped_infoc                4    | j                   | j                  z
  S N)rV   rW   )selfs    r?   improvementzReorderInfo.improvement   s    ##d&8&888rI   N)__name__
__module____qualname____doc__rV   __annotations__rW   rZ   r\   r]   r_   propertyrc    rI   r?   rS   rS      sV      OUM5!OS!E3NGSL#9 9rI   rS   c                    | yt        | t        j                  j                  j                  j
                        ryt        | dd       x}rd|v ryy)NFTpython_kernel_nameextern_kernels)r   r/   opsaten#_scaled_dot_product_flash_attentiondefaultgetattr)noderl   s     r?   is_gemm_likert      sX    |		::BB  &d,@$GGG
0
0rI   c                    ddl m} t        | |      rt        d | j                  D              S t        | j                        S )Nr   GroupedSchedulerNodec              3  2   K   | ]  }t        |        y wra   )contains_gemm_like).0xs     r?   	<genexpr>z%contains_gemm_like.<locals>.<genexpr>   s     ?Q%a(?s   )torch._inductor.schedulerrw   
isinstanceanyr5   rt   rs   )r7   rw   s     r?   ry   ry      s4    >%-.?%,,???EJJ''rI   c                    ddl m} t        | |      r&| j                  r| j                  D ]
  } ||        y  ||        y )Nr   rv   )r}   rw   r~   temp_groupingr5   )r7   fnrw   _snodes       r?   _temp_group_visit_leavesr      s;    >%-.53F3Fll 	FvJ	 	5	rI   c                    d}| j                   D ]<  }|r|dz  }||j                         z  }|s |t        | j                                z  }> |S )Nr^   r;   )r5   get_namer-   get_buffer_names)r7   	with_bufsretns       r?   _group_namer      s[    
C\\ 73JCqzz|d5113456C7 JrI   c                >    t        | t              xr | j                  S ra   )r~   r   is_fake)ds    r?   _is_fake_depr      s    a!/aii/rI   c                f    dj                  | D cg c]  }|j                          c}      S c c}w )N~)joinr   )gnsgns     r?   _group_namesr      s%    88S1rR[[]1221s   .c                    t        | |      }t        | ||      \  }}}}t        t        | |            }d|d<   |||||fS )z*Initialize memory tracking data structures)r   r   N)r   r   dictzip)	r5   graph_inputsgraph_outputsname_to_freeable_input_bufpeak_memorysnodes_curr_memorysnodes_allocfreebuf_to_snode_last_use_curr_memorys	            r?   _initialize_memory_trackingr      sg    !7!M&.	
 MK#%57L
 F$678LL" rI   c                    i }i }t        |       D ]5  \  }}|dkD  r| |dz
     nd||<   |t        |       dz
  k  r| |dz      nd||<   7 | d   }|||fS )z/Create double-linked list structure from snodesr   r   N)	enumerater3   )r5   _prev_nextr>   r7   _heads         r?   _initialize_double_linked_listr      s{     EEf% F5()Ava!e}4e()CK!O(;va!e}eF 1IE%rI   c                  23456789:;<=>? d}| D ]  }t        |      sd} n |s| i fS ddlm} t        |       }t	        t
        j                  j                  j                               }t	        t
        j                  j                               }t        | ||      \  }2?6}| D ci c]  }|t        |       c}=i }		 	 	 	 	 	 d)=fd}
d}t        |       \  543	 	 	 	 	 	 d*4fd}345fd}2789:;fd}2678;?fd	}t        j                  }d}3}t        j                  }d}4|   |rnt        |      r|||k\  rn|dz  }t!               x}|	|<    |
| |4|   d
            x|_        |_        5|   }|}|;2|   d   :|wt        |      r	d|_        nc ||;      9 ||j(                  9d      }|j*                  D ci c]  }t-        |      r|j.                  | }}|j1                         }d
}|D ](  }|j3                  |j5                         d
      x}s&|} n |	 	 	 	 d+d} ||      \  }} |r?|}t7        :2|   d         :|xj8                  dz  c_        t;        9      |_        5|   }d| dt?        |j                                d|j5                          d|jA                         g dt;        9       d|  }!|!|_        n&?|   77jB                  7jD                  z
  8tG        t>              }"6jI                         D ]I  \  }#>|#jJ                  jL                  }$||$vr!tO        >fd9D              s6|">   jQ                  |#       K  ||9|"      \  }%}&|%|kD  rd|% d| |_        n}|xjR                  dz  c_)        |dz  } |||;        |
| |4|   d
            |_         ||9|"|&       |r+ddl*m+}'  |'|9t;        9       |3d
      |||2?d|"      }|rn	5|   }|w4|   }4|   |	}(|(D ci c]  }||(|   jX                   })}t[        |)D cg c]  }|)|   	 c}      }*t[        |(D cg c]  }|(|   jR                   c}      }d|* d| d<g d}+|(jI                         D ,cg c]^  \  }},t]        |      |,j"                  |,j$                  |,jX                  |,j&                  |,jR                  |,j8                  |,j<                  g` }-}},t^        j`                  jc                  d      rddl2m2}. < |.|-|+       z  <n8<d!z  <<tg        |+      d"z   z  <<d"ji                  tk        tf        |-            z  < |3d
      }/t        |/      |k(  sJ tm        |/||      \  }0}1}1}1<d#| z  <<d$|0 z  <tn        jq                  <       ts        d%d& <fd'(       |/|	fS c c}w c c}w c c}w c c}w c c}w c c},}w ),z
    Internal testing helper that also returns debug info.
    Returns:
        - reordered snodes list
        - dict {snode: ReorderInfo}
    FTr   rv   c                    t        |       }d|D ].  }t        |      rt        |      r ndfd}t        ||       0 t	        d|z
        S )N        c                    |    z  y ra   rj   )r   compute_timeruntimess    r?   accumulate_timezs_reorder_communication_preserving_peak_memory_internal.<locals>.exposed_communication_time.<locals>.accumulate_time7  s     00rI   r   )r   r"   returnrX   )estimate_op_runtimer   r   r   max)collective_snoderemaining_snodes	comm_timer7   r   r   r   s        @r?   exposed_communication_timezZ_reorder_communication_preserving_peak_memory_internal.<locals>.exposed_communication_time)  sa     ((89	% 	=E"5)U# 1 %UO<	= 1i,.//rI   c                P    g }| }	 ||j                  |       ||k(  r	 |S |   }!ra   appendheadtailr   r   r   s       r?   _group_nodeszL_reorder_communication_preserving_peak_memory_internal.<locals>._group_nodesB  A     }

1Dy
 aA rI   c                n    |    }|r||<   ||<   |   }|r| |<   || <   || <   | |<   | k(  r|y y ra   rj   )	candidate
group_head
group_tailcandidate_prevgroup_tail_nextr   r   r   s        r?    _perform_double_linked_list_swapz`_reorder_communication_preserving_peak_memory_internal.<locals>._perform_double_linked_list_swapO  ss     y)$.E.!*j  
+%.E/"*i &i%j IE rI   c                r   i }d}|s)t        z
     d   z
  j                  z         }||fS  }D ]S  }|   d   |z   }|||<   t        ||      }|j                  |d       }|4|D ]  }	||	j                  j                  z  } U    d   |z   j                  z   }
|
|| <   t        ||
      }||fS Nr   r   )r   
size_allocget
mpi_buffer	size_free)r   group_ns/group_n_to_bufs_after_swap_dealloc_by_candidate_post_alloc_updatepotential_peakmem_after_reorder_deltar   gn_post_alloc_membufsbufcandidate_mem_post_allocr   candidate_allocfreecandidate_delta_memr   group_peak_memoryr   s              r?    _calculate_potential_peak_memoryz`_reorder_communication_preserving_peak_memory_internal.<locals>._calculate_potential_peak_memoryi  s0   
 <>> !$77Z(+%&%001N "#555 )<'; 		HB ,R 0 36M M%6r" 1BCNBFFr4PD HC+s~~/G/GG+H		H $Q'%&!,,- 	!
 )A9%^-EF111rI   c                   |sK|D ]  }|   }|d   z
  |d   z
  f|<       d   j                   z   }|j                  z
  }||f| <   y |j                         D ]  \  }}|D ]  }	| |	<   	  d}
|D ]R  }||   }t        d ||   D              }|
|z  }
|   xj                  |z  c_        ||   j                  z
  }||f|<   T ||    }|    xj                  |
z  c_        ||    j                  z
  }||f| <   y )Nr   r   c              3  H   K   | ]  }|j                   j                    y wra   r   r   rz   r   s     r?   r|   zu_reorder_communication_preserving_peak_memory_internal.<locals>._update_memory_tracking_after_swap.<locals>.<genexpr>  s"      6 ((6    ")r   r   itemssum)r   r   r   r   r   cm_candidate_post_alloc_mem_candidate_post_free_memr   r   "size_free_to_move_to_candidate_sumr   _gn_post_alloc_memsize_free_to_move_to_candidategn_post_free_memcandidate_post_free_memr   r   r   r   r   r   s                   r?   "_update_memory_tracking_after_swapzb_reorder_communication_preserving_peak_memory_internal.<locals>._update_memory_tracking_after_swap  s    ? !"%qE//qE//$R  Z(+.A.L.LL & *,?,I,II % *('L#  =BBD	7 
 7-6%c*7		7 34* 
	EA&8&;25 6J1M6 3* /2PP.Q))-KK)$69I!9L9V9V$V13CDLO
	E %7y$A!#--1SS-%(8(C(M(MM 	  &##
YrI   Nr   zcollective orderingr   c                4    t        |       ryt        |       ryy)N)Fr   )Fry   TN)r   ry   )r   s    r?   is_groupablezL_reorder_communication_preserving_peak_memory_internal.<locals>.is_groupable   s     /y9#?-i8#>)rI   data dependency (dep_names:)
 candidate:z(outs:)dep on 
 non_group_reason:c              3  (   K   | ]	  }|k(    y wra   rj   )rz   r   snode_last_uses     r?   r|   zI_reorder_communication_preserving_peak_memory_internal.<locals>.<genexpr>:  s     Br^3Bs   peak memory new:	 vs base:!_debug_iterative_memory_recomputerQ   zAreorder_communication_preserving_peak_memory improved overlap by z
 ns after z reorders.
)zCollective nodezinitial exposedzfinal exposedrc   limiting factorr\   r]   r_   tabulater   headers>Please `pip install tabulate` to nicely render overlap stats.

z
 peak_memory_before:z
 peak_memory_after:artifactc                     dddS )NrQ   stringnameencodingrj   rj   rI   r?   <lambda>zH_reorder_communication_preserving_peak_memory_internal.<locals>.<lambda>  s    B 
 rI   c                      S ra   rj   )reorder_log_strs   r?   r  zH_reorder_communication_preserving_peak_memory_internal.<locals>.<lambda>  s    ? rI   metadata_fn
payload_fn)r   r"   r   list[BaseSchedulerNode]r   rU   r   Optional[BaseSchedulerNode]r   r  r   r
  )r   r"   r   ztuple[bool, Optional[str]]):r   r}   rw   r3   r   r    graphr   keysget_output_namesr   r   r   r   (reorder_iterative_debug_limit_to_reorder(reorder_iterative_debug_memory_recomputerS   rV   rW   rZ   	schedulerunmet_dependenciesr   r  get_outputsr   r   r   r]   r   r_   r-   r   r   r   r   r   r   
succ_nodesr   r   r\   comms_debugr   rc   r   node_summary	importlibutil	find_specr   rY   r   mapr   overlap_loginfor
   )@r5   has_collectivesr7   rw   original_snodes_numr   r   r   r   statsr   total_movesr   r   r   r    debug_num_collectives_to_reordernum_processed_collectivescurr debug_iterative_memory_recomputeiterative_recompute_errorr  r   r   groupr   	data_depscandidate_outsdata_depor   is_groupable_resultgrouping_reasonmsgr   r   r  r   r   r   rP   rc   total_improvementr   	node_inforowsr   
new_snodesnew_peak_memoryr;   r   r   r   r   r   r   r   r   r   r   r  r   r   r   s@                                                     @@@@@@@@@@@@@@r?   rN   rN     s^    O u%"O rz>f+$.qww/C/C/H/H/J$KL%/0H0H0J%KM 	$FL-H" 8>0.3"5))0H 35E0+0?V0	0* K8@E5%)1L	 4&2 &2P5
 5
p 	77 % &'D'-'V'V$ %
+
!$t$/;)-MM%*%!,.D5;8Rl5;59 D 4#5 dIJJ ,T 21 5'&y1+@D(/;J
/S,NN"& (-'?'?"#|TUAFFAI	  "+!6!6!8' A%MM!**,==q=#$
 '	*#4	*3	* <H	;R8'*%.
,/-|I/Fq/I-) ),8,=)$))$4	  /xjDIYDZC[ \,,5,>,>,@+AIcIcIeHfGg h&&23&7%82?2CE  03,3CI3N#'225H5R5RR $"  % @ +002" "!$!:!:J 
2 BcBB C&fSk" 6Vs$S62 2 "K/*>*:)K=Q ( 

a
q 0J
S%?,uT{D9&" 3C&	 4 O0Q!$S)$UD12%#$(FG1- 1!*-	W 'X T{ +
!B JEOPE5*U+777PKP[IE[/IJJG5z%(..GHK LL]K^ _l	, 	G* !+ 0 0 2 E9 %%##!!%%OO""		
D  ~~
+%8
 	

 	M	
 	3w<$..499Sd^44eT*Jz?1111=. OQ1 /}==O..?@@O_%
 + u_0`@ QIG s+   -WW&WW WW$A#W)c                    i }i i i i ct        |       D ]y  \  }}|j                         D ]  }|||<   	 |j                         D ]  }||<   	 ||j                         <   |j                         }	t        j
                  |	<   d|	<   ||	<   { d}
| D ]  }|rZt        |      rO|
|j                         <   |j                  D ]'  }|   j                         }t        |   |
      |<   ) |
dz  }
_|sbt        |      snd|j                         <     G fdd      | D ci c]  }|t        d |j                  D              ! c}g t        t              | D ci c]  }|t        |       c}j                         D ]J  \  }}t        |      dk(  rt!        j"                   |             |D ]  }|   j%                  |        L g fdfdfd}t              rIt!        j&                        j(                  }|rt        |      r	 ||       n |       t              rIj                         D ]  \  }}t        |      dk(  rJ d	         S c c}w c c}w )
a  
    Schedule `snodes` for various comm optimization objectives.

    Args:
        snodes: the nodes to be scheduled.
        raise_comms: whether to greedily schedule collectives as early as possible
        sink_wait: whether to greedily schedule waits as late as possible
        reorder_compute_for_overlap: whether to reorder compute nodes to
            optimize for compute/communication overlapping.

    Returns:
        The new schedule order.

    Some notes on the synergy between different options:
        - `raise_comms` provides more overlapping oppurtunies for `reorder_compute_for_overlap`.
        - When both `raise_comms` and `sink_waits` is `True`, `raise_comms` is prioritized.
    r   r   c                  &    e Zd Zd fdZd Zy)$_schedule_for_comm.<locals>.Runnablec                    || _         t        t        |j                                     }|   j	                         }|   |   |   f| _        y ra   )r7   nextiterget_operation_namesr   score)rb   r7   r  
fused_namename_to_fused_nodescores_0scores_1scores_2s       r?   __init__z-_schedule_for_comm.<locals>.Runnable.__init__  sV    DJU6689:D+D1::<J$$$DJrI   c                4    | j                   |j                   k  S ra   r;  )rb   others     r?   __lt__z+_schedule_for_comm.<locals>.Runnable.__lt__  s    ::++rI   N)r   rX   )rd   re   rf   rA  rE  )r=  r>  r?  r@  s   r?   Runnabler6    s    	 		,rI   rF  c              3  4   K   | ]  }|j                     y wra   )r  )rz   deps     r?   r|   z%_schedule_for_comm.<locals>.<genexpr>  s     Gs#((Gs   c                    j                  |        | j                         D ]N  }|   D ]D  } |    j                  |       t        |          dk(  s)t	        j
                   |              F P y)zU
        Schedules `snode` and put all unblocked nodes onto the ready queue.
        r   N)r   r   remover3   heapqheappush)r7   buf_namerF  buffer_usersready	scheduled
unmet_depss     r?   schedulez$_schedule_for_comm.<locals>.schedule  sv     	..0 	;H%h/ ;5!((2z%()Q.NN5(5/:;	;rI   c                     D  cg c].  } t        | j                        st        | j                        s| 0 }} t        |      dk(  ryt	        |d       S c c} w )zh
        Return the next node in the ready queue that's neither a collective or
        a wait.
        r   Nc                    | j                   S ra   rC  r{   s    r?   r  zG_schedule_for_comm.<locals>.get_overlapping_candidate.<locals>.<lambda>,  s
    QWW rI   key)r   r7   r   r3   min)r{   
candidatesrO  s     r?   get_overlapping_candidatez5_schedule_for_comm.<locals>.get_overlapping_candidate   s]     
&qww/agg8N 

 

 z?a:#455
s   3Ac                   t        |       sJ  |        |    }|dkD  rM        x}Dj                  |        |j                         ||j                     z  }|dkD  r
        x}Dt        j                         y)z
        Schedules collective node `snode`, along with one or more compute nodes
        to overlap with it. The strategy is described in the comment of
        `reorder_compute_for_overlap`.
        r   N)r   rJ  r7   rK  heapify)r7   collective_costr   rZ  rO  rR  snode_to_costs      r?   schedule_collective_for_overlapz;_schedule_for_comm.<locals>.schedule_collective_for_overlap.  s     #5)))'.a799FLL#Y__%}Y__==O a799F
 	erI   z;Detected unscheduled nodes. Nodes with unmet dependencies: )r   r   r:  r   sysmaxsizer   	ancestorsrX  r   r   r  r   r   r   r3   rK  rL  addheappopr7   )r5   rC   rD   rE   buf_name_to_snodeidxr7   rM  op_name	node_namecomm_idxancestoranc_fused_namedepsrH  r_  rF  rN  rZ  r=  rO  rR  rP  r>  r?  r@  r^  rQ  s                   @@@@@@@@@@@@r?   rG   rG     s   L #%r2 Hh' "
U..0 	0H*/h'	0 002 	0G*/w'	0/45>>+,NN$	!kk!" H +.u5)1HU^^%&!OO S!3H!=!F!F!H+.x/G+R(S MHM%0)*HU^^%&+, ,  < 	zGe.F.FGGG<J
 E=H=TLDJK5U/66KM!'') )tt9>NN5(5/2 	)C!!%(	)) I	; 	;6& e*e$**#6u#=+E2UO e* "'') 
t4yA~ 	
I*V	
~
 Q< Ls   9$J5Jc           	     z   t         j                  j                         s| S | D cg c]  }t        |      s| }}t	        dt        |            D ]b  }t        t        ||   j                                     }||dz
     j                         D ]"  }||   j                  t        ||d             $ d | S c c}w )z
    Decide global ordering of comms, by just enforcing the ordering that's in the input graph
    (might not be the same ordering as the eager mode program).
    TODO: Come up with a better approach
    r   Tmutating_bufr   )r/   r(   is_availabler   r+   r3   r8  r9  r   add_fake_depr   )nodesname_to_bufr=  r   
comm_nodesr>   ro  r   s           r?   decide_global_ordering_of_commsru  O  s     ))+"=&9!&<!=J=1c*o& DA!?!?!ABCa!e$557 	CqM&&,E	 L >s
   B8B8c                  T    e Zd ZU dZded<   dZded<   dZded<   dZded<   d	Zded
<   y)SinkWaitInfor   r[   r]   r^   rY   r_   r\   
moves_inforX   rZ   N)	rd   re   rf   r]   rh   r_   r\   rx  rZ   rj   rI   r?   rw  rw  g  s3    GSL#E3NJ!OS!rI   rw  c                
  123456789:; ddl m} t        |       }|dk(  r| i fS t        t        j
                  j                  j                               }t        t        j
                  j                               }t        | ||      \  }1;}}t        |       \  432i }	 	 	 	 	 	 d(3fd}	156789;fd}
234fd}156;fd}| d   }t               }t        j                  }t        j                  }d}4|   ,|rn(|t        |      |k\  rnt        |      r||vr|j                  |       t!               x}||<   3|   }|}|8|}1|   d   9||rn |	8|      7 ||j"                  7d	
      }|j$                  D ci c]  }t'        |      s|j(                  | }}|j+                         }d }|D ](  }|j-                  |j/                         d       x}s&|} n |t1        |      xr t1        |      x}rd } ||      \  }}|r?|}t3        91|   d         9|xj4                  dz  c_        t7        7      |_        3|   }|(r&dt7        7       d|j/                          |_        nd| dt=        |j                                d|j/                          d|j?                         g d7 d|D cg c]  }|j/                          c} d| |_        n0;|   55j@                  5jB                  z
  6tE        t<              } |jG                         D ]H  \  }!}"|!jH                  jJ                  }#|"|k7  r"d }$7D ]	  }%|%|#v s|%}$ |$5| |$   jM                  |!       J  |
|7|       \  }&}'}(|&|kD  rd|& d| |_        n|xjN                  dz  c_'        |xjP                  d|j/                          z  c_(         ||8|        ||7| |'|(       |r+ddl)m*})  |)|7t7        7       |	2d       |||1;d|       }|rn	3|   }|É4|   }4|   ,g d}*|jG                         D +cg c]H  \  }+}tW        |+      |j4                  |j8                  |jN                  |jP                  |j:                  gJ },}+}d:tX        jZ                  j]                  d      rddl/m/}- : |-|,|*      z  :n8:d z  ::ta        |*      d!z   z  ::d!jc                  te        t`        |,            z  :tf        ji                  :        |	2d       }.t        |.      |k(  sJ tk        |.||      \  }/}0}0}0:d"| z  ::d#|/ z  :tm        d$d% :fd&'       |.|fS c c}w c c}w c c}}+w ))Nr   rv   c                P    g }| }	 ||j                  |       ||k(  r	 |S |   }!ra   r   r   s       r?   r   z4_sink_waits_iterative_internal.<locals>._group_nodes  r   rI   c                      d      j                   z
  }i }i }d}|s!t        z   |j                   z         }|||fS |j                   z   }||| <   |}t        d t        j                  j                  |j                               D              }| || <   |z   }	D ]S  }
|
   d   |	z   }|||
<   t        ||      }d}|
|v r*||
   }|D ]  }||j                  j                  z  } |||
<   |	|z  }	U |||fS )Nr   c              3  H   K   | ]  }|j                   j                    y wra   r   r   s     r?   r|   z[_sink_waits_iterative_internal.<locals>._calculate_potential_peak_memory.<locals>.<genexpr>  s"      *
 NN$$*
r   )	r   r   r   	itertoolschainfrom_iterabler.   r   r   )r   r   7group_n_to_bufs_after_swap_dealloc_instead_of_candidatepre_group_memr   _size_free_delta_updater   candidate_post_alloccandidate_size_free_to_move	delta_memr   gn_post_allocgn_size_free_to_addr   r   r   r   r   r   r   r   r   s                  r?   r   zH_sink_waits_iterative_internal.<locals>._calculate_potential_peak_memory  s    $Q'*::*F*Q*QQ 	 <>@BF !$77 3 > >>N "#57NNN,/B/M/MM(<9%-&) *
 44GNNP*
 '
# /J-I	*'*EE	 
	-B(,Q/);M%2r" ?N"#LLNrR DC'3>>+C+CC'D.A'+,,I
	- 13JJJrI   c                n    |   }|r| |<   || <   |    }|r||<   ||<   | |<   || <   |k(  r| y y ra   rj   )r   r   r   group_head_prevcandidate_nextr   r   r   s        r?   r   zH_sink_waits_iterative_internal.<locals>._perform_double_linked_list_swap  sq      
+%.E/"*i y)$.E.!*j &j%iE rI   c                L   |d   }|   d   |   j                   z
  }|sC|j                   z   }||j                  z
  f| <   |D ]  }|   }	|	d   z   |	d   z   f|<    y | g|D ]9  }
||
   }|
   xj                  ||
   z  c_        |||
   j                  z
  f|
<   ; y r   )r   r   )r   r   r  r   r  r   r  r  r   r   r   
post_allocr   r   r   r   s               r?   r   zJ_sink_waits_iterative_internal.<locals>._update_memory_tracking_after_swap  s    V
$Q'*::*F*Q*QQ 	 G#03F3Q3Q#Q $$':'D'DD'L#  !"%qE//qE//$R  "c" 	A+A.JQ))-DQ-GG)-a0:::LO	rI   rT   FTr   c                    t        |       rdd| j                          fS t        |       rdd| j                          fS y)NFzcandidate contains collective zcandidate contains gemm_like r   )r   r   ry   r7   s    r?   r   z4_sink_waits_iterative_internal.<locals>.is_groupable0  sW    .u5 %"@AQ@R S$  .e4 %"?@P?Q R$   *rI   r   zcollective ordering z with candidate:r   r   r   z(os:r   z
 outs:r   r   r   +r   sink_waits_iterative)z	Wait noder]   r_   r\   rx  r   r^   r   r   r   r   r   z*
 sink_waits_iterative peak_memory_before:z)
 sink_waits_iterative peak_memory_after:r   c                     dddS )Nsink_waits_iterative_infor   r  rj   rj   rI   r?   r  z0_sink_waits_iterative_internal.<locals>.<lambda>  s    / 
 rI   c                      S ra   rj   )log_strs   r?   r  z0_sink_waits_iterative_internal.<locals>.<lambda>  s    7 rI   r  r  )7r}   rw   r3   r   r    r  r   r  r  r   r   r   r  (sink_waits_iterative_debug_limit_to_sinkr   rc  rw  r  r  r   r  r  r   r   r   r   r]   r   r_   rZ   r-   r   r   r   r   r   r   r  r   r\   rx  r  r   r  r  r  r  r   rY   r   r  r  r  r   r
   )<r5   rw   r  r   r   r   r   r   r   r   r   r   r   r$  processed_waitsr%  debug_num_sink_waits_to_reorderr&  r  r   
wait_snoder   r'  r   r(  
group_outsr*  r+  both_contain_commsr   is_grp
grp_reasonr  r   r   r  last_succ_gnr   r   r   r  r   r   r7   r1  r   r2  r3  r;   r   r   r   r   r   r   r   r   r   r  r   s<                                                    @@@@@@@@@@@r?   _sink_waits_iterative_internalr  p  s    ?f+arz$.qww/C/C/H/H/J$KL%/0H0H0J%KM 	$FL-H" 9@E5%35E)1L	 )K )KV*B ":D lO'-'V'V$77 $ !&
+
!$+7O$(GG4#>%!-/D5;dIJJJ ,T 21 5',/;J
/S,(("& '99'? FFAI	  #..0
# A%MM!**,==q=#$ '+E2U7J97U& 
* *6i)@&FJ%.
,/-|I/Fq/I-) ),8,=)$))$4	 "*0B2<3D2E.y/A/A/C.DF ,  /xjDIYDZC[ \,,5,>,>,@+AyGaGaGcFdEe f&&)U&j'I

'I&J2:,	@ , 3CI3N#'225H5R5RR $  % H +002" "!$!:!:J%2 #'L! .++-L. $+  L$fSk'", 5!O L 24K "K/*>*:)K=Q ( 

a
Qy'9'9';&<#==0J
S2K&+ 4N0Q!$S)$UD12%#$(.O1- 1!*-	o 'p T{S +
!VG" !;;=
 E4 LLJJOO  	

D 
 G~~
+%8
 	

 	TT3w<$&&499Sd^,,WeT*Jz?1111=. OQ1 <[MJJG;O;LMMG
 # uyn (J|
s   U%	U*AU/c                    t        |       d   S )Nr   )r  rH   s    r?   r  r    s     *&1!44rI   c                    t         j                  dk(  r| j                         }|S t        t         j                        sJ t        j                  |       }|S )z:
    Returns estimated op runtime in nanoseconds (ns)
    rq   )r   r   r&   callable)r7   runtimes     r?   r   r     sR     !!Y.--/ N 22333,,U3NrI   c           
     b   | j                         }t        |      dk(  rd}t        | j                  t        j
                  t        j                  f      rd| j                         D cg c]  }|j                          c} }d| j                  D cg c]  }|j                   c} }d| j                          d| j                  j                   d| d| d		}| j                         D cg c]  }|j                  j                          }}d
j                  |D 	cg c];  }	t        |	t        j                        rd|	j                   d|	j                    d	nd= c}	      }
	 | j                  j#                         }| j                  j&                  j(                   | |
 d| d| j+                         ddS g }|D ]  }|j-                  t/        |              | j&                  j(                   ddj                  |       S c c}w c c}w c c}w c c}	w # t$        $ r d}Y w xY w)Nr   r^   zouts:zins: z (z)
 z
 (),z (size=z	, stride=z.0fz ns): z, )	get_nodesr3   r~   rs   r   ExternKernelOut_CollectiveKernelr  r   r  r  rl   get_output_specr   Layoutsizestridemaybe_get_nameAttributeError	__class__rd   r&   r   r  )r7   r5   detailr+  outs_strr   ins_strchildlayoutslayoutout_tensor_inforh  	summarieschild_snodes                 r?   r  r    s   __F
6{aejj2#5#5r7K7K"LMe6G6G6IJ

JKLHe.F.FGaffGHIG)*"UZZ-J-J,K4PXzY]^e]ffghF=B__=NOE5::--/OO((
 &	  fbii0 &++ia@
	

113I **&&//08II;VXY^YtYtYvwzX{{  A  	A I 4k234oo&&'r$))I*>)?@@/  KGO  	I	s+   )HH/!H!A H)H   H.-H.c                &   d}d }d }t        |       D ]  \  }}|^t        |      r|t        |      z  }|j                  }n$t	        |j                        rn|t        |      z  } ||t        |              ft        |      r.|t        |      z  }|j                  } ||t        |              t	        |j                        r ||t        |              d } ||dt        |               t        j                  d|dz  dz          y )Nr   c                :    t         j                  | dd|        y )Nz>6r  )r  debug)stepr.  s     r?   step_logz#visualize_overlap.<locals>.step_log  s    T"IRu-.rI   z| zEst. runtime (ms): i  )r   r   r   rs   r   r  r  r  )ordertotal_est_runtimecur_comm_noder  r  r7   s         r?   visualize_overlapr    s     #M/ !' ;e "5)!%8%??! %

$ !%8%??!Tl5124"5)!%8%??! %

,u"5!68$,u"5!68 $L$7#89:-;. 
/$6=>?rI   c                (   | }t        t        j                  j                  j	                               }t        t        j                  j                               }t        j                  D ]I  }t        |t              r|t               v rt               |   }t        |      sJ d| d       t        | t        | |      |      \  }}t        j                  j!                         dk(  r(t"        j%                  d| d|d       	 t'        |       t+        j*                         } ||      }t+        j*                         |z
  }	t        j                  j!                         dk(  r(t"        j%                  d	| d
|	 d       	 t'        |       t        | t        | |      |      \  }}t-        d|       L |S # t(        $ r!}t"        j%                  d|       Y d }~d }~ww xY w# t(        $ r!}t"        j%                  d|       Y d }~d }~ww xY w)Nz3Invalid reorder_compute_and_comm_for_overlap pass: z is not callabler   z.==== Visualize overlap before reordering pass z, peak_memory=z ====r^   )exc_infoz-==== Visualize overlap after reordering pass z	 (ran in z	 sec)====zfinal peak_memory=)r   r    r  r   r  r  r   'reorder_for_compute_comm_overlap_passesr~   rY   globalsr  r   r   r/   r(   get_rankr  r  r  	Exceptiontimeprint)
r5   r  r   r   pr   r;   et0ts
             r?   $reorder_compute_and_comm_for_overlapr  2  s    E$.qww/C/C/H/H/J$KL%/0H0H0J%KM;; 'a!wy.	!A{ 	
A!DTU	
{ .*6<@-
Q %%'1,@?k^SXY2!%( YY[%IIK"%%'1,?s)A3iX2!%( .*6<@-
Q 	#{n%&?'@ L#  2!!"q!112  2!!"q!112s0   F:G':	G$GG$'	H0HHc           
        t        | j                        t        t               t        t               t              D ]  \  }}|j                  dk(  s|j
                  t        j                  j                  j                  j                  k(  sR|j                  d   j                  dk(  sJ d| d|j                  d    d       |j                  d   }|j                  d   }|dkD  r|   j                  |       |   j                  |        fd}t        t               }t              D ]  \  }}|j                  dk(  s|j
                  t        j                  j                  j                  j                  k(  sR|}|j                  d   j                  dk(  sJ d	 d
|  d        |      s|   j                  |        d }d D ]  }|j                  dk(  st        |j
                  t        j                   j"                        sB|j
                  j$                  j&                  sc ||      rl ||j)                               sJ d| d        |j+                         D ]!  \  }	t        |	      D ]  \  }
}|   }|j                  d   u sJ |j                  \  }|dz   }|
t-        |	      dz
  k  r|	|
dz      nt-              dz
  }|| }t/        fd|D              rJ d d| d|  d       |D ]  }|j                  dk(  s|j                  v s"|j
                  t        j                  j                  j                  j                  k7  s^t1        fd|j                  D              }||_          $ |j+                         D ].  \  }	t        |	      D ]  \  }
}|   }| j3                  |        0 D ]q  }|j                  dk(  s|j
                  t        j                  j                  j                  j                  k(  sO|j                  d   |v sa| j3                  |       s y)a  
    This FX graph pass replaces uses of FSDP2 unsharded params with their corresponding
    graph intermediates that were fsdp.copy_ into the unsharded params in the original graph.

    NOTE: Can only apply this pass to any of the FSDP2 unsharded params that have this pattern
    (or repetition of): `resize_(full) -> copy_ -> resize_(0)`. Because of this, for partial-graph case
    where `resize_(full) -> copy_` is in one graph and `resize_(0)` is in another graph, we can't
    remove these resize and copy ops and thus we will have worse performance there.

    In other words, "do we try to remove all the resize_(full) -> copy_ -> resize_(0) nodes for this unsharded param"
    is actually a per-unsharded-param decision, since for each unsharded param, we look at its resize sequence pattern
    (in `check_resize_pattern()`) to determine if its set of resize and copy nodes can be removed.
    call_functionr   placeholderz1Resize can only operate on graph inputs, but got z# which is resizing non-graph-input r   r   c                l   j                  | g       }j                  | g       }t        |      t        |      k(  s2t        j                  d|  dt        |       dt        |       d       yt	        ||      D ]7  \  }}||k\  st        j                  d|  d|    d| d	|    d| d
        y y)NzH
Unequal number of resize-to-full and resize-to-0 nodes for graph input z:
z vs. zK.
Skipping `remove_fsdp2_unsharded_param_graph_input_usage` FX graph pass.
Fz
For graph input z: resize-to-full node z
 at index z 
happens after resize-to-0 node zd.
Skipping `remove_fsdp2_unsharded_param_graph_input_usage` FX graph pass for that unsharded param.
T)r   r3   logwarningr   )graph_inputresized_to_full_idxesresized_to_0_idxesresize_to_full_idxresize_to_0_idx&graph_input_to_resized_to_0_node_idxes)graph_input_to_resized_to_full_node_idxes	node_lists        r?   check_resize_patternzLremove_fsdp2_unsharded_param_graph_input_usage.<locals>.check_resize_pattern}  s    !J M M!
 DGGUWX()S1C-DDKKHHS} U E#&8"9!: ;  47!#54
 	/ "_43I>P4Q3RR\]o\p q  )/ :;:oEV W 	 rI   z\
Assumed all FSDP2 `unsharded_param`s to be graph input, but it's not true!
Offending node: z	. Graph: c                    | j                   t        j                  j                  j                  j
                  k(  xs; | j                   t        j                  j                  j                  j
                  k(  S ra   )targetr/   rn   fsdpcopy_rq   inductorresize_storage_bytes_)rs   s    r?   is_allowed_mutationzKremove_fsdp2_unsharded_param_graph_input_usage.<locals>.is_allowed_mutation  sO    KK599>>//777 O{{eii00FFNNN	
rI   c           	     n   t        | j                  t        j                  j                        r^t        | j                  j                  j                        D cg c])  \  }}|j                  |j                  j                  r|+ c}}ng }t        |D cg c]5  }t        | j                  |   j                  d   j                               7 c}      }t        |D cg c](  }t        |j                  d   j                               * c}      }t        ||z        dkD  S c c}}w c c}w c c}w )Nvalr   )r~   r  r/   _ops
OpOverloadr   _schema	arguments
alias_infois_writer   r   argsmetauntyped_storager3   )rs   unsharded_paramsr>   r{   mutated_arg_idxesmutated_node_arg_storagesunsharded_paramstorages_of_unsharded_paramss           r?   -is_node_mutating_unsharded_param_or_its_aliaszeremove_fsdp2_unsharded_param_graph_input_usage.<locals>.is_node_mutating_unsharded_param_or_its_alias  s    $++uzz'<'<= &dkk&9&9&C&CDAq<<+0E0E   	 %/ + tyy|007GGIJ%
! (2 (8# 33E:JJLM(
$ ,/KKLqPP)s   .D':D-"-D2zdUser mutation on FSDP2 unsharded param is not allowed when Traceable FSDP2 is used. Violating node: c              3  2   K   | ]  } |g        y wra   rj   )rz   rs   r  r  s     r?   r|   zAremove_fsdp2_unsharded_param_graph_input_usage.<locals>.<genexpr>  s#       >d_DUVs   z(Assumed no ops mutating unsharded param z in subgraph z, but it's not true!
Graph: c              3  .   K   | ]  }|u rn|  y wra   rj   )rz   argreplacementr  s     r?   r|   zAremove_fsdp2_unsharded_param_graph_input_usage.<locals>.<genexpr>  s%      % (+o'=3F%s   N)r-   rr  r   r   opr  r/   rn   r  r  rq   r  r   r  r  r~   r  r  r  
is_mutabler  r   r3   r   tuple
erase_node)r  rf  rs   r  new_sizer  'unsharded_param_to_fsdp_copy_node_idxesfsdp_copy_noder  fsdp_copy_node_idxesr>   fsdp_copy_node_idxr;   subgraph_start_idxsubgraph_end_idxsubgraph_nodesnew_argsr  r  r  r  r  r  s                    @@@@@@r?   .remove_fsdp2_unsharded_param_graph_input_usager	  [  s    U[[!I 1<D0A--8->*y) P	TGG&uyy11GGOOO99Q<??m3  :2267Z[_[d[def[gZh i6 3 ))A,Kyy|H!|9+FMMcR6{CJJ3OP"J /:$.?+y) 	U	T77o%$++9M9M9U9U*U!N"iilO"%%6  = !5' 29 6 $O47HOOPST	U
Q4  GG&4;;

(=(=>##..'-D=BBD eeidj k : 
1	6	6	8") 	%./C%D 	)!A!&'9:N!&&q)_<<<+00NA{!3a!7 s/0144 %QU+^a' 
 ''9:JKN *  ))8(9~FV Ww   ' 
)GG.'4994uyy'9'9'O'O'W'WW$ %#'99%  H !)DI
))	)	")P 
1	6	6	8- 	%./C%D 	-!A!&'9:N^,	-	-  #GG&uyy11GGOOO		! GGT"#rI   c                  	 	 dd l 		j                  j                         sJ 	j                  j                  j
                  r 	j                  j                  j                  sJ 	 ddl
m}m}m}m}m} 	 	fd} |       } | |	j                  j                  j
                  j                    |t"        j$                   |	j                  j&                  j(                  j                    |d       |d       |d       |d       |d	             |d
             |d       |d            |d       d	fd       } ||        |j+                  |        y # t        t        t        f$ r Y y w xY w)Nr   r   )CallFunction
KeywordArgMatchPatternMatcherPassregister_graph_patternc                J   t        | j                        }|D ]  }|j                  t        j                  k(  s!|j
                  d   j                  j                  j                  j                  j                  u se|j
                  d   dk(  sx| j                  |        y r   )r-   rr  r  operatorgetitemr  rn   r  all_gather_copy_inrq   r  )gr  r   r/   s      r?   remove_unused_getitemz8reinplace_fsdp_all_gather.<locals>.remove_unused_getitemD  su    M	 	 AH,,,FF1I$$		(I(I(Q(QQFF1INQ	 rI   all_gather_inputsall_gather_outputinp_split_sizesall_gather_input_numelrankitem_idx
group_size
group_namec                &    | j                   d   dk(  S )Nr  r   )kwargs)matchs    r?   r  z+reinplace_fsdp_all_gather.<locals>.<lambda>d  s    %,,z":a"? rI   )	pass_dictextra_checkc                l    fd}| j                  ||d   |d   |d   |d   |d   |d   |d   g       y )	Nc                     | d d }| d   }| d   } j                   j                  j                  j                  | }|d   }|d   }j                   j                  j
                  j                  ||||      }|S )NrT   r   r   )out)rn   r  r  rq   _c10d_functionalall_gather_into_tensor_out)	r  copy_in_argsr  r  r  r  	getitem_1all_gather_into_tensorr/   s	           r?   replzEreinplace_fsdp_all_gather.<locals>.reinplace_all_gather.<locals>.replg  s      9LbJbJ!J!B!B!J!J" )+G*1-I		**EEMMZ N  #
 *)rI   r  r  r  r  r  r  r  )replace_by_example)r   r  r  r,  r/   s       r?   reinplace_all_gatherz7reinplace_fsdp_all_gather.<locals>.reinplace_all_gatherQ  s[    ,	*$ 	  *+*+()/0v|$|$	
rI   )r   r  )5torch.distributed.fsdp._fully_shard._fsdp_collectivesr(   rp  rn   r'  r+  r(  ImportErrorr  AssertionErrorpattern_matcherr  r  r  r  r  rq   r  r  r  r  apply)
r  r  r  r  r  r  r  
graph_passr.  r/   s
            @r?   reinplace_fsdp_all_gatherr5    sd   
D  --/// II&&==		**EE	
FE
  	  $%JII&&==EE  IINN55==23230178v& :& |$|$	
" ?'*
+*
@ % U} 8 s   A"E E10E1c                    t        | t        j                  j                  j                  t        j                  j                  j
                  f      rJ t        | j                         dd        S )N   )r~   r/   	_inductorr  FusedSchedulerNoderw   r[   r   r  s    r?   
get_op_idxr:    s]    OO%%88OO%%::	
   u~~#$$rI   c           	     ~	   ! ddl m  g }t        t                  }d}d}i }i }i ! !fd}	| D ]  }
t	        |
j
                  t        j                  j                  j                  j                        rt        fd|
j                  D              rd}|
}t               }t        |||       t        t        j                  j                  j                  j                  t        j                  j                  j                  j                  t        j                  j                  j                   j                  g      t#        ||| fd	       t%        |d
       }t'        |      }d}t)        t'        |            D ]W  }||   }t+        |j
                  t        j                  j                  j                   j                        r|dz  }|dkD  sU|} n |d | }d }t)        t'        |      dz
        D ]3  }t-        ||dz      j
                  t.        j0                        s.|dz   } n |J  |	|d |       } |	||d        }|||<   't+        |
j
                  t        j                  j                  j2                  j                        skd}|
}t               }t#        |||       t%        |d       }d }t)        t'        |      dz
        D ]3  }t-        ||dz      j
                  t.        j0                        s.|dz   } n |J  |	|d |       } |	||d        }|||<    t'        !      dkD  sJ |rt'        |      dkD  sJ |rt'        |      dkD  sJ | D ]N  }
|
j5                         !v r!|
j5                            }
|
|v r-|j7                  |
       |j9                  |
       P d }|j;                         D ]k  \  }}|bt=        t?        |jA                                     }|jC                         D ]-  }|jE                  tG        |j5                         |d             / |}m d }|j;                         D ]k  \  }}|bt=        t?        |jA                                     }|jC                         D ]-  }|jE                  tG        |j5                         |d             / |}m |S )Nr   )r  Fc                    j                   j                  |       }| D ]  }||j                         <    ||j                         <   |S ra   )rw   creater   )snodes_to_group
group_noder7   r  snode_name_to_final_snodes      r?   _create_group_nodez:enforce_comm_ordering_for_fsdp.<locals>._create_group_node  sV    33::?K
$ 	EE:D%enn&67	E;E!*"5"5"78rI   )r  c              3     K   | ]I  }t        |   j                  t        j                  j                  j
                  j                         K y wra   )r   rs   r/   rn   r  r  rq   )rz   r{   r=  s     r?   r|   z1enforce_comm_ordering_for_fsdp.<locals>.<genexpr>  sD      
  "1%**EIINN,M,M,U,U
s   AATc                    t        | j                        xs0 t        | j                        xr | j                  j                  v  S ra   )r~   NopKernelSchedulerNodeExternKernelSchedulerNoders   op_overload)r{   allowed_opsr  s    r?   r  z0enforce_comm_ordering_for_fsdp.<locals>.<lambda>  sF    q)"B"BC "1i&I&IJ >FF..+=	' rI   )criteria_cbc                    t        |       S ra   r:  rU  s    r?   r  z0enforce_comm_ordering_for_fsdp.<locals>.<lambda>  
    JqM rI   rV  r   c                    t        |       S ra   rJ  rU  s    r?   r  z0enforce_comm_ordering_for_fsdp.<locals>.<lambda>  rK  rI   rn  )$r^   r  r   r   r   rs   r/   rn   r'  r(  rq   r   rb  r   wait_tensorr  split_with_sizes_copyr   sortedr3   r+   r   r~   r   _WaitKernel	chunk_catr   r   rc  r   r8  r9  r   r  rq  r   )"r5   rs  r=  	new_orderrP  	ag_exists	rs_exists$ag_grouped_node_to_wait_grouped_node$rs_grouped_node_to_wait_grouped_noderA  r7   ag_snodeag_related_snode_setag_related_snodesend_idx_of_current_ag_blockcopy_out_countr>   	cur_snodewait_node_idxag_group_nodeag_wait_group_noders_snoders_related_snode_setrs_related_snodesrs_group_noders_wait_group_nodeprev_ag_waitwait_group_nodero  r+  prev_rs_waitrG  r  r@  s"     `                            @@@r?   enforce_comm_ordering_for_fsdprh    s   
 )+I3!III+-(+-( "  nUJJ59955PPXX
 
 __	
 
 IHLVL  ($"	 %II..IIQQII..::BBIINN88@@K )$" !'$*A! +..?*@'N3012 -a0	!NNEIINN$H$H$P$P #a'N!A%23/ !22N3N O !M301A56 /A6;;R^^L$%EM !,,,./@-/PQM "44Emn4U!VBT0? EJJ		(@(@(H(HIIH MWL ($"	 !'$*A!
 !M301A56 /A6;;R^^L$%EM !,,,./@-/PQM "44Emn4U!VBT0?]nU` ()A---781<<<781<<<  >>88-enn.>?EIe L*N*T*T*V '&#]%C%C%E FGL!--/ **AJJL|TR '' L*N*T*T*V '&#]%C%C%E FGL!--/ **AJJL|TR '' rI   )r5   r
  )r5   r
  r   r
  )rs   z"Optional[Union[IRNode, Operation]]r   bool)r7   r"   r   ri  )F)r   rY   )r   r
  r   rY   )r5   r
  r   ztuple[dict[BaseSchedulerNode, Optional[BaseSchedulerNode]], dict[BaseSchedulerNode, Optional[BaseSchedulerNode]], BaseSchedulerNode])r5   r
  r   zDtuple[list[BaseSchedulerNode], dict[BaseSchedulerNode, ReorderInfo]])
r5   r
  rC   ri  rD   ri  rE   ri  r   r
  )rr  r
  r   r
  )r5   r
  r   zEtuple[list[BaseSchedulerNode], dict[BaseSchedulerNode, SinkWaitInfo]])r7   r"   r   rU   )r  torch.fx.Graph)r  rj  r   rX   )r5   1list[torch._inductor.scheduler.BaseSchedulerNode]rs  z4dict[str, torch._inductor.scheduler.SchedulerBuffer]r=  zdict[str, BaseSchedulerNode]r   rk  )V
__future__r   rK  r  r}  loggingr  r`  r  collectionsr   dataclassesr   typingr   r   r   r	   r/   torch._loggingr
    torch.multiprocessing.reductionsr   torch.utils._ordered_setr   r^   r   r   dependenciesr   r   r   r  r   memoryr   r   r   r   r   utilsr   r   r   r   r   r   r   virtualizedr    	getLoggerrd   r  _logginggetArtifactLoggerr  r}   r"   r@   rD   rC   rL   rQ   rS   rt   ry   r   r   r   r   r   r   rN   rG   ru  rw  r  r  r   r  r  r  r	  r5  r:  rh  rj   rI   r?   <module>r{     sD   #      
  # ! 6 6  + ; /  ! %*     g!nn..xC;M#M,#&####L 9 9 9""(03&#"m#mIm`W#WW W 	W
 Wt"0 " " "h#hJhV5#55	A>#L&#&&RA#HhV%n=nEn 5n 7	nrI   