
    i'                     6   d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	 ddl
Z
ddlZ
ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZmZmZmZmZ ddlm Z  ddl!m"Z" de
jF                  jH                  de%e&   fdZ'de
jF                  jP                  de)e
jT                  e
jF                  jV                  f   fdZ,de
jF                  jP                  de&de	e-   fdZ.de
jF                  jP                  de&de	e-   fdZ/de
jF                  jP                  de&fdZ0de
jF                  jP                  de1e	e-      fdZ2de
jF                  jP                  dee   defdZ3 G d d      Z4 e"d e4              	 	 d&d ed!ef   d"ee   d#e5d$e5ded!ee   f   f
d%Z6y)'a  
This module implements CUDA graphs support for TorchDynamo backends.

CUDA graphs allow for capturing and replaying GPU operations, which can significantly
reduce CPU overhead in GPU-accelerated PyTorch models. This module provides:

- CUDA graph creation and management for both forward and backward passes
- Input mutation detection and handling
- Device compatibility checking
- Stack trace management for debugging
- Integration with TorchInductor's cudagraph trees

The backend supports two main modes:
1. cudagraphs: Full CUDA graph support with both forward and backward pass optimization
2. cudagraphs_inner: Lower-level CUDA graph implementation used for benchmarking

Key components:
- CudagraphsBackend: Main backend class for CUDA graph integration
- Mutation detection utilities to ensure graph safety
- Device mapping and compatibility checks
- Stack trace collection for debugging
    N)defaultdict)Sequence)AnyCallableOptional)config)aot_autograd)	boxed_nop)BoxedDeviceIndex'check_multiple_devices_or_any_cpu_nodesformat_default_skip_messageget_mutation_stack_traceget_placeholder_info#log_cudagraph_skip_and_bump_counter)	BoxedBoolcount_tangents%get_first_incompatible_cudagraph_nodenum_fw_fixed_argumentsoutput_node)StorageWeakRef   )register_backendgreturnc           	         dt         t        t        f   dt        fd}t        t              }d}t	               }| j
                  D ]  }|j                  dk(  rkt         ||j                        t        j                        r;|t         ||j                        j                                  j                  |       |dz  }~|j                  dk(  st        |j                  d      s|j                  j                   }t#        |j$                        D ]  \  }}|t'        |j(                        k  r|j(                  |   }	n2|j*                  |j,                  vrG|j,                  |j*                     }	d	}
|j.                  r|j.                  j0                  rd
}
|
s||t         ||	j                        j                                  z  }  |S )Nmetar   c                     d| v r| d   S | d   S )Nvalfake_result )r   s    [/var/www/html/engine/venv/lib/python3.12/site-packages/torch/_dynamo/backends/cudagraphs.pymeta_fkz%find_input_mutations.<locals>.meta_fk7   s    #tmtE{Dm1DD    r   placeholderr   call_function_schemaFT)dictstrr   r   setnodesop
isinstancer   torchTensorr   _typed_storageaddhasattrtargetr&   	enumerate	argumentslenargsnamekwargs
alias_infois_write)r   r"   inputs	input_idxmutated_inputsnschemaiargargumentmut_args              r!   find_input_mutationsrD   6   s   Ed38n E E FIUNWW 44= '!&&/5<<8~gaffo&D&D&FGHLLYWNITT_$188Y/XX%%F#F$4$45 3s166{? vvayHxxqxx/  xx1H>>~~.."& #f&wx}}'='L'L'NO' N: r#   gmc                     i }| j                   j                  D ]W  }|j                  j                  dd       }t	        |t
        j                        s:|j                  |vsI|||j                  <   Y |S )Nr   )graphr*   r   getr,   r-   r.   device)rE   device_node_mappingr>   ts       r!   get_device_node_mappingrL   ]   sg     >@XX^^ .FFJJud#a&188;N+N,-). r#   	aot_model	num_fixedc                     t        | j                        t        t        |            z
  }|sy t	        | j                        }t        ||      S N)rD   rG   r)   ranger   r   )rM   rN   mutation_indicesplaceholderss       r!   3check_for_mutation_ignore_cuda_graph_managed_tensorrT   h   sD     ,IOO<s5CS?TT'	8L#L2BCCr#   c                     t         j                  st        | |      x}r|S t        t	        |             x}r|S t        |       x}rt        d|j                   d      S y )Nzincompatible op ())r   (cudagraph_backend_support_input_mutationrT   r   rL   r   r   r7   )rM   rN   mut_skipskipnodes        r!   check_for_skipr[   s   sz    ::Jy
 
8 
 O6	* t  4Y??t?*->tyyk+KLLr#   c                 v    t        t        t        |                   }|j                  dk(  sJ |j                  S )Ncuda)nextiterrL   typeindex)rE   rI   s     r!   get_device_indexrb      s3    $.r234F;;&   <<r#   c                 $   t        |       }t        |j                        dk(  sJ |j                  d   }t        |d      sg S |D cg c]>  }t	        |t
        j                  j                  j                        r|j                  nd @ c}S c c}w )Nr   r   __iter__)
r   r5   r6   r1   r,   r-   fxrZ   Nodestack_trace)rE   outputr6   rA   s       r!   get_stack_tracesri      s    _Fv{{q   ;;q>D4$	  'sEHHMM,>,>?T	I  s   ABdynamo_modeldynamo_inputsc           	         ddl m t        d      t        d       	 ddt        j
                  j                  dt        t           dt        dt        ffd}dt        j
                  j                  dt        t           dt        ffd	}t        ||t        j                  |d
      t        j                  j                  j                        } ||       S )Nr   )cudagraphify_implTrM   
aot_inputsis_inferencer   c                    t        | |      }t        t        
      t        |            }t        | |      x}r%t	        j
                  	       t        d|        |S j                  t        |               ||t        |      j                  ddt        |       t        | j                        t        | j                        	      }d|_        |S )Nskipping cudagraphs due to Fdevice_indexis_backwardro   stack_tracesrS   mutated_input_idxsT)r
   r   r5   r[   r   disabler   r)   rb   rQ   valueri   r   rG   rD   _boxed_call)rM   rn   ro   interpfixedskip_msgoutboxed_device_indexrm   do_cudagraphsrk   s          r!   forward_cudagraphsz&cudagraphs.<locals>.forward_cudagraphs   s    
 9j1&s='93z?K%i7787m,/-hZ8 M/	:;%L+11))4-ioo>3IOOD

 
r#   c                     t         |      }s S t               }t         |      x}rpt        d|        	j                  }|d}t
        j                  j                  j                  |d      J dt        t           dt        f fd}d|_        |S  
||t        |      t               ddt               t         j                         t#         j                         		      }d|_        |S )
Nrq   r   F)create_if_none_existsr;   r   c                 4    j                           |       S rP   )set_to_running_backward)r;   rM   managers    r!   fnz3cudagraphs.<locals>.backward_cudagraphs.<locals>.fn   s    //1 ((r#   Trr   )r
   r   r[   r   rx   r-   	_inductorcudagraph_treesget_managerlistr   ry   rQ   rb   ri   r   rG   rD   )rM   rn   rz   r{   r|   
device_idxr   r}   r   r~   rm   r   s   `       @r!   backward_cudagraphsz'cudagraphs.<locals>.backward_cudagraphs   s    9j1y)%i7787/-hZ8
 ,11J!
oo55AA% B G &&&)49 ) ) "BNI%L))4))4-ioo>3IOOD

 
r#   )ro   )fw_compilerbw_compilerinference_compilerkeep_inference_input_mutations)F)torch._inductor.cudagraph_treesrm   r   r   r-   re   GraphModuler   r   boolr	   	functoolspartial_dynamor   %cudagraph_backend_keep_input_mutation)rj   rk   r   r   aot_cudagraphsr~   rm   r   s    `   @@@r!   
cudagraphsr      s    AdOM)$/
 #88''I  
	 :)88'')59#Y)	)V "&'$,,-?dS',}}';';'a'a	N ,66r#   c                   n    e Zd ZdZedd       Zedej                  j                  de	e
   de
fd       Zy)	CudagraphsBackendr   r   Nc                      ddl m}   |         y )Nr   reset_cudagraph_trees)r   r   r   s    r!   resetzCudagraphsBackend.reset   s    Ir#   modelr;   c                     t        | |      S rP   )r   )r   r;   s     r!   __call__zCudagraphsBackend.__call__   s    %((r#   )r   N)__name__
__module____qualname__compiler_namestaticmethodr   r-   re   r   r   r   r   r    r#   r!   r   r      sP     M   
 ),, )hsm ) ) )r#   r   r   )r7   compiler_fnr   .r;   copy_outputscopy_inputsc                   	 t        |t        t        f      sJ r$|D cg c]  }t        j                  |       c}nt        |      t        j
                  j                          t        j
                  j                         }|j                  t        j
                  j                                t        j
                  j                  |      5   | |  ddd       |j                          t        j
                  j                         j                  |       t        j
                  j                          t        j
                  j                         t        j
                  j                  |      5   |  	ddd       t        	t        t        f      s	f	dt        dt        t           f	fd}|S c c}w # 1 sw Y   xY w# 1 sw Y   RxY w)zBThis isn't registered as a backend, but is used in some benchmarksN)stream
new_inputsr   c                      t              t        |       k(  sJ r%t        |       D ]  \  }}|j                  |        j                          rD cg c]  }|j	                          c}S S c c}w rP   )r5   zipcopy_replayclone)	r   dstsrcxr   r   rG   static_inputsstatic_outputss	       r!   runzcudagraphs_inner.<locals>.run  sp    =!S_444z: S		#'56!AGGI66!! 7s   A4)r,   r   tupler-   
zeros_liker]   synchronizeStreamwait_streamcurrent_streamr   	CUDAGraphrG   r   r   )
r   r;   r   r   r   r   r   rG   r   r   s
     ``   @@@r!   cudagraphs_innerr     sm    ftUm,,,6<=))!,=V 
JJZZ F
uzz0023			6	" v
	JJ++F3	JJ JJ  "E			%		/ /./ntUm4(*	" 	"# 	" 	" JA > / /s   GG
?G
GG)TT)7__doc__r   collectionsr   collections.abcr   typingr   r   r   r-   torch.fxtorch._dynamor   torch._dynamo.backends.commonr	    torch._dynamo.backends.debuggingr
   torch._inductor.cudagraph_utilsr   r   r   r   r   r   torch._inductor.utilsr   r   r   r   r    torch.multiprocessing.reductionsr   registryr   re   Graphr)   intrD   r   r'   rI   rf   rL   r(   rT   r[   rb   r   ri   r   r   r   r   r    r#   r!   <module>r      s  .  # $ * *     6 6   < &$EHHNN $s3x $N	%,,
%&Dxx##D03Dc]Dehh22 s xPS} $-- # 	-- 	$x}2E 	T7UXX11 T7(3- T7TW T7n) )  l0A0C D 	)CH)SM) ) 	)
 c8C= !)r#   