
    sg                    d
   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZmZ d dlmZmZmZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmc mZ d dlmZ d dl m!Z!m"Z" d dl#m$Z$m%Z% d dl&m'Z'm(Z(m)Z)m*Z* d d	l+m,Z, d d
l-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7 erd dl8Z8e0jr                  Z: ejv                  e<      Z=ej|                  j~                  Z?ej|                  j                  Z@e G d d             ZAe G d d             ZBe G d d             ZCdej                  deEfdZFdej                  deEfdZHdej                  deEfdZIdej                  deJfdZK G d d      ZL eL       ZM	 d^dej                  d eej                     d!eej                     d"eeO   dej                  f
d#ZPdej                  deEfd$ZQdej                  deEfd%ZRdej                  deEfd&ZSdej                  deEfd'ZTdej                  deEfd(ZUdej                  deEfd)ZVdej                  deEfd*ZWdej                  deEfd+ZXd,ej                  deeej                     eej                     f   fd-ZYd.eej                     d/eOfd0ZZd,ej                  d.eej                     d1eej                     d2eJdeej                  ej                  f   f
d3Z[d,ej                  deej                  ej                  f   fd4Z\ eJd5      Z]d6eJdeJfd7Z^dej                  deJfd8Z_d9ej                  fd:Z` ej                  d      d;        Zbd<eej                  eJf   deeej                  eJf      fd=Zcd>ej                  dej                  fd?Zdd,ej                  d@ej                  dAej                  dBeJdeej                  ej                  f   f
dCZed,ej                  dej                  fdDZf	 d^dej                  dEeBdFeCfdGZgdH ZhdeAfdIZid9ej                  fdJZjdKeek   dLeek   dMekdeekeeJ   eeJ   f   fdNZldKeek   dLeek   dMekdeekeeJ   eeJ   f   fdOZmdKeek   dLeek   dMekdeekeeJ   eeJ   f   fdPZndKeek   dLeek   dMekdeekeeJ   eeJ   f   fdQZod dRlpmqZq dS Zr	 d_dej                  dEeBdeej                     fdTZs	 d`d,ej                  deej                  ej                  f   fdUZt	 	 	 	 	 dadVej2                  j                  dWeOdXeOdYeEdZeeeOeeO   f      d[eEd\eeO   ddfd]Zuy)b    Ndefaultdict)	dataclassreplace)CallableDictListOptionalSetTupleTYPE_CHECKINGUnion)BackwardState)is_sym_nodepy_sym_types)magic_methodsmethod_to_operator)find_symbol_binding_fx_nodesfree_symbolshint_intis_symbol_binding_fx_node)graph_drawer)CheckpointPolicy   )config)get_aot_graph_name)is_with_effects)fx_graph_cseget_aten_targetc                      e Zd ZU dZee   ed<   ee   ed<   ee   ed<   ee   ed<   ee   ed<   dej                  fdZ	dej                  fd	Z
dej                  fd
Zdej                  fdZdej                  fdZy)OpTypesz8Class for keeping track of different operator categoriesfusible_opscompute_intensive_ops
random_opsview_opsrecomputable_opsnodec                 0    t        |      | j                  v S N)r   r"   selfr'   s     P/var/www/html/venv/lib/python3.12/site-packages/torch/_functorch/partitioners.py
is_fusiblezOpTypes.is_fusible9   s    t$(8(888    c                 0    t        |      | j                  v S r)   )r   r#   r*   s     r,   is_compute_intensivezOpTypes.is_compute_intensive<   s    t$(B(BBBr.   c                 0    t        |      | j                  v S r)   )r   r$   r*   s     r,   	is_randomzOpTypes.is_random?   s    t$77r.   c                 0    t        |      | j                  v S r)   )r   r%   r*   s     r,   is_viewzOpTypes.is_viewB   s    t$55r.   c                 0    t        |      | j                  v S r)   )r   r&   r*   s     r,   is_recomputablezOpTypes.is_recomputableE   s    t$(=(===r.   N)__name__
__module____qualname____doc__r   r   __annotations__fxNoder-   r0   r2   r4   r6    r.   r,   r!   r!   /   s    BXx=(H(m(m#9rww 9C C8bgg 86BGG 6>BGG >r.   r!   c                      e Zd ZU eej
                     ed<   eej
                     ed<   eej
                     ed<   eej
                     ed<   eej
                  e	f   ed<   e
j                  deej
                     fd       Zdej
                  defd	Zdej
                  defd
Zdej
                  defdZdej
                  de	fdZy)NodeInfoinputs_required_fw_nodesrequired_bw_nodesunclaimed_nodesfw_orderreturnc                 F     t        d  j                  D         fd      S )Nc              3       K   | ]  }|  y wr)   r>   .0ns     r,   	<genexpr>z-NodeInfo.required_fw_nodes.<locals>.<genexpr>V   s     01Q0s   c                 "    j                   |    S r)   )rE   )rK   r+   s    r,   <lambda>z,NodeInfo.required_fw_nodes.<locals>.<lambda>V   s    a@P r.   key)sortedrB   r+   s   `r,   required_fw_nodeszNodeInfo.required_fw_nodesS   s!    0//06P
 	
r.   rK   c                     || j                   v S r)   )rB   r+   rK   s     r,   is_required_fwzNodeInfo.is_required_fwY   s    D++++r.   c                     || j                   v S r)   )rC   rU   s     r,   is_required_bwzNodeInfo.is_required_bw\   s    D****r.   c                     || j                   v S r)   )rD   rU   s     r,   is_unclaimedzNodeInfo.is_unclaimed_   s    D((((r.   c                 R    || j                   v sJ d| d       | j                  |   S )NNode z not in fw nodes!)rB   rE   rU   s     r,   get_fw_orderzNodeInfo.get_fw_orderb   s4    D+++IuQC7H-II+}}Qr.   N)r7   r8   r9   r	   r<   r=   r;   r   r   int	functoolscached_propertyrS   boolrV   rX   rZ   r]   r>   r.   r,   r@   r@   I   s     MBGG$277|#\!277C<  
4= 
 

, ,D ,+ +D +)bgg )$ ) bgg  #  r.   r@   c                   @    e Zd ZU eed<   eed<   eed<   eed<   eed<   y)MinCutOptionsban_if_used_far_apartban_if_long_fusible_chainsban_if_materialized_backwardban_if_not_in_allowlistban_if_reductionN)r7   r8   r9   ra   r;   r>   r.   r,   rc   rc   g   s      $$"&&!!r.   rc   r'   rF   c                 z    | j                   j                  dd       t        j                  t        j                  fv S )N	recompute)metagetr   MUST_RECOMPUTEPREFER_RECOMPUTEr'   s    r,   must_recomputerp   p   s5    99==d+''))0  r.   fx_gc                 X    d}| j                   j                  D ]  }t        |      s y y)NFT)graphnodesrp   )rq   foundr'   s      r,   has_recomputable_opsrv   w   s0    E

   $ r.   c                     | j                   j                  D ]W  }t        |      st        |j                  d      s&t
        j                  j                  |j                  j                  v sW y y)NtagsTF)	rs   rt   rp   hasattrtargettorchTagnondeterministic_seededrx   )rq   r'   s     r,   has_recomputable_rng_opsr~      sU    

   4 V,		11T[[5E5EE r.   c                     t        | j                  d   t        j                  t        j                  f      ryt        | j                  d   t        j
                        sJ y)Nvalr      )
isinstancerk   r{   SymIntSymBoolSymFloatro   s    r,   sym_node_sizer      sE    $))E"U\\5==$ABdii&777r.   c                       e Zd Zd Zy)InvalidNodeBasec                      y)NzInvalid Noder>   rR   s    r,   __repr__zInvalidNodeBase.__repr__   s    r.   N)r7   r8   r9   r   r>   r.   r,   r   r      s    r.   r   joint_graphrA   outputssubgraphc                 `  
 t        j                         }i 
|D ]3  }|j                  |j                        }|j                  |_        |
|<   5 | j
                  D ]  }t        |      r|dk7  r
t        
|<   |
v r#|j                  dk(  r
t        
|<   <|j                  dk(  rt        j                  |j                  i |j                  }|D cg c]/  }t        |t         j                        rt        
|   t              1 }}t!        |      r
t        
|<   |j#                  |
fd      
|<   |j                  dk(  r|j#                  |
fd      
|<   |j                  dk(  s g }	|D ]s  }t        |t         j                        rF|
vrt%        d| d	      t        
|   t              rJ d| d
       |	j'                  
|          c|	j'                  |       u |j)                  t+        |	             |j-                          |j/                          |S c c}w )a  
    Given a graph, extracts out a subgraph that takes the specified nodes as
    inputs and returns the specified outputs.

    This includes specifying non-placeholder nodes as inputs.

    The general strategy is to initialize all inputs with proxies as we
    encounter them, and trace through the graph, only keeping values which take
    in valid proxies. Then, all dead code is eliminated.
    backwardplaceholdercall_functionc                     |    S r)   r>   xenvs    r,   rN   z4_extract_graph_with_inputs_outputs.<locals>.<lambda>       CF r.   get_attrc                     |    S r)   r>   r   s    r,   rN   z4_extract_graph_with_inputs_outputs.<locals>.<lambda>   r   r.   outputr\   z couldn't be found in envz was invalid, but is output)r<   Graphr   namerk   rt   _must_be_in_backwardInvalidNodeoppytreearg_tree_leavesargskwargsr   r=   r   any	node_copyRuntimeErrorappendr   tupleeliminate_dead_codelint)r   rA   r   r   	new_graphr'   new_nodeall_argsr   output_valuesr   s             @r,   "_extract_graph_with_inputs_outputsr      s%     
I
C  ((3		D		 !! %(j*@#CI3; WW%#CIWW'--tyyHDKKHH "a) 3q6?3H 
 8}'D	!++D2BCCIWW
"!++D2BCCIWW 56 M 	$a!|"U1#-F#GHH!A 6qc456    Q(  #	$ U=)*!!#NN9s   4H+c                     | j                   dk(  xr3 dt        | j                        vxr t        |        xr t	        |        S Nr   tangents)r   strrz   _is_bwd_seed_offset_is_fwd_seed_offsetro   s    r,   
_is_primalr      sK    =  	*c$++..	*#D))	* $D))	r.   c                 R    | j                   dk(  xr dt        | j                        v S r   r   r   rz   ro   s    r,   _is_tangentr      s$    77m#F
c$++6F(FFr.   c                     | j                   dk(  xr0 dt        | j                        v xs dt        | j                        v S )Nr   bwd_seedbwd_base_offsetr   ro   s    r,   r   r      =    77m# c$++&&O*;s4;;?O*Or.   c                     | j                   dk(  xr0 dt        | j                        v xs dt        | j                        v S )Nr   fwd_seedfwd_base_offsetr   ro   s    r,   r   r      r   r.   c                 v    | j                   dk(  xr) t        | j                  j                  d      t              S )Nr   r   )r   r   rk   rl   r   ro   s    r,   _is_backward_stater      s*    77m#W
499==3G(WWr.   c                 @    | j                   j                  dd       dk(  S )Npartitioner_tagis_backwardrk   rl   ro   s    r,   _has_tag_is_backwardr      s    99==*D1]BBr.   c                 @    | j                   j                  dd       dk(  S )Nr   must_be_in_backwardr   ro   s    r,   _has_tag_must_be_in_backwardr     s    99==*D15JJJr.   c                 L    t        |       xs t        |       xr t        |       S r)   )r   r   r   ro   s    r,   r   r     s&    '- T"<t'<r.   joint_modulec                    t        j                  d | j                  j                  d      D         }|d | }||d  }||fS )Nc              3   4   K   | ]  }|j                     y wr)   )r   )rJ   r'   s     r,   rL   z+_extract_fwd_bwd_outputs.<locals>.<genexpr>  s     	K$))	Ks   r   r   )r   r   rs   
find_nodes)r   num_fwd_outputsr   fwd_outputsbwd_outputss        r,   _extract_fwd_bwd_outputsr     sW     $$	K 2 2 = = = J	KG *?+K/*+K##r.   saved_valuesr   c                 V    | D ]$  }|j                   |k(  s| j                  |        y  y r)   )r   remove)r   r   saved_values      r,   _remove_by_namer     s0    # t#,r.   saved_sym_nodesr   c                Z   t        | |      \  }}| j                  j                  d      }g t        t        |      }g t        t
        |      }g t        t        |      }	g t        t        |      }
g t        t        |      }t        | j                  ||z   |z   |
z   |d      }|j                  d      D ]a  }|j                  s-t        ||j                         t        ||j                         <t        |      sHt        ||j                         |raJ  t               }g }g }|D ]C  }t        |      }|r#|j                  |       |j!                  |       3|j!                  |       E t#        | j                        }t%        j&                  |||      D ]]  }d|j(                  vrt+        |j(                  d         |z
  }t-        |d       D ]  }||vr|j!                  ||           ||z  }_ |j/                          |j1                  ||z          t        | j                  ||	z   ||z   |z   d      }t        | j                  ||z   |z   |
z   |z   |d      }t2        j4                  j7                  | |      }t2        j4                  j7                  | |      }||fS )	Nr   r   r   r   r   c                     | j                   S r)   )r   )ss    r,   rN   z*_extract_fwd_bwd_modules.<locals>.<lambda>Y  s
    166 r.   rO   forward)r   rs   r   filterr   r   r   r   r   r   usersr   r   setr   addr   r   	itertoolschainrk   r   rQ   clearextendr<   _lazy_graph_module_make_graph_module)r   r   r   r   r   r   placeholdersprimal_inputstangent_inputsfwd_seed_offset_inputsbwd_seed_offset_inputsbackward_state_inputs	bwd_graphr'   saved_symbolssaved_sym_nodes_bindingsaved_sym_nodes_derivedsymbolsymbol_bindingsnew_symbolsr   	fwd_graph
fwd_module
bwd_modules                           r,   _extract_fwd_bwd_modulesr     s     8o K  %%00M0BL7fZ67M9vk<89NIv&9<HIIv&9<HIGf%7FG2,&7:PP	I $$$6 )zzL$))4OTYY7%L$))4((() (+uM     1*40f%#**40#**401 3<3E3EFO 7~V %		!"499U#34}D)9: 	?A '#**?1+=>	? 	$%" 25LLM 3..l"_4	I 3
	
	 !	!  		 
 		I &&99,	RJ&&99,	RJz!!r.   c                   t        |       rt        | ||      S t        t        t        | j
                  j                              }t        t        t        | j
                  j                              }||z   }t        | |      \  }}t        | j
                  ||d      }|j                  D 	ch c]  }	|	j                  dk7  s|	j                    }
}	g }g }| j
                  j                  D ]  }	|	j                  |
vrt        |	      r|j                  |	       /d|	j                  vrA|	j                  dk(  r2|	j                  }t!        d |D              sJ |j#                  |       ~|	j                  D cg c]  }|j                  |
vs| }}d|	j                  v r$t!        d |D              r|j#                  |       |j                  |	        t        t$        j'                  |      j)                               }t        t$        j'                  |      j)                               }t+        | |||      S c c}	w c c}w )	a  
    Partitions the :attr:`joint_module` in a manner that closely resembles the
    behavior observed in the original ``.forward()`` and ``.backward()`` of the
    callable, i.e., the resulting forward graph contains those operators that
    are executed in the original ``.forward()`` callable passed to
    :func:`aot_function`.

    The default partitioner collects the operators that are between the forward
    inputs and the forward outputs. This helps in finding the tensors which have
    to be stashed for the backward pass. These stashed tensors become the output
    of the generated forward graph. The remaining operators are then placed in
    the backward graph.

    .. warning::
        This API is experimental and likely to change.

    Args:
        joint_module(fx.GraphModule): The joint forward and backward graph. This
            is the result of AOT Autograd tracing.

    Returns:
        Returns the generated forward and backward Fx graph modules.
    r   r   r   tensor_metar   c              3   V   K   | ]!  }|j                   t        j                  k(   # y wr)   )rz   operatorgetitem)rJ   users     r,   rL   z$default_partition.<locals>.<genexpr>  s     I4t{{h&6&66Is   ')c              3   2   K   | ]  }t        |        y wr)   r   rI   s     r,   rL   z$default_partition.<locals>.<genexpr>  s      2#$A2   r   r   )rv   #min_cut_rematerialization_partitionlistr   r   rs   rt   r   r   r   r   r   r   r   rk   r   allr   dictfromkeyskeysr   )r   _joint_inputsr   r   r   rA   r   r   forward_only_graphr'   forward_node_namesr   r   r   rK   backward_usagess                   r,   default_partitionr    s&   4 L)2-
 	
 
L,>,>,D,DEFM!&)<l>P>P>V>V"WX33F7o K <FK 166$''X:M		  LO""(( *99..t ""4($))+?0JJJEI5IIII&  ::7I)IO  		)c 2(72 /  &&7##D);*< l388:;L4==9>>@AO#''	 O&s   /I I :IIg    .Anumelc                      | |j                   z  S r)   )itemsize)r  dtypes     r,   _tensor_nbytesr    s    5>>!!r.   c                    dt         fdd| j                  v r| j                  d   }t        |t              ryt        |t        t
        f      rt        fd|D              S t        |t              r"t        fd|j                         D              S t        |t        j                        r |      S t        dt        |       d|        | j                  d	k(  ry
t        d|  d      )NrF   c                     t        | t        j                        syt        t	        | j                         d      | j                        S )Nr      fallback)r   r{   Tensorr  r   r  r  r   s    r,   object_nbytesz_size_of.<locals>.object_nbytes  s1    !U\\*hqwwy4@!''JJr.   r   r   c              3   .   K   | ]  } |        y wr)   r>   )rJ   rK   r  s     r,   rL   z_size_of.<locals>.<genexpr>  s     5A}Q'5s   c              3   4   K   | ]  \  }} |        y wr)   r>   )rJ   _rK   r  s      r,   rL   z_size_of.<locals>.<genexpr>  s     @DAq}Q'@s   zUnknown metadata type z	 on node r   r   r\   zO didn't have `val` metadata; we should always have `val` metadata on the nodes.)r^   rk   r   r   r  r   sumr  itemsr{   r  r   typer   )r'   r   r  s     @r,   _size_ofr"    s    KC K
 		iic<( dE]+5555T"@CIIK@@@U\\* %%3DI;ivNOOww*

vde r.   rs   c                     ddl m}  |t              }| j                  D ]3  }|j                  dk(  s||j
                  j                  xx   dz  cc<   5 t        t        |j                         d d             y )Nr   r   r   r   c                     | d   S Nr   r>   r  s    r,   rN   z_count_ops.<locals>.<lambda>  s
    AaD r.   TrP   reverse)
collectionsr   r^   rt   r   rz   r7   printrQ   r   )rs   r   cntr'   s       r,   
_count_opsr+    sa    '%c*C +77o%$$%*%+ 
&.$
?@r.   c                     g } t        t        j                  j                        D ]  }t	        t        j                  j                  |      }t        |t        j                  j                        sL|j                         D ]G  }t	        ||      }t        j                  j                  |j                  v s6| j                  |          | S r)   )dirr{   opsatengetattrr   _opsOpOverloadPacket	overloadsr|   	pointwiserx   r   )r.  	attr_nameopoverloadpacketoverloadop_overloads        r,   pointwise_opsr9    s    
C( 
	"599>>9=*EJJ,G,GH(224 	H!"2H=Kyy""k&6&66

+,	
 Jr.   	depth_mapc                     | D ci c]7  }t        |t        j                  j                  j                        s2|||   9 }}t        |j                         d d      S c c}w )Nc                     | d   S r%  r>   r  s    r,   rN   zsort_depths.<locals>.<lambda>  s
    AaD r.   Tr&  )r   r{   r<   r'   r=   rQ   r   )r   r:  arg
arg_depthss       r,   sort_depthsr?    s[    '+ #z#uxx}}?Q?Q/RYs^J  *""$.$GGs
   3A A gmc                   
 t        j                         i 
| j                  j                  d      D ]  }j	                  |
fd      
|<    i t        | j                  j                        D ]
  \  }}||<    
fd}t        t        t        | j                  j                              }d}t        j                  }|D ]#  }|j                  D ]  }|   |k  s|   }|} % || S t        | j                  j                        |   d D ]
  } ||        t        j                   j                  |       }	|	S )a  
    This pass finds the first bwd node in the graph (by looking at users of
    tangents) and then reorders the graph by walking from this node to all the
    way to the end of the graph. At each op in this traveral, we insert this op
    in a new graph and try to bring only the relevant subgraph from the other
    non-bwd edges relevant for this op. This closely mimics the behavior of
    autograd engine.

    Why is this pass required in the first place?

    This is an artifact of how partitioners work today. The starting point of
    partitioner is a joint graph, which is fwd and then bwd graph. In the case
    of checkpointing, we keep portions of fwd graph in their original place in
    the joint graph, while obtaining a bwd graph. As a result, the resulting bwd
    graph has copies of recomputed fwd subgraphs followed by the original bwd
    graph. If we run this naively, this leads to bad memory footprint, because
    the fwd subgraphs are live for way longer duration than necessary. This pass
    reorders the operations such that we prioritize the ops for the original bwd
    graph while only realizing those ops from the fwd graph that are necessary
    at any given point in the graph.
    r   r   c                     |    S r)   r>   r   s    r,   rN   z5reordering_to_mimic_autograd_engine.<locals>.<lambda>9  s    A r.   c                 *   | g}t               }t        |      dkD  rH|j                         } | |v s| v r'|j                  |        || j                  z  }t        |      dkD  rHt        |fd      }|D ]  } j                  | fd      | <    y )Nr   c                     |    S r)   r>   )rK   orders    r,   rN   zSreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graph.<locals>.<lambda>L  s    %( r.   rO   c                     |    S r)   r>   r   s    r,   rN   zSreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graph.<locals>.<lambda>N  r   r.   )r   lenpopr   all_input_nodesrQ   r   )r'   	cur_nodesinsertable_nodesr   r   rE  s      r,   insert_node_in_graphzAreordering_to_mimic_autograd_engine.<locals>.insert_node_in_graph?  s    F	5)nq ==?D''43;  & ---I )nq  ""28JK$ 	DD!++D2BCCI	Dr.   N)r<   r   rs   r   r   	enumeratert   r  r   r   mathinfr   r{   GraphModule)r@  r'   idxrL  r   first_node_in_bwdminimum_ordertangentr   new_gmr   r   rE  s             @@@r,   #reordering_to_mimic_autograd_enginerV    sL   . 
I"$C ##}#5 @''.>?D	@ Erxx~~. 	TdD$ &bhhnn=>NHHM! )MM 	)DT{]* %d$(!	))  	 RXX^^$U+<%=%?@ #T"# XX!!"i0FMr.   	fw_module	bw_modulenum_sym_nodesc                 :   t        j                         }d }d }d } ||       } ||      }	 ||      }
i }| j                  j                  D ]  }t	        |      st        |j                  d      s&t        j                  j                  |j                  j                  v sW||j                     }|	|j                     }|
|j                     }||d||<    t        j                  j                  j                  }t        j                  j                  j                  }d }|j                  j!                  d      D ]  }d|j                  v s|} n |t#        d	      g }|j%                         D ]  \  }}|d
   }|d   }|j                  }|j'                  |      5  |j)                  d||j                  g|j*                  |j,                        }|j)                  dt.        j0                  |dfi       }|j)                  dt.        j0                  |dfi       }|j3                  |       |j5                  |       |j7                  |       d d d        |j                  }|j'                  |      5  dt9        |       }|j;                  |      } | ||            |j<                  d<   d d d        |j'                  |      5  |j)                  d||j                  g|j*                  |j,                        }|j3                  |       |j5                  |       d d d         t9        t?        |j                  j!                  d                  }|j*                  d   }tA        |      |z
  }|d | tC        |      z   ||d  z   }|j                  jE                  |       |j                  j5                  |       |jG                          |jG                          ||fS # 1 sw Y   xY w# 1 sw Y   JxY w# 1 sw Y   xY w)Nc                    i }| j                   j                  D ]i  }|j                  dk(  st        |j                  d      s*t
        j                  j                  |j                  j                  v s[|||j                  <   k |S )Nr   rx   )
rs   rt   r   ry   rz   r{   r|   r}   rx   r   )gmodrandom_nodesr'   s      r,   get_rng_opsz*functionalize_rng_ops.<locals>.get_rng_ops  sl    JJ$$ 	/D?*DKK0II559I9II*.TYY'	/ r.   c                     d| j                   vry| j                   d   }t        |t              s|f}|D ]8  }t        |t        j                        s|j
                  j                  dk(  s8 y y)zV
        Check the example value of the node outputs to find the device type.
        r   Ncudacpu)rk   r   r   r{   r  devicer!  )r'   
candidates	candidates      r,   
get_devicez)functionalize_rng_ops.<locals>.get_device  sk     		!YYu%
*e,$J# 	"I)U\\2##((F2!	"
 r.   c                 p    | dk(  rt         j                  j                         S t        j                         S )Nr`  )r{   r`  get_rng_state)rb  s    r,   get_sample_rng_statez3functionalize_rng_ops.<locals>.get_sample_rng_state  s,    V::++--""$$r.   rx   )fwdbwdr   r   rT  zaCouldn't find tangent node in graph inputs. This is unexpected, please file a bug if you see thisri  rj  r   )r   r   r   r   rng_state_output_r   r   )$r   countrs   rt   rp   ry   rz   r{   r|   r}   rx   r   _prims	rng_primsrun_and_save_rng_staterun_with_rng_stater   r   r   inserting_beforecreate_noder   r   r   r   replace_all_uses_with
erase_noder   nextr   rk   iterrG  r   r   	recompile) r   rW  rX  rY  uidr^  re  rh  joint_graph_rng_opsfw_graph_rng_opsbw_graph_rng_opsrecomputable_rng_ops_mapr'   	base_nodefw_nodebw_noderun_and_save_rngrp  bw_tangent_start_nodefw_rng_state_outputs	node_pairfw_graphfunctional_fw_nodestate
rng_outputbw_graph
state_namebw_rng_state_nodefw_output_node
fw_outputssym_node_start_idxr   s                                    r,   functionalize_rng_opsr  g  s'   2 //
C	$% &l3"9-"9-!""(( 	S4 V,		11T[[5E5EE+DII6I&tyy1G&tyy1G:A'2R$Y/	S ||--DD//BB **m*< 		!$(! $o
 	
  8 > > @ /)	9E"E"??&&w/ 	/!)!5!5 nn4w||4~~	 "6 " ((  (!,	 ) E "--  &  . J ))*5( ''.1	/6 ??&&'<= 	V,T#YK8J ( 4 4Z @,@GAT,U""5)	V
 &&w/ 		)!--"'G',,G~~	 . J ))*5(		) 		)M/)h $y99X9FGHN$$Q'JZ=8&&'
$
%	&
'(
)	* 
 OO7#OO~.iw	/ 	/8	V 	V
		) 		)s&   &B5O6 ;PAP6P 	P	P	c                    | j                   j                  D ]h  }t        |      s|j                  D ]K  }t        |      s|j                  d   |j                  d   kD  s/t
        j                  |j                  d<   M j | S )a  
    If there are two consecutive checkpointed blocks with no operator in
    between, we would still want to stash the tensor at the boundary of
    checkpointed blocks. The following pass makes the last output node
    non-recomputable to allow for that.
    ac_graph_idrj   )rs   rt   rp   r   rk   r   	MUST_SAVE)r   r'   r   s      r,   cleanup_recompute_tagsr    s|     ""(( H$

 H"4(		-0499]3KK-=-G-GDIIk*HH r.   	node_infomin_cut_optionsc                 L  &'()*+,-./01 
t               t               0t        r| j                  D ch c]F  }|j                  dk(  r5t        |j                  d      rt        |j                  j                        H }}|0j                  D ch c]  }t        |       c}z
  }t        d|       t                d 'd ('(0fd)	 dd l})0fd	++0fd
}
)fd*dt        f*0fd}|j                         /t               &&/0fd}| j                  D ]A  }|j                  dk(  r|j                   v rm|j"                  vr0/j%                  |j&                  dz   dt(        j*                         `/j%                  |j&                  dz   dt(        j*                         t-        |      r0/j%                  |j&                  dz   dt(        j*                         t/        |      st1        |      r ||       j3                  |      r |
|      r ||       d|j4                  vxr d|j4                  vxs8 d|j4                  v xr( t7        |j4                  d   t8        j:                         }t=        |      rt        t?        |            }nF|r<t7        |j4                  jA                  d      tB              rdnt(        j*                  }n ||      }/j%                  |j&                  dz   |j&                  dz   |       |jD                  D ]>  }/j%                  |j&                  dz   |j&                  dz   t(        j*                         @ D dtF        tH        jJ                     dtL        dtL        f)fd}jN                  r(jP                  D ]  }|jD                  D cg c]$  }j3                  |      rjS                  |      & }}|jD                  D cg c]  }j3                  |      s| }}tU        |      dkD  sw ||tW        |            }tY        |jD                        D ]x  }j3                  |      sjS                  |      |kD  s* )||      s4|&v r9tZ        j]                  d|jS                  |      ||jS                  |              ||       z  j^                  r^t               }| j                  D ]D  }j3                  |      sjS                  |      |fg}jS                  |      }tU        |      dkD  sJta        jb                  |      \  }}||v r,|je                  |       jS                  |      |dz   kD  rNtU        |      dk(  r@tZ        j]                  d||jS                  |      jS                  |              ||       |jD                  D ]J  }j3                  |      s )||      s|&vs$ta        jf                  |jS                  |      |f       L tU        |      dkD  rG 	 |ji                  /dd      \  }}|\  }.t               }/fd |D        D ]   \  1}|jw                  .1fd!|D               " t               } |D ](  \  }!}"|!d d" |"d d# k(  sJ |!d d" }#| je                  |#       * ty        |       ,t{        | j                        D $ci c]  \  }$}||$
 c}}$-t}        ,fd$| D        -fd%&      }%|%&fS c c}w c c}w # t        $ r}	t        d      |	d }	~	ww xY wc c}w c c}w # tj        $ rU t        d       t        djm                  |jn                  jp                  js                  /                   tu        /        w xY wc c}}$w )'Nr   _overloadpacketz$Ops banned from re-materialization: c                 D   |j                   t        j                  j                  j                  k7  ry|j
                  d   }t        j                  j                  j                  |      \  }}|D ].  }|j                  |   }| |u r yt        |t              s)| |v s. y yNFr   T)rz   r{   r.  higher_orderauto_functionalizedr   _higher_order_opsauto_functionalizeget_mutable_argsr   r   r  )ab
mutable_opmutable_arg_namesr  r   r=  s          r,   !can_fuse_into_auto_functionalizedz8solve_min_cut.<locals>.can_fuse_into_auto_functionalized*  s    88uyy--AAAVVAY
 ##66GG
S	
% 	 D((4.CCx#t$8	  r.   c                     |j                   t        j                  j                  j                  k7  ry|j
                  d   }|D ]  }|j
                  d   |   }| |u s y y)NFtensors_to_cloner   T)rz   r{   r.  r   triton_kernel_wrapper_functionalr   )r  r  r  r   r=  s        r,   .can_fuse_into_triton_kernel_wrapper_functionalzEsolve_min_cut.<locals>.can_fuse_into_triton_kernel_wrapper_functional;  sb    88uyy--NNNHH%78% 	D((8$T*CCx	 r.   c                     t        |      t        j                  k(  ry | |      ry | |      ryj                  |       xr j                  |      S )NT)r   r/  catr-   )r  r  r  r  op_typess     r,   r-   z!solve_min_cut.<locals>.is_fusibleE  sT     1),Q29!Q?""1%@(*=*=a*@@r.   r   zANeed networkx installed to perform smart recomputation heuristicsc                 *   j                  |       ry| h}t        |      dkD  ro|j                         }|j                  D ]A  }j	                  |      s ||      s yj                  |      s1|j                  |       C t        |      dkD  royr  )r4   rG  rH  r   rV   r   )r'   rJ  curr   r-   r  r  s       r,   is_materialized_backwardsz0solve_min_cut.<locals>.is_materialized_backwardsW  s    D!F	)nq --/C		 ( //5jd>S##D)MM$'	( )nq  r.   c                 T   | j                   dk7  ry| j                  t        j                  k(  ry| j                  j                  dd       t        j                  k(  ryt        j                  rj                  |       ry| j                  t        j                  j                  t        j                  j                  fv ryj                  rj!                  |       s$yj#                  |       sj%                  |       ryj&                  r3 |       r+t(        j+                  d| t-        | j.                               y| j0                  dk  r| j0                  t        j2                  kD  ryj4                  r/t7        d | j8                  D              }t;        |       }|dz  |k  S y)	Nr   Frj   Tzmaterialized backwards: %s %si  c              3   h   K   | ]*  }t        |t        j                        st        |       , y wr)   )r   r<   r=   r"  rJ   is     r,   rL   zBsolve_min_cut.<locals>.should_ban_recomputation.<locals>.<genexpr>  s&      % !*Q2H%s   22r   )r   rz   r   r   rk   rl   r   r  r   recompute_viewsr4   r/  lift_fresh_copydefault
lift_freshrg   r6   r2   r0   rf   loginfor   r   dist_from_bwmax_dist_from_bwrh   r  r   r"  )r'   input_tensors_sizeoutput_sizer  r  r  s      r,   should_ban_recomputationz/solve_min_cut.<locals>.should_ban_recomputatione  sc   77o%;;(***99==d+/?/I/II!!h&6&6t&<;;4//779P9PQQ22++D1!!$'8+H+H+N 77<U=
 HH4dE$**<MN t#(9(9F<S<S(S ++!$ %%)YY% " #4.K?%777r.   c                 f      j                   dk(  ryt         fd j                  D               S )Nr   Tc              3   0   K   | ]  } |        y wr)   r>   )rJ   r   r-   r'   s     r,   rL   z9solve_min_cut.<locals>.is_materialized.<locals>.<genexpr>  s     E$z$-Es   )r   r  r   )r'   r-   s   `r,   is_materializedz&solve_min_cut.<locals>.is_materialized  s*    77m#E$**EEEEr.   rF   c           
         t        |       }t        j                  r!j                  |       rt        j
                  S t        | j                  d   t              r-t        | j                  d   t        j                        st        S t        |dt        t        | j                  d      d      z  z        } |       r|S |dz  S )Nr   g?d   r      )r"  r   r  r4   rN  rO  r   rk   r   r{   r   INT_INFr^   maxminr  )r'   mem_szr  r  s     r,   get_node_weightz&solve_min_cut.<locals>.get_node_weight  s    $!!h&6&6t&< 88Odii&5dii.= Vsc#d.?.?*Eq&IIJK4 MA:r.   c                 8   j                  |       ry| v ryt        |       ryd| j                  v r(t        | j                  d   t        j
                        ryj                  |        j                  d| j                  dz   t        j                         y)NFr   source_incapacityT)r4   rp   rk   r   r{   r   r   add_edger   rN  rO  )r'   banned_nodesdont_bannx_graphr  s    r,   ban_recomputation_if_allowedz3solve_min_cut.<locals>.ban_recomputation_if_allowed  s    D!8 $DII*TYYu-=u~~"N
 	(DII$5Ir.   r   r  sinkr  _outr   r           start_nodes	max_rangec           
         g }| D ]*  }t        j                  |	j                  |      |df       , t        |      dkD  rt        j                  |      \  }}}|s	j                  |      S |j
                  D ]X  }	j                  |      s	j                  |      |kD  r*t        j                  |	j                  |      | ||      f       Z t        |      dkD  r|S )z
        Finds the first unfusible node in the chain of nodes starting from
        `start_nodes` and returns its position.
        Tr   )heapqheappushr]   rG  heappopr   rV   )
r  r  sorted_nodesrK   r  r'   node_is_fusibler   r-   r  s
           r,   find_first_unfusiblez+solve_min_cut.<locals>.find_first_unfusible  s    
 9; 	OANN<)*@*@*CQ)MN	O ,!#',}}\'B$At_" --d33

 ++D1 --d3i? NN$"//5tZd=ST		 ,!# r.   z1used above/below fusible %s:(%s) -> %s -> %s:(%s)r  ztoo long %s %s %s %sr  z-Failed to compute min-cut on following graph:
c              3   ,   K   | ]  }||   f  y wr)   r>   )rJ   rK   r  s     r,   rL   z solve_min_cut.<locals>.<genexpr>  s     8Q$8s   c              3   0   K   | ]  }|v s|f  y wr)   r>   )rJ   vnon_reachableus     r,   rL   z solve_min_cut.<locals>.<genexpr>  s     Aa=.@q!fAs   		c              3   (   K   | ]	  }|     y wr)   r>   )rJ   r'   name_to_nodes     r,   rL   z solve_min_cut.<locals>.<genexpr>  s     2d	2s   c                     |    S r)   r>   )r   node_idxs    r,   rN   zsolve_min_cut.<locals>.<lambda>  s    (1+ r.   rO   )?r   get_default_op_listAOT_PARTITIONER_DEBUGrt   r   ry   rz   r   r  r&   r)  networkxImportErrorr   floatDiGraphrC   rA   r  r   rN  rO  rp   r   r   rV   rk   r   r{   r  r   r   rl   r   r   r	   r<   r=   r^   rd   rS   r]   rG  r  r   r  r  re   r  r  r   r  minimum_cut	Exceptionjoin	readwriteedgelistgenerate_edgelistvisualize_min_cut_graphupdateget_name_to_noderM  rQ   )2r   r  r  r  r'   joint_module_opsr  ops_ignorednxer  r  r  is_non_tensor_nodeweightr   r  	used_nodeordersfw_usersfirst_unfusible_usevisited
start_nodefusiblestart_orderr  r  	cut_value	partition	reachablecutsetnbrs	cut_nodesnode_innode_out	node_namerQ  r   r  r  r  r-   r  r  r  r  r  r  r  r  s2    ```                                  @@@@@@@@@@@@r,   solve_min_cutr    s    5"$H $))
ww/)gdkkCT.U ++,
 

 '(:S:S)TQ#a&)TT4kB"	A0dF 2 zz|H5L* !! 3X77h9...9+++!!$))e"3Vdhh!O dii&0&488L$
 dii%/$((Kd248(.
 ##D).Ft.L(. "E}DII'EUtyy SDIIe4Dell)S%S 	 t=./F!$))--"6FDHH  %T*F$))e+TYY-?&QJJ 	XDdii&0$))e2CdhhW	Xe3XL$rww- C C . ,,"44 	;I &OO++D1 &&t,F  "+I4L4LT4RH  6{Q&:8S[&Q#!)//2 ;D!006%2248;NN&y$7</$O%%229=/ %2248 5T:!;	;P 11%%++ 	VJ++J7!..z:JGHG#00<Kg,"w/3'>C  **3/+2CCG)HH."!..s3!..z: 15II VD!006&sD1 4w1G1G1Mt0TUV) g,"	VB!~~h&I	9  )I}#&5F8i8 B4AdAAB I# !s|x},,,CRL	i !
 $K0L+4[5F5F+GHic4c	HH2	28ML %%_

 *UX  R
	v
N  =>dii--??IJK)	& IsI   A\\\ =)\56\:\:3\?  ^ 	\2!\--\2?A^c                    dd l }dd l}|j                  j                  |       j	                         }|j                  |      d   }|j                         D ]c  }| |j                            |j                            d   }|j                  t        |             |t        d      k(  sS|j                  d       e t        d       |j                  d       y )Nr   r  rO  redz2Visualizing the failed graph to min_cut_failed.svgzmin_cut_failed.svg)r  pydotnx_pydotto_pydot	to_stringgraph_from_dot_data	get_edges
get_sourceget_destination	set_labelr   r  	set_colorr)  	write_svg)r  r  r  
dot_format	dot_graphedger  s          r,   r  r    s    %%h/99;J))*5a8I##% "$//+,T-A-A-CDZPs6{#U5\!NN5!" 

>?,-r.   c                  L   g t         j                  t         j                  t         j                  t         j                  t         j
                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                   t         j"                  t         j$                  t         j&                  t         j(                  t         j*                  t         j,                  t         j.                  t         j0                  t         j2                  t         j4                  t         j6                  t         j8                  t         j:                  t         j<                  t         j>                  t         j@                  t         jB                  t         jD                  t         jF                  t         jH                  t         jJ                  t         jL                  t         jN                  t         jP                  t         jR                  t         jT                  t         jV                  t         jX                  t         jZ                  t         j\                  t         j^                  t         j`                  t         jb                  t         jd                  t         jf                  t         jh                  t         jj                  t         jl                  t         jn                  t         jp                  t         jr                  t         jt                  t         jv                  t         jx                  t         jz                  t         j|                  t         j~                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t        j                  t         j                  t         j                  t         j                  t         j                  } t         j                  t         j                  t         j                  g}|t         j                  t         j                  t         j                  t        j                  t         j                  t         j                  t         j                  gz  }|}| g t        j                  t        j                  t         j                  t         j                  t         j                  t        j                  t        j                  t         j                  t         j                  t        j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t        j                  t        j                  z  } | t         j                  t         j                  gz  } | |z  } | t               z  } | t         j                  gz  } | t        D cg c]  }t        |       c}z  } t        |       }t         j                  t         j                  t         j                  g}t         j                  t         j                  t         j                  t         j                  t         j                  t         j                  t         j                   t         j                  t         j                  t         j                  t         j                  g}|t        |      z  }t        t        |      t        |      t        |      t        |      t        |            S c c}w r)   )r/  r   subdivatan2mulr  r  pow	remainderfmod__and____or____xor__
__lshift__
__rshift__eqnegegtleltabsbitwise_notceilfloorfracnegreluroundsilutruncr  log10log1plog2lgammaexpexpm1erferfccosacoscoshsinasinsinhtanatantanhatanhsqrtrsqrt
reciprocalsigmoidsoftplus	thresholdthreshold_backwardclampwherelerpaddcmulgelugelu_backwardr  mean_grad_sum_to_sizesum_to_sizeamaxtotype_asr   r   squeeze	unsqueezersub_to_copyaliasviewslicetprimsbroadcast_in_dimexpand
as_stridedpermuteconvert_element_typeclone	full_likevarstdselect_unsafe_viewreshapebroadcast_tensorsscalar_tensorones	new_zerosr  arangetriuvar_meanisinfr   fullzerosempty
empty_likeargmaxmaximumiota)_low_memory_max_pool2d_offsets_to_indicesindexgatherr9  
zeros_liker   r   r   native_dropout	rand_like
randn_likemmconvolutionconvolution_backwardbmmaddmm#_scaled_dot_product_flash_attention'_scaled_dot_product_efficient_attention_flash_attention_forward_efficient_attention_forwardupsample_bilinear2d
_scaled_mmr!   )default_recomputable_opsrecomputable_view_opsr%   mr&   r$   r#   r"   s           r,   r  r    s~   L0L0L0 	L0 	

	L0
 	L0 	L0 	L0 	L0 	L0 			L0 	L0 	L0 	L0 	L0 	L0  	!L0" 	#L0$ 	%L0& 	'L0( 	)L0* 	+L0, 	-L0. 	/L00 			1L02 	

3L04 			5L06 	7L08 			9L0: 	

;L0< 			=L0> 	

?L0@ 	AL0B 	

CL0D 	

EL0F 			GL0H 	IL0J 	KL0L 	

ML0N 	OL0P 			QL0R 	SL0T 			UL0V 			WL0X 	YL0Z 			[L0\ 			]L0^ 	_L0` 			aL0b 			cL0d 	

eL0f 			gL0h 	

iL0j 	kL0l 	mL0n 	oL0p 	qL0r 	sL0t 	

uL0v 	

wL0x 			yL0z 	{L0| 			}L0~ 	L0@ 	AL0B 			CL0D 	EL0F 	GL0H 			IL0J 	KL0L 	ML0N 	OL0P 	QL0R 	SL0T 			UL0V 	WL0Z "\\4>>4::F		

  %H $!		$!""$! 	

$! 		$!
 	$! 			$! 			$! 	$! 	$! 	$! 	$! 	$! 			$! 	$! 	

$!  	!$!" 	#$!$ 	%$!& 			'$!( 	)$!* 	+$!, 	-$!. 			/$!0 	1$!2 	

3$!4 	5$!6 			7$!8 	9$!: 	

;$!< 	

=$!> 	?$!@ 	A$!B 	C$!D 	

E$!F 	77G$! $L T[[ 99(/!   N1!3A!6 NN34%%t~~tGJ!!

0044%%))   #S_4KK!"JH ' !Os   d!c                 J    i }| j                   D ]  }|||j                  <    |S r)   )rt   r   )rs   r  r'   s      r,   r  r  H  s.    L '"&TYY'r.   memoryruntimes
max_memoryc                     t              }t        t        |            }t        | fdd      }d}d}g }g }|D ]@  }	| |	   z   |k  r"| |	   z  }||	   z  }|j	                  |	       0|j	                  |	       B |||fS )Nc                     |    |    z  S r)   r>   )r  r  r  s    r,   rN   z!greedy_knapsack.<locals>.<lambda>V  s    fQi(? r.   Tr&  r  )rG  r  rangerQ   r   )
r  r  r  rK   r   total_memorytotal_runtimeitems_to_saveitems_to_allow_recomputingr  s
   ``        r,   greedy_knapsackr  O  s     	HAqNE 5?NELMM!# 1&)#z1F1I%LXa[(M  #&--a01 -)CCCr.   c           	         dd l }	 ddlm}m}m} |j                  |       }|j                  |      }| }	 |||j                  |            }
|
g}|j                  |	      } ||	|| |dd            }|j                  st        d      g }g }t        |j                        D ]-  \  }}|dk(  r|j                  |       |j                  |       / |j                   ||fS # t
        $ r t        d      d w xY w)Nr   )BoundsLinearConstraintmilpzHTo use the ILP for memory budget checkpointing you need to install scipy)Aubr   )cconstraintsintegralityboundszSomehow scipy solving failed)numpyscipy.optimizer  r  r  r  r   array	ones_likesuccessrM  r   r   fun)r  r  r  npr  r  r  	np_memorynp_runtimesr  memory_constraintr  r  resr  r  rQ  r  s                     r,   ilp_knapsackr  g  s    AA  I((8$K	A(9*9MN$%K,,q/K

+fQPQlC ;;9::M!#CEE" 3Q6  %&--c2	3
 GG8]$>>>5  V
	s   
C% %C;c                    d}t        j                  | D cg c]  }t        t        ||z               c}t         j                  d      }t        j                  |t         j
                  d      }t        t        ||z              }t        |       }t        j                  |dz   |dz   ft         j
                  d      }t        d|dz         D ]v  }	||	dz
     }
||	dz
     }||	dz
  d d f   ||	d d f<   |
dk(  r||	dz
  d d f   |z   ||	d d f<   Bt        j                  ||	dz
  |
d f   ||	dz
  d |
 f   |z         ||	|
d f<   x g }g }|}t        |dd      D ]a  }	||	   |   ||	dz
     |   k7  r7|j                  |	dz
         |t        ||	dz
     j                               z  }N|j                  |	dz
         c |j                          ||   |   j                         }|||fS c c}w )Ni'  ra  )r  rb  r   r   )r{   tensorr^   r:  longfloat32rG  r  r  r  r   itemr'  )r  r  r  Sr  quantized_memoryquantized_max_memoryrK   dpr  current_memorycurrent_runtimesaved_itemsrecomputable_itemsjmax_runtimes                   r,   dp_knapsackr    s)    	A ||$*+qU1q5\	+5::e ||HEMM%HH uZ!^45FA 
	
Q$q()u
B 1a!e_ )!a%0"1q5/ a!eQh<1a4 Q!a%(|o5Bq!tH%*]]1q5./)*1q5*N?**+o=&Bq./!"" K!A1a_ -a58r!a%y|#q1u%%a!e,11344A%%a!e,-  Q%,-224K%777c 	,s   G,c                     t         j                  }|dk(  rt        | ||      S |dk(  rt        | ||      S |dk(  rt	        | ||      S t        d|       )Ngreedyilpr  z,Not aware of memory budget knapsack solver: )r   activation_memory_budget_solverr  r  r  r   )r  r  r  SOLVERs       r,   #_optimize_runtime_with_given_memoryr    se    
 33Fvx<<	5FHj99	468Z88I&RSSr.   no_dispatchc                 F   	 t         j                  }d }|dk(  ry|dk(  rat               5  ddlm} t        j                  | j                   j                  f      \  	|j                  	 fd      }|cd d d        S |dk(  rudd	l
m} t        j                  | j                   j                  f      \  	 |d
      5 }  j                  i 	 d d d        j                         }t        |d      S t        d|       # 1 sw Y   y xY w# 1 sw Y   ?xY w)Nc                 2   t        | t        j                        rt        | j                  d   t        j
                        rqt        | j                  d   j                        }d }|D cg c]
  } ||       }}| j                  d   j                  || j                  d   j                        S t        | t        j                        rAt        | j                  d   t        j                        rt        | j                  d   d      S t        | t        j                        r(t        | j                  d   t        j                        ryt        | t        j                        r(t        | j                  d   t        j                        ry| S c c}w )	Nr   c                     t        | d      S )Nr  r  )r   )ds    r,   realize_symbolzAestimate_runtime.<locals>.materialize_arg.<locals>.realize_symbol  s    D11r.   r   )strider  r  g      ?T)r   r<   r=   rk   r{   r  r  shapenew_empty_stridedr  r   r   r   r   )r   r  r  r   s       r,   materialize_argz)estimate_runtime.<locals>.materialize_arg  s    a!j&M,,-E2 1661^A&6E666%=22aff]3:: 3   277#
166%=%,,(OAFF5MD99277#
166%=%..(Q277#
166%=%--(PH 7s   +Ftestingr   profiler   )benchmarkerc                  (     j                    i S r)   )rz   )r   r   r'   s   r,   rN   z"estimate_runtime.<locals>.<lambda>  s    ;4;;3O3O r.   flops)FlopCounterModeF)displayz Not aware of runtime estimator: )r   *activation_memory_budget_runtime_estimatorr  $torch._inductor.runtime.benchmarkingr  r   tree_mapr   r   benchmark_gputorch.utils.flop_counterr  rz   get_total_flopsr  r   )
r'   RUNTIME_MODEr  r  msr  modecounted_flopsr   r   s
   `       @@r,   estimate_runtimer    s   DDL( y 		"] 	H!???TYY<TULD&**+OPB	 	 
	 <DKK8PQfU+ 	)tDKK((	),,.=!$$=l^LMM#	 		) 	)s   ADDDD c                     !"#$% |dkD  s|dk  rt        d|       t        t        j                  t        j                  t        j
                  t        j                  t        j                        }t        j                  rt        |dddd      }|dk(  rj                  S t         |      \  }}|dk(  r|S dt        t        j                     dt        fd	 j                        # |      !!#k  r|S !#fd
}dt        t        j                     f!#fd}t        |ddd      }t         |      \  }	} ||	      |k  r|	S t        |d      t               \  }
} ||
      |k  r|
S ddlm j                  D ch c]
  } |       c} dt        t        j                     dt        t        j                     f fd} ||      }t%        |t&        d      t)              dk(  rj                  S D cg c]  } |t'        |             c}"D cg c]  }t+        |       c}%ddlm$  "$%fd}t        j0                  rag }t3        ddd      D ]6  } ||dz        \  }}|j5                  |t7        %      |z
   ||      f       8 dd lm} |D cg c]  }|d   	 }}|D cg c]  }|d   	 }}|j=                  d       |j?                  ||d       tA        |      D ]"  \  }}|jC                  |d|||   fdd d!"       $ |jE                  d#       |jG                  d$       |jI                  d%       |jK                  d       |jM                         }|jO                          d&tQ                d'}|jS                  |       tT        jW                  d(|        ||)      d   S c c}w c c}w c c}w c c}w c c}w )*Nr   r   zJThe valid ranges for memory budget are 0 <= m <= 1. The provided value is )rd   re   rf   rg   rh   F)rd   re   rf   rg   r   rF   c                 :    t        t        t        |             dz  S N    eA)r  mapr"  )r   s    r,   estimate_activations_sizez:choose_saved_values_set.<locals>.estimate_activations_size,  s    3x./#55r.   c                     | dz  z
  z  S r  r>   )szmax_act_sizemin_act_sizes    r,   get_normalized_sizez4choose_saved_values_set.<locals>.get_normalized_size5  s    S\L899r.   activationsc                 &     |       z
  z
  z  S r)   r>   )r  r  r  r  s    r,   get_mem_ratioz.choose_saved_values_set.<locals>.get_mem_ratio8  s"    )+6E<'
 	
r.   )rd   re   rf   )rg   get_node_storager  c                 r    | D cg c]&  }|j                   t        d      k  r |      vr|( c}S c c}w r  )r  r^   )r  r  r   input_storagess     r,   get_recomputable_banned_nodesz>choose_saved_values_set.<locals>.get_recomputable_banned_nodesX  sD     "
 S)$Q'~= 
 	
 
s   +4Tr&  r  c           	                 5  t        t        | d            \  }}}d d d        t               }D ]  }|j                  	|           |j	                  	      sJ t        
|      \  }}|fS # 1 sw Y   VxY w)Nr   )r  r  r   r   issubsetr  )memory_budgetexpected_runtimesaved_node_idxsrecomputable_node_idxsr  rQ  r   r  aggressive_optionsall_recomputable_banned_nodesr   memories_banned_nodesr  r  runtimes_banned_nodess           r,   get_saved_values_knapsackz:choose_saved_values_set.<locals>.get_saved_values_knapsackv  s    ] 	
 4%'<c-QR>S	 &		 5) 	=CLL6s;<	=  !>???'	
a ---'	 	s   A::Br  r  r  )
      )figsizeo)markerz.2fzoffset points)r   r  center)
textcoordsxytexthazMemory Budgetz Runtime of Recomputed Componentsz:Pareto Frontier of Memory Budget vs. Recomputation Runtimememory_budget_pareto_z.pngz%Generated Pareto frontier curve at %sr  ),r   rc   r   ban_recompute_used_far_apart!ban_recompute_long_fusible_chains#ban_recompute_materialized_backwardban_recompute_not_in_allowlistban_recompute_reductionsaggressive_recomputationr   rA   r  r	   r<   r=   r  torch._inductor.fx_utilsr   rQ   r"  rG  r  torch.utils._mode_utilsr  visualize_memory_budget_paretor  r   r  matplotlib.pyplotpyplotfigureplotrM  annotatexlabelylabeltitlegridgcfshowr   savefigr  warning)&r   r  r  r  runtime_optimized_saved_valuesr  r  r  more_aggressive_optionsmore_aggressive_saved_values%aggressive_recomputation_saved_valuesr  r'   r  recomputable_banned_nodesr  r  optionssweep_memory_budgetr   r  pltr  x_valuesy_valuestxtfigfig_namer
  r  r  r   r  r  r  r  r  r  s&   ``                          @@@@@@@@@@r,   choose_saved_values_setr>  	  s,    qMA-XYfXgh
 	
 $$AA#)#K#K%+%O%O & E E88O &&!"'',).$)
 (5)%"A --6RWW 6% 6 -Y-=-=>L,-KLL|#--:
4= 

 &##(%*	 '4Y 7'# ! 12]B++  % ;HY 2;7)< :;mK4499B9I9IJ&t,JN	
DM 	
d277m 	
 !>l K %+!x%! ()Q.2O-.HQK( ,I#' 4. ., ,,#(b"#5 
	-F#c).*L* NN'-.1AA!,/	
	 	((/0DG00(/0DG00 	

7
#8C0  ) 	FAsLLs)hqk"*  	 	

?#

56		NOggi
*+=+?*@EH;XF %=A!DDO K0V 10s   OO2O ;O%O*c          
         | j                   j                          | j                          | j                   }t        j                  rt        |      }|| _         | j                   }t        |       }t        |       }|rt        |       } fd}	 |	|       }
t        |
j                        dk(  rt        | |      S t        | j                   j                        D ]  }|j                  dk(  rt        d      |_        #|
j#                  |      sd|_        <t        d      |_        |j$                  D ]*  }t'        |j                   |j                   dz         |_        ,  t        j(                  }|j                  D ]=  }t+        |j,                  j/                  dd      t0              s.|j,                  d   } n t3        ||
|	      }t5        t7        t8        |            }t5        t7        d
 |            }t;        | ||      \  }}|r|rt=        | ||t        |            \  }}t?        |      }t@        rddl!m"} |D ch c]
  } ||       }}tG        dtI        d |D              dz         tK        |D cg c]  }tM        |      tO        |      f c}      }|j                   j                  D ch c]  }|j                  dk(  s|jP                    }}|j                   j                  D ch c]  }|j                  dk(  s|jP                    }}||z  }tS        t              }|j                   j                  D ]R  }|jP                  |v stU        |jV                  d      s)|tO        |jV                  jX                        xx   dz  cc<   T tG        dt        |       dt        |       dt        |              tG        dtK        |j[                         d d             ||fS c c}w c c}w c c}w c c}w )ax  
    Partitions the joint graph such that the backward recomputes the forward.
    Recomputing helps in trading off memory bandwidth with computation.

    To create the fwd and bwd graph, we copy the joint graph, manually set the
    outputs to just original forward or backward outputs. And then we run the
    resulting graphs through dead code elimination.

    .. warning::
        This API is experimental and likely to change.

    Args:
        joint_module(fx.GraphModule): The joint forward and backward graph. This
            is the result of AOT Autograd tracing.
        _joint_inputs: The inputs to the joint graph. This is unused.
        compiler: This option determines the default set of recomputable ops.
            Currently, there are two options: ``nvfuser`` and ``inductor``.
        recomputable_ops: This is an optional set of recomputable ops. If this
            is not None, then this set of ops will be used instead of the
            default set of ops.
        num_fwd_outputs: The number of outputs from the forward graph.

    Returns:
        Returns the generated forward and backward Fx graph modules.
    c                    t        | j                        }t               }| j                  j                  D ]t  }|j                  dk(  r d|j
                  v r|j                  |       nt        |      r|j                  |       ||v sS|j                  D ]  }|j                  |        v t        t        t        | j                  j                              }t        t        t        | j                  j                              }||z   }t        |       \  }}	|j                  d |	D               t        | j                  ||d      }
|
j                  D ch c]   }|j                  dk7  r||j                      " }}| j                  j                  D ch c]  }||vr||vr| }}d}i }| j                  j                  D ]  }||v s|||<   |dz  } t#        |||||      S c c}w c c}w )	Nr   r   r   c              3   F   K   | ]  }||j                   dk7  s|  y w)Nr   r   )rJ   r  s     r,   rL   zNmin_cut_rematerialization_partition.<locals>.classify_nodes.<locals>.<genexpr>  s$      !
am8HA!
s   !!!r   r   r   r   )r  rs   r   rt   r   rz   r   r   r   r  r   r   r   r   r  r   r   r@   )r   r  rC   r'   r   r   r   rA   r   r   r  rS   rD   fw_cntrE   r   s                  r,   classify_nodesz;min_cut_rematerialization_partition.<locals>.classify_nodes  s   '(:(:;E &&,, 	0Dww-'J$++,E!%%d+%d+!%%d+(( JJ 0D%))$/0	0 VJ0B0B0H0HIJ!%&(:(:(@(@A"
 !77#;/$
 [ 	   !
"!
 	
 @Y

 +00+
ww(" #+
 +
 %**00
,,=N1N 
 

  &&,, 	D((!'!	 %'8/8
 	
!+


s   %G*G/r   r   r   r  r   r  Nr  c                     t        |        S r)   r  )rK   s    r,   rN   z5min_cut_rematerialization_partition.<locals>.<lambda>?  s    [^); r.   r  r  z Theoretical Activations Stored: c              3   2   K   | ]  }t        |        y wr)   )r"  r  s     r,   rL   z6min_cut_rematerialization_partition.<locals>.<genexpr>V  s     22r  r   r  z# remat/fw/bw: /zCount of Ops Rematerialized: c                     | d   S r%  r>   r  s    r,   rN   z5min_cut_rematerialization_partition.<locals>.<lambda>j  s
    1 r.   Tr&  ).rs   r   rw  r   cser   rv   r~   r  rG  rC   r  reversedrt   r   r^   r  rV   r   r  activation_memory_budgetr   rk   rl   r  r>  r  r   r   r   r  rV  r  r!  r   r)  r  rQ   r"  r   r   r   ry   rz   r  r   )r   r
  compilerr   rq   	cse_graphr   graph_has_recomputable_opsgraph_has_recomputable_rng_opsrC  r  r'   r   r  r   r   rW  rX  r   storagesr  sorted_sizesfw_module_nodesbw_module_nodesremat_nodescountss      `                      r,   r  r    s   B **,D zz &	&$$K!5l!C%=l%K"!-l;-
^ |,I
 9&&'1, -
 	
 ++112 R77h #CD))$/ !D #CD

 R$'(9(94;L;Lq;P$Q!RR 33M!! diimmOT:EB IIo6M
 +YmL 6+|<=O;\JKL 4''	Iy ")#8iC4H$ Iy 4I>I=7CDt$T*DD.2\22S8	
 lKSV4KL"+//"7"7
477o;UDII
 
 #,//"7"7
477o;UDII
 
 &7!,S!1OO)) 	>DyyK'GDKKAR,Ss4;;6678A=8	> 	c+./q_1E0FaOH\G]^	
 	+6<<>~tD	
 i3 E
 L

s$   O7 O<<PP9PPtracedfnamefigname
clear_metaprogparse_stack_tracedot_graph_shapec                    |rWt        j                  | j                        }t        j                  | |      } | j                  j
                  D ]	  }i |_         t        j                  j                  |      \  }	}
|
sdt        j                  z   }
t        d|	 |
        t        j                  | |||      }|j                         }t!        |d|
j#                  d      z         }|	 |
 }|	 ||       y  |||       y )N.zWriting FX graph to file: )rZ  r[  write_)rY  )copydeepcopyrs   r<   rP  rt   rk   ospathsplitextr   torch_compile_graph_formatr)  r   FxGraphDrawerget_main_dot_graphr0  lstrip)rU  rV  rW  rX  rY  rZ  r[  r   r'   baseextgr   write_methods                 r,   
draw_graphrl  o  s     MM&,,/		2LL&& 	DDI	  'ID#F555	&tfSE
23""+'		A 	
A1hC89LfSENE|UU&r.   r)   )r   )inductor)fx_graphTNFN)vr_  r_   r  r   loggingrN  r   ra  r(  r   dataclassesr   r   typingr   r   r	   r
   r   r   r   r   r{   torch._inductor.inductor_primstorch.fxr<   torch.utils._pytreeutils_pytreer   %torch.fx.experimental._backward_stater   "torch.fx.experimental.proxy_tensorr   r   torch.fx.experimental.sym_noder   r   %torch.fx.experimental.symbolic_shapesr   r   r   r   torch.fx.passesr   torch.utils.checkpointr    r   _aot_autograd.logging_utilsr   _aot_autograd.utilsr   compile_utilsr   r   sympydebug_partitionerr  	getLoggerr7   r  r.  r/  rj  r!   r@   rc   r=   ra   rp   rP  rv   r~   r^   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r"  r+  	lru_cacher9  r?  rV  r  r  r  r  r  r  r  r  r  r  r  r"  r  r  r>  r  rl  r>   r.   r,   <module>r     s          	 # * S S S  %  $ $ ? H L  ) 3  ; 0 8  00 g!yy~~		 > > >2      :    T r~~ $ 2>> d  C  
  #	DDMD "'']D sm	D
 XXDNRWW  Gbgg G$ Gbgg $ bgg $ XRWW X XCrww C4 CKrww K4 Krww 4 $..$
4=$rww-'($$rww- s `"..`"rww-`" "'']`"
 `" 2>>2>>)*`"FS..S
2>>2>>)*Sl c("# " "277 s :Abhh A T "Hbggsl!3 HU277C<=P8Q HGBNN Gr~~ GTZ ..Z ~~Z  ~~Z  	Z 
 2>>2>>)*Z z BNN , 	z&z&z& #z&z."aW aHBHH DKD#';D<AD
5$s)T#Y&'D0!?K!?#';!?<A!?
5$s)T#Y&'!?H98K98#';98<A98
5$s)T#Y&'98xTKT5kT T 5$s)T#Y&'	T  0,N` ?@tEtE&.tE	"'']tEt l ..l  2>>2>>)*l d ,0#%)'HH  '' ' 	'
 5d3i(
)' ' c]' 
'r.   