
    sg                    d   d dl Z d dlZd dlmZmZ d dlmZmZmZm	Z	m
Z
mZmZmZ d dlZd dlmZ d dlmc mc mZ d dlmZ d dlmc mZ d dlmZ d dlmZ d dlm Z  d dl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) d dl*m+Z+m,Z,m-Z-m.Z.m/Z/ d d	l0m1Z1 d d
l2m3Z3 d dl4m5Z5m6Z6m7Z7m8Z8 d dl9m:Z;  ejx                  e=      Z>dZ? G d de      Z@dej                  dee	e#   e	ej                     f   fdZBdej                  de	e#   fdZCde#dej                  deDfdZEe
de#dej                  de#fd       ZFde#dej                  fdZGe
de#dej                  ddfd       ZHe
de#ddfd       ZIe
de#de,dej                  dej                  ddf
d       ZKe
de#de,d eDfd!       ZLdee,   ddfd"ZMdee,   ddfd#ZNe
de#dee,   d$edej                  d%eed&f   d'eeOef   deeed&f   eeOef   f   fd(       ZPe
de#dee,   ddfd)       ZQe
de#dee,   d*edej                  d+ed,edefd-       ZRe
de#de,ddfd.       ZSe
de#dej                  ddfd/       ZTe
de#dej.                  j                  deeef   fd0       ZUe
de#dej                  de,d1edef
d2       ZVe
 ej                         de#de,d1efd3              ZXde#de,d1eddfd4ZYde#de,d1eddfd5ZZe
de#de,deDfd6       Z[e
de#de,ddfd7       Z\e
de#d8ej                  deej                  ej                  f   fd9       Z^e
de#de,d:ej                  dej                  fd;       Z_e
de#de,ddfd<       Z`e
de#de,d=ej                  fd>       Zae
de#de,d=ej                  fd?       Zbe
de,fd@       ZcdAej                  dBedddfdCZee
de#d:ej                  dDe+fdE       ZfdFej                  dGej                  ddfdHZge
de#deDfdI       Zhe
 ej                         de#dej                  fdJ              Zie
de#ddfdK       Zje
de#ddfdL       Zke
de#dMee,   dNe@ddfdO       Zle
de#dMe,de,fdP       Zmde,de.fdQZne
de#dej                  ddfdR       Zoe
de#dej                  ddfdS       Zpe
de#dej                  fdT       Zqe
de#dej                  dUede,ddf
dV       Zrde#dee,   ddfdWZsde#dee,   d%eed&f   d'eeOef   ddf
dXZte
de#dej                  ddfdY       ZudZej                  dej                  dej                  fd[Zvd\e	e,   fd]Zwe
de#dej                  dee	ej                     e	eej                        f   fd^       Zye
de#d_e	eO   de	ej                     fd`       Zzdae	ej                     dbe	eej                        dcej                  ddfddZ|y)e    N)autoEnum)AnyCallableDictListno_type_checkOptionalSetTuple)Variable)register_multi_grad_hook)LOW_PRECISION_HOOKS)_assert_in_training_states
_FSDPState_get_module_fsdp_state_is_composable_log_post_backward_hook_no_dispatch_record_streamclean_tensor_nameTrainingState)FlatParameterFlatParamHandleHandleShardingStrategyHandleTrainingState'RESHARD_AFTER_FORWARD_HANDLE_STRATEGIES)HYBRID_SHARDING_STRATEGIES)BackwardPrefetch)_apply_to_tensors_cast_forward_inputs	_p_assert
_to_kwargs)_pytree)_use_orig_paramslimit_all_gathers_use_full_prec_in_evalc                   (    e Zd Z e       Z e       Zy)_PrefetchModeN)__name__
__module____qualname__r   BACKWARDFORWARD     X/var/www/html/venv/lib/python3.12/site-packages/torch/distributed/fsdp/_runtime_utils.pyr(   r(   5   s    vHfGr/   r(   modulereturnc                     g }g }t               }| j                         D ]U  }t        |      }|||vst        ||      s#|j	                  |       |j                  |       |j                  |       W ||fS )a6  
    Returns a tuple containing:
    1. A list of the root ``_FSDPState`` instances in the module tree rooted at
    ``module`` without any duplicates and following the ``module.modules()``
    traversal order (which is assumed to be depth-first).
    2. A corresponding list of the root modules owning the states in the first
    list.

    This is similar to :func:`_get_fsdp_states_with_modules` except that we
    must call :func:`_is_fsdp_root` to force a lazy initialization to determine
    the FSDP root in case lazy initialization has not yet happened.
    )setmodulesr   _is_fsdp_rootaddappend)r1   fsdp_root_statesfsdp_root_modulesvisited_fsdp_states	submoduleoptional_states         r0   "_get_fsdp_root_states_with_modulesr>   :   s     *,)++.5^^% 	0	/	:&&99ni8##N3##N3$$Y/	0 ...r/   c                 "    t        |       \  }}|S )z/See :func:`_get_fsdp_root_states_with_modules`.)r>   )r1   r9   _s      r0   _get_fsdp_root_statesrA   Z   s    <VDar/   statec                 N    t        | |       | j                  J | j                  S )z
    Returns if ``state`` corresponds to that of an FSDP root.

    For the wrapper code path, ``state`` and ``module`` should be the same. For
    the non-wrapper code path, ``state`` should be ``module`` 's state.
    )
_lazy_init_is_rootrB   r1   s     r0   r6   r6   `   s(     uf>>%%%>>r/   root_modulec                    | j                   y| j                  j                         st        d      d| _         t	        | t
        j                  g       t        | |       t        j                  |      | _
        t        |        t        | |      \  }}t        ||| j                         | j                  j!                  | || j"                         t%        | |       | S )a  
    Performs initialization lazily, typically right before the first forward
    pass. The laziness is needed to ensure that the parameter device/dtype and
    the FSDP hierarchy have finalized. This method's actual logic only runs on
    the root FSDP instance, which performs initialization for all non-root FSDP
    instances to avoid partial initialization.

    For the non-composable code path, ``state`` and ``root_module`` should be
    the same, namely the FSDP instance itself.
    Nz(FSDP does not support CPU only executionT)rE   _device_handleis_availableRuntimeErrorr   r   IDLE%_check_flat_params_on_expected_devicetraversal_utils_get_fsdp_states_all_fsdp_states_init_streams'_get_buffers_and_dtypes_for_computation!_cast_buffers_to_dtype_and_devicecompute_device_exec_order_datainitprocess_group"_share_state_and_init_handle_attrs)rB   rG   buffersbuffer_dtypess       r0   rD   rD   m   s     ~~!,,. EFF ENu}'9'9&:;)%=,==kJE%DUKXG]%g}e>R>RS	{E4G4GH&uk:Lr/   c                    t        j                  d      }t        j                  |      D ]  }|j                  sS|j
                  j                  | j                  k7  r0t        d|j
                  j                   d| j                   d      |j                  so|j
                  j                  |k7  st        d|j
                  j                   d       y)z
    Checks that all ``FlatParameter``s in ``module`` 's tree managed by
    ``state`` are on the expected device for *lazy initialization*.
    cpuz6An FSDP-managed module unexpectedly has parameters on z". Make sure to move the module to z before training.zOAn FSDP-managed module with parameter CPU offloading enabled has parameters on zG. Make sure to not move the module from CPU when offloading parameters.N)torchdevicerN   _get_fsdp_handles_offload_params
flat_paramrT   rK   )rB   r1   
cpu_devicehandles       r0   rM   rM      s    
 e$J!33F; &&!!((E,@,@@H$$++,,N''((9; 
 ##(9(9(@(@J(N%%+%6%6%=%=$> ?KL r/   
root_statec                    | j                   }|r|j                          i }t        D ]  }t               ||<    | j                  j
                  | _        | j                  D ]  }|j                  }t        |d      rt        d      |j                  duxr t        d |j                  D              |_        |j                  sdt        j                  j                  d        | j                   D ]6  }t        D ]9  }t#        t        ||      d|        ||   j%                  t'        ||             ; || u rKt#        |j(                  du xs |j(                   d       d|_        | j*                  |_        | j,                  |_        | j.                  |_        | j0                  |_        | j2                  |_        | j                  |_        | j4                  |_        |j6                  | j2                  |j6                  _        |j                   }|s'|j                          9 |j;                         D ]$  \  }}t=        |      d	k7  st?        d
| d|        y)z
    Shares data structure state from the ``root_state`` to all FSDP states in
    ``root_module`` 's module tree, and initializes handle attributes. These
    are done together to require a single loop over the states.
    _in_backward_optimizerszDFSDP optimizer in backward only supported with use_orig_params=True!Nc              3   4   K   | ]  }t        |d         yw)rf   N)hasattr).0params     r0   	<genexpr>z5_share_state_and_init_handle_attrs.<locals>.<genexpr>   s      O
:?GE45O
s   zfsdp.optimizer_in_backwardzFSDP state missing attribute zcNon-root FSDP instance's `_is_root` should not have been set yet or should have been set to `False`F   z"Expects one homogeneous value for z	 but got ) _handleinit_flat_param_attributesHOMOGENEOUS_ATTR_NAMESr4   rU   all_handles_all_handlesra   rh   rK   _paramsany_has_optim_in_backwardr]   _C_log_api_usage_oncerP   r!   r7   getattrrE   _unshard_stream_post_backward_stream_pre_unshard_stream_all_reduce_stream_default_stream_free_event_queue_fsdp_extensioncompute_streamitemslen
ValueError)rd   rG   rc   attr_name_to_values	attr_namera   
fsdp_stateattr_valuess           r0   rX   rX      sm    F))+/1+ /	),I&/(99EEJ)) 
G&&
:89V  )3(:(:$(F )
3 O
CMCUCUO
 L
% ((HH(()EF
G !11 0
/ 	OI
I./	{;  	*..wz9/MN	O # 	4'Bz/B/B+B9	

 $
%/%?%?
"+5+K+K
()3)G)G
&(2(E(E
%%/%?%?
"&0&A&A
#'1'C'C
$%%18B8R8RJ&&5##--/;0< #6";";"= 	;{q 4YKyV r/   c                 l   | j                   sJ | j                  j                         sJ t        d | j                  D              }| j
                  r|rdnd}| j                  j                         | _        | j                  | j                  | j                  _	        | j                  j                  |      | _        | j                  j                  |      | _        | j                  j                  |      | _        |r | j                  j                         | _        y| j                  | _        y)z
    Initializes CUDA streams for overlapping communication, computation, and
    data transfers. The streams should be shared across FSDP instances.
    c              3   @   K   | ]  }|j                   t        v   y wN)sharding_strategyr   )ri   r   s     r0   rk   z _init_streams.<locals>.<genexpr>   s$       	$$(BB   r   N)priority)rE   rI   rJ   rs   rP   r%   current_streamr|   r~   r   Streamrx   ry   rz   r{   )rB   uses_hybrid_shardinghigh_prioritys      r0   rQ   rQ      s    >>>,,... 00   116JBPQM!00??AE(/4/D/D, "00777OE #("6"6"="=}"="UE !& 4 4 ; ;] ; SE *>##% 
CHCXCX 
r/   rc   unshard_streampre_unshard_streamc                 (   |sy| j                   j                  |      5  |j                         }ddd       r|j                  |       | j                  rT| j
                  j                         }|r8t        j                  j                  d      5  |j                          ddd       | j                   j                  |      5  |j                          |j                          ddd       y# 1 sw Y   xY w# 1 sw Y   ZxY w# 1 sw Y   yxY w)a'  
    Unshards the handles in ``handles``. If the handles are in
    :meth:`summon_full_params` and are using mixed precision, then they are
    forced to full precision.

    Postcondition: handle's ``FlatParameter`` 's data is the padded
    unsharded flat parameter on the compute device.
    Nz%FullyShardedDataParallel.rate_limiter)rI   streampre_unshardwait_streamr%   r}   dequeue_if_neededr]   profilerrecord_functionsynchronizeunshardpost_unshard)rB   rc   r   r   ran_pre_unshardevents         r0   _unshardr     s     				$	$%7	8 / ,,./""#56''99;//7 $ !!#$ 
			$	$^	4  / /$ $ s#   C0C<!D0C9<DDfree_unsharded_flat_paramc                 J   |j                  |       | j                  ro|rmt        j                  j                  j                         sE| j                  j                         }|j                          | j                  j                  |       |j                          d|_        y)z
    Reshards the handle. ``free_unsharded_flat_param`` indicates whether to
    free the handle's padded unsharded flat parameter.
    FN)reshardr%   r]   distributed_functional_collectivesis_torchdynamo_compilingrI   Eventrecordr}   enqueuepost_reshard_prefetched)rB   rc   r   
free_events       r0   _reshardr   1  s     NN,-#<  88QQS --335J##++J7
 Fr/   c                 *    | r| j                          y y r   )unshard_gradrc   s    r0   _unshard_gradsr   I        r/   c                 *    | r| j                          y y r   )reshard_gradr   s    r0   _reshard_gradsr   P  r   r/   
unshard_fnargs.kwargsc                    t         j                  j                  d      5  |r*|j                  t        j
                  k(  r||fcddd       S t        j                  | _        | j                  j                  ||j                         |rt        j                  |_        |	 || |       t        | |       |r|j                  r}|j                  j                   gt        j"                  |j                  j$                  t        j&                  d            j)                  | j*                        |j                  _        | j,                  xr | j,                  j.                   }|r>| j0                  j2                  r(| j0                  j4                  }t7        |g|i |\  }}t9        | |||       ||fcddd       S # 1 sw Y   yxY w)a0  
    Runs the pre-forward logic. This includes an opportunity to unshard
    currently sharded parameters such as those for the current forward and
    registering post-backward hooks for these current parameters. This function
    also converts forward ``args`` and ``kwargs`` to the given precision.

    Args:
        handles (List[FlatParamHandle]): Handles giving the parameters used in
            the current forward.
        unshard_fn (Optional[Callable]): A callable to unshard any currently
            sharded parameters or ``None`` to not do any unsharding.
        module (nn.Module): Module whose forward this method runs right before;
            expected by the hook signature.
        args (Tuple[Any, ...]): Module forward ``args``.
        kwargs (Dict[str, Any]): Module forward ``kwargs``.
    z%FullyShardedDataParallel._pre_forwardNr\   r^   )r]   r   r   _training_stater   BACKWARD_PREr   FORWARD_BACKWARDtraining_staterU   record_pre_forwardtrainingr-   _register_post_backward_hookr`   ra   	_cpu_grad
zeros_like_local_shardr^   
pin_memoryrT   rm   _force_full_precisionmixed_precisioncast_forward_inputsparam_dtyper    )_register_post_backward_reshard_only_hook)rB   rc   r   r1   r   r   should_cast_forward_inputsinput_dtypes           r0   _pre_forwardr   W  s   2 
	'	'(O	P # f,,0C0P0PP
 <# #  -==11&&//J%8%@%@F"!uf% 	%UF3 f,,1B1B1L1L1T*/*:*:!!..u||E7J+j 4 4j5 '
 MME%--"E"EE 	# &%*?*?*S*S161F1F1R1RK/MdMfMLD&1%vNV|G# # #s   #GE*GG
c                 *   |sy|j                   s"t        | || j                  | j                         d|_        t
        j                  j                  j                         sd| j                  j                         }| j                  #|j                  | j                         d| _        n|j                  | j                         t
        j                  j                  d      5  t!        | |t"        j$                         ddd       y# 1 sw Y   yxY w)z'Unshards parameters in the pre-forward.NFz.FullyShardedDataParallel._pre_forward_prefetch)r   r   rx   rz   _needs_pre_forward_unshardr]   r   r   r   rI   r   _unshard_event
wait_eventr   r   r   _prefetch_handler(   r-   )rB   rc   r   s      r0   _pre_forward_unshardr     s       5 5u7P7PQ(-F%44MMO--<<>+%%e&:&:;#'E &&u'<'<=		'	'8
 ? 	(=(=>? ? ?s   $D		D
reshard_fninputoutputc                    t         j                  j                  d      5  |r(|j                  t        j
                  k(  r|cddd       S | j                  j                  |       |	 || |       t        | |||      }t        j                  | _        |rt        j                  |_        |cddd       S # 1 sw Y   yxY w)a  
    Runs the post-forward logic. This includes an opportunity to reshard
    currently unsharded parameters such as those used in the current forward
    and registering pre-backward hooks on the forward outputs.

    Args:
        handles (List[FlatParamHandle]): Handles giving the parameters used in
            the current forward.
        reshard_fn (Optional[Callable]): A callable to reshard any currently
            unsharded parameters (e.g. from the current forward) or ``None`` to
            not do any resharding.
        module (nn.Module): Module whose forward just ran, which should be a
            fully sharded module (see [Note: Fully Sharded Module]); expected
            by the hook signature.
        input (Any): Unused; expected by the hook signature.
        output (Any): Forward pass output; pre-backward hooks are registered on
            the tensors that require gradients in this output.

    Postcondition: Each ``FlatParameter`` 's data points to the sharded flat
    parameter.
    z&FullyShardedDataParallel._post_forwardN)r]   r   r   r   r   r   rU   record_post_forward_register_pre_backward_hooksr   rL   r   )rB   rc   r   r1   r   r   s         r0   _post_forwardr     s    < 
	'	'(P	Q  f,,0C0P0PP	  	226:!uf% .eVVVL,11%8%=%=F"  s   !B6A!B66B?c                 f    |sy| j                    xr |j                  t        v }t        | ||       y)z(Reshards parameters in the post-forward.N)rE   _sharding_strategyr   r   )rB   rc   r   s      r0   _post_forward_reshardr     s?     
 NN 	Q%%)PP  UF56r/   c                 :   t         j                  j                  d      5  t        | |       t	        | j
                  dud       | j
                  s/t        |       rt        | |||      cddd       S ||fcddd       S | j                  }|r|j                  }nd}|rft        t        |j                               j                         t        | j                  j                               | j                          d| _        nrt%        | dd      ret'        | |      \  }}t)        |      dkD  rAt)        |      dkD  r3t+        d	 t-        ||      D              rt        ||| j                          d| _        | j.                  rPg }| j0                  D ]*  }	|	j                  s|j3                  |	j                         , |D ]  }d|_        d|_         t9        | j:                  j=                         | j>                  | j@                         tC        | jD                         t         j                  j                  d
      5  tG        ||| j                   d      \  }
}ddd       
d   }d   }t        | |||      cddd       S # 1 sw Y   *xY w# 1 sw Y   yxY w)a  
    Runs pre-forward logic specific to the root FSDP instance, which should run
    before any individual module's pre-forward. This starts with an attempt at
    lazy initialization (which only runs non-vacuously once). Otherwise, if
    this is called on a non-root FSDP instance, then it returns directly.

    Args:
        module (nn.Module): Module for which this logic tries to run. It may or
            may not be the root. If not, then this method does not do anything.
    z*FullyShardedDataParallel._root_pre_forwardNz$Expects a root FSDP to have been setT)rY   rZ   r^   !_needs_buffer_dtype_restore_checkFr   c              3   @   K   | ]  \  }}|j                   |k7    y wr   dtype)ri   bufferbuffer_dtype_for_computations      r0   rk   z$_root_pre_forward.<locals>.<genexpr>-  s'      < < LL$@@r   z#FullyShardedDataParallel._to_kwargs)$r]   r   r   rD   r!   rE   r   _root_cast_forward_inputrm   r   rS   dictnamed_buffersvalueslist_buffer_name_to_orig_dtyperT   r   rw   rR   r   rs   zipforward_prefetchrP   r8   r   r   _wait_for_computation_streamrI   r   rx   rz   %_reset_flat_param_grad_info_if_neededrq   r"   )rB   r1   r   r   rc    should_cast_buffers_to_full_precrY   buffer_dtypes_for_computationhandlesr   
args_tuplekwargs_tuples               r0   _root_pre_forwardr     s   " 
	'	'(T	U OE5&!%..,.TU~~ e$/vtVLOE OE <OE OE" /5/K/K,/3,+-V1134;;="5#C#C#J#J#LM++ 7;E3U?G 8vF-7|aC(E$F$J @C!>A  6!>@T@T 7<E3!!G#44 7
%%NN:#5#567 " +481%*"+ 	%  //1!!%%	

 	.e.@.@A
 ^^++,QR 	'1fe22E($J	 !}a'vtVD_OE OEP	 	QOE OEs8   A	J3J D J!BJ JJJ	
JJc                    | j                   r| j                   j                   }nd}|j                  xs | j                   xr |xr | j                  j
                  }|r(| j                  j                  }t        |g|i |\  }}||fS NT)rm   r   r   r&   r   cast_root_forward_inputsr   r    )rB   r1   r   r   force_full_precisionr   r   s          r0   r   r   V  s     }}#(==#F#FF# 
	< < <<VBV"9



8
8  "-2-B-B-N-N+KI$I&If<r/   unusedc                     |rt        |d      r|j                  r|S t        j                  j	                  d      5  | j
                  r.| j                  s"t        | |       t        | j                         nI|rGt        j                  g}t        |       r|j                  t        j                         t        | |       t        j                  | _        |s|cddd       S t"        j$                  |_        |j(                  r|j*                  s"t-        | || j.                  | j0                         t        j2                  j4                  j7                         s3| j8                  j;                         j=                  | j.                         d|_        t        j                  j	                  d      5  t?        | |t@        jB                         ddd       |jE                          d|_        |cddd       S # 1 sw Y   +xY w# 1 sw Y   yxY w)z
    Prepares ``_handle`` 's ``FlatParameter`` s for gradient computation.

    Args:
        module (nn.Module): Fully sharded module (see [Note: Fully Sharded
            Module]).
    _ran_pre_backward_hookz+FullyShardedDataParallel._pre_backward_hookNFz/FullyShardedDataParallel._pre_backward_prefetchT)#rh   r   r]   r   r   rE   _post_backward_callback_queued&_register_post_backward_final_callbackr   rq   r   rL   r   r8   r   r   r   r   r   r   _needs_pre_backward_unshardr   r   rx   rz   r   r   r   rI   r   r   r   r(   r,   prepare_gradient_for_backward)rB   r1   rc   gradr   allowed_statess         r0   _pre_backward_hookr  j  s   $ 	F45))		'	'(U	V + >>%"F"F25&A1%2D2DE+001Ne$%%m&D&DE&un=,== #+ +$ "5!A!A-- %%))--	 $$<<UUW$$335AA%BWBWX .3*^^++=
 	D UFM,B,BC	D 	,,.(,%W+ +J	D 	DK+ +s+   BH"CH2G8 H8H	=HHc                 @   t        | |t               |j                  }d|_        t        j
                  j                  j                  d      5  t        | t        j                  g       t        |j                  t        j                  t        j                  fv d|j                          t        j                  |_        |j                   
	 ddd       y|j                   j"                  rt%        d      t'        | |       | j(                  s&|j*                  r|j-                          	 ddd       yt        j.                  j0                  j3                         s3| j4                  j7                  | j8                  j;                                | j8                  j=                  | j4                        5  |j                   j>                  }tA        |       sc|j                   jB                  |jD                  k7  r@|jF                  s4|j                   jI                  |jD                        |j                   _        |jJ                  rtM        | |       ntO        | |       tQ        || j4                         ddd       ddd       y# 1 sw Y   xY w# 1 sw Y   yxY w)a  
    Reduce-scatters the gradient of ``handle`` 's ``FlatParameter``.

    Precondition: The ``FlatParameter`` 's ``.grad`` attribute contains the
    unsharded gradient for the local batch.

    Postcondition:
    - If using ``NO_SHARD``, then the ``.grad`` attribute is the reduced
    unsharded gradient.
    - Otherwise, the ``_saved_grad_shard`` attribute is the reduced sharded
    gradient (accumulating with any existing gradient).
    Tz,FullyShardedDataParallel._post_backward_hookz8Expects `BACKWARD_PRE` or `BACKWARD_POST` state but got Nz,FSDP does not support gradients of gradients))r   loggerra   _post_backward_calledr]   autogradr   r   r   r   r   r!   r   r   r   BACKWARD_POSTr   requires_gradrK   _post_backward_reshard_sync_gradientsr$   _use_unsharded_grad_viewsr   r   r   ry   r   rI   r   r   data_low_precision_hook_enabledr   _reduce_dtyper   touses_sharded_strategy_reduce_grad_reduce_grad_no_shardr   )rB   rc   ra   r   autograd_computed_grads        r0   _post_backward_hookr    s%   ( E662""J'+J$		 	 	0	06
 4 	#5=+I+I*JK
 	""#002E2S2STUFvG]G]F^_	

 "5!B!B??"!4 4" ??((MNNuf-$$&&00214 48   88QQS''33$$335 !!(()D)DE 	%/__%9%9"/6OO))V-A-AA 44'1'9'9&:N:N'O
$++UF+%eV4 '&(C(C#	C4 4B	 	C4 4s4   BJAJ7B J7C J7JJ	JJc                     t         j                  j                  d      5  t        j                  | _        t        j                  |_        t        | |       d d d        y # 1 sw Y   y xY w)Nz9FullyShardedDataParallel._post_backward_hook_reshard_only)
r]   r   r   r   r   r   r   r  r   r	  )rB   rc   r   s      r0    _post_backward_reshard_only_hookr    sV    
 
	'	'C
 .  -==!4!B!Buf-. . .s   7A  A)c                     t        | |      }t        | ||       t        j                  j	                  d      5  t        | |t        j                         d d d        y # 1 sw Y   y xY w)Nz0FullyShardedDataParallel._post_backward_prefetch)_should_free_in_backwardr   r]   r   r   r   r(   r,   )rB   rc   r   r   s       r0   r	  r	    sb    
 !9 GUF56
 
	'	':
 @ 	(>(>?@ @ @s   AA'c                 \    |j                   sy| j                  xs |j                  t        v S )zh
    Returns whether FSDP should free the unsharded flat parameter in the
    post-backward or not.
    F)r  r
  r   r   )rB   rc   s     r0   r  r    s5     ''
 	 	P$$(OOr/   c                 
   |j                   }|j                  t        j                  t        j                  fv }|j
                  j                  }d|_        t        | |      \  }}| j                  Ot        || j                         |j                  r|j                  n| j                  }t        j                  |||       |rt         j"                  j$                  j'                         s%| j(                  j+                  | j,                         | j.                  j1                  | j(                        5  t3        || j(                         t        j4                  || j6                         t        || j8                         t;        | ||      }t=        | ||       	 ddd       yt        || j8                         n| j                  | j>                  ||       t;        | ||      }t=        | ||       y# 1 sw Y   XxY w)z
    For sharded strategies, this runs gradient reduction, sharded gradient
    accumulation if needed, and the post-reduction callback.
    Ngroup) ra   r   r   HYBRID_SHARD_HYBRID_SHARD_ZERO2r   r  _get_reduce_scatter_tensors
_comm_hook_div_if_needed_gradient_predivide_factor_use_fake_reduce_fake_process_grouprW   distreduce_scatter_tensorr]   r   r   r   r{   r   ry   rI   r   r   
all_reduce_inter_node_pg_gradient_postdivide_factor_accumulate_sharded_grad_post_reduce_grad_callback_comm_hook_state)	rB   rc   ra   uses_hybrid_sharded_strategyunsharded_gradpadded_unsharded_gradnew_sharded_gradpggrad_to_offloads	            r0   r  r  3  s    ""J#)#<#<++22A $   __))NJO.I~/++ ,e.N.NO && &&$$ 	
 	""!	

 ($$<<UUW((44U5P5PQ%%,,U-E-EF  ++;U=U=UV 08L8LM/1R1RS":6#3# +5&/J  	')J)JK""$9;K	
 /uf>NOOufo>' s   8A)G99Hr.  c                 "   t        |j                  | j                              }| j                  |d   j                         z  |j                         z
  }|dkD  rt	        j
                  |d|g      n|}t        j                  |d         }||fS )zO
    Returns the input and output tensors to reduce-scatter, respectively.
    r   )r   chunk
world_sizenumelFpadr]   
empty_like)rB   r.  chunksnumel_to_padr/  r0  s         r0   r  r  n  s     .&&u'7'789F##fQioo&77.:N:N:PPL4@14Dnq,/0.  ''q	2 "222r/   sharded_gradc                     |j                   }t        | ||       t        |d      }|r,t        ||j                         |xj                  |z  c_        n||_        |j                  }|S )z
    Accumulates the reduce-scattered sharded gradient with any existing sharded
    gradient if needed, returning the gradient to offload (if CPU offloading is
    enabled).
    _saved_grad_shard)ra   _cast_grad_to_param_dtyperh   _check_grad_to_accumulater>  )rB   rc   r<  ra   accumulate_gradr2  s         r0   r*  r*  ~  sg     ""Je\:> j*=>O!,
0L0LM$$4$'3
$ 22Or/   c                    |j                   }| j                  lt        |j                  | j                         t        j                  |j                  | j                         t        |j                  | j                         n&| j                  | j                  |j                         |j                  st        | |j                  |       |j                  j                  }t        | ||       y)z
    For no-shard, this runs gradient reduction (which directly covers any
    gradient accumulation implicitly) and the post-reduction callback.
    Nr  )ra   r   r!  r   r"  r%  r'  rW   r)  r,  _keep_low_precision_gradsr?  r  r+  )rB   rc   ra   r2  s       r0   r  r    s     ""Jz(H(HI
u/B/BCz(I(IJ//A ++!%*E oo**Oufo>r/   r2  c                 4    t        | ||       t        |       y)z
    This callback captures any logic to run after the gradient reduction
    finishes. Currently, this offloads the gradient to CPU if CPU offloading is
    enabled and uses sharded gradient views if ``use_orig_params=True``.
    N)_offload_grad%_post_backward_use_sharded_grad_views)rB   rc   r2  s      r0   r+  r+    s     %1)&1r/   c                     |j                   sy |j                  xr |j                   }|j                  j                  j                  |j                         |       t        |j                  | j                         y )N)non_blocking)
r`   r  rt   ra   r   copy_detachr   r  ry   )rB   rc   r2  rH  s       r0   rE  rE    sp     !! //U8U8U4UL
%% | &  33U5P5PQr/   c                    | j                   sy | j                          | j                          | j                  r| j	                          | j
                  j                  D ]O  }|j                  t        |d      s|j                  D ]  }|j                           j                  d       Q | j                          | j                  rd | j
                  _        y y y )Nrf   T)set_to_none)r$   _reset_is_grad_none_use_sharded_grad_viewsrt   prepare_gradient_for_optimra   rr   r   rh   rf   step	zero_gradr   r`   r   )rc   
orig_paramoptims      r0   rF  rF    s    ""   ""$$$))+ ++33 	2J*w50 (?? !EJJL! D1	2 	446!!*.F' "! %r/   tensor
div_factorc                 2    |dkD  r| j                  |       y y )Nrl   )div_)rT  rU  s     r0   r!  r!    s    A~J r/   rj   c                 <   t        | t        j                  g       t        |       sv|j                  |j                  k7  r\|j
                  }|j
                  j                  |j                        |_        t        || j                  j                                yyy)a  
    Casts ``sharded_grad`` back to the full parameter dtype so that the
    optimizer step runs with that dtype. This performs an actual cast if
    1. parameters were in reduced precision during the forward since then
    gradients would be in that reduced precision, or
    2. parameters were not in reduced precision but gradients were in
    reduced precision for communication.
    However, if a low precision communication hook is registered, then this
    dtype cast happens in the hook instead.
    r   N)
r   r   r   r  r   r  r  r   rI   r   )rB   r<  rj   low_prec_grad_datas       r0   r?  r?    s      u}'E'E&FG&u-,2D2D2S)..(--00u{{0C 	# 4 4 C C E	
 3T-r/   r0  accumulated_gradc                     t        |j                  | j                  k(  d|j                   d| j                          t        |j                  | j                  k(  d|j                   d| j                          y )NzDShape mismatch when accumulating gradients: existing gradient shape=z new gradient shape=zFDevice mismatch when accumulating gradients: existing gradient device=z new gradient device=)r!   shaper^   )r0  rZ  s     r0   r@  r@    s     "2"8"88	##3#9#9": ;.445	7 #3#:#::	$$4$;$;#< =/667	9r/   c                 &    | j                   t        v S r   )r   r   )rB   s    r0   r  r  *  s    222r/   c                    t        | j                  d       | }|j                  r| j                  j	                         }|j                  |j                         |j                  |ur|j                  |j                         |j                  j                  r(| j                  j	                         j                          |j                  j                          | j                  D ]m  }t        |       t        |       t         j"                  |_        |j&                  }|s=d|_        d|_        d|_        t.        j"                  |_        d|_        o d|_        y)z
    This waits for the post-backward to finish and performs some final cleanup.
    This runs at the end of the entire backward pass and should only be called
    on the root FSDP instance.
    zJThe post-backward callback should only be called on the root FSDP instanceFN)r!   rE   r
  rI   r   r   ry   r{   cpu_offloadoffload_paramsr   rU   	next_iterrP   _catch_all_reshard_finalize_paramsr   rL   r   rm   r   r   _post_forward_indexr   r   r   r   )rB   r1   rd   r   r   rc   s         r0   _post_backward_final_callbackre  /  s)    T J!!--<<> 	"":#C#CD((>&&z'D'DE!!00   //1==?))+,, 
'
:&$$1$6$6
!##,1F)16F.)-F&%8%=%=F"!&F
' 16J-r/   c           
         	 | j                   r| j                   j                  j                         | j                   j                  j                  j                         k(  xr | j                   j                   }|ryt        | | j                         }t        | | j                   |       yy# t        $ r$}t        dd|  dt        |       d       |d}~ww xY w)a{  
    Reshards the parameters that may not have been resharded in the
    post-backward hook. This can happen when a module's output is used in the
    forward pass, meaning that its pre-backward hook runs (unsharding the
    parameter), but the post-backward hook does not run because the output was
    not jused in the loss computation corresponding to this backward pass.
    NFz+Got exception in the catch-all reshard for : )raise_assertion_error)
rm   ra   data_ptrr   _skipped_use_sharded_viewsr  r   	Exceptionr!   str)rB   already_reshardedr   es       r0   rb  rb  ^  s    == ((113==++88AACD A
 @@@  !(@(V%UEMM+DE   9%3q6(K"'	

 s   A:B, =-B, ,	C5CCc                    | j                   }|sy|j                  }t        j                  j                  j                         r+t        |d      r|j                  }|j                          |`n~t        |d      rrt        |j                        }t        |j                        dz   }t        ||k(  d|j                          |j                  d   j                          t        |d       |j                  rG| j                  sy|j                   s|j#                          t        t        |d      d       d	|_        yy)
z3Finalizes the parameters before the next iteration.N_post_backward_hook_handle_post_backward_hook_staterl   z(Invalid: ``_post_backward_hook_state``: r   r  z@Expects `_post_backward_called` to be set on the `FlatParameter`F)rm   ra   r]   r   r   r   rh   rp  remover   rq  intr  r!   delattrr
  rt   rO  r  )rB   rc   ra   pbhs_handlepost_backward_hook_state_len%expected_post_backward_hook_state_lens         r0   rc  rc    s(   
 ]]F""J00IIK:;<$??K 5::;+.z/S/S+T(47
8P8P4QTU4U1,0UU::;_;_:`a 004;;=J ;<$$
 ,,--/J 78N	
 ,1
(  r/   current_handleprefetch_modec                 z   |syt        | |      }|sy|j                  }|t        j                  k(  rt        j
                  |_        nD|t        j                  k(  rt        j                  |_        nt        d| j                   d|       t        | || j                  | j                         ||_        d|_        y)zt
    Prefetches the next handles if needed (without synchronization). An empty
    handles key cannot prefetch.
    NzInvalid prefetch mode on rank rg  T)_get_handle_to_prefetchr   r(   r,   r   r   r-   r   rankr   rx   rz   r   )rB   rx  ry  rc   prev_training_states        r0   r   r     s     $UN;F !00...!4!A!A	-//	/!4!<!<9%**RWXX UFE1153L3LM0FFr/   c                    t        |      }t        j                  t        j                  t        j                  f}t        ||v d| d|        | j                  }d}|t        j                  k(  r| j                  t        j                  k(  s0|t        j                  k(  rP| j                  t        j                  k(  r3|j                  |      }|r|j                  r|j                  s|}|S d}|S |t        j                  k(  r=| j                  r1|j                  |      }|r|j                  r|j                  s|}|S d}|S )aS  
    Returns a :class:`list` of the handles keys to prefetch for the next
    module(s), where ``current_handle`` represents the current module.

    "Prefetching" refers to running the unshard logic early (without
    synchronization), and the "next" modules depend on the recorded execution
    order and the current training state.
    z!Prefetching is only supported in z but currently in N)_get_training_stater   r   r  r-   r!   rU   backward_prefetchr   get_handle_to_backward_prefetchr   r   r   get_handle_to_forward_prefetchr   )rB   rx  r   valid_training_stateseodtarget_handletarget_handle_candidates          r0   r{  r{    sS    )8N(())##
 //
+,A+B C&'	)
 
 
 C/3M-:::##'7'D'DD-;;;##'7'E'EE"%"E"En"U#'CC+773M  !M  
.66	65;Q;Q"%"D"D^"T#'BB+773M  !Mr/   c                 2    t        | d       | j                  S )z8Returns the training state of the handles in ``handle``.zExpects a non-empty handle)r!   r   r   s    r0   r  r    s     f23!!!r/   c                 L   | j                   D ]  }|j                           | j                   j                          | j                  j	                  |d      }t        j                  t        | |t              }| j                   j                  |j                  |dd             y)z5
    Registers a pre-forward hook on ``module``.
    NTprependwith_kwargs)_pre_forward_handlesrr  clear_fully_sharded_module_to_handleget	functoolspartialr   r   r8   register_forward_pre_hookrB   r1   forward_handlemodule_param_handlehooks        r0   _register_pre_forward_hookr    s      44   	$$&??CCFDQe02FD 
%%((t(Nr/   c                 F   | j                   D ]  }|j                           | j                   j                          | j                  j	                  |d      }t        j                  t        | |t              }| j                   j                  |j                  |             y)z
    Registers a post-forward hook on ``module``. Even if the module has no
    handles, we should register the hook since it will register the module's
    pre-backward hook.
    N)_post_forward_handlesrr  r  r  r  r  r  r   r   r8   register_forward_hookr  s        r0   _register_post_forward_hookr    s      55   	%%'??CCFDQ	D 
&&v'C'CD'IJr/   c                    | j                   D ]  }|j                           | j                   j                          t        j                  t
        |       }| j                   j                  |j                  |dd             y)a  
    Registers root pre-forward hook on ``module``, which should be the local
    FSDP root.

    NOTE: For the current composable FSDP design, we have each application of
    ``fully_shard()`` to a module to indicate that that module is the local
    FSDP root. We may remove this assumption in the future, in which case we
    will need to register this root pre-forward hook on any candidate module
    that may be the local FSDP root.
    Tr  N)_root_pre_forward_handlesrr  r  r  r  r   r8   r  )rB   r1   r  r  s       r0   _register_root_pre_forward_hookr  6  ss      99   	##))+.6D	##**((t(Nr/   outputsc                      t        j                         s|S  j                  rd _        rd_        d_        dt         j                  dt         j                  f fd}t        ||      S )a  
    Registers pre-backward hooks on the tensors that require gradients in the
    forward pass outputs ``outputs``, which were computed using the
    ``FlatParameter`` s of ``handles``.

    Args:
        module (nn.Module): Fully sharded module (see [Note: Fully Sharded
            Module]).

    Returns:
        Forward pass outputs with pre-backward hooks registered to tensors that
        require gradients.
    Ftr2   c           
          | j                   r[| j                  t        j                  j                  j                  t        j                  t                           rd_	        | S r   )
r  register_hookr]   utilshooksunserializable_hookr  r  r  r   )r  rc   r1   rB   s    r0   _register_hookz4_register_pre_backward_hooks.<locals>._register_hooko  sU    ??OO!!55%%&8%P
 592r/   )r]   is_grad_enabledrE   r   r   r   Tensorr   )rB   r1   r  rc   r  s   `` ` r0   r   r   N  sd    ,   "~~/4,-2* ).%	%,, 	5<< 	 ^W55r/   c                 h   t        j                         sy|sy|j                  }t         j                  j                  j                         rOt        |d      }|s|j                  syt        j                  t        | |      }|j                  |      }||_        yt        |d      }|s|j                  sy|j                  |      }t        |j                  dud       |j                  j                   d   d   }|J |j#                  t        j                  t        | |            }||f|_        y)a  
    Registers post-backward hooks on the ``FlatParameter`` s'
    ``AccumulateGrad`` objects to reshard and to reduce-scatter gradients.

    The ``AccumulateGrad`` object represents the last function that finalizes
    the ``FlatParameter`` 's gradient, so it only runs after its entire
    gradient computation has finished.

    We register the post-backward hook only once in the *first* forward that a
    ``FlatParameter`` participates in. This relies on the ``AccumulateGrad``
    object being preserved through multiple forwards.

    NOTE: We follow this heuristic to prefer the *first* forward to target the
    parameter mixed precision case, where there are *separate*
    ``AccumulateGrad`` objects across the different forwards. (Without
    parameter mixed precision, the ``AccumulateGrad`` objects are the same.) If
    we instead prefer the *last* forward, then the hook runs early.
    Nrp  rq  zZThe `grad_fn` is needed to access the `AccumulateGrad` and register the post-backward hookr   )r]   r  ra   r   r   r   rh   r  r  r  r  "register_post_accumulate_grad_hookrp  	expand_asr!   grad_fnnext_functionsr  rq  )rB   rc   ra   already_registeredr  hook_handletemp_flat_paramacc_grads           r0   r   r   }  s'   0   """J00IIK$Z1MNZ%=%=  !4eVD CCDI0;
-$Z1LMZ%=%=$..z:##4/.	

 #**99!<Q?###,,15&A
 19+/F
,r/   c                 N   t        j                         syd}|sy|j                  }t         j                  j                  j                         rt        |d      }nt        |d      }|s|j                  ry|Gt        j                  |i |}|D cg c]'  }t        j                  |      s|j                  s&|) }}|J t        |t        j                  t        | |            }	t         j                  j                  j                         r|	|_        y|	f|_        yc c}w )a  
    Registers post-backward hooks to reshard flat parameters that do not
    require gradient. We register these using multi-post-grad hooks on the
    input activations to ensure that all gradients that may depend on the
    parameters have been computed before resharding.
    Nrp  rq  )r]   r  ra   r   r   r   rh   r  pytreearg_tree_leaves	is_tensorr   r  r  r  rp  rq  )
rB   rc   r   r   inp_tensorsra   r  	args_flatobjr  s
             r0   r   r     s      " 15K""J00IIK$Z1MN$Z1LMZ55**D;F;	$
(<ARARC
 
 """*Y&&'GPVWK 00IIK0;
-0;~
,
s   D"-D":D"c                 P   t        | j                  d       | j                  ryt        | t        j
                  g       t        j                  j                  j                         s@d| _        t        j                  j                  t        j                  t        | |             yy)z
    Registers the post-backward final callback that runs at the end of the
    backward pass. This should be called from the root FSDP instance at the
    beginning of the pre-backward.
    zFOnly the root FSDP instance should register the post-backward callbackNT)r!   rE   r   r   r   rL   r]   r   r   r   r   _execution_enginequeue_callbackr  r  re  rF   s     r0   r   r     s     P ++u}'9'9&:;44MMO/3,""11;UFK	
 Pr/   computation_streamc                     t         j                  j                  j                         ry|j	                  |        |j	                  |        y)z
    Has the unshard and pre-unshard streams wait for the computation stream.
    For example, this should be called in the FSDP root's pre-forward to
    respect optimizer step computation.
    N)r]   r   r   r   r   )r  r   r   s      r0   r   r     s?     00IIK12 ""#56r/   r   c                 r    t        | t              s| g} | D ]  }|j                  s|j                          ! y)z
    Clears the original parameters' gradients if needed. This method's CPU
    overhead is minimal, so we may call it throughout FSDP methods, which serve
    as callsites to free the gradient memory earlier.
    N)
isinstancer   r$   r   )r   rc   s     r0   r   r     s9     gt$) ;""88:;r/   c                 0   t        | j                  d       g }g }t               }t        j                  |      \  }}t        t        |      t        |            D ]  \  }}|j                         D ]i  \  }	}
|
|v r|j                  |
       t        |	      |j                  v r4|j                  |
       |j                  |j                  j                         k  t        |      t        |      k(  sJ t        |       dt        |              ||fS )a  
    Returns all buffers in the module tree rooted at ``root_module`` and a
    corresponding list of the buffer dtypes for computation. Each buffer dtype
    is either ``None`` if buffer mixed precision is not enabled or the buffer
    low precision dtype otherwise.
    z Expects the root to cast buffers )r!   rE   r4   rN   _get_fsdp_states_with_modulesr   reversedr   r7   r   _ignored_buffer_namesr8   r   buffer_dtyper   )rB   rG   rY   rZ   visited_buffersfsdp_statesfsdp_modulesr   fsdp_modulebuffer_namer   s              r0   rR   rR     s    enn@A"$G13M),O !0 M M!K $'x'<h|>T#U J
K#.#<#<#> 	JK(' -1Q1QQNN6"  !;!;!H!HI	JJ w<3}--U#g,q]AS@T/UU-M!!r/   buffer_namesc           
          g }|D ]b  }t        || j                  v | d| j                   d| j                  j                                 |j	                  | j                  |          d |S )zF
    Returns the original buffer types of the given buffer names.
    z+ is missing from pre-computed dict on rank z, which only has keys )r!   r   r|  keysr8   )rB   r  rZ   r  s       r0   _get_orig_buffer_dtypesr  >  s     (*M# L5;;;mFzzl0//44679	
 	U==kJKL r/   rY   rZ   r^   c           	      8   t        |du xs t        |       t        |      k(  dt        |        dt        |              t        | |      D ]L  \  }}t        j                  |      r||j                  |      |_        5|j                  ||      |_        N y)z
    Casts ``buffers`` to the dtypes given by ``buffer_dtypes`` and moves them
    to ``device``. If an element in ``buffer_dtypes`` is ``None``, then the
    corresponding buffer is only moved to ``device``.
    NzfExpects `buffers` and `buffer_dtypes` to have the same length if `buffer_dtypes` is specified but got z and r   )r^   r   )r!   r   r   r]   is_floating_pointr  r  )rY   rZ   r^   r   r  s        r0   rS   rS   R  s     CW]1C!C003G~U}
	  !$G] ; G&&v.,2F ))6)2FK ))6)FFK	Gr/   )}r  loggingenumr   r   typingr   r   r   r   r	   r
   r   r   r]   torch.distributedr   r%  'torch.distributed.fsdp._traversal_utilsfsdp_traversal_utilsrN   torch.nnnntorch.nn.functional
functionalr7  torch.autogradr   torch.autograd.graphr   (torch.distributed.algorithms._comm_hooksr   $torch.distributed.fsdp._common_utilsr   r   r   r   r   r   r   r   "torch.distributed.fsdp._flat_paramr   r   r   r   r   "torch.distributed.fsdp._init_utilsr   torch.distributed.fsdp.apir   torch.distributed.utilsr   r    r!   r"   torch.utilsr#   r  	getLoggerr)   r  ro   r(   Moduler>   rA   boolr6   rD   rM   rX   rQ   r   r   r   r   r   rl  r   r   r   r   r   r   r  no_gradr  r  r	  r  r  r  r  r*  r  r+  rE  rF  floatr!  r?  r@  r  re  rb  rc  r   r{  r  r  r  r  r   r   r   r   r   r   r   rR   r  r^   rS   r.   r/   r0   <module>r     s
      Q Q Q    A A    # 9 H	 	 	  J 7  * 
		8	$ D 
/II/
4
T"))_,-/@")) Z0@ 
 
RYY 
4 
   D RYY 0 >>> 
> >B ""	" "J  LL 	
 
 @   $ ._%	_%	 ;;_%; ; II	;
 S/; cN; 5c?DcN*+; ;| ??_%? 
? ?4 ,,_%, , II	,
 , , 	, ,^ 777 
7 7" _E_EII_E
 
_E _ED $xx
38_ & BBIIB B
 B 	B BJ III 	I  IX... . 
	. @@@ @ 
	@"  
 & 7?
 7?O 7? 7? 7?t 33',||3
5<<%&3 3  ,, \\	 2 ? ?_ ? ? ?( 222 \\	2 2 RRR \\R R2 // / /: 5<<  U  t  
 

,,
 
 
:llll 
$ 3z 3d 3 3 *6*6II*6  *6Z ""	" "J $1$1	$1 $1N _- ! 
	 < 44#4 4 4n""" II 
 & KKIIK 
K K, II . +6+6II+6 +6 	+6
 
+6 +6\5G5G_%5G 
5Gp*>*>_%*> S/*> cN	*>
 
*>Z 

!yy
	
 
.77LL7 7(;/"; """ 4tHU[[$9::;" "@ s) 
%++ &G%,,G-.G LLG 
	Gr/   