
    sg                       d dl Z d dlZd dlZd dlZd dlZd dlmZmZ d dlm	Z	m
Z
 d dlmZmZmZmZmZmZmZmZmZmZmZmZmZmZ d dlZd dlmZ d dlmZ d dl mc m!Z" d dlm#Z# d dl$m%Z%m&Z&m'Z'm(Z(m)Z) d dl*m+Z+m,Z,m-Z-m.Z. d dl/m0Z0 d d	l1m2Z2 d
dl3m4Z4m5Z5m6Z6 g dZ7 ejp                  e9      Z:	 dZ;dZ<dZ=dZ>dZ?dZ@ G d de      ZAeAj                  eAj                  fZDeAj                  eAj                  fZG G d de      ZH G d de      ZI G d de      ZJ G d de      ZK G d de0      ZL G d d ej                  eL!      ZN G d" d#      ZOd$ej                  d%eQd&ej                  d'dfd(ZRd$ej                  d%eQd)e#d'dfd*ZSd$ej                  d%eQd+ee#ej                  f   fd,ZTd-eeejF                  ej                  f      d'eej                     fd.ZUd/eej                  e#f   d'e#fd0ZVd1ej                  fd2ZX ej                  d3      d4        ZZd5e[d6ej                  d7e\d8ej                  fd9Z^ ej                  d
      d:ej                  d;eQfd<       Z` ej                  d
      d:ej                  d;eQfd=       Za ej                  d
      d:ej                  d;eQfd>       Zbd? Zcd@ejF                  dAe[fdBZdd)e#fdCZey)D    N)autoEnum)
accumulatechain)AnyCallablecastDict	GeneratorIteratorList
NamedTupleno_type_checkOptionalSequenceSetTupleUnion)Tensor)_FSDPDeviceHandle!_named_parameters_with_duplicates_no_dispatch_record_stream_set_fsdp_flattenedHandleTrainingState)_alloc_storage_data_ptr_allocated_free_storage	_p_assert)_ParameterMeta)FakeProcessGroup   )_ext_post_unflatten_transform_ext_pre_flatten_transformFSDPExtensions)FlatParameterFlatParamHandleFlatParamShardMetadata	ParamInfoSharedParamInfoHandleShardingStrategyFSDP_USE_UNSAFE_SETATTRFSDP_SKIP_WRITEBACK_CHECKFSDP_USE_FULL_PREC_IN_EVAL*   FSDP_USE_FAKE_ALL_GATHERFSDP_USE_FAKE_REDUCEc                   R    e Zd Z e       Z e       Z e       Z e       Z e       Zy)r*   N)	__name__
__module____qualname__r   
FULL_SHARDSHARD_GRAD_OPNO_SHARDHYBRID_SHARD_HYBRID_SHARD_ZERO2     U/var/www/html/venv/lib/python3.12/site-packages/torch/distributed/fsdp/_flat_param.pyr*   r*   v   s&    JFMvH6L&r;   r*   c                   D    e Zd ZU dZeed<   ej                  ed<   eed<   y)r(   z&Information for an original parameter.
param_namemodulemodule_nameNr2   r3   r4   __doc__str__annotations__nnModuler:   r;   r<   r(   r(      s    0OIIr;   r(   c                   v    e Zd ZU dZeed<   ej                  ed<   eed<   eed<   ej                  ed<   eed<   y)	r)   ai  
    Additional information for a shared parameter.

    For each shared parameter, we designate one module and its parameter
    variable to be the primary owner, determined as the first one encountered
    in the parameter walk. These are prefixed with "prim". The primary module
    and parameter do not have their own :class:`SharedParamInfo` instance.
    r>   r?   r@   prim_param_nameprim_moduleprim_module_nameNrA   r:   r;   r<   r)   r)      s4     OIIr;   r)   c                   \    e Zd ZU dZeed<   ee   ed<   ee   ed<   ee   ed<   ee   ed<   y)_ShardParamInfoz4Shard-related information for an original parameter.in_shardoffset_in_shardnumel_in_shardintra_param_start_idxintra_param_end_idxN)r2   r3   r4   rB   boolrD   r   intr:   r;   r<   rL   rL      s6    >N c]"SM! $C=(!#&r;   rL   c                       e Zd ZU dZeedf   ed<   eej                  df   ed<   ee	df   ed<   eee	e	f   df   ed<   y)r'   a  
    This holds metadata specific to this rank's shard of the flat parameter.

    Attributes:
        param_names (Tuple[str, ...]): Prefixed parameter names of this rank's
            shard of the parameters; see :class:`FlatParameter`.
        param_shapes (Tuple[torch.Size, ...]): Parameter shapes of this rank's
            shard of the parameters; see :class:`FlatParameter`.
        param_numels (Tuple[int, ...]): Parameter numels of this rank's shard
            of the parameters; see :class:`FlatParameter`.
        param_offsets (Tuple[Tuple[int, int], ...]): [start, end] offsets (in
            units of numels) giving this rank's part of each flattened
            original parameter.
    .param_namesparam_shapesparam_numelsparam_offsetsN)
r2   r3   r4   rB   r   rC   rD   torchSizerS   r:   r;   r<   r'   r'      sP     sCx 

C((S/!sCx#-..r;   r'   c                       e Zd Zd Zy)_FlatParameterMetac                 T    t        |t        j                        xr t        |dd      S )N_is_flat_paramF)
isinstancerY   r   getattr)selfinstances     r<   __instancecheck__z$_FlatParameterMeta.__instancecheck__   s)    (ELL1 
g&7
 	
r;   N)r2   r3   r4   rc   r:   r;   r<   r\   r\      s    
r;   r\   c                      e Zd ZU dZej
                  ed<   ej
                  ed<   ej
                  ed<   eed<   ee	df   ed<   eej
                  df   ed<   ee
df   ed	<   eee   df   ed
<   eedf   ed<   eedf   ed<   eedf   ed<   eedf   ed<   eej"                     ed<   eed<   eed<   eed<   eed<   eeef   ed<   eed<   eed<   eed<   eed<   eeej(                        ed<   eeej(                        ed<   eeee         ed<   eee      ed<   ee   ed<   d+dZed ee	   d!ee   d"eej
                     d#ee
   d$ee   d%eee      d&eeej(                        d'eeej(                        d(ee   d)dfd*       Zy),r%   a  
    This is the flat parameter used by :class:`FullyShardedDataParallel`.

    It is comprised of one or more original parameters, which are flattened and
    concatenated to construct the flat parameter.

    Under the current design, this parameter logically represents both the
    unsharded and sharded flat parameter, and its data changes storages
    dynamically.
        - In the :class:`FullyShardedDataParallel` constructor, the parameter
        is initialized as unsharded and then sharded in-place.
        - At runtime, the parameter is lazily (re)-initialized. The sharded
        parameter data is saved in ``self._local_shard``, and a new ``Tensor``
        ``self._full_param_padded`` is created, which is the all-gather
        destination and owns the unsharded parameter storage thereafter. (See
        :meth:`FlatParamHandle.init_flat_param_attributes`.)
        - Throughout runtime, the parameter data changes storages as needed,
        e.g. to the sharded flat parameter, low precision sharded flat
        parameter, or the unsharded flat parameter.

    NOTE: Since ``use_orig_params=True`` supports intra-``FlatParameter``
    padding, we have two versions of the per-parameter numels, one that
    includes the padding (``_numels_with_padding``) and one that does not
    (``_numels``). The former may have length longer than the other data
    structures, while the latter has the same length as the number of actual
    original parameters like the other per-parameter data structures.

    NOTE: This is not a real class; instead, you will always get a Parameter
    back out if you try to create one of these.  This is similar to the trick
    we implemented for Parameter to get it to work with subclasses; this
    is primarily so that FlatParameter supports combination with FakeTensor.

    Attributes:
        _unpadded_unsharded_size (torch.Size): Unsharded flat parameter's size
            without right-hand-side padding for divisibility by the world size.
            For ``use_orig_params=True``, this includes alignment padding.
        _padded_unsharded_size (torch.Size): Unsharded flat parameter's size
            with right-hand-side padding for divisibility by the world size.
            For ``use_orig_params=True``, this includes alignment padding. This
            is only set for sharded strategies since they require padding for
            the all-gather.
        _sharded_size (torch.Size): Sharded flat parameter's size with padding.
            This is also set for ``NO_SHARD``, in which case it is the same as
            the unsharded sizes. (We omit "padded" because there is no
            analogous unpadded one.)

        _num_params (int): Number of original parameters flattened into this
            flat parameter. This is the length of the per-parameter data
            structures.
        _param_infos (Tuple[ParamInfo, ...]): Each parameter's parameter info
            entry; see :class:`ParamInfo` for details.
        _shapes (Tuple[torch.Size, ...]): Each parameter's original shape.
        _fqns (Tuple[str, ...]): Each parameter's fully-qualified name (FQN)
            prefixed from the ``_fully_sharded_module``. The names are
            guaranteed to be unique in the subtree rooted at that module.
        _param_extensions (Tuple[Optional[Any], ...]): Each parameter's
            extension (i.e. some per-parameter state) used to customize
            pre-flatten and post-unflatten behavior or ``None``. This is
            experimental, and users should not depend on its existence in the
            future.
        _numels_with_padding (Tuple[int, ...]): Each parameter's numel
            including entries for the padding. This is used to construct views
            into the flat parameter via ``torch.split()``. This may have length
            longer than ``_num_params``.
        _numels (Tuple[int, ...]): Each parameter's numel excluding entries for
            padding. This has length equal to ``_num_params``.
        _shard_param_infos (Tuple[_ShardParamInfo, ...]): Each parameter's
            shard parameter info; see :class:`_ShardParamInfo` for details.
        _shared_param_infos (Tuple[SharedParamInfo, ...]): Shared parameter
            info entries; see :class:`SharedParamInfo` for details.
        _modules (Set[nn.Module]): Modules that contain some original parameter
            that is flattened into the flat parameter.

        _shard_numel_padded (int): Numel padded for this rank's sharded flat
            parameter.
        _local_shard (Tensor): Sharded flat parameter with padding if using a
            sharded strategy. If using ``NO_SHARD``, then this is the unpadded
            unsharded flat parameter, and there is no notion of a sharded flat
            parameter or padded unsharded flat parameter.
        _full_param_padded (Tensor): Unsharded flat parameter with padding.
            This is not defined for ``NO_SHARD``. When using mixed precision
            for parameters, this has the low precision.
        _full_prec_full_param_padded (Tensor): Full precision unsharded flat
            parameter with padding. This is used for unsharding outside of
            computation when using mixed precision for parameters. This is
            never defined for ``NO_SHARD``.
        _post_backward_hook_handle (RemovableHandle):
            Flat parameter's post-backward hook handle. (Compile only)
        _post_backward_hook_state (Tuple[AccumulateGrad, RemovableHandle]):
            Flat parameter's :class:`AccumulateGrad` object and post-backward
            hook handle. (Eager only)
        _mp_shard (Tensor): Low precision sharded flat parameter with padding.
            This is only defined when parameter mixed precision is enabled. For
            ``NO_SHARD``, this is used for computation.
        _cpu_grad (Tensor): Sharded gradient with padding stored on CPU.
            This is only defined when offloading parameters is enabled.
        _saved_grad_shard (Tensor): Sharded gradient with padding from previous
            iterations for gradient accumulation without :meth:`no_sync`.

        _params (Optional[List[nn.Parameter]]): If ``use_orig_params=True``,
            then each original parameter variable; otherwise, ``None``. This
            does not include any padding tensors.
        _shared_params (Optional[List[nn.Parameter]]): The original shared
            parameter variables if ``use_orig_params=True`` and ``None``
            otherwise.
        _tensors (Optional[List[Optional[Tensor]]]): This saves the ``Tensor``
            views created in the forward and tracked by autograd when
            ``use_orig_params=True`` and is ``None`` otherwise. This is to
            preserve those ``Tensor`` variables for the backward to ensure that
            the ``FlatParameter`` 's ``AccumulateGrad`` object does not change
            in which case the post-backward hook does not run. This is relevant
            for cases like reentrant activation checkpointing.
        _is_grad_none_mask (Optional[List[bool]]): If ``use_orig_params=True``,
            a mask over the original parameters' gradients indicating if it is
            logically ``None`` or not; otherwise, ``None``. This does not
            include entries for padding. This mask is needed because only some
            of the parameters may have ``None`` gradient, in which case the
            flat gradient must be non-``None`` and must use zeros to
            approximate those original ``None`` gradients. This mask informs
            FSDP to set the original parameter gradients to ``None`` (instead
            of zeros) as needed.
    _unpadded_unsharded_size_padded_unsharded_size_sharded_size_num_params._param_infos_shapes_fqns_param_extensions_numels_with_padding_numels_shard_param_infos_shared_param_infos_modules_shard_numel_padded_local_shard_full_param_padded_full_prec_full_param_padded_post_backward_hook_state_post_backward_hook_handle	_mp_shard	_cpu_grad_saved_grad_shard_params_shared_params_tensors_is_grad_none_mask_is_padding_maskNc                     | t         u sJ d       t        j                  j                  t        j                  ||      }d|_        |S )Nz&subclasses FlatParameter not supportedT)r%   rE   	Parameter__new__r^   )clsdatarequires_gradrs       r<   r   zFlatParameter.__new__l  s?    m#M%MM#LL  t]Cr;   param_infosnumelsshapesfqnsshared_param_infosparam_extensionsparamsshared_paramsis_padding_maskreturnc                 0   t        |      t        |      k(  sJ t        |      t        |      k(  sJ t        |      t        |      k(  sJ t        |      |_        ||_        ||_        ||_        ||_        |
|_        g }t        ||
      D ]  \  }}|r	|j                  |        t        |      |_
        t        |      |_        t        |j                        |j                  k(  sJ t        |      |_        |j                  D ch c]  }|j                   c}j                  |j                  D ch c]  }|j                   c}      |_        |du |	du k(  sJ ||	t        |	      t        |      k(  sJ g |_        t        ||
      D ]#  \  }}|r	|j                   j                  |       % |	|_        t%        |j                   |j"                        D ]  }t'        |        t)        |j                        D cg c]  }d c}|_        t)        |j                        D cg c]  }d c}|_        nd|_        d|_        d|_        d|_        |j/                         |_        t'        |       d|_        yc c}w c c}w c c}w c c}w )a  
        Initialize attributes holding metadata about the original parameters comprising the flat parameter.

        We expose this method separate from the constructor to keep the
        constructor only responsible for the flat parameter's tensor data. This
        method should only be called once per model, while the constructor may
        be called multiple times, e.g. when reloading from a checkpoint, in
        which case only the tensor data needs to be passed to the constructor.
        Since :meth:`load_state_dict` is implemented via :meth:`copy_`, the
        metadata is correctly assumed to be unchanged.

        Args:
            See the Attributes in the class docstring.
        NF)lenrh   ri   rj   rk   rl   r   zipappendtuplern   rm   rp   r?   unionrq   r{   r|   r   r   ranger~   r}   sizere   _post_backward_called)r   ra   r   r   r   r   r   r   r   r   r   numels_without_paddingnumel
is_paddingpispiparam_s                     r<   _init_metadatazFlatParameter._init_metadatau  ss   8 ;3v;...;3t9,,,;3'7#8888{+'
!1 /,.!$V_!= 	5E:&--e4	5 34$)&M!4<< D$4$4444#();#< -1->->?r?EE#'#;#;<CSZZ<
 $MT$9::: ,]1Cs"H 2   DL%(%A /!z!LL''./ #0D t||T-@-@A +#E*+6;D<L<L6M&Nu&ND#+01A1A+BCaTCDMDL"&D&*D# DM(,		%D! &+"9 @<  'OCs   J8J		J.	J)NT)r2   r3   r4   rB   rY   rZ   rD   rS   r   r(   rC   r   r   rL   r)   r   rE   rF   r   r   r   rR   r   classmethodr   r:   r;   r<   r%   r%      s   yv $jj(!JJ&::	3''5::s?##c?Xc]C/00S/)38_os233344"))n"(($S#X. ##d2<<())T",,/00tHV,-.. d,,4j  J+ )_J+ S		J+
 UZZ J+ 3iJ+ !1J+ x}-J+ bll+,J+  R\\ 23J+ dJ+ 
J+ J+r;   r%   )	metaclassc                       e Zd ZdZdddeeej                  ef      dej                  de
j                  deded	ee
j                     d
ee
j                     dedej"                  dedee   f fdZd ZdefdZdeeeej                  f      dej                  dededdf
dZdeeeej                  f      defdZdee   dedefdZdee   dededefdZd	ee
j                     d
ee
j                     ddfdZ e
j>                         d        Z deded eddfd!Z!ded edee"d"f   fd#Z#e$d$ed%ed&edeeef   fd'       Z%e$d$ed%ed&edeeef   fd(       Z&e$d$ed%ed&ede
jN                  fd)       Z(deeeef      fd*Z)e*de+fd+       Z,e* e
j>                         did,              Z-defd-Z.d. Z/d/ Z0defd0Z1d1 Z2de
j                  fd2Z3d3edefd4Z4d3e
j                  ddfd5Z5d6 Z6d7 Z7 e
j>                         d8        Z8d9 Z9d: Z:d; Z;e<jz                  d<        Z>d=efd>Z?d? Z@d@ ZAdidAZBe*	 djd$ee
j                     deCe   fdB       ZDe*	 djd$ee   dee   fdC       ZEe* e
j                         dDeddfdE              ZGe*didF       ZHe<jz                  deIfdG       ZJe* e
j>                         didH              ZKe* e
j>                         didI              ZLe* e
j>                         defdJ              ZMdKee   dLedMedNe
jN                  dOedPeddfdQZNdR ZOdS ZPdT ZQdeRej                     fdUZSd$edefdVZTdeCeeUeUf      fdWZVdeCeeUeUf      fdXZWeXdeeU   fdY       ZYeXdee   fdZ       ZZdid[Z[d\ Z\d$efd]Z]d$efd^Z^e$d$efd_       Z_e$d$efd`       Z`da Zad$efdbZbd$efdcZceXdefdd       ZdeXdefde       ZeeXdefdf       ZfeXdefdg       ZgeXdefdh       Zh xZiS )kr&   a  
    A handle that manages a flat parameter (:class:`FlatParameter`).

    This includes sharding and view management.

    Args:
        params (Sequence[nn.Parameter]): The parameters to flatten into the
            flat parameter.
        fully_sharded_module (nn.Module): See [Note: Fully Sharded Module].
        device (torch.device): The compute and communication device, which
            should be a non-CPU device. We refer to it as the compute device.
        sharding_strategy (ShardingStrategy): Sharding strategy to apply to
            this handle's ``FlatParameter``.
        offload_params (bool): Whether to offload the handle's
            ``FlatParameter`` to CPU.
        mp_param_dtype (Optional[torch.dtype]): Parameter mixed precision
            setting passed to the FSDP constructor.
        mp_reduce_dtype (Optional[torch.dtype]): Gradient reduction mixed
            precision setting passed to the FSDP constructor.
        keep_low_precision_grads (bool): Whether to keep gradients in low
            precision.
        use_orig_params (bool): If ``True``, then FSDP preserves the original
            parameter variables and returns them from ``named_parameters()``
            (e.g. to support different optimizer hyperparameters within one
            :class:`FlatParameter`). If ``False``, then FSDP reconstructs the
            parameters every iteration and returns the :class:`FlatParameter` s
            from ``named_parameters()``.
    N)fsdp_extensionr   fully_sharded_moduledevicesharding_strategyoffload_paramsmp_param_dtypemp_reduce_dtypekeep_low_precision_gradsprocess_groupuse_orig_paramsr   c                t   t         |           t        |      }t        |      dk(  r#t	        d| j
                  j                   d      | j                          t        j                  j                  t        d      dk(  | _        t        j                  j                  t        d      dk(  | _        t        j                  j                  t        d      dk(  | _        t        j                  j                  t"        d      dk(  | _        | j                  rt'        t(        dt         d       | j                   rt+        t(        dt         d       | j$                  rt-        t(        dt"         d	       |
}| j/                  |       || _        t3        j4                  | j0                        | _        |	| _        | j                   s| j$                  r.t;        |	j=                         |	j?                         
      | _         |	j=                         | _        |	j?                         | _!        || _"        || _#        |
| _$        || _%        tL        jN                  | _(        tS        jT                         | _+        || _,        d | _-        d | _.        d | _/        d | _0        d| _1        d| _2        d| _3        |d   jh                  | _5        | jm                  ||       | jn                  J |rtq        | jn                        nd| _9        || _:        | jw                  ||| jr                  |
       | jy                  d       y )Nr   zCannot construct a z with an empty parameter list 1zSince z=1, FSDP will not check for parameter or gradient writeback. Changing parameter or gradient storages may lead to silent correctness errors.z=1, FSDP will not execute all-gather ops. Your training will be incorrect, but can reveal how much time spent on all-gather ops.z=1, FSDP will not execute reduce-scatter ops. Your training will be incorrect, but can reveal how much time spent on reduce-scatter ops.)rank
world_sizeF)unsharded_dtype	as_params)=super__init__listr   
ValueError	__class__r2   _init_setattr_fnsosenvironget_FSDP_SKIP_WRITEBACK_CHECK_skip_writeback_check_FSDP_USE_FULL_PREC_IN_EVAL_use_full_prec_in_eval_FSDP_USE_FAKE_ALL_GATHER_use_fake_all_gather_FSDP_USE_FAKE_REDUCE_use_fake_reduce_warn_skip_writeback_checklogger_warn_use_fake_all_gather_warn_use_fake_reduce_init_get_unflat_views_fnr   r   from_device_device_handler   r    r   r   _fake_process_groupr   _sharding_strategy_offload_params_use_orig_params_keep_low_precision_gradsr   IDLE_training_statedistget_debug_level_debug_level_fully_sharded_module'_unsharded_flat_param_for_skipped_views_handle_index_pre_forward_order_index_post_forward_index_needs_pre_forward_unshard_needs_pre_backward_unshard_prefetcheddtype_orig_param_dtype_init_param_reduce_dtypes_fwd_bwd_param_dtype_get_aligned_numel_aligned_numel_fsdp_extension_init_flat_param_and_metadata_use_unsharded_views)ra   r   r   r   r   r   r   r   r   r   r   r   align_addressesr   s                r<   r   zFlatParamHandle.__init__  s    	fv;!%dnn&=&=%>>[\  	 JJNN5r:cA 	" JJNN6;sB 	# %'JJNN3Lb$QUX$X! "

/Db IS P%%&34 5K K $$%23 4D D   !./ 0H H *&&7/;;DKKH*$$(=(='7"'')m6H6H6J(D$ "&&(	',,."3- /)A&277 002%9" JN4 -17;%26 */'+0(  "(&&~G((444  t/H/HI 	
  .**($*=*=	
 	!!E!2r;   c                     t         j                  j                  t        d      dk(  }|  |  |rt        | _        t        | _        y t        | _        t        | _        y )Nr   r   )	r   r   r   _FSDP_USE_UNSAFE_SETATTR_unsafe_setattr_tensor_setattr_tensor_unsafe_setattr_param_setattr_param_safe_setattr_tensor_or_param)ra   use_unsafe_setattrs     r<   r   z!FlatParamHandle._init_setattr_fnsM  sF    ZZ^^,DbISP#9D "7D#@D "?Dr;   r   c                 N    |r| j                   | _        y | j                  | _        y N)_get_unflat_views_aligned_get_unflat_views_unaligned_get_unflat_views)ra   r   s     r<   r   z)FlatParamHandle._init_get_unflat_views_fnX  s-      ** 	 11 	r;   r?   aligned_numelr   c                 (   t        |      dk(  rt        d      |dk  rt        d|       | j                  |      \  }}}t        |      }g }	g }
g }g }g }i }g }g }g }g }dx}}|j	                  d      D ]  \  }}t        |d      D ]  \  }}||vr||v r:||   \  }}}|j                  |       |j                  t        ||||||             J|dkD  rX|||z  z
  }|dkD  rK||k  rFt        ||d|      }|j                  |       |j                  d       |
j                  |       ||z  }t        || j                        \  }}t        t        j                  |      }|j                  |       |||f||<   |j                  |       |j                  d       |	j                  t        |||             |
j                  |j                                |j                  |j                          |r|dz   |z   n|} |j                  |        ||j                         z  }||j                         z  }  t        |      dk(  rt        d	| d
|       | j"                  dk(  r%|dkD  r ||k7  rt$        j'                  d||z
  ||       |dkD  r| j(                  || j(                  z  z
  }|dkD  rz|| j(                  k  rk| j"                  dk(  rt$        j+                  d|       t        ||d|      }|j                  |       |j                  d       |
j                  |       ||z  }| j-                  |d|      | _        t0        j3                  | j.                  |	|
|||||rt5        |      nd|rt5        |      |
       yd|
       y)a  
        Initialize the ``FlatParameter`` and its metadata.

        NOTE: This should only be called once at construction time, after which
        the ``FlatParameter`` metadata is assumed to be static.

        NOTE: The elements of ``params`` should only be ``Tensor`` s when
        composing with ``DTensor`` -based tensor parallelism, in which case the
        elements may be ``DTensor`` local shards.
        r   zExpects non-empty `params`-Expects non-negative `aligned_numel` but got F)remove_duplicate)recurseT.z2`params` were not found in `module`'s treeparams: z	
module: zLFSDP FlatParameter address alignment created %s numel of padding (%s vs. %s)zFFSDP FlatParameter world size divisibility created %s numel of padding)r   r   N)r   r   _validate_tensors_to_flattensetnamed_modulesr   r   r)   _construct_padding_tensorr#   r   r	   rE   r   r(   r   shaper   r   debugr   infoflatten_tensors_into_flat_param
flat_paramr%   r   _convert_to_params)!ra   r   r?   r   r   r   flat_param_requires_gradr   
params_setr   r   r   r   r   shared_param_memoparams_to_flattenr   r   r   total_numeltotal_numel_without_paddingsubmodule_name	submoduler>   r   rI   rJ   rH   numel_to_padpadding_tensortransform_t	extensionfqns!                                    r<   r   z-FlatParamHandle._init_flat_param_and_metadata_  s6   " v;!9::1?O  --f5		
$[
 (*#%46  	 @B;=&(&(4551)/)=)=u)=)U 3	A%NI%F5& 2A!
E 
*--EVFBK!1? "((/&--'&%*+',	 %q('4m8S'T'!+}0L-F ,eUF.N .44^D+2248"MM,7'<7K-G,,.*K !{;E$++I609>:/V%e,%,,U3#**51&&yY'WXMM%++-0MM%++. * ',z9' 
 KK$5;;=0K/5;;=@/e2A3	Ah  !Q&!(*VH6 
 IIN!::LL299+ 1  ??kDOO.KLLaL4??$B99>KK.$
 "; %" "((8&&t,l+|+)-)M)M2 *N *

 	$$OO5D01$1@}-	
 GK	
r;   tensorsc                    d}d}d}|D ]  }t        |t              rt        d      ||j                         st        d      |*|j                  |k7  rt        d| d|j                         | j
                  s||j                  |k7  rt        d      |*|j                  |k7  rt        d| d|j                         |j                  }|xs |j                  }|j                  } |J d       |||fS )	zCValidate the tensors to flatten and returns any necessary metadata.Nz Cannot flatten a `FlatParameter`z$Cannot flatten integer dtype tensorsz0Must flatten tensors with uniform dtype but got z and zNMust flatten tensors with uniform `requires_grad` when `use_orig_params=False`z5Must flatten tensors on the same device but got both z!Requires non-empty `tensors` list)r_   r%   r   is_floating_pointr   r   r   r   )ra   r  r   r  r   tensors         r<   r   z,FlatParamHandle._validate_tensors_to_flatten  s>    (,37 )- 	#F&-0 !CDD}V%=%=%? !GHH V\\U%: Fug N!<<.* 
 )),8((,DD .  !fmmv&= KheFMM?4  LLE'?'W6CWCW$]]F5	#6 (3X5XX3.66r;   c                    t        |      dk(  rt        d      |dk  rt        d|       | j                  |      \  }}}g }|dkD  rd}|D ]x  }|||z  z
  }	|	dkD  r)|	|k  r$t        |	|d|      }
|j	                  |
       ||	z  }|j	                  t        j                  t        |                   ||j                         z  }z | j                  || j                  z  z
  }	|	dkD  r_|	| j                  k  rPt        |	|d|      }
|j	                  |
       ||	z  }n+|D cg c]   }t        j                  t        |            " }}t        j                  |d      S c c}w )a   
        Flatten ``tensors`` into a single flat tensor.

        The flattening optionally includes
        padding if ``aligned_numel`` is greater than 0, where ``aligned_numel``
        gives the numel required to have address alignment.

        NOTE: The padding alignment algorithm must be kept in sync with
        :meth:`_init_flat_param_metadata`. We separate the two methods because
        the initialization happens once, whereas this method may be called
        multiple times throughout training (e.g. for checkpointing).
        r   zExpects non-empty `tensors`r   Fdim)r   r   r   r  r   rY   flatten_detach_if_neededr   r   cat)ra   r  r   r   r   r   flat_tensorsr  r  r  r  s              r<   flatten_tensorszFlatParamHandle.flatten_tensors  s   " w<1:;;1?O   <<WEq&%'1K! 	.,m0KL!#}(D%>$eUF&N !''7</K##EMM2CF2K$LMv||~-	.  ??kDOO.KLLaL4??$B!: %" ##N3|+ HO=C/78L  yy1--s   !%Er   c                 @    | j                  ||      }t        ||      S )Nr   )r   r%   )ra   r  r   r   flat_param_datas        r<   r  z/FlatParamHandle.flatten_tensors_into_flat_paramN  s#     ..wF_MJJr;   c                    |du| _         |du| _        | j                   r%| j                  s|| _        | j                  | _        n*|xs | j                  | _        |xs | j                  | _        | j                  J | j                  J y)a]  
        Initialize param and reduce dtypes.

        Precondition: ``self.flat_param`` is set. This ensures that this
        handle's parameters have a single dtype.

        Postcondition: This sets ``self._fwd_bwd_param_dtype`` and
        ``self._reduce_dtype``. If ``mp_param_dtype`` or ``mp_reduce_dtype``
        is ``None``, then we assume the original parameter dtype. One special
        case is if ``mp_param_dtype`` is not ``None`` and ``mp_reduce_dtype``
        is ``None``, in which case we assume the gradient reduction dtype
        matches the forward/backward parameter dtype.
        N)_low_prec_param_dtype_specified _low_prec_reduce_dtype_specifiedr   _reduce_dtyper   )ra   r   r   s      r<   r   z)FlatParamHandle._init_param_reduce_dtypesW  s    ( 0>T/I,0?t0K-0099 )7D%!%!:!:D(6(P$:P:PD%!0!JD4J4JD((444!!---r;   c                    | j                   }| j                  s&| j                  dd|j                         dz
         nt	        |j                         dk(  d       t        j                  || j                  | j                        \  }}t        j                  j                  j                         sB|j                         j                         dkD  }|r|j                         j!                  d       |j#                  |       |j                         | j                  z  }|j                         | j                  dz   z  dz
  }| j                  |||       | j$                  r| j'                          yy)aM  
        Shard the handle's ``FlatParameter``.

        This allocates new memory for
        the sharded flat parameter and frees the unsharded flat parameter's
        storage.

        Postcondition: ``self.flat_param`` is the sharded flat parameter. Shard
        metadata attributes are set for all sharding strategies.
        r   r!   z;The `FlatParameter` is not the sole occupant of its storageN)r  uses_sharded_strategy_init_shard_metadatar   r   storage_offsetr&   
_get_shardr   r   rY   distributed_functional_collectivesis_torchdynamo_compiling_typed_storage_size_resize_set_r   _use_sharded_views)ra   r  sharded_flat_paramnumel_padded	allocated	start_idxend_idxs          r<   shardzFlatParamHandle.shard}  s9    __
))%%aJ,<,<,>,BC))+q0M 0?/I/IDIIt0, $$<<UUW&557==?!C	--/88;OO./*002TYY>I(..0DIIMBQFG%%lIwG  ##% !r;   r6  unsharded_start_idxunsharded_end_idxc                 x   | j                   }|j                         |_        |j                         }t	        |dk\  xr ||k  d| d|        t	        ||k  d| d|        | j                  ||      }t        |      |j                  k(  s J d|j                   dt        |              ||_        ||_	        y)	a  
        Initialize shard-related metadata for this rank's shard of the flat parameter.

        This includes ``_sharded_size``, ``_shard_param_infos``, and ``_shard_numel_padded``.

        Args:
            numel_padded (int): Numel padded for this rank's sharded flat
                parameter.
            unsharded_start_idx (int): Start index in the unsharded flat
            parameter assigned to this rank.
            unsharded_end_idx (int): End index (inclusive) in the unsharded
                flat parameter assigned to this rank.

        Precondition: ``self.flat_param`` 's data is the sharded flat
        parameter.
        r   zunsharded_start_idx: z unsharded_end_idx: znumel_padded: z sharded_flat_param_numel: zExpects length 	 but got N)
r  r   rg   r   r   _get_shard_metadatar   rh   ro   rr   )ra   r6  r;  r<  r  sharded_flat_param_numelshard_param_infoss          r<   r*  z$FlatParamHandle._init_shard_metadata  s    , __
#-??#4
 #-#3#3#5 1$Q)<@Q)Q#$7#88LM^L_`	
 	44\N +))A(BD	

 !44!2
 !"j&<&<<	WZ334IcBS>T=UV	W<(9
%)5
&r;   .c                 ~   | j                         }t        |      t        | j                  j                        k(  s3J dt        | j                  j                         dt        |              g }||z
  dz   }t	        t        || j                  j                              D ]  \  }\  \  }}}	|	r||k  xr ||k\  }
|
st        ddddd      }nR||k  rd}||z
  }n||z
  }d}|dk\  r||k  sJ d| d| d	       t        ||      |z
  }||z
  dz   }t        d
||||      }|j                  |        t        |      S )z
        Compute the shard metadata based on ``unsharded_start_idx`` and ``unsharded_end_idx`` (inclusive).

        ``unsharded_start_idx`` and ``unsharded_end_idx`` give the interval of the
        unsharded flat parameter specifying the shard.
        z	Expected r>  r!   FNr   zInvalid `offset_in_shard` of z! for sharded flat parameter with z numelT)_get_flat_param_offsetsr   r  rm   	enumerater   r   rL   minr   r   )ra   r;  r<  flat_param_offsetsrA  r@  iunsharded_param_start_idxunsharded_param_end_idxr   in_sharded_flat_paramshard_param_inforP   rN   rQ   rO   s                   r<   r?  z#FlatParamHandle._get_shard_metadata  s    "99;%&#OO00+
 
 	es4????@A3OaKbJcd	e 
 46#47J#JQ#N  s-t/O/OPQ)	7 
A 
@&(?#'>> C%)BB " )#25$dD#Q &*CC -.)&?BU&UO ,.GG * '(O#q(_?W-W 4O3D E33K2LFTW /1BC/0 $ "57L!Lq!P#2#")'$  $$%56S)	7T &''r;   r  r   r   c                    t        j                  |       j                  |      }t        |      |dz   k  r|d   j	                  d      }n||   }|d   j                         |j                         z
  }|dk\  sJ d       ||fS )a  
        Return the unpadded shard of ``tensor`` for the given ``rank`` and ``world_size``.

        The returned value is a tuple of the shard of ``tensor`` without any
        padding and the numel to pad for that shard.

        If ``tensor`` is already flattened or may be viewed in the flattened
        shape (which is true in the expected usage), then this method does not
        allocate any new tensor memory.
        r!   r   z5Chunk's size should be at most the first chunk's size)rY   r  chunkr   	new_emptyr   )r  r   r   chunksrM  r  s         r<   _get_unpadded_shardz#FlatParamHandle._get_unpadded_shard	  s      v&,,Z8v;$(# 1I''*E4LEay(5;;=8A	CB	Cl""r;   c                     t         j                  | ||      \  }}|j                         }|dkD  rt        j                  |d|g      }||fS )a  
        Return the shard of ``tensor`` with padding for the given ``rank`` and ``world_size`` and the numel padded for that shard.

        This method allocates new memory (via :meth:`clone`) since the
        unsharded ``tensor`` may be deallocated after this method returns.
        r   )r&   rP  cloneFpad)r  r   r   rM  r  r:  s         r<   r,  zFlatParamHandle._get_shard&  sU     .AAD*
| !EE%!\!23El""r;   c                    t        | j                        dk(  sJ | j                          t        j                  | ||      \  }}|j	                         }t        |      dk(  sJ |        t        j                  |d   |z   g      S )z
        Return the shape of ``tensor`` after sharding including padding.

        This requires ``tensor`` to have 1D shape and ensures that the returned
        shape is 1D.
        r!   r   )r   r  r&   rP  r   rY   rZ   )r  r   r   unpadded_sharded_tensorr  unpadded_sharded_sizes         r<   _get_sharded_sizez!FlatParamHandle._get_sharded_size:  s     6<< A%8&,,8%0?0S0SD*1
- !8 < < >()Q.J3H2IJ.zz03lBCDDr;   c                     t        t        | j                  j                              }dg|dd z   }|D cg c]  }|dz
  	 }}t        t	        ||            }|S c c}w )z
        Return [start, end] offsets of each original parameter's flattened data in the unsharded flat parameter (without padding).

        NOTE: The returned list includes elements for alignment padding.
        r   Nr!   )r   r   r  rm   r   )ra   cumulative_sumstartsendendsrX   s         r<   rC  z'FlatParamHandle._get_flat_param_offsetsJ  sc     j)M)MNO~cr**#12Ca22S./ 3s   Ac                    g }g }g }g }t        | j                  j                  | j                  j                  | j                  j                  | j                  j
                        D ]n  \  }}}}|j                  s|j                  |       |j                  |       |j                  |       |j                  |j                  |j                  f       p t        t        |      t        |      t        |      t        |            S )z
        Return the shard-related metadata specific to this rank's shard of the flat parameter.

        NOTE: The returned tuple does not include elements for alignment
        padding but does account for the padding.
        )r   r  rk   rj   rn   ro   rM   r   rP   rQ   r'   r   )	ra   	fqns_listshapes_listnumels_listshard_param_offsetsr  r  r   rK  s	            r<   shard_metadatazFlatParamHandle.shard_metadataV  s     	 36OO!!OO##OO##OO..	4
 	/C/ $,,S!u%u%&&$::$88	" &)++%&	
 	
r;   c                    | j                   }|j                  | j                  k7  rW| j                  s|j                  | _        | j
                  s| j                  s|j                  | _        |j                  | _        t        j                  d      }| j                  r't        |j                  |k(  d|j                          n| j                  | j                          |j                  |_        | j                  rk|j                  j                  | j                        |_        t        j                  |j                  |      j                  | j                        |_        | j"                  rPt        j$                  |j                  | j                  | j                        |_        t)        |j&                         | j*                  r| j"                  r| j                  n|j                  }|j-                         | j.                  z  }t        j0                  || j                  |      |_        |j2                  j5                         |_        t)        |j2                         | j"                  rGt        j0                  || j                  |j                        |_        t)        |j8                         yyy)a  
        This initializes some attributes on the handle's ``FlatParameter``.
        This should be called during lazy initialization since it requires the
        parameter to be on the compute device if not offloading to CPU and we
        want to give users the chance to move the parameter appropriately after
        the FSDP constructor.

        For each tensor attribute on the ``FlatParameter``, see the unshard and
        reshard methods in this class for the allocation and free pattern.
        cpuzWExpects the `FlatParameter` to be on CPU when parameter CPU offloading is enabled, not r   r   r   N)r  r   r   r%  r   r&  r'  rY   r   r   r   _check_on_compute_devicer   rs   
pin_memory
zeros_likery   _uses_param_mixed_precision
empty_likerx   r   r)  r   r   emptyrt   r   rf   ru   )ra   r  
cpu_deviceunsharded_param_dtypepadded_unsharded_numels        r<   init_flat_param_attributesz*FlatParamHandle.init_flat_param_attributes|  sO    __
t555
 77,6,<,<) 99<<%/%5%5"%/%5%5D"\\%(
!!Z/..8.?.?-@B ))$//:",//
&0&=&=&H&H{{ 'I 'J#
 $)#3#3''
$jj,   ++
 $)#3#3''{{//$J 
 *../%%
 33 ))%% "
 &0%5%5%7$//%I",1KK&{{+-J)
 1;0M0M0R0R0TJ-*778// ;@++*;;$**;
7
 jEEF 0# &r;   c                 F   | j                   t        j                  k(  r| j                  r| j	                          d}| j
                  r| j                  s| j                         }| j                  r| j                  s| j                         sny| j                  r| j                  s| j                          d}nN| j                  rB| j                  j                  | j                  k7  r| j!                  | j                  d       d}| j#                  | j                         |S )a"  
        Return ``False`` if this is a no-op and ``True`` otherwise.

        Postcondition: ``self.flat_param`` 's data is on the device for
        communication and is what should be all-gathered. This means that it
        matches the dtype of the expected unsharded parameter.
        FTnon_blocking)r   r   SUMMON_FULL_PARAMS_skipped_use_sharded_viewsr4  r   r   _writeback_orig_paramsr)  r   needs_unshardrl  _force_full_precision_use_low_precision_shardr  r   flat_param_tori  )ra   rets     r<   pre_unshardzFlatParamHandle.pre_unshard  s       $7$J$JJ//
 ##%  )C)C--/C&&((&&(--d6P6P))+C!!doo&<&<&Kt{{>C%%doo6
r;   c                 :   | j                          | j                  }t        |j                  |j                  j                                |j                  j                  |j                  j                  | j                  d             |j                  |_	        y)z\Allocate on the compute device and switch to using the low precision sharded flat parameter.Trt  N)
_check_low_precision_shardr  r   rx   rs   r   copy_tor   r   )ra   r  s     r<   r{  z(FlatParamHandle._use_low_precision_shard  s    '')__
  *"9"9">">"@	
 	""##&&$ ' 	
 %..
r;   c                     | j                         s:| j                  r| j                         n| j                  }| j	                  |       y| j                         }| j                  |      }| j	                  |       y)a  
        Run the unshard logic.

        This includes all-gathering the flat parameter
        and switching to using the unsharded flat parameter. If the handle does
        not need unsharding, then this only switches to using the unsharded
        flat parameter. For ``NO_SHARD``, this is a no-op.

        If FSDP is in :meth:`summon_full_params` and the handle uses parameter
        mixed precision, then the parameter is forced to full precision.
        N)ry  r)   _get_padded_unsharded_flat_paramr  _use_unsharded_flat_param"_alloc_padded_unsharded_flat_param_all_gather_flat_param)ra   unsharded_flat_parampadded_unsharded_flat_params      r<   unshardzFlatParamHandle.unshard	  sy     !!#
 -- 557__ !
 **+?@#FFH&*&A&ABV&W#&&'BCr;   c                 v    | j                   sy| j                         }t        ||j                               }| S )z<Return if the handle's flat parameter needs to be unsharded.F)r)  r  _same_storage_sizer   )ra   r  already_unshardeds      r<   ry  zFlatParamHandle.needs_unshard#  sB    ))#DDF. "6"<"<">
 %$$r;   c                     | j                          | j                  }| j                         }| j                  |       t	        ||j
                         |S )a(  
        Allocate the *padded* unsharded flat parameter.

        The unpadded unsharded
        flat parameter is always a view into the padded one. This padded
        parameter is saved to a different attribute on the ``FlatParameter``
        depending on if we force full precision.
        )_check_sharded_strategyr  r  _check_storage_freedr   rf   ra   r  r  s      r<   r  z2FlatParamHandle._alloc_padded_unsharded_flat_param-  sN     	$$&__
#DDF!!"67+Z-N-NO##r;   c                    | j                          | j                  }| j                  r| j                  r~|j                  }t        |j                  | j                  k7  d| j                          |j                  j                         j                         dkD  rt        |j                         |S |j                  }|S )z
        Return a reference to the padded unsharded flat parameter depending on the calling context.

        This should only be called if using a sharded strategy.
        zExpects full precision but got r   )r  r  rz  rl  ru   r   r   r   rt   untyped_storager   r   r  s      r<   r  z0FlatParamHandle._get_padded_unsharded_flat_param=  s     	$$&__
%%$*J*J $.#J#J $**d.G.GG1$2K2K1LM ,,<<>CCEIj;;< $# $.#@#@ ##r;   r  c                    t        t        | d      xr t        | d      d       | j                  j                  }|j	                         | j
                  z  }t        |j	                         |k(  d| d|j	                                 | j                  r| j                  n| j                  }|j                  rKt        t        j                  |t        j                  |                  }t        j                  |||       nt        j                   |||       | j"                  r$t%        || j&                  j)                                |S )z
        All-gather the handle's flat parameter to the destination ``padded_unsharded_flat_param``.

        Then switch to use the all-gathered tensor.
        r   r   zEExpects a process group and world size to have been set via `shard()`zExpects z numel but got group)r   hasattrr  r   r   r   r   r   r   is_cpur   rY   rM  r   get_world_size
all_gatherall_gather_into_tensorr   r   r   current_stream)ra   r  r5  expected_numelpgtensor_lists         r<   r  z&FlatParamHandle._all_gather_flat_paramZ  s2    	D/*Jwt\/JS	
 "__11+113dooE'--/>A~&o6Q6W6W6Y5Z[	
 (( $$## 	 $$79L9LR9PQK OOK);2F''+"  '"##224 +*r;   c                 ~   | j                   j                  }|d|j                          }|| j                   _        | j                  t
        j                  k(  }| j                  t
        j                  k(  }| j                  r(| j                  r|ry| j                  | xr |        y|r| j                  d       yy)z
        Switch to use the *unpadded* unsharded flat parameter.

        This is a view into the *padded* unsharded flat parameter.
        Nr   F)r  re   r   r   r   r   FORWARDBACKWARD_PREr   rw  r   )ra   r  unsharded_sizeflat_param_part
in_forwardin_pre_backwards         r<   r  z)FlatParamHandle._use_unsharded_flat_param  s     AA56N8L8L8NO.))-@-H-HH
..2E2R2RR  ..? 
 %%)>A/.A &  %%%6 r;   c                     | j                   r| j                  r| j                          | j                  | j                         y)zo
        Run the post-unshard logic.

        This includes freeing the low precision shard if needed.
        N)rl  r)  !_free_low_precision_sharded_paramri  r  ra   s    r<   post_unshardzFlatParamHandle.post_unshard  s3     ++0J0J224%%doo6r;   c                     | j                          t        | j                  j                  | j                  j                                t        | j                  j                         y)z/Frees the low precision sharded flat parameter.N)r  r   r  rx   r   r  r   r  s    r<   r  z1FlatParamHandle._free_low_precision_sharded_param  sJ    '') 	#OO%%t':':'I'I'K	
 	doo//0r;   c                 Z   | j                   s| j                          y| j                  }| j                  |       t	        j
                  dt        j                  | j                        }|j                  du |d<   t        j                  || j                         |d   | j                  k(  rd|_        | j                          y|j                  }| j                  t        j                  j                   k(  r#t#        j$                  d| j&                   d       d|_        t	        j
                  |j(                  | j                        }n8| j+                  |j                         |j                  |_        |j                  }t	        j,                  |j.                  | j                  |j0                  	      }t        j2                  ||| j                         | j                  j4                  }|d|j7                          j9                  |      |_        | j                          y)
a  
        Unshard the handle's ``FlatParameter``'s gradient.

        If all ranks have
        ``None`` gradient, then all original parameters will as well. This
        method performs an all-reduce and an all-gather. The additional
        all-reduce is tolerable since this method is not meant to be used on
        the computation critical path.

        Postcondition: ``_saved_grad_shard`` is defined and contains the value
        to set ``flat_param.grad`` after gradients are resharded.
        Nr!   )r   r   r   r  [Rank z] Only some but not all ranks have a `None` `FlatParameter` gradient, so FSDP is using zeros to approximate those ranks' sharded gradients being `None`rg  rh  )r)  _use_unsharded_grad_viewsr  _check_unshardedrY   zerosint32r   gradr   
all_reducer   r   rz   r   
DebugLevelINFOwarningswarnr   rg   _check_shardedrn  rf   r   r  re   r   view)ra   r  num_grad_nonesharded_gradpadded_unsharded_gradr  s         r<   unshard_gradzFlatParamHandle.unshard_grad  s    ))**,__
j) AU[[M%??d2aT-?-?@t.+/J(**,??"   DOO$8$88TYYK (N N
 ,0J( ;;z'?'?TL
0+5??J(%77L %--;;$$!

 	##!<1C1C	
 AA/0H.2F2F2HINN

 	&&(r;   c                     | j                   r| j                          | j                  sy | j                  j                  | j                  _        t        | j                  d       y )Nrz   )r   _use_sharded_grad_viewsr)  r  rz   r  delattrr  s    r<   reshard_gradzFlatParamHandle.reshard_grad  sH      ((*))#@@!45r;   c                 |   t        | j                  t        j                  t        j                  fv d       | j
                  }|j                  |j                  j                         |j                  k7  s$|j                  j                  |j                  k7  r| j                  | j
                         |j                  j                  | j                  k7  }t        | xs | j                  d| j                   d|j                  j                          |j                  j                         |j                  j                         k(  }|r|s(|j                  j                  |_        |j                  }n"t        t        |d      d       |j                   }|j                  j"                  }| j$                  rw|j"                  |k7  rh|j'                  |      |_        nQ|j(                  }t        |j                  j                         |k(  d| d|j                  j                                 d|_        yyy)	z
        Prepare the gradient for the backward computation.

        This is done by saving and clearing any existing sharded gradient
        in ``.grad`` to enable computing a new unsharded gradient.
        z:Expects to be in `BACKWARD_PRE` or `IDLE` (if prefetching)Nz&Expects the sharded gradient to be on r>  ry   z7`_cpu_grad` should be defined if the gradient is on CPUzFExpects `.grad` to be the unsharded gradient in `no_sync()` with size z but got size )r   r   r   r  r   r  r  r   re   r   ri  r   rs   r   rz   r  ry   r   r   r  rf   )ra   r  grad_offloadedprev_iter_synced_gradientsr  local_shard_dtypepadded_unsharded_sizes          r<   prepare_gradient_for_backwardz-FlatParamHandle.prepare_gradient_for_backward  s     	  #002E2J2JKLH	

 __
??&OO  "j&I&II%%):)::))$//:'__33t{{BN"":d&:&:8 F%??1124 $$&**//12 ' * &3=??3G3GJ0#-#?#?L
K8Q $.#7#7L %/$;$;$A$A!22$**.??(48I(JL%(2(I(I%OO((*.CC--B,C D$$.OO$8$8$:#;= #JOc ; 'r;   c                 D     fd} j                   }t        |d      r< j                  |        j                  |       |j                  |_         ||       nt        |d      r{ j                  |        j                  |       |j                   j                  |j                         |j                  rL|j                  |_        |j
                  / ||       n&t         j                   xs |j                   d       t        |d      rt        |d       yy)ziPrepare the gradient for optimizer computation by moving the sharded gradient to the ``.grad`` attribute.c                 T   j                   sj                  rt        | j                  d ud       | j                  j                  j
                  k7  rR| j                  j                  j
                        | j                  _        j                  rj                          y y y y y )NzUnexpected None grad!)
rz  r   r   r  r   r   r  r   r   r  )r  ra   s    r<   "cast_grad_to_param_dtype_if_neededzVFlatParamHandle.prepare_gradient_for_optim.<locals>.cast_grad_to_param_dtype_if_neededJ  s    --$2P2P*//57NO??((D,E,EE+5??+=+=d>W>W+XJOO(,,446 - F 3Q-r;   ry   rz   NzcAll sharded parameters that received a gradient in the post-backward should use `_saved_grad_shard`)r  r  r  _check_on_cpury   r  ri  rz   r   r   r)  r  )ra   r  r  s   `  r<   prepare_gradient_for_optimz*FlatParamHandle.prepare_gradient_for_optimG  s   	7 __
 :{+
+z*(22JO.z:Z!45
+))*5++7--j.J.JK //",">">
??.6zB... 8!777? :23J 34 4r;   c           
   #   $  K   | j                          t        | j                  j                         | j                  j                  k(  d| j                  j                   d| j                  j                                 | j                  | j                         t        t        | j                  | j                               d       | j                  t        j                  d             | j                          	 d t        | j                  j                         | j                  j                  k(  d| j                  j                   d| j                  j                                 | j                         }|d| j                  j                          j                  | j                         | j                  |       y# t        | j                  j                         | j                  j                  k(  d| j                  j                   d| j                  j                                 | j                         }|d| j                  j                          j                  | j                         | j                  |       w xY ww)aS  
        Move the unpadded unsharded flat parameter to CPU while in the context and moves it back to the previous device upon exit.

        For now, this assumes the ``FlatParameter`` is the unpadded unsharded flat parameter
        since (1) there is no reason to include the padding in the copy and (2)
        there is no use case for the sharded flat parameter.

        Precondition: ``self.flat_param`` 's data is the unpadded unsharded
        flat parameter on the compute device, and the handle uses a sharded
        strategy.
        Postcondition: Same as the precondition.
        zExpects size r>  zEExpects the unpadded parameter to be a view into the padded parameterrf  N)r  r   r  r   re   ri  _same_storager  r|  rY   r   _free_unsharded_flat_paramr  r   r  r  )ra   r  s     r<   to_cpuzFlatParamHandle.to_cpus  s     	$$&OO  "doo&N&NNDOODDEYtOcOcOeNfg	
 	%%doo6
 	$//4+P+P+RSS	
 	5<<./'')	H$$&$//*R*RR H HISWSbSbSgSgSiRjk +/*Q*Q*S''(A$//*?*?*ABHH **+FG $$&$//*R*RR H HISWSbSbSgSgSiRjk +/*Q*Q*S''(A$//*?*?*ABHH **+FGs    C;J>G CJCJJfree_unsharded_flat_paramc                 J    | j                          |r| j                          yy)a~  
        Run the reshard logic.

        This includes freeing the unsharded flat
        parameter if ``free_unsharded_flat_param`` and switching to using the
        sharded flat parameter. Note that this also implicitly offloads
        the sharded flat parameter (if CPU offload is enabled) by pointing
        it to the ``_local_shard`` attribute which resides on CPU.
        N)_use_sharded_flat_paramr  )ra   r  s     r<   reshardzFlatParamHandle.reshard  s$     	$$&$++- %r;   c                 r    | j                   r+| j                  s| j                  s| j                          yyyy)aC  
        Run the post-reshard logic.

        This includes freeing any memory that
        can now be freed given that the ``FlatParameter`` points to the full
        precision sharded flat parameter.

        Precondition: ``self.flat_param`` 's data points to the full precision
        sharded flat parameter.
        N)rl  r)  rz  r  r  s    r<   post_reshardzFlatParamHandle.post_reshard  s:     ,,....224 / / -r;   c                     | j                          | j                         }| j                  |       t        || j                  j                                t        |       y)a1  
        Free the padded unsharded flat parameter. We allow this
        function to be called even when storage is not allocated

        The tensor to free depends
        on the calling context since the unshard may have forced full
        precision, in which case a different tensor is used.
        N)r  r  ri  r   r   r  r   )ra   r  s     r<   r  z*FlatParamHandle._free_unsharded_flat_param  sU     	$$&#DDF%%&:;" $"5"5"D"D"F	
 	*+r;   c                    | j                   }| j                  rW| j                  t        j                  k(  }t        j                         xr |xr | j                  t        v }|r|j                  }| j                  r;|j                  j                  }t        |t        j                  d      k(  d|        |j                  |_	        | j                  rr| _        n| j                          rr| j                   se|j"                  duxr1 | j$                  xr# |j"                  j&                  |j(                  k(  }|r| j+                          y| j-                          yyyy)z-Switches to using the sharded flat parameter.rf  z-Expects the local shard to be on CPU but got N)r  r   r   r   r  rY   is_grad_enabledr   *NO_RESHARD_AFTER_FORWARD_HANDLE_STRATEGIESr   r   rs   r   r   r   r4  rw  r  r)  r  re   r  r  )ra   r  r  skip_use_sharded_viewsr  r   accumulated_grad_in_no_syncs          r<   r  z'FlatParamHandle._use_sharded_flat_param  sQ   __
  --1D1L1LLJ%%' >>++=> # &'1$,,33F%,,u--?xH %11
  %?S<'')  77
 OO4/ U22U"--1T1TT ,
 /224002 8	  !r;   c                       j                   }||} fdt        t        j                  ||j                  d      |j
                  |j                        D        }|S )a-  
        Return unflattened ``Tensor`` views into ``tensor``.

        If `tensor`` is ``None``,  ``flat_param`` is used. The unflattening is based
        on ``flat_param`` 's metadata.

        Examples for ``tensor`` include ``flat_param.grad`` or unsharded
        tensor optimizer state.
        c              3   r   K   | ].  \  }}}t        |j                  |      |j                         0 y wr   )r"   r  r   ).0	subtensorr  param_extensionra   s       r<   	<genexpr>z>FlatParamHandle._get_unflat_views_unaligned.<locals>.<genexpr>  s>      
 4E? *u%$$
s   47r   r  )r  r   rY   splitrn   rj   rl   )ra   r  r  viewss   `   r<   r   z+FlatParamHandle._get_unflat_views_unaligned  s]     __
>F
 8;FJ$6$6A>"",,8
 r;   c           	      \   | j                   }||}t        j                  ||j                  d      }d}g }t	        ||j
                        D ]\  \  }}|r	|j                  t        |j                  |j                  |         |j                  |   | j                               |dz  }^ |S )a=  
        Return unflattened ``Tensor`` views into ``tensor`` with handling for padding.

        This method has the same contract as :meth:`_get_unflat_views_unaligned`
        except it checks for ``None`` placeholders representing padding for
        alignment, which may incur slightly more CPU overhead.
        r   r  r!   )r  rY   r  rm   r   r   r   r"   r  rj   rl   r   )ra   r  r  splitsidxr  r  r   s           r<   r   z)FlatParamHandle._get_unflat_views_aligned+  s     __
>F${{J33 
  !$VZ-H-H!I 
	E:LL-JJz11#67005(( 1HC
	 r;   r   c           
         | j                   }| j                  |       | j                         }ddlm} t        t        ||j                              D ]  \  }\  }\  }}}	| j                  rv|rtt        |      |u r3| j                  ||t        j                  ||j                               \| j                   j                  |   }
| j                  |||
       ||
_        |r3| j                  ||t        j                  ||j                               |}| j                  rv| j                   t"        j$                  k(  r|| j                   j&                  |<   n?| j                   t"        j(                  k(  r"| j                   j&                  |   }||_        |}| j+                  |||       | j                  sj| j                   t"        j$                  k(  s||j,                  |<    t        | j                   j.                        D ]  \  }\  }}}	}}}	t1        ||      }t3        | xs t5        |t        j                        d| dt        |              | j                  r6|r4| j                   j6                  |   }| j                  |||       ||_        |r| j                  |||       | j+                  |||       | j                  s| j                   t"        j$                  k(  s||j,                  |<    y)a  
        Unflatten the unsharded flat parameter by setting the original parameter variables to be views into it.

        Args:
            as_params (bool): If ``True``, then registers the original
                parameters as ``nn.Parameter`` s; if ``False``, then registers
                the original parameters only as ``Tensor`` s. ``False`` should
                be used during forward/backward computation and when hiding the
                original parameters from :meth:`nn.Module.named_parameters`.

        Note:
            when prefetching for next forward, current forward may be
            annotated with `@torch.no_grad()`
            `@torch.enable_grad()` ensures non-empty `view.grad_fn`
            otherwise `_post_backward_hook` will not get called
        r   DTensorr"  z
as_params=z type(prim_param)=N)r  r  r   torch.distributed.tensorr  rD  r   ri   r   typer   rE   r   r   r{   r   r   r   r  r}   r  r   _parametersrp   r`   r   r_   r|   )ra   r   r  r  r  rG  r  r>   r?   r   r   	param_varr  rH   rI   
prim_paramshared_params                    r<   r   z$FlatParamHandle._use_unsharded_viewsL  s   & __
j)&&(42;z../3
 )	?.A.-z61 $$:( ''"T9Q9QR
 //2##FJ>!
##LLZ5M5MN %)	((++/B/J/JJ6:003--1D1Q1QQ "&!9!9!!<&*$*	$$VZC)),,0C0K0KK5>F&&z2S)	?b t::;	@ 
A 
6=_7J EJ!EYK'9$z:J9KL $$#==a@##FJE$.!##FJ
C$$VZD)),,0C0K0KK5?F&&z27	@r;   c                    | j                   j                  At        | j                   j                  | j                   j                        D ]	  }d|_         y| j                  | j                   j                         | j                  | j                   j                        }t        t        || j                   j                              D ]  \  }\  }\  }}}t        t        ||      | j                   j                  |    d       t        ||      }|j                  |j                  k7  s2|j                  |j                  k7  s|j                   |j                   k7  r8|j                  t#        j$                  |      |_        ||j                  _        ||_         t        | j                   j(                        D ]  \  }\  }}}}	}
}t        t        ||      |r|dz   |z   n| d       t        ||      }t        |
|	      }|j                  |j                  j                  k7  sF|j                  |j                  j                  k7  s#|j                   |j                  j                   k7  rB|j                  t#        j$                  |      |_        |j                  |j                  _        |j                  |_         y)z
        Unflatten the unsharded flat parameter's gradient.

        The original parameter variables' gradients are set to be views into
        the unsharded flat parameter's gradient.
        Nz is missingr   )r  r  r   r{   r|   r  r   rD  r   ri   r   r  rk   r`   r  r   r   rY   rm  r   rp   )ra   r   r  rG  r  r>   r?   r   r@   rH   rI   r  s               r<   r  z)FlatParamHandle._use_unsharded_grad_views  sL    ??'t668V8VW "!
"doo223&&t';';<2;t3343
 	".A.-z61 
+??((+,K8 FJ/Etzz);;$**,<<4;;. ::%!&!1!1%!8EJ"&

!
7	"F t::;	- 
A 

+5@;$z1jQQ\] FJ/E o>Jz444;;*//"7"77<<:??#9#99 ::%!&!1!1%!8EJ",//

'__
3	-r;   c              #      K   | j                  d       	 d | j                  d       y# | j                  d       w xY ww)a`  
        Unflatten the original parameters.

        The function assumes that the flat parameter is unsharded. When in the context,
        unflattens the original parameters as ``nn.Parameter`` views into the
        flat parameter, and after the context, restores the original parameters
        as ``Tensor`` views into the flat parameter.
        Tr   NF)r   r  s    r<   unflatten_as_paramsz#FlatParamHandle.unflatten_as_params  sC      	!!D!1	7%%%6D%%%6s   A- AAAc                    d| _         | j                  s| j                  d       y| j                  }| j	                  |       t        j                  d| j                  j                  | j                  j                  d      }t        |j                  |j                  |j                        D ]V  \  }}\  }}}| j                  |||       |j                  s||_        2|j                   }|j"                  }	||||	z    |_        X | j                  j$                  J t'        t        | j                  j$                  | j                  j(                              D ]5  \  }
\  }\  }}}}}}| j                  |||       t+        ||      }||_        7 | j,                  t.        j0                  k(  rGt3        t5        | j                  j6                              D ]  }
d| j                  j6                  |
<    yy)a,  
        Set the original parameter variables' data to be flattened views into the sharded flat parameter.

        The views are kept as flattened to simplify the case where a parameter
        is sharded across ranks. Parameters whose data is not present in the
        sharded flat parameter have their data set to a size-0 empty tensor. We
        do not delete them to ensure to preserve expected behaviors like model
        printability. Parameters whose data is present must preserve their
        variables to be passable to an optimizer.
        NTr   r   F)r   r   r   )r   r)  r   r  r  rY   rn  r   r   r   r{   ro   ri   r   rM   r   rN   rO   r|   rD  rp   r`   r   r   BACKWARD_POSTr   r   r}   )ra   r  size_0_empty_tensorr   rK  r>   r?   r   offsetrO   rG  rH   rI   r  s                 r<   r4  z"FlatParamHandle._use_sharded_views  s    8<4)) %%%5__
J'#kk//''??))	
 AD
 = =z?V?VA
 
	J<E#%<j&! 
E:#,,0
)99!1!@!@'.1HI

	J --999 ..0S0ST
	$ 
A 
DZO[! 
E: o>J#EJ	$ #6#D#DD3t7789 3.2((+3 Er;   c                    | j                   }| j                  |       | j                  }|-t        |j                  |j
                        D ]	  }d|_         y| j                  |       t        |j                  |j                  |j                        D ]  \  }}}|j                  sd|_        |j                  }|j                  r|s|j                  }| j                  s|j                  |j                  k7  rW|j                  t!        j"                  |      |_        ||||z    j%                  |j&                        |j                  _        ||||z    j%                  |j&                        |_        d|_         |j
                  J t+        t        |j
                  |j,                              D ]N  \  }\  }\  }	}	}	}
}}	t/        ||
      }|r*|j                  rt1        ||
      }|j                  |_        Hd|_        P y)a  
        Set the original parameter variables' gradients to be flattened views into the sharded flat parameter's gradient.

        This is a no-op if there is no gradient.

        Parameters whose data is not present in the sharded flat parameter and
        parameters with ``requires_grad=False`` have their gradients set to
        ``None``. Since the gradient variables do not need to be preserved,
        this method does not manipulate existing ``Tensor`` data directly and
        creates new ``Tensor`` variables instead.
        N)r  r  r  r   r{   r|   r  r   ro   r~   rM   rO   r   rN   r   r   rY   rm  reshaper  r   rD  rp   r  r`   )ra   r  r  r   rK  is_grad_nonerO   r  rG  r   rH   rI   rJ  r  s                 r<   r  z'FlatParamHandle._use_sharded_grad_views7  s    __
J'  <z11:3L3LM "!
"D!58))))6
 	&1E#\
 $,,!
!1!@!@&&|-==F55

9R !::-).)9)9%)@EJ*."Vn%<+!'%++. 

 &*&6N3J%K%S%S!KK&
 "&EJ5	&6 ((444FO
)):+I+IJG
 	"BABA1a+q %,K$I!$)<)<$[/B
'__
!
	"r;   c           	      N   | j                   r(| j                  | j                        s| j                  sy| j                  }d}| j                  r.| j                   r"| j                  }t        t        |      d       n|}| j                   s| j                  s|j                  n|j                  }t        t        |j                  |j                  |j                              D ]  \  }\  }\  }}}	}
}
\  }}}
|st        ||      s%| j                  r-|j                   |   }t        |dud|j"                  |           t%        ||      |u}|xs t'        ||       }| j                  r|s|rt)        d| j*                         |rt%        ||      }||j                  |<   |r.t-        j.                  |	g      }| j1                  |||||d       d}| j                  r|j                  D|j                  8t-        j.                  |	g      }| j1                  d|j                  |||d       O|j                  ]| j                   s| j                  rw|du xs t'        |j                  |       }|s|t-        j2                  |      }t-        j.                  |	g      }| j1                  |j                  ||||d       ||_        |j                  } t        |j4                        D ]/  \  }\  }}}
}}}
t%        ||      t%        ||      us&t7        d       |S )as  
        Write back any parameters that changed storage to the handle's ``FlatParameter``.

        Iterates over the original parameters and writes back any parameters
        that changed storages (due to a non-inplace operator) to the handle's
        ``FlatParameter``. This method preserves the ``FlatParameter` 's
        device even if an original parameter's device changes.

        Raises:
            RuntimeError: If an original parameter or gradient changes storages
            but no longer has the expected flattened shape.
        Returns: ``True`` if some writeback happened, and ``False`` otherwise.
        FzPIf skipped using sharded views, the unsharded flat parameter should be allocatedNz!Expects to have saved tensor for zOFSDP does not support changing the parameters between forward and backward for Tz/Changing shared parameters is not supported yet)r)  
is_shardedr  rw  r   r   r   r   r  ry   rD  r   r{   ro   ri   r  r}   rk   r`   r  AssertionErrorr   rY   rZ   _writeback_tensorrk  rp   NotImplementedError)ra   r  	wrotebackflat_param_tensorflat_param_gradrG  r   rM   rN   rO   r   r>   r?   param_changedneeds_param_writebackexpected_shapeneeds_grad_writebackrH   rI   s                      r<   rx  z&FlatParamHandle._writeback_orig_paramss  sx   " &&OODOO433 __
	**t/I/I
 !% L L#$56& !+ ))1E1E OO%% 	 ""--''
	U	6 
A 
=X1#Z 6:.  .."++A.%7
8H8H8K7LM $FJ7uDM ?$U,=>> " ..!6$0040G0G/HJ  
3(-
""1%$!&^,<!=&&:q./4 !	 .. zz!joo&A!&^,<!=&&*//1nou ' 11d6J6J
 '6$'> (mJJG C$ (&.*/*:*::*F%*ZZ0@%AN**

'&' '6JO&0ooOkU	6@ z556	 
A 
vz*'+2WW)E 	 r;   
src_tensor
dst_tensortensor_indexr  r  is_paramc                    t        t        |      dk(  d|        | j                  t        j                  j
                  k(  rt        | d      r| j                  nt        j                         }||j                  nd}||j                  nd}	t        j                  d| d|rdnd d	| j                   d
| d| d|j                   d|	        |1|j                  |k7  r"t        d|rdnd d| d|j                         |&||||j                         z    j!                  |       y||||j                         z    j#                          | j$                  j&                  J d| j$                  j&                  |<   y)a  
        Write back ``src_tensor`` to ``dst_tensor`` at offset ``offset``, where ``src_tensor`` should have shape ``expected_shape``.

        ``is_param`` indicates if the tensor is the parameter (if ``True``) or gradient (if
        ``False``). If ``src_tensor`` is ``None``, then the effect is zeroing
        instead of copying. ``tensor_index`` gives the index of ``src_tensor``
        in the metadata structures.

        Raises:
            RuntimeError: If the ``src_tensor`` does not have the expected
            shape.
        r!   z$Expects a 1D expected shape but got r   Nr  z] r   Gradientz needs writeback in z
expected shape=z shape=z expected device=z device=zCannot writeback when the 	parametergradientz shape changes
Expects r>  T)r   r   r   r   r  r  r  r   get_rankr  r   r  r  r   RuntimeErrorr   r  zero_r  r~   )
ra   r  r  r	  r  r  r
  r   	src_shape
src_devices
             r<   r  z!FlatParamHandle._writeback_tensor		  s   * 	1$2>2BC	
  4 44 'f 54994==?D,6,B
((I.8.D**$JMMbj I J  $ 4 45 6""0!1 D##-#4#4"5Xj\K !j&6&6.&H ,H[*,U V**8)9:CSCSBTV  !v)=)=)? ?@FFzRv)=)=)? ?@FFH??55AAA?CDOO..|<r;   c                     | j                   sy| j                  }|j                  J d}d}|j                  D ]"  }||j                  du z  }||j                  z  }$ |rd|_        ||_        y)a  
        Reset ``flat_param.grad`` if needed.

        When ``use_orig_params=True``:
        (1) sets the underlying ``flat_param.grad`` to ``None`` if *all* of the
        original parameters' ``.grad`` are ``None``, and
        (2) sets ``flat_param.requires_grad=False`` if *none* of the original
        parameters require gradient.
        For (1), this is targeting ``optim.zero_grad(set_to_none=True)``, in
        which case we want to free the gradients as soon after the
        ``zero_grad()`` call as possible.
        NTF)r   r  r{   r  r   )ra   r  all_grad_noner   r   s        r<   %_reset_flat_param_grad_info_if_neededz5FlatParamHandle._reset_flat_param_grad_info_if_needed;	  s     $$__
!!---'' 	1EUZZ4//MU000M	1 "JO $1
 r;   c                     | j                   j                  D ]!  }|\  }}}t        ||      st        ||       # | j                   j                  D ]"  \  }}}}}}t        ||      st        ||       $ y r   )r  ri   r  r  rp   )ra   
param_infor>   r?   r   s        r<   _deregister_orig_paramsz'FlatParamHandle._deregister_orig_paramsW	  sw    //66 	,J$.!Jvz*
+	, /3oo.Q.Q 	,*J1avz*
+	,r;   c                      | j                   j                  |i || j                   _        | j                  r?| j	                  | j                         r| j                          y| j                  d       yy)z;Wrap an in-place call to ``.to()`` for ``self.flat_param``.Tr   N)r  r  r   r   r  r4  r   )ra   argskwargss      r<   r|  zFlatParamHandle.flat_param_toc	  s_    1t114B6B  t/'')))D)9 !r;   c                     | j                   j                  D ch c]  }|j                   c}j                  | j                   j                  D ch c]  }|j                   c}      S c c}w c c}w )zcReturn a :class:`set` of the modules whose parameters are included in this handle's flat parameter.)r  ri   r?   r   rp   )ra   r   r   s      r<   _get_moduleszFlatParamHandle._get_modulesm	  sP    $(OO$@$@Ab		AGG#'??#F#FGCSZZG
 	
AGs   A*A/c                     t        | j                  d      r| j                  sy| j                  j                  }|j	                         |k(  S )z
        Return whether ``tensor`` is *currently* sharded.

        For ``NO_SHARD``, we choose to have this always return ``False`` for clarity.
        rg   F)r  r  r)  rg   r   )ra   r  sharded_sizes      r<   r  zFlatParamHandle.is_shardeds	  s@     9-- 44{{},,r;   c              #      K   | j                   j                  D cg c]  \  }}}}}}t        |||       }}}}}t        | j                   j                  |      D ]  }|\  }}}||f  y c c}}}}w wr   )r  rp   r(   r   ri   )ra   r>   r?   r@   r   r   r  s          r<   param_module_namesz"FlatParamHandle.param_module_names	  s      44

 

 j&+6

 

   < <>PQ 	,J)3&J;{++	,

s   A8A0
?A8c              #      K   | j                   j                  D cg c]  \  }}}}}}t        |||       c}}}}D ]  \  }}}||f  y c c}}}}w wr   )r  rp   r(   )ra   r>   r?   r@   r   s        r<   shared_param_module_namesz)FlatParamHandle.shared_param_module_names	  so      44
+
 
+
 j&+6
+
 	,&J; {++	, 
+
s   AA
Ac                     g }t        | j                  j                  | j                  j                        D ]#  \  }}|j                  s|j                  |       % |S )z?Return the FQNs of the parameters present in this rank's shard.)r   r  rk   ro   rM   r   )ra   fqns_in_shardr  rK  s       r<   _fqns_in_shardzFlatParamHandle._fqns_in_shard	  s]     $&%(OO!!4??#E#E&
 	*!C!  (($$S)		*
 r;   c                 H   | j                   }t        |d      r|j                  }|S t        |d      r|j                  }|S t	        |j
                  du xs; | j                   xs, | j                  t        j                  t        j                  fv d       |j
                  }|S )z%Return the handle's sharded gradient.ry   rz   NzZSharded strategies should use `_cpu_grad` or `_saved_grad_shard` unless in IDLE or FORWARD)r  r  ry   rz   r   r  r)  r   r   r  r   )ra   r  r  s      r<   r  zFlatParamHandle.sharded_grad	  s     __
 :{+''D& % Z!45 //D  4' K111K'''//1D1I1IJK, ??Dr;   c                 .   | j                   syt        | j                  t        j                  k(  d       | j
                  }|j                  J t        |j                        D ]/  \  }}|j                  s|j                  J d|j                  |<   1 y)a2  
        Reset ``_is_grad_none_mask`` as needed.

        This method should only be
        called in the post-backward after gradient computation, in which case
        if a parameter requires gradient, then it will surely receive a
        gradient and we may reset its mask entry to ``False``.
        NzIExpects to only be called in the post-backward after gradient computationF)
r   r   r   r   r  r  r{   rD  r   r~   )ra   r  rG  r   s       r<   _reset_is_grad_nonez#FlatParamHandle._reset_is_grad_none	  s     $$  $7$E$EEW	
 __
!!---!*"4"45 	9HAu ""!44@@@38
--a0	9r;   c                 0    t        | j                  d       y )NzExpects sharded strategy)r   r)  r  s    r<   r  z'FlatParamHandle._check_sharded_strategy	  s    $,,.HIr;   c                 ~    t        |j                  | j                  k(  d| j                   d|j                          y )Nz+Expects tensor to be on the compute device z	, was on )r   r   ra   r  s     r<   ri  z(FlatParamHandle._check_on_compute_device	  s5    MMT[[(9$++iPVP]P]_	
r;   c                 v    t        |j                  t        j                  d      k(  d|j                          y )Nrf  z$Expects tensor to be on CPU but got )r   r   rY   r-  s     r<   r  zFlatParamHandle._check_on_cpu	  s-    MMU\\%0026==/B	
r;   c                     t         j                  j                  j                         st	        t        | d      d       y y )Nr   z9Expects storage to be freed but got storage with size > 0)rY   r-  r.  r/  r   r  r  s    r<   r  z$FlatParamHandle._check_storage_freed	  s7       88QQS"61-K Tr;   c                 .    t        t        |       d       y )NzExpects storage to be allocated)r   _storage_size_allocatedr0  s    r<   _check_storage_allocatedz(FlatParamHandle._check_storage_allocated	  s    )&13TUr;   c                    t        | j                  d       t        t        | j                  dd       d ud       | j                  j                  j
                  }t        || j
                  k(  d| j
                   d|        y )Nz&Not using low precision for parametersrx   zExpects `_mp_shard` to existz)Expects the low precision shard to be on r>  )r   rl  r`   r  rx   r   )ra   r   s     r<   r  z*FlatParamHandle._check_low_precision_shard
  sw    ,,4	
 	DOO[$7tC*	
 **11dkk!7}IfXV	
r;   c           	          d}t        |d u|dz          | j                  j                  }t        |j                         |k(  |d| d|j                          z          y )NzExpects tensor to be unsharded but got `None`
with size r>  )r   r  re   r   )ra   r  
msg_prefixr  s       r<   r  z FlatParamHandle._check_unsharded
  s]    6
&$j3C&CDAAKKM^+:n%5Yv{{}oNN	
r;   c           	          d}t        |d u|dz          | j                  j                  }t        |j                         |k(  |d| d|j                          z          y )NzExpects tensor to be sharded r6  r7  r>  )r   r  rg   r   )ra   r  r8  r   s       r<   r  zFlatParamHandle._check_sharded
  s\    4
&$j3C&CD44KKM\):l^9V[[]OLL	
r;   c                 <    | j                   t        j                  k7  S r   )r   r*   r7   r  s    r<   r)  z%FlatParamHandle.uses_sharded_strategy%
  s    &&*@*I*IIIr;   c                 4    | j                   | j                  k7  S r   )r   r   r  s    r<   rl  z+FlatParamHandle._uses_param_mixed_precision)
  s    ((D,B,BBBr;   c                 4    | j                   | j                  k7  S r   )r'  r   r  s    r<   _uses_reduce_mixed_precisionz,FlatParamHandle._uses_reduce_mixed_precision-
  s    !!T%;%;;;r;   c                     | j                   xs | j                  xrD | j                  t        j                  k(  xs% | j
                  j                   xr | j                  S r   )rl  r=  r   r   rv  r   trainingr   r  s    r<   rz  z%FlatParamHandle._force_full_precision1
  s^     ,,Q0Q0Q
   $7$J$JJ V ++444T9T9T	
r;   c                     | j                   duS )a?  
        This property is used for sharding strategies that do not free after forward with ``use_orig_params=True``.

        This returns if this handle is
        currently in a state where it has skipped using sharded views, in which
        case it can restore view invariants via ``_use_sharded_views()``.
        N)r   r  s    r<   rw  z*FlatParamHandle._skipped_use_sharded_views<
  s     ;;4GGr;   )r   Nr   )jr2   r3   r4   rB   r   r   rE   r   r   rF   rY   r   r*   rR   r   r   r   ProcessGroupr$   r   r   r   r   rS   r   r   r   r   r%   r  r   no_gradr:  r*  rL   r?  staticmethodrP  r,  rZ   rX  rC  r   r'   rd  rr  r~  r{  r  ry  r  r  r  r  r  r  r  r  r  r  
contextlibcontextmanagerr  r  r  r  r  r   r   r   enable_gradr   r  r   r  r4  r  rx  r  r  r  r|  r   r  r  rC   r"  r$  propertyr'  r  r*  r  ri  r  r  r3  r  r  r  r)  rl  r=  rz  rw  __classcell__)r   s   @r<   r&   r&     s   Z 48g3r||V345g3 !iig3 	g3
 2g3 g3 !-g3 "%++.g3 #'g3 ((g3 g3 !0g3R	@
 
T
U62<</01T
 		T
 	T

 T
 
T
l%7E&",,"678%7	%7N0.f0. 0. 
	0.dKfK K 	K
 
K!. -!. "%++.!. 
	!.L U]]_& &B)6)6 !)6 	)6
 
)6V=( =( =( 
#	$	=(~ ### # 
vs{		# #8 ### # 
vs{		# #& E& E E E E E
eCHo)> 
 #
	#
 #
J U]]_SG  SGp!T !F/ D4%t %$ $%,, $:/+%+/+ 
/+b7%*\\7 
7>71 U]]_7) 7)r6@#D*5X )H )HV. .$5*,$/3h  *.& 
&	 <  $(  
f @ U[@d [@t [@  [@z C- C-J 7Y 7 7 U]]_23  23h U]]_8"  8"t U]]_R R  Rh0DV$0D 0D 	0D
 

0D 0D 0D 
0Dd18,:
c"))n 
- -D -,HU38_$= , ,8E#s(O+D , S	   hv.  <96J
v 

F 
 V   V V V

v 

V 
 Jt J J CT C C <d < < 
t 
 
 HD H Hr;   r&   r?   r>   r   r   c                 ^    || j                   |<   t        t        j                  |   ||       y r   )r  r   rE   rF   __setattr__)r?   r>   r   s      r<   r   r   I
  s+     &+Fz" 
"))V(U;r;   r  c                 x    | j                   j                  |d        t        t        j                  |   ||       y r   )r  popr   rE   rF   rJ  )r?   r>   r  s      r<   r   r   R
  s/    
:t, 
"))V(V<r;   tensor_or_paramc                 N    t        | |      rt        | |       t        | ||       y r   )r  r  setattr)r?   r>   rM  s      r<   r   r   Y
  s$     vz"
#FJ0r;   r  c                     | D cg c]3  }t        |t        j                        r|nt        j                  |      5 c}S c c}w r   )r_   rE   r   )r  ts     r<   r  r  b
  s4     LSSaAr||,A",,q/ASSSs   8A param_or_tensorc                 Z    t        | t        j                        r| j                         S | S r   )r_   rE   r   detach)rR  s    r<   r  r  h
  s/     or||4 	  r;   r   c                 *    d}t        |       }||z  }|S )N   )_get_dtype_size)r   	ALIGNMENTunsharded_dtype_sizer   s       r<   r   r   p
  s#    I*?;!55Mr;      c                 L    t        j                  d|       j                         S )Nr:   r   )rY   rn  element_sizer\  s    r<   rW  rW  x
  s    ;;r'4466r;   padding_numelr   r   r   c                 D    t        j                  | f|||      t        z  S )N)r   r   r   )rY   ones_FLAT_PARAM_PADDING_VALUE)r^  r   r   r   s       r<   r  r  }
  s+     	

Ev	
 $	$r;   logwarningc                 .    t         j                  |       y r   r   rc  rb  rc  s     r<   r   r   
      
NN7r;   c                 .    t         j                  |       y r   re  rf  s     r<   r   r   
  rg  r;   c                 .    t         j                  |       y r   re  rf  s     r<   r   r   
  rg  r;   c                     ddl m} t        | |      r| j                  } t        ||      r|j                  }| j	                         j                         |j	                         j                         k(  S )Nr   r  )r  r  r_   _local_tensorr  data_ptr)abr  s      r<   r  r  
  s\     1!WOO!WOO'')Q->->-@-I-I-KKKr;   rm  rn  c                 f    | j                         j                         | j                         z  |k(  S r   )r  r   r]  )rm  rn  s     r<   r  r  
  s+    ##%)99Q>>r;   c                 H    | j                         j                         }|dkD  S )Nr   )r  r   )r  storage_sizes     r<   r2  r2  
  s$    ..0557L!r;   )frD  	functoolsloggingr   r  enumr   r   	itertoolsr   r   typingr   r   r	   r
   r   r   r   r   r   r   r   r   r   r   rY   torch.distributedr-  r   torch.nnrE   torch.nn.functional
functionalrS  r   $torch.distributed.fsdp._common_utilsr   r   r   r   r   torch.distributed.utilsr   r   r   r   torch.nn.parameterr   +torch.testing._internal.distributed.fake_pgr    _fsdp_extensionsr"   r#   r$   __all__	getLoggerr2   r   r   r   r   ra  r   r   r*   r5   r8   'RESHARD_AFTER_FORWARD_HANDLE_STRATEGIESr6   r9   r  r(   r)   rL   r'   r\   r   r%   r&   rF   rC   r   r   r   r  r  r   r   	lru_cacherW  rS   rR   r   r  Loggerr   r   r   r  r  r2  r:   r;   r<   <module>r     sW      	   '   "          . H  
		8	$4 5  9  ;    7 . !T ! %%''+ '
 ((... *
 j $'j '/Z /,
 
o+BLL,> o+dB"H B"HLD<II<#&</1||<	<=299 =# =v =RV =1II1#&19>vr||?S9T1T%bll234T	",,TuR\\6-A'B v   Q7 7

${{
;?
IN
 QGNN S  
 Q7>> C  
 Qw~~   	L?%,, ?3 ?F r;   