
    sgS                    P   d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZ d dl	m
Z
mZmZmZmZmZmZmZmZmZmZmZmZmZ d dlZd dlmZ d dlmc mc mZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- d dl.m/Z/ d d	l0m1Z1m2Z2 d d
l3m4Z4m5Z5 d dl6m7Z7m8Z8 d dl9m:Z:m;Z;m<Z< d dl=m>Z>m?Z? d dl@mAZA erd dlBmCZC  ej                  eE      ZFe G d d             ZGdeeHe
f   deeeHe
f      fdZIe G d d             ZJ G d de      ZK G d de      ZLdeGdeeHe
f   deMdeMdeMdeeeHe
f      fd ZNd!e
deMfd"ZOdeGdeeHe
f   deJfd#ZPdeGd$eJdeMdeeeHe
f      fd%ZQd&e(d'eeHe
f   d(eej                     deeHe
f   fd)ZSd&e(d$e
d(eej                     de
fd*ZTdeGd+eHd'eeHe
f   deeHe
f   fd,ZU	 	 	 	 dfd-eeHe
f   d.e!j                  d/eMd0eej                  j                     d1eMd(eej                     deeHe
f   fd2ZYdeGd3eeHeeHe
f   f   d4eeH   deeHe
f   fd5ZZd6eHd7eej                     d4eeH   d8eej                     d9e2dej                  fd:Z]d6eHd;eej                     d4eeH   dej                  fd<Z^d6eHd=ee
   d4eeH   de
fd>Z_	 dgd?eeHe
f   d.e!j                  d0ej                  j                  d@eeeeeHe
f      ee!j                     f      dAeMdBeMdeeHe
f   fdCZa	 dhd.e!j                  d@eeeeeHe
f      ee!j                     f      deebe!j                  f   fdDZcd.ejB                  j                  dee1eHf   fdEZd	 	 	 	 did0ej                  j                  d.ee!j                     dBeMdFeee!j                  eeH   f      dGeee1eHf      deeebeHf   e!j                  f   fdHZe	 	 	 	 did0ej                  j                  d.ee!j                     dBeMdFeee!j                  eeH   f      dGeee1eHf      dee!j                  eebeHf   f   fdIZf	 dhd.e!j                  d@eeeeeHe
f      ee!j                     f      dee!j                  ebf   fdJZgdKeeL   dLeeLeeHebf   f   dMeeeHebf   e!j                  f   d(eej                     ddf
dNZh	 dgd-eeHe
f   d(eej                     dMeeebeHf   e!j                  f   dFee!j                  eeH   f   dOeeHeGf   dPeMdeeeL   eeLeebeHf   f   f   fdQZidReeHe
f   dMeeebeHf   e!j                  f   dFee!j                  eeH   f   deeeHe
f      fdSZjd-eeHe
f   deMfdTZke G dU dV             Zld&e(dWeeHe
f   deeeHelf      fdXZmdeGdYeeeHelf      dWeeHe
f   dZeeHeeHe
f   f   deeej                     eeHeeej                        f   f   f
d[ZodeGdZeeHeeHe
f   f   d6eHdeMdeMdeMddfd\ZpdeGdYeeeHelf      dWeeHe
f   deMdeMdeMdeeHeeHe
f   f   fd]ZqdeGdWeeHe
f   deMdeMdeMdeeHe
f   fd^Zr	 djd_eeL   dLeeLeebeHf   f   dOeeHeGf   d-eeeHebf   e
f   deMdeMdeMdeeHe
f   fd`Zs	 djd_eeL   dLeeLeebeHf   f   dOeeHeGf   d-eeeHebf   e
f   deMdeMdeMdeeHe
f   fdaZt ej                         	 	 dkd.e!j                  d0ej                  j                  d-eeHe
f   d@eeeeeHe
f      ee!j                     f      d1eMdeMd(eej                     dAeMd/eMdeMdeeHe
f   fdb       Zvd.e!j                  deeHeGf   fdcZwed&e(dde;ddfde       Zxy)l    N)	ExitStack)	dataclassfield)AnycastDictIterableIteratorList
NamedTupleno_type_checkOptionalSequenceSetTupleTYPE_CHECKINGUnion)_gather_state_dict)_get_pg_default_device)_apply_to_modules
_FSDPState._get_module_fsdp_state_if_fully_sharded_module_get_param_to_fqns_module_handle!_named_parameters_with_duplicatesclean_tensor_name)SimpleProfiler)FlatParameterFlatParamHandle)_ext_chunk_dtensor_ext_chunk_tensor)
_lazy_init%_reset_flat_param_grad_info_if_needed)ShardingStrategyStateDictSettingsStateDictType)DTensor	Replicate)tree_map_only)ShardedTensorc                   F    e Zd ZU eed<   eed<   eeef   ed<   e	e
   ed<   y)FSDPParamInfostatehandleparam_indicesparam_requires_gradN)__name__
__module____qualname__r   __annotations__r   r   strintr   bool     V/var/www/html/venv/lib/python3.12/site-packages/torch/distributed/fsdp/_optim_utils.pyr,   r,   B   s&    S>!d#r9   r,   
dictionaryreturnc              #   ^   K   t        | j                               }|D ]  }|| |   f  y wN)sortedkeys)r;   r@   ks      r:   sorted_itemsrB   J   s5     *//#$D As   +-c                       e Zd ZU dZ ee      Zeee	j                  f   ed<    ee      Zeee	j                  f   ed<    ee      Zeeef   ed<   y)_ConsolidatedOptimStateap  
    This holds the consolidated optimizer state on the target rank. Positive-
    dimension tensor state is communicated across ranks, while zero-dimension
    tensor state and non-tensor state is taken directly from the target rank.

    PyTorch version 1.12 moved to using zero-dimension tensors for scalar
    values, but user implemented optimizers may still use float (i.e. a
    non-tensor). Thus, we support both and handle them identically.

    Attributes:
        tensor_state (Dict[str, torch.Tensor]): Mapping from positive-dimension
            tensor state name to the unsharded flat tensor representing the
            state.
        zero_dim_tensor_state (Dict[str, torch.Tensor]): Mapping from zero-
            dimension tensor state name to its value.
        non_tensor_state (Dict[str, Any]): Mapping from non-tensor state
            name to its value.
    )default_factorytensor_statezero_dim_tensor_statenon_tensor_stateN)r1   r2   r3   __doc__r   dictrF   r   r5   torchTensorr4   rG   rH   r   r8   r9   r:   rD   rD   P   s\    & -2$,GL$sELL()G5:45P4U\\ 12P',T'Bd38nBr9   rD   c                   N    e Zd ZU dZej
                  ed<   ej                  ed<   y)_PosDimTensorInfoa  
    Meatadata for positive-dimension tensors used internally for
    :meth:`scatter_full_optim_state_dict`.

    Attributes:
        shape (torch.Size): Sharded tensor shape (which is equal to the
            unsharded tensor shape if the tensor is optimizer state for a
            non-FSDP parameter and is hence not sharded).
        dtype (torch.dtype): Data type of the tensor.
    shapedtypeN)r1   r2   r3   rI   rK   Sizer4   rP   r8   r9   r:   rN   rN   j   s    	 ::;;r9   rN   c                   0    e Zd ZU dZeedf   ed<   eed<   y)_OptimStateKeyz
    This represents an optimizer state key that may be used commonly across
    ranks. It is based on the unflattened parameter names rather than parameter
    IDs to make it independent of each rank's own optimizer construction.
    .unflat_param_namesis_fsdp_managedN)r1   r2   r3   rI   r   r5   r4   r7   r8   r9   r:   rS   rS   z   s     c3h'r9   rS   fsdp_param_infoflat_param_stateto_saveshard_statecpu_offloadc                    |r	|sJ d       t        | |      }|rjt        | ||      }|D ]V  }|st        |j                               D ]5  }||   }	t	        |	t
        j                        s#|	j                         ||<   7 X |S g S )aO  
    Unflattens the optimizer state, consisting of the "state" part and the
    "param_groups" part. Unflattening the "state" part involves consolidating
    the state on the target rank and remapping from flattened to unflattened
    parameter IDs, and the "param_groups" part only involves remapping from
    flattened to unflattened parameter IDs.

    Args:
        fsdp_param_info (FSDPParamInfo): The FSDP state, the handle, and a
            mapping from FQN to original parameter index.
        flat_param_state (Dict[str, Any]): Entry for the flat parameter in the
            "state" part of the optimizer state dict.
        to_save (bool): Whether to save the state on this rank.

    Returns:
        List[Dict[str, Any]]: A :class:`list` holding the entries in the
        "state" part of the optimizer state dict corresponding to the
        unflattened parameters comprising the flat parameter if on the target
        rank or an empty :class:`list` otherwise. The final optimizer state
        dict will need to map these entries using the proper unflattened
        parameter IDs.
    z7If ``shard_state`` is True, ``to_save`` has to be True.)_communicate_optim_state#_unflatten_communicated_optim_statelistr@   
isinstancerK   rL   cpu)
rV   rW   rX   rY   rZ   consolidated_stateunflat_param_stateoptim_statekeyr-   s
             r:   _unflatten_optim_statere      s    < 7A@A"1 @

 . 	3K 0 0 23 3C',E%eU\\: ',yy{K$	3	3 "!	r9   xc                 V    t        j                  |       xr | j                         dk(  S Nr   )rK   	is_tensordim)rf   s    r:   _is_zero_dim_tensorrk      s     ??1.!%%'Q,.r9   c                    | j                   }| j                  j                  }t               }|j                  |j
                  |j                  }}}t        |      D ]  \  }}	t        j                  |	      rB|	j                         dkD  r.|j                  dk(  s|j                  t        j                  k(  r|	||<   c|j                  J d       |	j                   j"                  |j                  j"                  k7  r|	j%                  |j                        }	|j&                  j)                         }
 |	j*                  |
 }t-        j.                  ||	|j0                         |j2                  j5                          t7        t8        j:                  |j<                        j?                         }|d| ||<   _tA        |	      r#|	jC                         jE                         ||<   |	||<    |S )a  
    Communicates the optimizer state for a flat parameter across ranks. All
    ranks will hold the entire non-sharded optimizer state on GPU.

    If ``N`` is the number of tensor optimizer states in the optimizer state
    dict, then the communication complexity is 0 if ``N = 0`` and ``N + 1``
    otherwise (where the plus 1 comes from all-gathering the padding per rank).

    Args:
        fsdp_param_info (FSDPParamInfo): The FSDP state, the handle, and a
            mapping from FQN to original parameter index.
        flat_param_state (Dict[str, Any]): The entry in the "state" part of the
            optimizer state dict corresponding to the flat parameter.

    Returns:
        ConsolidatedOptimState: Consolidated optimizer state for the target
        flat parameter.
    r      Nz'compute_device has not been initializedgroup)#r-   r.   
flat_paramrD   rF   rG   rH   rB   rK   ri   rj   
world_sizesharding_strategyr$   NO_SHARDcompute_devicedevicetypeto_full_param_paddedsize	new_zerosdistall_gather_into_tensorprocess_group_device_handlesynchronizer   nn	Parameter_unpadded_unsharded_sizenumelrk   detachclone)rV   rW   
fsdp_staterp   r-   rF   rG   rH   
state_namevaluebuffer_sizetensor_bufferunpadded_numels                r:   r\   r\      s   , !&&J ''22J#%E## *:'L **:; #5
E??5!eiikAo
 %%*//3C3L3LL+0Z())59895||  J$=$=$B$BB!:!:; %77<<>K+EOO[9M''uJ,D,D %%113!jAAeg  (5_n'EL$ #5)49LLN4H4H4J%j1/4 ,G#5H Lr9   r-   c           
      R   | j                   }| j                  }|j                  }g }i }|j                  }|j                  |j
                  |j                  }}
}	t        |      D ]?  }i }t        |	      D ]  \  }}||v }|s|j                  |      }|||<   n||   }t        |      }|r|j                  }t        |dd      r;|j                  J t        ||j                  |j                  |j                         }n^|j"                  J t%        ||j                  |j&                  |j(                  j+                         |j"                  |j                         }|||<    t        |
      D ]
  \  }}|||<    t        |      D ]
  \  }}|||<    |j-                  |       B |S )a  
    Unflattens the communicated optimizer state (given by ``tensor_state``,
    ``non_tensor_state``, and ``zero_dim_tensor_state``) for a single flat
    parameter. This should only be called on the target rank.

    Args:
        fsdp_param_info (FSDPParamInfo): The FSDP state, the handle, and a
            mapping from FQN to original parameter index.
        state (_ConsolidatedOptimState): Consolidated optimizer state.

    Returns:
        List[Dict[str, Any]]: A :class:`list` holding the entries in the
        "state" part of the optimizer state dict corresponding to the
        unflattened parameters comprising the flat parameter. The final
        optimizer state dict will need to map these entries using the proper
        unflattened parameter IDs.
    _use_dtensorF)r-   r.   rp   _num_paramsrF   rG   rH   rangerB   _get_unflat_viewsnext_optim_state_dict_configgetattr_device_meshr    rank_fsdp_extensionr}   r!   rq   r~   device_countappend)rV   r-   rY   r   r.   rp   rb   flat_param_viewsnum_unflat_paramsrF   rG   rH   _unflat_state_paramr   flat_tensorviews_generatedviewsrc   
osd_configzero_dim_tensor
non_tensors                         r:   r]   r]     s   , !&&J##F""J/1,."..## *:'L $% '6'3L'A 	9#J(,<<O"00=/4 ,(4GKE{K'@@
:~u=%22>>>"4#""//"22	#K &33???"3#""--"11>>@"00"22#K .9z*9	9> ,88M+N 	='J-<z*	= '33C&D 	8"J
-7z*	8!!"45O'6P r9   r   rc   ro   c                     d g}t        j                  |      dk(  rt        t        j                  d |      |d<   t        j
                  |d|       t        j                  |      dk(  r|S |d   S )Nr   c                     | j                         dk(  r| j                         S t        | j                  | j                        S rh   )rj   r`   rN   rO   rP   )vs    r:   <lambda>z,_broadcast_processed_state.<locals>.<lambda>\  s,    Aaeeg 3DQWWagg3V r9   srcro   )r{   get_rankr)   rK   rL   broadcast_object_list)r   rc   ro   objectss       r:   _broadcast_processed_stater   S  sl    
 G}}Uq "LLV


 	wAU;}}Uq qzr9   c                    t        j                  |      dk(  rKt        |t        j                        r|j                         dk(  r|S |j                  | j                        }n~t        |t        j                        r|j                         dk(  sJ d       |S t        |t              s|S t        j                  |j                  |j                  | j                        }t        j                  |d|       |S )Nr   zlFor non-zero ranks, a tensor state should have zero dimension, but got the state with shape {state.shape()}.rP   ru   r   )r{   r   r_   rK   rL   rj   rw   rt   rN   zerosrO   rP   	broadcast)r   r-   ro   tensors       r:   _broadcast_stater   f  s     }}Uq %.%))+2BL*334eU\\*99;!# @# LE#45LKKu{{:3L3L
 	NN6q.Mr9   fqnc                 .   |si S | j                   }| j                  j                  }| j                  |   }|j                  |   }t        ||j                  |j                        }|j                  si S i }|j                  }|j                  }	|j                         D ]s  \  }
}t        j                  |      rT|j                         dkD  rA|j                  t         j"                  k7  r$|j%                         ||	dz    j'                         }|||
<   u |S )z
    Shard the optimizer state for the original parameter with the name ``fqn``.
    This API should only be used when ``use_orig_params`` is True.
    pgru   r   rm   )r-   r.   rp   r/   _shard_param_infosr   r}   rt   in_shardintra_param_start_idxintra_param_end_idxitemsrK   ri   rj   rr   r$   rs   flattenr   )rV   r   rc   r   rp   	param_idxshard_param_infonew_optim_stater   r   r   r   s               r:   _shard_orig_param_stater   }  s    	 &&J ''22J--c2I!44Y?$
009R9RK $$	&(O,BB*>>(..0 ,
EOOE"		a,,0@0I0IIMMO$9<ORS<STZZ\E&+
#, r9   optim_state_dictmodeluse_orig_paramsoptim
rank0_onlyc                    t        j                          | }d|vr|st        d      t        |      }t	        |      }t        t        |j                                     j                  }	|rt        |	||      }i }
|d   }t        |j                               }|j                         D ]  \  }}|d   }||vr|j                  |       |rD|D ]:  }||   s	||   j                         D ]  }t        |	||   |   |      ||   |<    < |d   }||v r||   }|rGt        j                  t         j                   j"                        5  t%        ||||         }ddd       nt'        |||      }t)        t+        |      d      }r||
|<   n|ryt-        |      dk(  sJ d| d	       ||j                  j/                  |d      }|t1        j2                  |      |
|<   nvt5        j6                  d
| d|	j8                   d	       nOt;        d| d      t-        |      dk(  sJ t)        t+        |      d      }t1        j0                  ||         |
|<   |s|D ]^  }||   s	t=        ||   j                               D ]7  \  }}|	j8                  dkD  r||   |= ||   |   j?                         ||   |<   9 `  |D ]K  }||   }tA        |tB        jD                        r|r|rt        |	||      }t1        j0                  |      |
|<   M t        jF                  d       d|v rt1        j2                  |d         }|
|dS d|
iS # 1 sw Y   xY w)a  
    Flattens the full optimizer state dict, still keying by unflattened parameter
    names.

    If ``use_orig_params`` is True, each rank will have all FSDP-managed
    parameters but some of these parameters may be empty due to the sharding.
    For a regular optim.Optimizer, states for those empty parameters will
    not be initialized. So, when aggregating the FQNs across ranks, no assert
    will be raised on a rank even if it does not have all the states -- it is
    valid and FSDP know how to aggregate them. However, FSDP has to ignore
    handling those parameters that are not managed by FSDP and do not exist on
    the local rank -- it is managed by other parallelism and FSDP does not
    know ho to handle/aggregate them.

    Note that ``_flatten_tensor_optim_state`` does not need ``optim`` to
    flatten/shard the state. However, NamedOptimizer and KeyedOptimizer require
    all the states even if the corresponding parameters are empty. To this end,
    ``optim`` will be used to to get the initial state of the empty parameters.
    ``optim`` should only be non-None if the ``optim` is KeyedOptimizer or
    NamedOptimizer.

    Returns:
        Dict[str, Any]: The flattened optimizer state dict.
    r-   zO`optim_state_dict` must have the keys "state"to be a valid optimizer state dictrn   r   NTrm   z5use_orig_params is True but there are multiple FQNs, .zoptim_state[z] is not on rankzThe state of z8 is empty. This should happen when use_orig_params=True.Fz,FSDP _flatten_optim_state_dict() profiling: param_groupsr-   r   )$r   reset
ValueErrorr   _get_fqn_to_fsdp_param_infor   itervaluesr-   r   setr@   r   difference_updater   profileType
RESHARDINGr   _flatten_optim_staterS   tuplelengetcopydeepcopywarningswarnr   RuntimeErrorr^   r`   r_   rK   rL   dump_and_reset)r   r   r   r   r   ro   
unflat_osdparam_to_fqnsfqn_to_fsdp_param_infor   flat_osd_stateunflat_osd_stateall_state_keysparamfqnsr   r   rV   
flat_staterd   r-   param_state
user_stateflat_osd_param_groupss                           r:   _flatten_optim_state_dictr     s   @ !Jj 1
 	
 'u-M8?d188:;<BBJ /
JeT
 =?N!'*)..01N$**, H t1g&&((. ',"23"7"<"<"> J8H"$4S$9*$EU9$S)*5 q'C((4S9O#++N,?,?,J,JK !8'(-"J  2#$

 !td3C &0s# INSJ4&PQRS"$!KKOOE48E(.2mmE.Bs+ *3%/?
?PPQR
 ##C5 ), , 
 t9>!> te4C"&)),<S,A"BN3  ',/34DS4I4O4O4Q/R 	 +J!*,S1*= =MS<Q&=#% )-j9	  yH V  4%c*
j%,,/J?)*jNJ"ii
3s	4 !!"PQ # $j.H I'9NOO((O s   MM	r   rT   c           	      \   | j                   }| j                  }|j                  }t        |      }|dkD  sJ d       |j                  }t        |      }||k(  sJ d| d|        |D 	cg c]  }	t        |	|v        }
}	t        |
      si S |D 	cg c]-  }	|	|v r%t        ||	   |j                  |j                        nd/ }}	d}|D ]K  }||t        |j                               }"|t        |j                               k7  s?t        d|        |J i }|D ]q  }|D cg c]  }|||   nd }}|D cg c]  }||	 }}|sd||<   4dx}x}}|D ]V  }|t        j                  |      xr |j                         dkD  z  }|t!        |      z  }|t        j                  |       z  }X |D ch c]  }t#        |       }}t        |      d	k7  s|s|s|st        d
| d| d|       |rst%        |||||      }|j&                  d	k7  rL|j(                  t*        j,                  k7  r/t/        j0                  ||j2                  |j&                        \  }}n|}|||<   J|rt5        |||      ||<   ^|sJ t7        |||      ||<   t |S c c}	w c c}	w c c}w c c}w c c}w )ao  
    Flattens the optimizer state in ``full_optim_state_dict`` for a single
    flat parameter in ``fsdp_param_info`` corresponding to the unflattened
    parameter names in ``unflat_param_names``.

    Args:
        fsdp_param_info (FSDPParamInfo): The FSDP state, the handle, and a
            mapping from FQN to original parameter index.
        unflat_osd_state (Dict[str, Dict[str, Any]]): The "state" part of the
            optimizer state dict corresponding to the unflattened parameters.
        unflat_param_names (List[str]): A :class:`list` of unflattened
            parameter names corresponding to the flat parameter ``flat_param``.

    Returns:
        Dict[str, Any]: A :class:`dict` mapping state names to their values for
        a particular flat parameter. The sharded optimizer state dict's "state"
        part will map a key to this returned value.
    r   zNExpects at least one unflattened parameter corresponding to the flat parameterzExpects z shapes but got r   Nz@Differing optimizer state names for the unflattened parameters: Trm   z*Differing optimizer state types for state z	, values z", and unflattened parameter names )r-   r.   rp   r   _shapesr7   anyr   r}   rt   r   r@   r   rK   ri   rj   rk   rv   _flatten_tensor_optim_staterq   rr   r$   rs   r   
_get_shardr   $_flatten_zero_dim_tensor_optim_state_flatten_non_tensor_optim_state)rV   r   rT   r   r.   rp   r   unflat_param_shapesnum_unflat_param_shapesunflat_param_name	has_stateunflat_param_statesstate_namesrb   r   r   state_valuesr   non_none_state_valuesare_pos_dim_tensorsare_zero_dim_tensorsare_non_tensorstypesr   sharded_flat_tensorr   s                             r:   r   r   1  s   . !&&J##F""J./q  	  %,,!"5644O	#$$45L4MNO4 "4 	"223I  y>	 "4	   00 	./'',,	
 		 	 K1 
%05578Kc"4"9"9";<< ##5"68 
 """ "$J! :
 ':
" /A.Lz*RVV
 
 -9 JqAM J J$%)Jz"GKKK2_& 	6A5??1#5#E!%%'A+E $7$:: 5??1#555O	6 #88Qa88u:?#7?<ZL I/0 1+,. 
 5"#K %%*004D4M4MM)8)C)COO))*&#Q '2#%8Jz"!%I"&Jz" #"?%D"&Jz"m:x C	8
 !K 9s$   )J2J&J<J$J$7J)r   pos_dim_tensorsr   r.   c                 X   |j                   }|D cg c]  }||	 }}|D ch c]  }|j                   }}t        |      dk7  rt        d| d|  d|       t	        t        |            }	t        ||      D ]M  \  }
}|
t        |      dk(  rt        d      |
$|
j                  |k7  s4t        d|
j                   d|        t        j                  d	      }t        ||      D cg c]V  \  }}|$t        j                  |j                  |            n*t        j                  t        j                  ||	|
            X }}}|j                  ||j                        }|j                  }|j                  |k(  sJ d|j                   d|        |S c c}w c c}w c c}}w )aA  
    Flattens the positive-dimension tensor optimizer state given by the values
    ``tensors`` for the state ``state_name`` for a single flat parameter
    from ``handle`` corresponding to the unflattened parameter names
    ``unflat_param_names`` and unflatted parameter shapes
    ``unflat_param_shapes``. This flattens each unflattened parameter's tensor
    state into one tensor.

    NOTE: We use zero tensors for any unflattened parameters without state
    since some value is required to fill those entries. This assumes that the
    zero tensor is mathematically equivalent to having no state, which is true
    for Adam's "exp_avg" and "exp_avg_sq" but may not be true for all
    optimizers.

    Args:
        state_name (str): Optimizer state name.
        pos_dim_tensors (List[torch.Tensor]): Positive-dimension tensor
            optimizer state values for the unflattened parameters corresponding
            to the single flat parameter.
        unflat_param_names (List[str]): A :class:`list` of unflattened
            parameter names corresponding to the single flat parameter.
        unflat_param_shapes (List[torch.Size]): Unflattened parameter shapes
            corresponding to the single flat parameter.
        handle (FlatParamHandle): The flat parameter's handle.

    Returns:
        torch.Tensor: A flat tensor containing the optimizer state
        corresponding to ``state_name`` constructed by concatenating the
        unflattened parameter tensor states in ``pos_dim_tensors`` (using zero
        tensors for any unflattened parameters without the state).
    rm   zAll unflattened parameters comprising a single flat parameter must have positive-dimension tensor state with the same dtype but got dtypes  for state ! and unflattened parameter names r   z6Flattening a zero-dimension parameter is not supportedzBTensor optimizer state does not have same shape as its parameter:  r`   )ry   rP   ru   ztensor optim state: z flat parameter: )rp   rP   r   r   r   r   ziprO   rK   ru   r   rw   r   flatten_tensors_aligned_numelr   )r   r   rT   r   r.   rp   tnon_none_tensorsdtypesrP   r   rO   
cpu_devicestate_valuetensors_to_flattenr   flat_param_shapes                    r:   r   r     s   L ""J#2DaamDD/0!agg0F0
6{a))/J< H++=*>@
 	
 fE_.AB >c%jAoUVVFLLE$9$ll^1UG5 	 e$J #&o7J"K K " 	knnZ01]]KK!
	
  (();V=R=RSK!:: 00 
{001 2+,	.0 W E0.s   FFF!.AF&zero_dim_tensorsc           
         |D cg c]  }||	 }}|D ch c]  }||j                         nd }}|D ch c]  }||j                  nd }}t        |      t        |      k7  st        |      dk7  st        |      dk7  rt        d| d| d|  d|       t	        t        |            }t	        t        |            }t        j                  ||t        j                  d            S c c}w c c}w c c}w )	a  
    Flattens the zero-dimension tensor optimizer state given by the values
    ``zero_dim_tensors`` for the state ``state_name`` for a single flat
    parameter corresponding to the unflattened parameter names
    ``unflat_param_names`` by enforcing that all tensors are the same and using
    that common value.

    NOTE: The requirement that the tensors are the same across all unflattened
    parameters comprising the flat parameter is needed to maintain the
    invariant that FSDP performs the same computation as its non-sharded
    equivalent. This means that none of the unflattened parameters can be
    missing this state since imposing a value may differ from having no value.
    For example, for Adam's "step", no value means maximum bias correction,
    while having some positive value means less bias correction.

    Args:
        state_name (str): Optimizer state name.
        zero_dim_tensors (List[torch.Tensor]): Zero-dimension optimizer state
            for the unflattened parameters corresponding to the single
            flat parameter.
        unflat_param_names (List[str]): A :class:`list` of unflattened
            parameter names corresponding to the single flat parameter.

    Returns:
        torch.Tensor: A zero-dimensional tensor giving the value of the state
        ``state_name`` for all unflattened parameters corresponding to the
        names ``unflat_param_names``.
    Nrm   All unflattened parameters comprising a single flat parameter must have scalar state with the same value and dtype but got values z and dtypes r  r  r`   r   )	itemrP   r   r   r   r   rK   r   ru   )	r   r  rT   r  r  
values_setr	  r   rP   s	            r:   r   r     s   B $4Eaq}EE?OP!am!&&(5PJP:JKQaggD0KFK%5!66z?av;!(\fX[l;!"	$
 	
 j!"EfE<<U5<<3FGG% FPKs   C.C.C3C8non_tensorsc                     |D cg c]  }||	 }}t        |      }t        |      t        |      k7  st        |      dk7  rt        d| d|  d|       t        t	        |            }|S c c}w )a  
    Flattens the non-tensor optimizer state given by the values ``non_tensors``
    for the state ``state_name`` for a single flat parameter corresponding
    to the unflattened parameter names ``unflat_param_names`` by enforcing that
    all values are the same and using that common value.

    See the note in :func:`_flatten_zero_dim_tensor_optim_state`.

    Args:
        state_name (str): Optimizer state name.
        non_tensors (List[Any]): Non-tensor optimizer state for the unflattened
            parameters corresponding to the single flat parameter.
        unflat_param_names (List[str]): A :class:`list` of unflattened
            parameter names corresponding to the single flat parameter.

    Returns:
        Any: A non-tensor giving the value of the state ``state_name`` for all
        unflattened parameters corresponding to the names
        ``unflat_param_names``.
    rm   r  r  z" and  unflattened parameter names )r   r   r   r   r   )r   r  rT   ntnon_none_non_tensorsnon_tensor_setr   s          r:   r   r   F  s    2 *5G2BGG%N
 C$44N8Kq8P,-[ E++=*>@
 	
 d>*+J Hs
   A-A-sharded_osdoptim_inputusing_optim_inputis_named_optimizerc           
         t        |      }t        |      }t        t        t        j
                  t        t        t        f   f   |rt        ||      nt        |||||            }t        |      t        |      k  sJ i }	i }
|j                         D ])  \  }}||vr||   }||	t        |      <   |D ]  }||
|<   	 + | d   }i }|j                         D ]F  \  }}t        |t              r|||<   |	j                  |j                   |j                         }|||<   H d| v r[g }| d   D ]L  }t#        j$                  |      }t'        |d   D ch c]  }|
|   	 c}      }||d<   |j)                  |       N ||dS d|iS c c}w )a  
    Rekeys the optimizer state dict from unflattened parameter names to flat
    parameter IDs according to the calling rank's ``optim``, which may be
    different across ranks. In particular, the unflattened parameter names are
    represented as :class:`_OptimStateKey` s.
    r-   r   paramsr   )r   _get_flat_param_to_fqnr   r   r   r   r   r6   r5   '_get_param_to_param_id_from_optim_input_get_param_to_param_keyr   r   r   r_   r   rT   r   r   r?   r   )r  r   r   r  r  r  r   flat_param_to_fqnparam_to_param_key$unflat_param_names_to_flat_param_key#unflat_param_name_to_flat_param_keyr   rT   flat_param_keyr   sharded_osd_staterekeyed_osd_staterd   r   rekeyed_osd_param_groupsunflat_param_groupflat_param_groupflat_param_keyss                          r:   _rekey_sharded_optim_state_dictr,  m  s   & 'u-M.u5>BR\\5c?*+ ! 4E;G(u0-AR	? !"c-&8888 	 )
 	 ( &3%8%8%: T!!**+E2JX,U3E-FG!3 	TES/0AB	TT $G,46-335 8[c3%0c"=AA""C$:$:
 -8.)8 $9; "-n"= 		>#}}-?@$ .@-I) 88IJO *9X&$++,<=		> +<TUU*++s   F
c                 V   |"t        t        | j                                     S 	 t        t        t
        j                     t        |            }t        |      dk(  rt        d      d}d}|D ]2  }|t        |t        j                        z  }|t        |t               z  }4 |s|st        d      |rt        t        |            S |sJ g }|D ]#  }d|v }	|	sJ d       |j                  |d          % t        t        |            S # t        $ r}t        d|       |d}~ww xY w)	a]  
    Constructs a mapping from parameter IDs to parameters. This may be used
    both for models with ``FlatParameter`` s and without.

    NOTE: This method is only preserved for backward compatibility. The method
    :meth:`_get_param_key_to_param` is the preferred code path that does not
    rely on ``optim_input``.

    NOTE: We critically assume that, whether the optimizer input is a list of
    parameters or a list of parameter groups, :class:`torch.optim.Optimizer`
    enumerates the parameter IDs in order. In other words, for a parameter list
    input, the parameter IDs should be in that list order, and for a parameter
    groups input, the parameter IDs should be in order within each parameter
    group and in order across parameter groups.

    Args:
        model (nn.Module): Model whose parameters are passed into the
            optimizer.
        optim_input (Optional[Union[List[Dict[str, Any]],
        Iterable[nn.Parameter]]]): Input passed into the optimizer
            representing either a :class:`list` of parameter groups or an
            iterable of parameters; if ``None``, then this method assumes the
            input was ``model.parameters()``. (Default: ``None``)

    Returns:
        List[nn.Parameter]: Mapping from parameter IDs to parameters,
        where the parameter ID is implicitly the index in the :class:`list`.
    NzCOptimizer input should be an iterable of Tensors or dicts, but got r   z#Optimizer input should not be emptyTz9Optimizer input should be an iterable of Tensors or dictsr  zNA parameter group should map "params" to a list of the parameters in the group)rJ   	enumerate
parametersr   r   r   r   r^   	TypeErrorr   r   r_   rK   rL   extend)
r   r  r  eall_tensors	all_dictsr   param_id_to_paramparam_grouphas_params_keys
             r:   '_get_param_id_to_param_from_optim_inputr8    sZ   N Ie..0122d2<<(${*;< 6{a>?? KI -z%66Zt,,	- ySTTIf%&&9,. 8![0 	
&	
~ 	  X!678 	+,--;  "m%
 	s   *D 	D(D##D(c                 t    d }d }i }t        | ||t        |       D cg c]  \  }}|	 c}}|      S c c}}w )a  
    Constructs a mapping from ``FlatParameter`` to a cleaned (devoid of prefixes
    from wrappers) fully qualified name (FQN). Note that this FQN is "non-canonical"
    because ``FlatParameter``  s do not come from the original module but are
    registered only after FSDP has been applied. This function returns the FSDP-given
    name for the ``FlatParameter`` (usually module._flat_param) as opposed to the
    canonical FQNs returned for ``FlatParameter`` s in ``_common_utils._get_param_to_fqns(...)``).

    Consequently, this function will only return a non-empty mapping if FSDP was
    applied with ``use_orig_params=False`` as, otherwise, the original parameters
    are used within the module and there would be no ``FlatParameter`` s in the module.

    c                 v    t        | d      D ])  \  }}t        |t              st        ||z         }|||<   + y )NF)recurse)r   r_   r   r   )moduleprefix
tree_levelr!  
param_namer   r   s          r:   	module_fnz)_get_flat_param_to_fqn.<locals>.module_fn  sI    !BE"
 	+J e]3#FZ$78C'*e$	+r9   c                     | S r>   r8   )r!  s    r:   	return_fnz)_get_flat_param_to_fqn.<locals>.return_fn         r9   r   r   )r   r@  rB  flat_param_to_fqn_retr   r   s         r:   r  r    sJ    +! 79<UCDaD  	E   4r   r!  c                    i }|r0||J d       |J t        |      D ]  \  }}||t        |      <    i }d}	| j                  D ][  }
|rC|
d   D ]:  }|J ||v r||   }n|J t        ||         dk(  sJ ||   d   }	 ||   }|||<   < H|
d   D ]  }|||	<   |	dz  }	 ] |S # t        $ r/}t	        d| dt        |j                                d      |d}~ww xY w)	z
    Constructs a mapping from parameter keys to parameters. For the regular
    optimizers, the keys are parameter IDs. For NamedOptimizer, the keys
    are FQNs. This API may be used both for models with ``FlatParameter`` s and
    without.
    NzDThe optimizer is a NamedOptimizer, `param_to_fqns` must not be None.r   r  rm   zCan't find z from r   )r   r   r   r   KeyErrorr^   r@   )r   r   r  r   r!  clean_fqn_to_curr_fqnrd   r   param_key_to_parampidr6  r   r2  s                r:   _get_param_key_to_paramrL  -  s    -/%*;*G	RQ	RG   7> 	@FC<?!"3C"89	@ ?A
C)) $X. 0(444--+E2C(444}U34999'.q1C/4C
 +0"3'!0$ %X. */"3'q)0    "%cU&6K6P6P6R1S0TTUVs    B%%	C.*CCc                 t    t        | ||||      }|j                         D ci c]  \  }}||
 c}}S c c}}w )z
    Constructs the inverse mapping of :func:`_get_param_key_to_param`. This API
    only supports the case where `optim` is a regular optimizer, not NamedOptimizer.
    So the parameter keys will be parameter ids.
    )rL  r   )r   r   r  r   r!  r5  param_idr   s           r:   r   r   `  sD     0u(-9J 4E3J3J3LM%E8OMMMs   4c                 n    t        | |      }|j                         D ci c]  \  }}||
 c}}S c c}}w )zRConstructs the inverse mapping of :func:`_get_param_id_to_param_from_optim_input`.)r8  r   )r   r  r5  rN  r   s        r:   r  r  r  s7     @{S3D3J3J3LM%E8OMMMs   1r0_optim_state_keysoptim_state_key_to_param_keyrJ  c           	         g }| D ]H  }||vr|j                  |       ||   }t        |t              s/|dk\  r|t        |      k  rCJ d        t	        |      }t        j                  t        |      gt
        j                  |      }t        j                  ||       |j                         dkD  rt        t        j                  |            D 	cg c]  }	d  }
}	t        j                  |
||       d}t        |
      D ]N  \  }}t        t         t"           |      }t        |      dkD  s,|d| d|D cg c]  }|j$                   c} z  }P t'        |      y c c}	w c c}w )Nr   z+Check the `param_key_to_param` constructionr   rn   zFSDP currently requires each rank to have at least the optimizer states needed by rank 0's optimizer but some ranks are missing some of those statesz
Rank z' is missing states for the parameters: )r   r_   r6   r   r   rK   r   int32r{   
all_reducer  r   get_world_sizeall_gather_objectr.  r   r   rS   rT   r   )rP  rQ  rJ  ro   missing_keysr0_optim_state_key	param_keyru   num_missingr   obj_list	error_msgr   r@   rd   s                  r:   _check_missing_keys_on_rankr]    s    *,L1 
=%AA  2301CD	i%>i#"3 ' =<= 
= $E*F,,L 12%++fUKOOKu-A"'(;(;E(B"CDQDDDxUC/ 	
 $H- 	JD$^,d3D4y1}dV#J:>?3..?@B		 9%% D @s   	E":E'r   
merge_keysc                    t        j                  |      }i }g }|j                         D ]  \  }	}
|	| d   vr||
   }t        |
t              }|r*|d   |v s#J |d   t        |j                               f       |d   |v }t        t        |      |      }|dk(  s|r|j                  |       |	||<    |rst        t        j                  |            D cg c]  }g  }}t        j                  |||       |D cg c]  }|D ]  }|  }}}t        t        |            }||fS |dk(  r|gndg}t        j                  |d|       |d   J |d   }t!        ||||       ||fS c c}w c c}}w )a@  
    Construct the local mapping between the ``_OptimStateKey`` and parameter keys
    and all the ``_OptimStateKey`` across ranks. If ``merge_keys`` is False, rank0
    must contain all the ``_OptimStateKey``, an exception will be raised otherwise.
    Note that ``merge_keys`` should equal to ``use_orig_params``.
    r-   r   )rT   rU   rn   Nr   )r{   r   r   r_   r   r^   r@   rS   r   r   r   rU  rV  r?   r   r   r]  )r   ro   rJ  r   r   r^  r   rQ  all_optim_state_keysrY  r   r   rU   optim_state_keyr   all_keys
local_keysrd   merge_all_optim_state_keyskey_obj_lists                       r:   _map_param_key_to_optim_keysrf    s    ==DJL 13.446 B	5 ,W55U#$UM:744 Q+00237 4 q'%;;($T{+
 19
 ''88A$_5'B* d11%890
B0
 0
 	x)=UK"*&
j&
/2C&
&
" &
  &c*D&EF  !=== '+ai!"dV 	 	""<QeDA***+A# (		
  !===-0
&
s   	E+?E0
state_dictc                    g }| d   D ]k  }t        j                  |      }|d   D cg c]  }||   	 }}|D cg c]  }||   	 }	}|	D 
cg c]  }
|
D ]  }|  c}}
|d<   |j                  |       m |S c c}w c c}w c c}}
w )Nr   r  )r   r   r   )rg  rJ  r   r   r*  r)  r%  param_group_paramsr   nested_unflat_param_namesrT   r   s               r:   _unflatten_param_groupsrk    s    
 *,L&~6 0!]]+;< #38"<
 ~.
 

 /A%
%*M% %
! %

 '@(
"%7(
 " (
(
8$
 	./0 
%
(
s   A8A=Bc                     | j                  dd      }|sy	 t        t        |j                                     }t        |t              S # t        $ r}t	        |       |d}~ww xY w)a
  
    Returns whether the state_dict is from a NamedOptimizer.
    This function checks that the keys in the state_dict['state'] are strings
    (which usually are FQNs) versus integers (which usually refer to param_ids
    from a vanilla torch.optim.Optimizer).
    r-   NF)r   r   r   r@   	Exceptionr_   r5   )r   r-   rd   r2  s       r:   _is_named_optimizerrn    sg       $/E 14

%& c3  1()q01s   "A	 		A#AA#c                   ^    e Zd ZU eeef   ed<   eeej                  f   ed<   eee	f   ed<   y)	StateInfotensorsscalar_tensorsr  N)
r1   r2   r3   r   r5   rN   r4   rK   rL   r   r8   r9   r:   rp  rp    s7     #(())ell*++c3hr9   rp  input_statesc                    i }t        | j                        D cg c]  }i  }}|j                         D ]  \  }}t        i i i       }t	        |      D ]  \  }}	t        j                  |	      r_|	j                         dk(  r|	j                         |j                  |<   Lt        |	j                  |	j                        |j                  |<   z|	|j                  |<    |||<    t        j                   ||| j"                         |S c c}w )z
    Given the ``input_states``, allgather StateInfo for each state. The function
    uses all_gather_object to gather StateInfo so no GPU tensors are sent.
    r   rn   )r   rq   r   rp  rB   rK   ri   rj   r`   rr  rN   rO   rP   rq  r  r{   rV  r}   )
r   rs  processed_state_dictr   gathered_state_infor   rc   processed_stater   r   s
             r:   _allgather_state_inforx    s    24*//077 7 )..0 4[#BB/!-k!: 
	@Ju%99;!#AFO22:>:KU[[;O++J7 ;@++J7
	@ %4S!4 	&&
 17s   	D	rv  output_statesc                    i }|j                         D ]`  \  }}|D cg c]  }||   	 }}t        |D 	
ch c]#  }	|	j                  j                         D ]  }
|
 % c}
}	      }t	               }d}|D ]%  }g }t	               }t        |      D ]  \  }}|j                  d       |j                  j                  |d      }|=|j                  j                         |d<   |s|j                  }n||j                  k(  sJ |d   dk(  s{|j                  |        |r||k(  sJ |}||vr| j                  D cg c]  }d c}||<   ||   j                  |d      }|%|j                  | j                  j                        }|||   | j                  |   <   ( t        |      D ]  \  }}||v r|j                   j                         D ]:  \  }}|j                  |d      }|||k(  sJ d| d| d| dd| z          |||<   < |j"                  j                         D ]K  \  }}|j                  |d      }|-t%        j&                  ||      sJ d| d| d| dd| z          |||<   M  c |fS c c}w c c}
}	w c c}w )	a1  
    Given the ``gathered_state_info`` and ``input_states``, the API converted
    the StateInfo into the original state if the state is not a non-scalar
    tensor. For a multi-dimensional tensor, the local state will be stored in
    ``state_buffer`` in a correct order for later allgather purpose.
    Nr   zRank z has different values for z: r   z Other ranks: )r   r?   rq  r@   r   r.  r   r   rO   r   rP   addr/   rw   r-   rt   r  rr  rK   equal)rV   rv  rs  ry  state_buffersr   gathered_states
state_infor-   nall_tensor_statesempty_ranksrP   r   numels_empty_ranksr   object_stateinfor   local_statenamenon_tensor_valuecurr_non_tensor_valuescalar_tensor_valuecurr_scalar_tensor_values                              r:   _convert_all_state_infor  C  s1    >@M,224 ?;^&9:af:
:"&E50B0B0DE1QEQE
 !$'+ , 	XJF%(UL&/
&; 
+"la #++//
DA#!%!1!1!3F2J  $

$

222":? $$T*
+ #k\&AAA&K."1"?"?-D-j) 's+//
DAK &)nn_-B-B-Q-QRLWM*%o&C&CC&HI7	X@ #,J"7 	;D,{"*6*B*B*H*H*J 	8&&(6(:(:4(F%)1,0@@ D6!;D6DTCUUVW&'<&=>?	A (8t$	8 .:-H-H-N-N-P ;))+9+=+=dD+I(/75;;')A<  D6!;D6DWCXXYZ&'?&@AB  (;t$;	;S?;B -A ;E0-s   I;(J :	Jc           	      D   |sy| j                   j                  }| j                  }|j                         D ]O  \  }}	|	|   }
| j                  |   }t        |
t              r|
j                  d   }|t               k7  r|j                  }|
j                  t               f      }t        |j                  |         }||xx   |
j                  j                  d      z  cc<   t        j                   |      }|
j#                  |      }
n=|
j#                  |j                  |         }
n|
j#                  |j                  |         }
|r|j$                  }t'        |dd      r;|j(                  J t+        |
|j,                  |j(                  |j.                        }
n|j0                  J t3        |
|j,                  |j4                  |j6                  j9                         |j0                  |j.                        }
n>|s<t;        j<                  d      5  |
j?                         jA                         }
ddd       |rFt;        j<                  t:        jB                  jD                        5  |
jG                         }
ddd       |
|	|<   R y# 1 sw Y   ZxY w# 1 sw Y   xY w)a,  
    Given a output state dict, ``output_states``, which the keys are FQNs to the
    original parameters (not FlatParameters nor parmeter ID), and the values
    are gathered states, unflatten the states to the original dimensions.

    This function performs the unflattening process in-place.
    Nr   )
placementsr   Fr   )$r.   rp   r-   r   r/   r_   r'   r  r(   rj   redistributer^   r   device_meshry   rK   rQ   reshaper   r   r   r    r   r   r}   r!   rq   r~   r   r   r   r   r   r   D2Hr`   )rV   ry  r   rY   rX   rZ   rp   r   r   r  r   r   	placementplacement_dimvalue_localreshape_sizer   s                    r:   _unflatten_orig_param_statesr    s[     ''22J &&J,224 2+^z*#11#6	 eW%((+I IK' )#00Y[N0K#J$6$6y$AB]+u/@/@/E/Ea/HH+$zz,7l3 j&8&8&CD MM*"4"4Y"?@E#<<Jz>59!..:::*OO++..	 "//;;;)OO))--::<,,.. ''0 /,,./ ''(;(;(?(?@ $		$%*z"e2+X/ /$ $s   J
(J
J	J	c                 
   | j                   }|j                  dk(  r]t        j                         t        j                  j
                  k(  r.t        j                  d|j                  j                                |j                         D ci c]  }|i  }}t        | |||      \  }	}
t        |
      dk(  r|S | j                  j                         D cg c]  \  }}||v rdnd }}}| j                  j                   }t#        j$                  t&        j(                  |	|j*                        } ||j,                        }|j                  j/                          |
j                         D ]  \  }}g }|j                  |j0                  j3                         z  }||j0                  j3                         z   dz
  }d\  }}t5        |j6                  |j8                        D ]  \  }}| xr | j:                  |    xr ||    }|s|r|||z   dz
  }}||cxk  r|k  rn n||k\  r||z
  dz   n||z
  dz   }n=||cxk  r|k  rn n||k  r||z
  dz   n||z
  dz   }n||cxk  r|cxk  r|k  rn n|}nd}|r|j=                   ||             |s6||   ,|j=                  t?        t&        j@                  ||                |dz  }||z  } |j0                  j3                         tC        d	 |D              z
  }|jD                  |k(  sFJ d
|jD                   d| d|j0                  j3                          d|j6                   d| d| d       |dkD  r|j=                   ||             t'        jF                  |      }|j3                         |jH                  z  |j3                         k(  sJ d       |j                  j/                          tK        jL                  tJ        jN                  jP                        5  t        jR                  |||jT                         |j                  j/                          ddd       |d|jV                  j3                          }| j                  } | jY                  |      }!t        |!      t        | j                        k(  sJ d       | j                  j                         D ]$  \  }}| j:                  |   s||v s|!|   ||   |<   & t[        | |||||        ~|S c c}w c c}}w # 1 sw Y   xY w)z
    Given the ``gathered_state_info`` and ``input_states``, the API allgathers
    all tensor states and restore non-tensor states from ``gathered_state_info``.
    r   z@Memory Summary before calling to _allgather_orig_param_states %sTFr   rm   )r   r   Nc              3   <   K   | ]  }|j                           y wr>   )r   ).0r  s     r:   	<genexpr>z/_allgather_orig_param_states.<locals>.<genexpr>P  s     1a	1s   zLManually calculated _sharded_numel_padded is incorrect. _shard_numel_padded=z, shard_numel_padded=z, _sharded_size.numel=z, _numels_with_padding=z, begin=z, end=,zThe size of local shard times the world size should equal to the gathered tensor size. The inconsistency may be from a bug of FlatParameter's metadata or the reconstruction logic in optimizer state dict.rn   zThe number of parameters from FlatParameter is not consistent to the number of states used by optimizer state dict reconstruction logic.).r-   r   r{   get_debug_level
DebugLevelDETAILloggerr  r~   memory_summaryr@   r  r   r/   r   r.   rp   	functoolspartialrK   emptyrt   _padded_unsharded_sizer   _sharded_sizer   r  _numels_with_padding_is_padding_maskr0   r   r   rL   sum_shard_numel_paddedcatrq   r   r   r   	ALLGATHERr|   r}   r   _get_unflat_views_alignedr  )"rV   rv  rs  rY   rX   rZ   r   r   ry  rP   r~  idxhas_state_paramsrp   
empty_funcgathered_tensorr   bufferslocal_buffersbeginend
mem_offsetr   r   
is_paddingfrozen_and_no_statepadding_beginpadding_endpadding_lenshard_numel_paddedlocal_shardunpadded_tensorflat_param_handleorig_statess"                                     r:   _allgather_orig_param_statesr    so    !&&J! 4 4 6$//:P:P PN%%446	

 DPCTCTCV/WCR/WM/W2,lME= =Q (55;;=$C }$%/$ $ !''22J""5)B)BJ !!B!BCO))+,224 r

G,.*":":"@"@"BBj..4466: $
I!$++Z-H-H"
 8	 E: '1. ##77	BB 4(33  
 0
 .8e9Ka9O{ E8[8
 +- $e+a/ 5[1_  
 #c8[8
 !M1 m+a/ 5[1_  
 ]?k?C? #(K"#K!((K)@A 9%1!((ellGI<N)OPQ	%Jq8	 t (55;;=1=11
 --1CC 	
##-#A#A"B C""4!5 6##-#;#;#A#A#C"D E$$.$C$C#D EG6#a)	
C !  ,>!?@ii.  "Z%:%::o>S>S>UU 	
	
U 	!!--/##N$7$7$A$AB 	4''J4L4L %%113	4 **WJ,O,O,U,U,WX+22'AA/R;3'D'D#EE 	
	
E
 (55;;= 	BHC22373-;O1<S1Ac":.	B 	%	
Wr
h 	e 0X$X	4 	4s   
S8S=:=TT	c                    | j                   }|j                  dk(  s|j                  t        j                  k(  r|r|S i S t        j                  t
        j                  j                        5  t        j                  t
        j                  j                        5  t        ||      }ddd       t        | ||||      }ddd       |r| j                  j                         D ]e  \  }}	|v r| j                  |	   st        | dt!        | j                  j#                                dt!        |j#                                d       S i S # 1 sw Y   xY w# 1 sw Y   xY w)a6  
    Given a optimizer state dict, ``input_states``, which the keys are FQNs to the
    original parameters (not FlatParameters nor parmeter ID), gather all the
    states and unflatten them to the original dimensions. Note that all the
    params referred by the ``input_states`` must be managed by FSDP.
    rm   NzB is not in the output state. The FSDPParamInfo has the param keys z, while the output_states has the param keys r   )r-   rq   rr   r$   rs   r   r   r   r   ALLGATHER_OBJrx  r  r/   r   r0   r   r?   r@   )
rV   rs  rY   rX   rZ   r   rv  ry  rd   r  s
             r:   _gather_all_orig_param_stater    sl    !&&J"''+;+D+DD&|.B.			 3 3 > >	? 

##N$7$7$E$EF 	R"7
L"Q	R4


 '55;;= 	HCm#"66s;% 8/77<<>?@ A8-,,./0	3 	 	5	R 	R

 

s$   ,.EE'EE	EE$r`  c                    i }i }| D ]P  }	|j                  |	d       }
|
|	j                  s%|	j                  rZ|	j                  d   }|j                  |d       }|U|
i n||
   }t        |      |vri |t        |      <   ||t        |         |<   |st	        |	j                        dk(  sJ |	j                  d   }t        j                  d      5  t        t        t        t        f   |
      }
t        j                  ||
         ||<   |rBt        ||         D ]1  \  }}t        j                  |      s|j                         ||   |<   3 d d d        S |j!                         D ]  }t#        t%        |j'                                     }||   }t	        |j(                        dkD  sJ d       |j*                  j-                         D ]e  \  }}||v r|j(                  |   st/        | dt1        |j*                  j'                                dt1        |j'                                d       |j3                  t5        |||||              |S # 1 sw Y   [xY w)Nr   rm   none_fsdp_managed_copyzgWith use_orig_params, FSDPParamInfo should have requires_grad information. However, the length is zero.zE is not in the optimizer state. The FSDPParamInfo has the param keys z( while the optimizer has the param keys r   )r   rU   rT   idr   r   r   r   r   r5   r6   r   rB   rK   ri   r`   r   r   r   r@   r0   r/   r   r   r?   updater  )r`  rQ  r   r   rX   rY   rZ   fsdp_osd_state
all_statesra  rY  r   rV   r-   r   r   r   _all_statesrd   r  s                       r:   _convert_state_with_orig_paramsr    s    &(N
 -/J 0 &T+G+K+KT,
	 _%D%D**!44Q7C488dCO& #+B1A)1LE/"*424
2o./38Jr/*+C099:a??? / B B1 E''(@A T sCx)<	48II$Y/501 -9&'89. T)
E  %u5$HM		'89*ETT T7&TT "((* 
4((*+,05?667!; 	
8	
; (55;;= 	HCk!"66s;% 8/77<<>?@ A4+**,-.a	1 	 	(	
'
: YT Ts   A=II"	c                 n   i }| D ],  }|j                  |d       }	|	J d| d|	        |j                  rp|j                  d   }
||
   }t        |||	   |||      }|sZt	        |      t	        |j                        k(  sJ t        |j                  |      D ]
  \  }}|||<    |st	        |j                        dk(  sJ |j                  d   }t        j                  ||	         ||<   |st        ||         D ]1  \  }}t        j                  |      s|j                         ||   |<   3 / |S )NzQIf use_orig_params is False, we must be able to find the corresponding param id. r  r   rm   )r   rU   rT   re   r   r  r   rB   rK   ri   r`   )r`  rQ  r   r   rX   rY   rZ   r  ra  rY  r   rV   unflat_stater   rb   r   r   s                    r:   _convert_state_with_flat_paramsr    s    &(N 0 (P+G+K+KT,
	 $ 	
''6&7qE	
$
 ** "44Q7C4S9O1 +L <(C0R0R,SSSS=@#66 > K9%'9 9KN#45	K
 99:a??? / B B1 E04		:J9:U0VN,-)5"#45* P%J !??51 DIIIKN#45jAPG(PT r9   c
                    t        j                          t               }
|
j                  t        j                  t         j
                  j                               t        t        j                  |              | xs t        j                  |      dk(  xs |}t        j                  d      5  t        |       }t        |       }t        |      }t        t         t"        t$        t&        f   t(        j*                  f   |rt-        | |      nt/        || |||            }t1        |       }ddd       t        j                  d      5  t3        |||      \  }}ddd       t        j                  d      5  |rt4        nt6        } ||d   |||	      }ddd       |si S di}t9        j;                               }|d   j=                         D ]2  \  }}||v r||v r|v rt?        j@                  d| d	       |||<   4 d
|v rtC        |      |d
<   |
jE                          t        jF                  d       |S # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w)a
  
    Consolidates the optimizer state and returns it as a :class:`dict`
    following the convention of :meth:`torch.optim.Optimizer.state_dict`,
    i.e. with keys ``"state"`` and ``"param_groups"``.
    The flat parameters in ``FSDP`` modules contained in ``model`` are mapped
    back to their unflattened parameters.

    Parameter keys are not well-defined. For a regular optimizer, the optimizer
    state_dict contains a mapping from parameter IDs to parameter states.
    Parameter IDs are the order of parameters in ``optim.param_groups()`` across
    all the groups. This API also allows user to pass ``optim_input`` for the
    mapping between parameters and parameter IDs. Using ``optim_input`` is being
    deprecated.

    If the optimizer is a ``NamedOptimizer``, the optimizer state_dict does not
    contain parameter IDs mapping but a mapping from parameter FQNs to parameter
    states. This API finds the mapping from FQNs to parameters if the optimizer
    is a ``NamedOptimizer``.

    If ``use_orig_params`` is True, each rank will have all FSDP-managed
    parameters but some of these parameters may be empty due to the sharding.
    For a regular optim.Optimizer, states for those empty parameters will
    not be initialized. So, when aggregating the FQNs across ranks, no assert
    will be raised on a rank even if it does not have all the states -- it is
    valid and FSDP knows how to aggregate them. However, FSDP has to ignore
    handling those parameters that are not managed by FSDP and do not exist on
    the local rank -- those are managed by other parallelisms and FSDP does not
    know how to handle/aggregate them.

    Args:
        model (nn.Module): Root module (which may or may not be a
            :class:`FullyShardedDataParallel` instance) whose parameters
            were passed into the optimizer ``optim``.
        optim (torch.optim.Optimizer): Optimizer for ``model`` 's
            parameters.
        rank0_only (bool): If ``True``, saves the populated :class:`dict`
            only on rank 0; if ``False``, saves it on all ranks. (Default:
            ``True``)
        shard_state (bool): If ``True``, shard and distribute all
            non-zero-dimension states.

    Returns:
        Dict[str, Any]: A :class:`dict` containing the optimizer state for
        ``model`` 's original unflattened parameters and including keys
        "state" and "param_groups" following the convention of
        :meth:`torch.optim.Optimizer.state_dict`. If ``rank0_only=False``,
        then nonzero ranks return an empty :class:`dict`.
    r   preprocessingNpreprocessing_with_comm)r^  state_convertingr-   zFound a optim state, aK  , that FSDP cannot process. FSDP will directly copy everything to the returned state_dict. In most cases, this is a user-defined state that is not associated with any particular parameter. Another possible case is this state is managed by TorchRec. Otherwise, there may  be a mismatched assumption of optim_state_dict of this mode.r   z$FSDP _optim_state_dict() profiling: )$r   r   r   enter_contextr   r   ALLr#   traversal_utils_get_fsdp_handlesr{   r   r   r  rn  r   r   r   r6   r5   r   r   r8  rL  r   rf  r  r  r   r   r   r   r   rk  closer   )r   r   r   r  r   rY   ro   r  r   rZ   cmrX   r   r!  r  rJ  r   r`  rQ  
convert_fnr  fsdp_osdflat_param_fqnsrd   r   s                            r:   _optim_state_dictr  L  s   D 	B^++N,?,?,C,CDE)/*K*KE*RSnHe 4 9H[G				0 D*5125901AB!sCx",,./ % 8{K,5"4mEV	
 "=U!CD" 
		 9	: 
 )"&
	
 (
 
		 2	3 
  ,0 	
 $ ("W%

$ 	 '8H+2245O&w/557 $
U. /!$$ 	#C5 )L L	
 $s%$( ))#:0-$
  HHJ!!"HIOeD D"
 

 
s%   /A<II: IIII(c                 t    d }d }i }t        | ||t        |       D cg c]  \  }}|	 c}}|      S c c}}w )a   
    Construct the mapping from a param's fqn to its corresponding ``FSDPParamInfo``
    if the param is managed by FSDP. Shared parameters, or original parameters that
    are shared across multiple nn.Modules, are required to belong to one and only
    one FSDP instance and thus correspond to one ``FlatParameter``. Within the one
    ``FlatParameter``, ``FlatParameter._fqns`` only stores the first FQN of a shared
    parameter. Thus, the keys in the mapping are guaranteed to map to unique parameters.
    c                    t        |       }|y t        ||        t        ||       }|sy |j                  }t	        ||i g       }t        |j                        D ]  \  }}	t        ||	z         }
|
|v r"||
   j                  j                  |u sJ |
       |||
<   ||j                  |
<   |j                  [|j                  j                  |j                  |   j                          y r>   )r   r"   r   rp   r,   r.  _fqnsr   r.   r/   _paramsr0   r   requires_grad)r<  r=  r>  fqn_to_param_infor   r.   rp   rV   r  	local_fqnr   s              r:   r@  z._get_fqn_to_fsdp_param_info.<locals>.module_fn  s    CFK
:v&
F3&&
'
FBC (
(8(89 		NC#FY$67C''(-44??:MRsRM%4c"14O))#.!!-33::&&s+99		r9   c                     | S r>   r8   )r  s    r:   rB  z._get_fqn_to_fsdp_param_info.<locals>.return_fn
  rC  r9   rD  )r   r@  rB  r  r   r   s         r:   r   r     sL    .! 35 <UCDaD  	ErF  state_dict_settingsc                     t        | dd       r>|j                  }|t        j                  k(  rt	        ddd      d|j
                  _        y y )Nr   z'Found state_dict_type LOCAL_STATE_DICT.z3DeviceMesh is not compatible with LOCAL_STATE_DICT.zKPlease set state_dict_type to SHARDED_STATE_DICT to get DTensor state_dict.T)r   state_dict_typer&   LOCAL_STATE_DICTr   optim_state_dict_configr   )r   r  r  s      r:   _set_optim_use_dtensorr    sW     z>40-==m<<<9E]  HL77D 1r9   )FNFN)Fr>   )NFNN)T)FT)yr   r  loggingr   
contextlibr   dataclassesr   r   typingr   r   r   r	   r
   r   r   r   r   r   r   r   r   r   rK   torch.distributeddistributedr{   'torch.distributed.fsdp._traversal_utilsfsdp_traversal_utilsr  torch.nnr   #torch.distributed._state_dict_utilsr   "torch.distributed.distributed_c10dr   $torch.distributed.fsdp._common_utilsr   r   r   r   r   r   r   #torch.distributed.fsdp._debug_utilsr   "torch.distributed.fsdp._flat_paramr   r   'torch.distributed.fsdp._fsdp_extensionsr    r!   %torch.distributed.fsdp._runtime_utilsr"   r#   torch.distributed.fsdp.apir$   r%   r&   torch.distributed.tensorr'   r(   torch.utils._pytreer)   'torch.distributed._shard.sharded_tensorr*   	getLoggerr1   r  r,   r5   rB   rD   rN   rS   r7   re   rk   r\   r]   ProcessGroupr   r   r   Moduler   	Optimizerr   r   rL   rQ   r   r   r   r   r,  r6   r8  r  rL  r   r  r]  rf  rk  rn  rp  rx  rP   r  r  r  r  r  r  no_gradr  r   r  r8   r9   r:   <module>r	     s         (   "    A A  B E   ? M 
 8 - E 
		8	$ $ $ $T#s(^ sCx0I  C C C2
  Z 4"438n4 4 	4
 4 
$sCx.4n/3 /4 /C"C38nC CLJ"J"J J 
$sCx.	JZc3h D%%& 
#s(^	&#&/78I8I/J. " 	  c3h  
#s(^	 L "-1)-N)38nN)99N) N) EKK))*	N)
 N) D%%&N) 
#s(^N)bG"G3S#X./G S	G 
#s(^	GTRR%,,'R S	R "%**-	R
 R \\Rj3H3H5<<(3H S	3H \\	3Hl$$c$ S	$ 		$d  %M,c3hM,99M, ;;  M, c3h R\\"$	
	M, M, M, 
#s(^M,n 	H.99H.c3h R\\"$	
H. 
#r||
H.V"%((// "d=#;M6N "N "&$=A<@0;;  0BII0 0 DtCy!89:	0
  ]C%7 890 
%S/2<<
'(0j "&$=A<@N;;  NBIIN N DtCy!89:	N
  ]C%7 89N 
",,c3h
'(N2 	N99Nc3h R\\"$	
N 
",,
N'&n-'&"&~uS#X'F"G'& U38_bll:;'& D%%&	'&
 
'&` >>38n>>D%%&>> U38_bll:;>> d3i/0	>>
 !m!34>> >> 4neCHo&E!FFG>>BS#XU38_bll:; d3i/0 
$sCx.	0 $sCx.  T  &      ""sCx." 
$sI~
"JP "P d3	>23P  sCx.P  T#s(^+,	P 
 8EKK $sD%,,1G,H'H"IIJP fE+"E+T#s(^+,E+ E+ 	E+
 E+ E+ 
E+Pe"ed3	>23e sCx.e 	e
 e e 
#tCH~
eP/"/sCx./ / 	/
 / 
#s(^/r X~.X"&~uS#X'F"GX !m!34X 5c?C/0	X
 X X X 
#s(^XD 6~.6"&~uS#X'F"G6 !m!346 5c?C/0	6
 6 6 6 
#s(^6r  "Y99Y;;  Y 38nY c3h R\\"$	
	Y Y Y D%%&Y Y Y Y  
#s(^!Y Yx.ryy .T#}:L5M .b LL*L 
L Lr9   