
    sgl                     n   U d dl Z d dlZd dlZd dlZd dlmZmZmZ d dlm	Z	 d dl
mZmZmZmZmZmZmZmZmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlm Z m!Z!m"Z"m#Z#m$Z$m%Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z,m-Z-m.Z.m/Z/m0Z0m1Z1 d d	l2m3Z3m4Z4 d d
l5m6Z6 d dl7m8Z8 d dl9m:Z; d dl<m=Z= g dZ>dZ?dZ@dZAdZBeeC   ZDee6eej                  eFeGeCf   ZHeeHeeH   eeH   eeCdf   f   ZIeeCeIf   ZJeeJ   ZKeeCeeJeKf   f   ZL eM       ZNee   eOd<   e j                  d        ZQe G d d             ZRe G d deR             ZS ej                  d      	 	 dHdej                  deCdeVdeVdeDf
d        ZW G d! d"      ZXd# ZYddd$dej                  d%eej                  j                  d&f   d'eVd(eeej                        d)eeR   deSfd*Z\d+eeCeIf   d,eLd-eSddfd.Z]d/eej                  ej                  j                  f   d0eCdefd1Z^d2eeCef   d-eSdeeCef   fd3Z_ ej                         dej                  d-eSdeeCeIf   fd4       Za ej                         dej                  d2eeCeIf   d-eSde8fd5       Zbd6ej                  j                  ddfd7Zcd2eLdeeCeIf   fd8Zdd6ej                  j                  d2eeCeIf   d-eSdeLfd9Ze ej                         dej                  d:eej                  j                  d&f   d-eSdeLfd;       Zfdej                  d6ej                  j                  d,eLd-eSdeLf
d<Zg ej                         dej                  d:eej                  j                  d&f   d2eLd-eSddf
d=       Zhddd$dej                  d(eeej                        d)eeR   deeCeIf   fd>Ziddd$dej                  d:eej                  j                  eej                  j                     f   d(eeej                        d)eeR   deLf
d?Zjddd$dej                  d:eej                  j                  eej                  j                     f   d(eeej                        d)eeR   deeeCeIf   eLf   f
d@Zkdej                  d2eeej                  eeCeIf   f   eeCeIf   f   deeCeIf   fdAZlddBdej                  d+eeCeIf   d)eeR   de8fdCZmddBdej                  d:eej                  j                  eej                  j                     f   d,eLd)eeR   ddf
dDZnddBdej                  d:eej                  j                  eej                  j                     f   d+eeCeIf   d,eLd)eeR   de8fdEZoeddBdej                  d)eeR   ddfdF       ZpeddBdej                  d:eej                  j                  d&f   d)eeR   ddfdG       Zqy)I    N)asdict	dataclassfield)chain)AnyCallablecastDict	GeneratorIterableListno_type_checkOptionalSetTupleUnion)ShardedTensor)_broadcast_state_dict_distribute_state_dict_flatten_state_dict_gather_state_dict_offload_state_dict_to_cpu_unflatten_state_dict)_CHECKPOINT_PREFIX)FullOptimStateDictConfigFullStateDictConfigFullyShardedDataParallelOptimStateDictConfigShardedOptimStateDictConfigShardedStateDictConfigStateDictConfigStateDictType)._get_module_fsdp_state_if_fully_sharded_moduleFSDP_WRAPPED_MODULE)DTensor)_IncompatibleKeys)DistributedDataParallel)tree_map_only)FQNS_TPrimitiveType	ValueTypeDictValueTypeListDictValueTypeOptimizerStateTypeStateDictOptionsget_model_state_dictget_optimizer_state_dictget_state_dictset_model_state_dictset_optimizer_state_dictset_state_dict_flat_paramparam_groupsparamsstater+   _patched_state_dictc               #      K   t        j                         } t        j                          	 d  | rt        j                          y y # | rt        j                          w w xY wwN)gc	isenableddisableenable)
is_enableds    Z/var/www/html/venv/lib/python3.12/site-packages/torch/distributed/checkpoint/state_dict.py_gc_contextrC   ]   sD     JJJLIIK :IIK s   )A$A A$A!!A$c                   t    e Zd ZU dZdZeed<   dZeed<   dZeed<   dZ	eed<   dZ
eed<   dZeed	<   dZeed
<   y)r/   ap  
    This dataclass specifies how get_state_dict/set_state_dict will work.

    - ``full_state_dict``: if this is set to True, all the tensors in the
      returned state_dict will be gathered. No ShardedTensor and DTensor
      will be in the returned state_dict.

    - ``cpu_offload``: offload all the tensors to cpu. To prevent CPU OOM, if
      ``full_state_dict`` is also true, then only the rank0 will get the
      state_dict and all other ranks will get empty state_dict.

    - ``ignore_frozen_params``: if the value is True, the returned state_dict
      won't contain any frozen parameters -- the ``requires_grad`` is False.
      The default value is False.

    - ``keep_submodule_prefixes`` (deprecated): when ``submodules`` is not None, this option
      indicates whether to keep the submodule prefixes from the state_dict keys.
      or example, if the submodule is ``module.pretrain`` and the full FQN of
      the parameter is ``pretrain.layer1.weight`` of the param. When this option
      is True, the parameter's key in the returned state_dict will be
      ``pretrain.layer1.weight``. If the options is False, the key will be
      ``layer1.weight``.
      Note that if ``keep_submodule_prefixes`` is False, there may be conflicted
      FQNs, hence there should be only one submodule in ``submodules``.

    - ``strict``: the ``strict`` option when ``set_state_dict`` calls
      model.load_state_dict().

    - ``broadcast_from_rank0``: when the option is True, rank0 should receive a
       full state_dict and will broadcast the tensors in the state_dict/
       optim_state_dict one by one to other ranks. Other ranks will receive
       the tensors and shard according to the local shards in the model and
       optimizer. ``full_state_dict`` must be set to True when using this option.
       This option currently only supports DTensor, not the legacy ShardedTensor.
    Ffull_state_dictcpu_offloadignore_frozen_paramsTkeep_submodule_prefixesstrictbroadcast_from_rank0flatten_optimizer_state_dictN)__name__
__module____qualname____doc__rE   bool__annotations__rF   rG   rH   rI   rJ   rK        rB   r/   r/   h   sT    "H "OT!K!&$&$(T(FD!&$&). $.rS   r/   c                   h   e Zd ZU  ee      Zeeee	j                  f   eee	j                  f   f   ed<    ee      Zeeee	j                  f   eee	j                  f   f   ed<    ee      Zee   ed<   dZeed<   dZeed<   ej*                  Zeed<    ee      Zeej8                     ed	<   y
)_StateDictInfo)default_factoryfqn_param_mappingshared_params_mappingsubmodule_prefixesThandle_modelhandle_optimfsdp_contextfsdp_modulesN)rL   rM   rN   r   dictrW   r
   r   strtorchTensorr)   rQ   rX   setrY   r   rZ   rP   r[   
contextlibnullcontextr\   r   listr]   r   nnModulerR   rS   rB   rU   rU      s     	d# tc5<< %(<"== $
 	d# 4c5<< %(<"== $ $)#=C=L$L$'33L(3$)$$?L$ryy/?rS   rU   )maxsizemodelnameskip_ddp_prefixskip_compiler_prefixreturnc                 *   |j                  t        d      }d|vr|hS |j                  d      }g }| }t        |      D ]  \  }}t	        |t
              r(|dk(  sJ |j                  }|r-|j                  |       ?t	        |t              r|t        |      dz
  k  rW||dz      t        k(  rHdj                  |      }	t        |t              }
|	r|	 d}	|
j                  D ch c]  }|	 | 
 c}c S t        |t              }|t        k7  s|j                  |       t        ||      }t	        |t        j                   j"                  j$                        r*|dk(  sJ |j&                  }|r4|j                  |       G|j                  |       |t(        j*                  j                  j,                  k(  r|t        |      dz
  k7  st/        d      t        ||      } dj                  |      j                  t        d      hS c c}w )a  
    This API is used to convert the name of a parameter to the FQNs. For FSDP
    without `use_orig_params`, the name of FlatParameter can be mapped to
    multiple original parameters. As a result, the return type of this function
    is `Set[str]`.

    Args:
        module (nn.Module): the root model.
        name (str): the name
        skip_ddp_prefix (bool): whether to skip DDP's `module` prefix

    Returns:
        The canonical FQNs based on the model traversal.
     .module   	_orig_modz-Expect `_extra_state` to be the last obj name)replacer   split	enumerate
isinstanceDDPrq   appendFSDPlen_FLAT_PARAMjoingetattr_fqnsr$   r`   _dynamo
eval_frameOptimizedModulers   rf   modules_EXTRA_STATE_KEY_SUFFIXRuntimeError)ri   rj   rk   rl   	obj_namesfqn_obj_namescurr_objicurr_obj_nameprefix
flat_paramfqns               rB   	_get_fqnsr      s   . <<*B/D
$v

3IMH%i0 <=h$ H,,,H"$$]3$'3y>A%%)AE*:k*I-0$X{;
 &xq\F4>4D4DES6(3%(EEx)<=H 33$$]3"8];%--":":"J"JK K///))H'$$]3  /

 1 1 I III**&'VWW"8];9<< HH]#++,>CDD% Fs   #Hc                       e Zd Zy)_EXTRA_STATEN)rL   rM   rN   rR   rS   rB   r   r      s    rS   r   c              #      K   t               dt        j                  dt        dt        ffd | d      E d {    y 7 w)Nrq   curr_fqnrm   c              3   N  K   j                  |        |r| dnd}| j                         D ]!  \  }}|v r| | } ||      E d {    # t        | j                  d      | j	                  d            D ]   \  }}|| j
                  v r| | }||f " t        | j                  dt        j                  j                        t        j                  j                  k7  r7| t        j                  j                  j                   }|t               f y y 7 ׭w)Nrp   ro   F)recurseget_extra_state)addnamed_childrenr   named_buffersnamed_parameters_non_persistent_buffers_setr~   	__class__rf   rg   r   r   rq   r   r   )rq   r   rj   	submodulenew_fqnobjr   visited_moduless         rB   r   z+_iterate_valid_model_state.<locals>.recurse   s9    F#%-hZq>2%446 	3OD)O+!
4&)Gy'222		3    /1H1HQV1H1W
 	ID# v999!
4&)G3,	 F$$&79R9RSyy(() "
2::#4#4#L#L"MNG<>))	) 3s   AD%D#CD%ro   )rb   rf   rg   r_   r   )ri   r   r   s    @@rB   _iterate_valid_model_stater      s>     &)eO*		 *S *Y *2 ub!!!s   :AAA)
submodulesoptionsoptims.
optim_onlyr   r   c                   |rt        j                  dt               |r|st        d      |xs
 t	               }i }i }t        |       D ]  \  }}t        |t              rt        | |      }	|j                  |d      }
|
2t        t        t           ||         j                  |	       ||   ||<   n|	j                         ||<   |	D ]  }
t        |t              r|||
<     t        |j!                               D ])  \  }}|D ]  }
t        t"        j$                  |      ||
<   ! + t'               }|rat'        |      }| j)                         D ]C  \  }}||vrt        | |      }	t+        |	      dk(  sJ d       |j                  d |	D               E |j,                  r|j.                  st1        d      t3        j4                  |       }|r|j.                  rat7        |j8                  |j8                        }t;        |j8                  |j8                  xs |j,                        }t<        j>                  }n<tA        |j8                  	      }tC        |j8                  	      }t<        jD                  }tF        jH                  d
        }tK        jL                  || |||      }ntF        jN                  }tQ        di tS        |      ||||t        tT        tV        jX                     |      | t+        |      dkD  dS )zW
    Verify the model and options passed by the user and generates _StateDictInfo.
    zGetting submodules only model/optim state_dict is deprecated and will be removed in 2.5. This feature can be achieved by manually filtering out the state_dict returned from get_state_dict.z;Optimizers are not passed in but optim_only is set to True.Nrr   z)Submodule FQN should only have 1 instancec              3   &   K   | ]	  }| d   yw)rp   NrR   ).0r   s     rB   	<genexpr>z"_verify_options.<locals>.<genexpr>C  s     %@CQi%@s   z?full_state_dict must be True when broadcast_from_rank0 is True.)offload_to_cpu
rank0_only)r   c              3      K   t        j                         5  t        j                  ddt               t	        j
                  | |||      5  d  d d d        d d d        y # 1 sw Y   xY w# 1 sw Y   y xY ww)NignorezFSDP.state_dict_type)messagecategoryrq   state_dict_typestate_dict_configoptim_state_dict_config)warningscatch_warningsfilterwarningsFutureWarningrz   r   r   s       rB   $fsdp_state_dict_type_without_warningz=_verify_options.<locals>.fsdp_state_dict_type_without_warninga  sy      ((* 
''&<} ))!$3&7,C	  	
 
 	
 
s4   A;6A/A#A/	A;#A,	(A//A84A;r   r   )rW   rX   rY   r\   r]   rZ   r[   rR   )-r   warnr   r   r/   r   rw   r   r   getr	   r   r_   updatecopyre   itemsr`   ra   rb   named_modulesr{   rJ   rE   
ValueErrorrz   r]   r   rF   r   r"   FULL_STATE_DICTr    r   SHARDED_STATE_DICTrc   contextmanager	functoolspartialrd   rU   r   r   rf   rg   )ri   r   r   r   r   rW   rX   rj   paramfqnsr   param_fqns_rY   rq   r]   r   r   r   r   r\   s                        rB   _verify_optionsr     s+    I 		
 &I
 	
 +)+G 	 
 	  2%8 /ee\*%##E40?S,U34;;DA+<U+C!%( (,yy{e$ 	/Ce\2).!#&	//  399;< D 	DC)-ellF)C!#&	DD $'5_
!//1 	ALD&Z'UD)Dt9>N#NN>%%%@4%@@	A ##G,C,CM
 	
 $$U+L "" 3&22w?R?R! '?&22#//O73O3O'# ,;;O 6&22! 'B&22'# ,>>O		"	"	 
#	$ !((0+/$;
 "-- 	
/	+3-!$ryy/<8#^&kAo	 	rS   model_state_dictoptim_state_dictinfoc                     |j                   D ]  }t        |      }|J d        |j                  rk| si|j                  s]|j                  sQ|j
                  r|j                  s9|j                  r-|j                  s!t        dt        j                         d      |j                  r4|s2|j
                  r|j                  s|j                  st        d|       | j                         D ]  }t        |v st        | dt         d       y )Nz)Expected a fsdp_state with a fsdp module.z}The option indicates that model state_dict is required to save or load, but model state_dict is empty.rank = dist.get_rank()=rp   zgThe option indicates that model state_dict is required to save, or load but optim state_dict is empty. z
 contains z6. This can happen if the model is not the root module.)r]   r#   rZ   rY   rG   rF   rE   rI   rJ   r   distget_rankr[   keysr|   )r   r   r   rq   
fsdp_statekeys         rB   _verify_state_dictr     s'   
 ## SCFK
%R'RR%S 	 ''))!!d&:&:KK))'mmo'q*
 	
  %%$*>*>..::J9KM 
  $$& #%z+ /* * rS   r   apic                     t        | |      }|t        v r+t        j                  t        | j                  |      |       }|S )N)self)r~   r:   r   r   r   )r   r   calls      rB   _state_dict_fnr     s9    3D""  !<3GKrS   
state_dictc                     |j                   rF|j                  rt        j                  j	                         sdnd}t        | |j                  |      S |j                  rt        |       S | S )NrR   )r   )rF   
ranks_only)rE   rF   r`   distributedis_initializedr   r   )r   r   r   s      rB   _maybe_full_or_cpu_state_dictr     sn      $$E,=,=,L,L,N  	
 "D$4$4
 	
 
		)*55rS   c                    |j                   si S |j                         5   t        | d             }d d d        t        j	                               D ]w  }t        | |      }t        |      dk(  s	J ||f       t        t        |            }||k7  s@dt        fd} |||      st        d| d|       |j                  |      ||<   y |j                  rgi }|j	                         D ]P  }|j                  D ]?  }|j                  |      s|j                  r	||   ||<   *|t        |      d  }	||   ||	<   A R |}|j                  rI| j!                         D ]6  \  }}
|
j"                  rt        | |      }|D ]  }|j                  |        8 t        |j%                               D ]9  \  }}t'        j(                  |      s|j*                  s)|j                  |       ; t-        ||      S # 1 sw Y   xY w)Nr   rr   rm   c                    t        |      t        |       k\  ry|j                  d      }| j                  d      }d}t        |      D ]:  \  }}|||   k(  r'|dz  }|t        |      k(  s"|t        |      dz
  k(  c S |dv r: y y)NFrp   r   rr   )rq   rs   T)r{   ru   rv   )r   r   	fqn_split	key_splitfqn_idxkey_idxkey_names          rB   verifyz%_get_model_state_dict.<locals>.verify  s    s8s3x' IIcN	IIcN	)29)= %%GX9W#551"c)n4#*c)nq.@#@@!%<< $% rS   zAn unexpected key, z, exists. FQN is )rZ   r\   r   re   r   r   r{   nextiterrP   r   poprY   
startswithrH   rG   r   requires_gradr   r`   	is_tensoris_metar   )ri   r   r   r   r   r   r   new_state_dictr   r   r   ps               rB   _get_model_state_dictr     s    					 ;8^E<8:
; JOO%& 2$4yA~*T{*~4:#:D " #s#"%8=Nse#TUU(nnS1JsO72: /1??$ 	>C11 >~~f-//*4S/N3'!#f+-0G.8oN7+>	> $
  002 	$JC""UC(D $s#$		$ z'')*  Q??1!))NN3  )T::u; ;s   G55G?c           	      ,   |j                   r|s|j                  st        i i       S i }t        |       D ]u  \  }}t	        | |      }t	        | |dd      }t        ||      D ]F  \  }}	|j                  rt        j                         dk(  r||	k7  r|j                  |      ||	<   |||	<   H w d}
|j                  s|j                  rd }|j                         D ]O  \  }}t        j                  |      s|j                         dkD  s0||j                  }?||j                  k(  rOJ  |J |t        j                  d      k(  r t        j                  j!                         }d}
|j                  rt#        ||||j$                         n|j                  rt'        |||       |j                         D ]
  \  }}|||<    |j)                         5  t+        t         t-        | d      ||j$                  |
	            cd d d        S # 1 sw Y   y xY w)
NF)rk   rl   r   metaT)devicerI   r   load_state_dict)r   rI   assign)rZ   rJ   r&   r   r   zipr   r   r   rE   r   r`   r   dimr   distributed_c10d_get_pg_default_devicer   rI   r   r\   r	   r   )ri   r   r   local_state_dictr   valuer   fqns_with_prefixr   fqn_with_prefixr   r   local_states                rB   _load_model_state_dictr     s    Z8Q8Q R((07 6
U$$3E
 %(.>$? 	6 C--A1E(.8nnS.A
?+05_-	66 F  D$8$8*002 	2JCu%%))+/>"\\F!U\\111	2 !!!U\\&))**AACFF$$!,VDKK !!":/?O 0 6 6 8 	*C)JsO	* 
			 
4N5"34%dkk&

 
 
s   -H

Hoptimc                    | j                   ry| j                  D ]  }|t           D ]  }|j                    y ! | j                  D ]7  }|t           D ])  }|j                  st        j                  |      |_        + 9 g }| j                  D ]   }d|v s|j                  |d          d|d<   " | j                  d       | j                  D ]  }d|v s|j                  d      |d<    | j                  d       y)zH
    Initialize optim states by calling the step() with zero grads.
    Nlrg        )closurer   T)set_to_none)r9   r7   _PARAMSgradr   r`   
zeros_likery   stepr   	zero_grad)r   param_groupr   lrss       rB   _init_optim_stater  J  s    {{ ))  ) 	Ezz%	
 )) 5 ) 	5E"""--e4
	55 C)) $;JJ{4() #K$ 
JJtJ )) +; #
K+ 
OOO%rS   c           
         d }i }t        t        | t                 j                         D ]D  \  }}t        t        |      j                         D ]  \  }} ||       ||t         d| d| <     F t        t        | t
                 D ]\  }|j                  t              }t        t        t           |      D ]+  }|j                         D ]  \  }}||t
         d| d| <    - ^ |S )aI  
    This API flattens the optimizer state_dict to support optimizer resharding for
    MPMD, e.g., pipeline parallelism.

    Without the API, the original optimizer state_dict looks like:
    {
        "state": {
            "layer1.weight": {
                "step": 10, "exp_avg": SomeTensor, "exp_avg_sq": SomeTensor
            },
            "layer2.weight": {
                "step": 10, "exp_avg": SomeTensor, "exp_avg_sq": SomeTensor
            },
        },
        "param_group": [
            {
                "lr": 0.0,
                "betas": (0.9, 0.95), ...,
                "params": ["layer1.weight", "layer2.weight"]
            }
        ]
    }

    With this API, the optimizer state_dict looks like:
    {
        "state.layer1.weight.step": 10,
        "state.layer2.weight.step": 10,
        "state.layer1.weight.exp_avg": SomeTensor,
        "state.layer2.weight.exp_avg": SomeTensor,
        "state.layer1.weight.exp_avg_sq": SomeTensor,
        "state.layer2.weight.exp_avg_sq": SomeTensor,
        "param_group.layer1.weight.lr" : 0.1,
        "param_group.layer2.weight.lr" : 0.1,
        "param_group.layer1.weight.betas" : (0.9, 0.95),
        "param_group.layer2.weight.betas" : (0.9, 0.95),
    }

    Note that if any of the value is a container, like the betas in the example,
    this API won't flattent it.
    c                 ~    t        | t        j                  t        t        f      st        dt        |        d      y )NzUFlattening optimizer state_dict only supports tensor, int, float states now. Type is rp   )rw   r`   ra   intfloatNotImplementedErrortype)vs    rB   _raise_if_type_not_supportedz?_flatten_optim_state_dict.<locals>._raise_if_type_not_supported  s>    !ellC78%7)1&  9rS   rp   )
r	   r,   _STATEr   r-   _PGr   r  r   r_   )	r   r  retr   r9   kr  r
  r   s	            rB   _flatten_optim_state_dictr  p  s    T !#C=*V*<=CCE +
U.446 	+DAq(+)*C6(!C5!%&	++
 -z#? ,w'S	4( 	,C#))+ ,1*+se1SE1#&',	,,
 JrS   c                    i }g }t         |t        |i}| j                  D ]Q  }|j                  t        g i       |t           D ]  }|j
                  |   D ]  }|d   t           }	t        |	t              sJ |	j                  |       |j                  s?i ||<   | j                  |   j                         D ]'  }
|t          d| d|
    t        t        ||         |
<   )   t        t        t           |d   t                 d   }|j                         D ]V  }|t        k(  r|t         d| d|    }||d   vr	||d   |<   .|d   |   |k7  s:t        d| d| d| d|d   |    d	       T |S )z
    This API unflattens the state_dict generated by _flatten_optim_state_dict().
    See the docstring of _flatten_optim_state_dict() for more detail.
    rp   r   zaAll the parameters in the same parameter group should have the same saved param_group value. But z is z while other(s) is )r  r  r7   ry   r  rW   rw   re   r   r9   r   r	   r,   r   r_   r   )r   r   r   r9   pg_state
return_osdr
  r   r   r8   
state_namefirst_param_fqnr  r   s                 rB   _unflatten_optim_state_dictr     s    E"$H&,eS(%CJ)) "& ) 	E--e4 
!"g.!&$///c"**c
"'++e"4"9"9"; JBL!(!C5*6CDc
3J?
	 tCy(2,w*?@C!!# 	AG|#a'8!=>E$"'Q"aE)"==L<MQqc R 3HRLO3DAG 	!: rS   
optimizersc                    |j                   si S t        i t        g i}|D ]  }t        |        t	        |d             }|j
                  r|j                         5  t        j                  | ||      }d d d        |s_t        |t           j                               D ]9  }d|v s|t           j                  |      |t           |j                  dd      <   ; |t           D ]1  }|t           D cg c]  }|j                  dd       }}||t        <   3 n/t        t        j                  d |j                   D                    }t#        t%        |t'        t)        |                        }	i }
| j+                         D ]I  \  }}t-        | |      }t)        |      dk(  sJ t/        t1        |            }||	vr;|	|   }||
|<   ||
|<   K t        |t           j                               D ])  }|
|   }|t           j                  |      |t           |<   + |t           D ]#  }|t           D cg c]  }|
|   	 c}|t        <   % |s-t3        t4        |t                 j7                  |t                  t3        t8        |t                 j;                  |t                   |j<                  rt3        t>        tA        |            }tC        ||      S # 1 sw Y   ixY wc c}w c c}w )Nr   rs   z
_orig_mod.ro   c              3   .   K   | ]  }|t              y wr<   )r  )r   gs     rB   r   z(_get_optim_state_dict.<locals>.<genexpr>  s     -UQaj-Us   rr   )"r[   r  r  r  r   r]   r\   rz   r   re   r   r   rt   r  r   from_iterabler7   r^   r   ranger{   r   r   r   r   r	   r,   r   r-   extendrK   r.   r  r   )ri   r!  r   r   r   osdr  r$  r8   param_pid_mappingfqn_pid_mappingr   r   r   r   pidgroups                    rB   _get_optim_state_dictr-    s    	,2BR+@ ,H% 1nUL13""$ ?++E5#>? #f+**,- R!#?B6{q?QCK		, ;<R X $?@zJ!!))L"5JJ#'
$ %---U%BTBT-UUVF $Ss6{1C%D E O#446 +
U ,4yA~%~4:& 11'.'*$'*$+ CK,,./ 8%c*#&v;??3#7FC 8 S RBG.!Q3/#"6!QgR ],V45<<S[I 0 56==c#hGY,H\ (( 9:J K
 ))94@@_? ? K* "Rs   K1K'0K,K$	c           
      "   i }g }t         |t        |i}i }t        d t        t        |t                  j                         D              r|S |j                  D ]L  }|j                  t        g i       |t           D ]%  }	|j                  |	   D ]  }
|
|j                  v rCd}t        t        |t                 D ]&  }|
t        t        t           |t                 v s$d} n nd}|sZ|d   t           }t        |t              sJ |j                  |
       |	j                   rt        t        |t                  |
   ||
<   t        t        |t                 D ]D  }|
t        t        t           |t                 v s$t#        |t                 dz
  |t%        |      <   F  ( O t        t        |t                 D ]M  }|j'                  t%        |      d      }|dk(  r$|j)                         D ]  \  }}|t        k(  r|||   |<    O |S )a  
    Extract the corresponding optim state_dict from ``optim_state_dict`` for
    ``optim`` and return the result optim state_dict.

    Args:
        model (nn.Module): the root model.
        optim (torch.optim.Optimizer): the optimizer.
        optim_state_dict (Dict[str, ValueType]): the superset optim state_dict that
            contains the optim state_dict of ``optim``.
        info (_StateDictInfo): state dict information.

    Returns:
        The optim state_dict of ``optim``.
    c              3   <   K   | ]  }t        |t                y wr<   )rw   r  )r   r  s     rB   r   z*_split_optim_state_dict.<locals>.<genexpr>7  s       
1cs   FTr  rr   )r  r  allr	   r,   r   r7   ry   r  rW   rX   r-   r   r_   rw   re   r   r{   idr   r   )ri   r   r   r   r9   r  r  
pg_mappingr
  r   r   	in_paramsloaded_param_groupr8   idxr   r   s                    rB   _split_optim_state_dictr6    s3   * E"$H&,eS(%CJ!#J
 $(8H8P$Q$V$V$X   )) V"& ) 	VE--e4 V$444 %I.2)+;C+@/ "* $tCy2DW2M"NN(,I!" !%I !"g.!&$///c"&&!%m5Ef5M!Ns!SE#J*.%'7'<+ V& d49.@.IJJ=@C=QTU=U
2&8#9:	V'V	VV8 -/?/DE 'nnR_b1"9%++- 	'JCg~!&HSM#		'	' rS   c           
      v   |j                   sy |D ]  }t        |       |r@t        |v rt        | |||      }n+t	        |t        t        t        t        f   |      |      }ni }|j                  rS| j                         D ]  \  }}t        | |      }t        | |d      }	||	k(  r't        |      dk(  sJ |j                         }
|	j                         }|t           D ]N  }t        t        t        t        f   |      }|t            D cg c]  }|j#                  |
|       }}||t         <   P t        t$        |t                 }t'        |j)                               D ]+  }|
|v s|j                  |      ||j#                  |
|      <   -  |j+                         5  t-        j.                  | ||      }d d d        n|j0                  rd|_        t3        | |f|      }d|_        d fd}t5        t6        j8                  ||      }J t;        |      \  }}t;        |      \  }}|j<                  rt?        ||       ntA        ||       |j)                         D ]  }||vs||v sJ ||   ||<   ||   ||<    tC        ||      } tE        |d      |        y c c}w # 1 sw Y   %xY w)	NF)rl   rr   Tc                     | j                         dkD  r*| j                  | S | j                  k7  rt        d      | S )Nr   zDevice mismatch)r   r   r   )tr   s    rB   _devicez'_load_optim_state_dict.<locals>._device  sD    557Q;~!"   188+():;;rS   r   r   )r   )#r[   r  r  r6  r   r	   r
   r_   r+   r]   r   r   r{   r   r  r   r  rt   r,   re   r   r\   rz   optim_state_dict_to_loadrE   r-  r(   r`   ra   r   rJ   r   r   r   r   )ri   r!  r   r   r   r   original_fqn_r   fqns_with_compilerr   fqn_with_compilerr$  valr   r8   	osd_stater  r   r:  flatten_osdosd_mappingflatten_local_osdlocal_osd_mapping	optim_keyr   s                            @rB   _load_optim_state_dictrG  e  s	     QN% ##:5*d$  $?4S)^ 4jA4$   " $)#9#9#; Xa 5%.<e&" --4yA~%~hhj$6$:$:$<!)#. *AtCH~q1CGJ7|@CC):;F  $*CL* !0@0HI	inn./ XAaxGP}}UVGW	!))C1B"CDX%X, ""$ #'#@#@5"2$   !!#(D 4UUHdK#'D F ellG5EFA%%%':;K'L$K3FGW3X00((%k3DVT&{4EfU
 )--/ J	$55$3333>y3I%i03>y3I%i0	J
  5!#4  	1u/0<LMcQN: s   J*
#J//J8	c                    t               5  t        | dd||      }t        | |      }t        |i |       |cddd       S # 1 sw Y   yxY w)aH  
    Return the model state_dict of ``model``.

    See ``get_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        submodules (deprecated): Optional[Set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        The state_dict for ``model``.

    :rtype: typing.Dict[str, ValueType]
    rR   Fr   r   r   N)rC   r   r   r   )ri   r   r   r   r   s        rB   r0   r0     sV    0 
 
 !
 1=+R6
  
  
 s   +A  A	c                    t               5  t        |t        j                  j                        r|fn
t        |      }t        | |d||      }t        | ||      }t        i ||       |cddd       S # 1 sw Y   yxY w)a  
    Return the combined state_dict for optimizers.

    See ``get_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        submodules (deprecated): Optional[Set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        The state_dict for ``optimizers``.

    :rtype: OptimizerStateType
    TrI  N)	rC   rw   r`   r   	Optimizertupler   r-  r   )ri   r!  r   r   r   r   s         rB   r1   r1     s    6 
   *ekk&;&;< Mz" 	
 !
 1
DI2/6     s   AA33A<c                   t               5  t        |t        j                  j                        r|fn
t        |      }t        | |d||      }t        | |      }t        | ||      }t        |||       ||fcddd       S # 1 sw Y   yxY w)a  
    Return the model state_dict and optimizers state_dict.

    ``get_state_dict`` can process any module that is parallelized by PyTorch
    FSDP/fully_shard, DDP/replicate, tensor_parallel/parallelize_module, and any
    combination of these parallelisms. The main functions of ``get_state_dict``
    are: 1.) returning a model and optimizer state_dict that can be resharded
    with a different number of trainers and/or different parallelisms.
    2.) hiding the parallelism-specific state_dict APIs. Users don't have to call
    these APIs.
    3.) sanity checking the result state_dict.

    The keys of the result state dictionary are the canonical FQNs (Fully
    Qualified Names).  A canonical FQN refers to the FQN based on a parameter's
    position in an nn.Module hierarchy. More specifically, a canonical FQN to a
    parameter is the FQN returned by ``module.named_parameters()`` or
    ``module.named_buffers()`` when the module is not distributed by any
    parallelisms. Since the optimizer internally uses parameter IDs to represent
    a parameter, there will be a conversion from the parameter IDs to the
    canonical FQNs when calling this API.

    ``get_state_dict`` can also process a module that is not parallelized. In
    such a case, ``get_state_dict`` only performs one function -- converting the
    optimizer parameter IDs to the canonical FQNs.

    Example:
        >>> # xdoctest: +SKIP
        >>> import torch
        >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        >>> from torch.nn.parallel import DistributedDataParallel as DDP
        >>> from torch.distributed.checkpoint.state_dict import get_state_dict

        >>> fsdp_model = FSDP(copy.deepcopy(model))
        >>> fsdp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)
        >>> ddp_model = DDP(copy.deepcopy(model))
        >>> ddp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)


        >>> ddp_state_dict, ddp_optim_state_dict = get_state_dict(ddp_model, ddp_optim)
        >>> fsdp_state_dict, fsdp_optim_state_dict = get_state_dict(fsdp_model, fsdp_optim)

        >>> # if we simply call ddp_model.state_dict() and fsdp_model.state_dict(),
        >>> # the asserts will fail.
        >>> assert ddp_state_dict == fsdp_state_dict
        >>> assert ddp_optim_state == fsdp_optim_state_dict


    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        submodules (deprecated): Optional[Set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        ``Tuple`` that contain model state_dict and optimizer state_dict.

    :rtype: typing.Tuple[typing.Dict[str, ValueType], OptimizerStateType]
    FrI  N)
rC   rw   r`   r   rK  rL  r   r   r-  r   )ri   r!  r   r   r   r   r   s          rB   r2   r2     s    L 
 2 *ekk&;&;< Mz" 	
 !
 1=0
DI+-=tD!11!2 2 2s   A,BB
c           
         |si S t        t        t        |j                                     t        j
                        rt        j                  dt               t        t        t        j
                  t        t        t        f   f   |      }i }|j                         D ]  \  }}| j                         D ]y  \  }}||k7  rt        | |      }t!        |      dk(  sJ d       t        t        |             d}	|j#                  |j                         D 
ci c]  \  }
}|	|
z   | c}}
       {  |S t        t        t        t        f   |      S c c}}
w )NzPassing model_state_dict as a ``Dict[nn.Module, Dict[str, Any]]``is deprecated and will be removed in 2.5. If you need this feature, please preprocessing the model_state_dict to achieve the same functionality.rr   z/FQNs for a submodule should only have 1 elementrp   )rw   r   r   r   rf   rg   r   r   r   r	   r
   r_   r+   r   r   r   r{   r   )ri   r   cast_state_dictr   r   sub_state_dictrj   mr   r   subfqnr   s               rB   _unflatten_model_state_dictrS  n  s<    	$tJOO-./;" 	
 tBIItCN/C$CDjQ/1)8)>)>)@ 
	%I~ ..0 	a	> -4yA~X'XX~ d,-Q/%%AOAUAUAWXVf_e+X	
	 Di(*55	 Ys   E)r   c                    t        | |      }t               5  t        | dd|      }t        |i |       t	        | ||      cddd       S # 1 sw Y   yxY w)a=  Load the model state_dict.

    The counterpart of ``get_model_state_dict`` to set the state_dict to the
    model. See ``set_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        model_state_dict: (Dict[str, ValueType]):
           the model state_dict to load. If the key of the ``model_state_dict``
           is nn.Module, the key is a submodule of ``model`` and the value should
           be the state_dict of the submodule. When loading the state_dict,
           the prefix of the submodule will be append to the state_dict.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
            * **missing_keys** is a list of str containing the missing keys
            * **unexpected_keys** is a list of str containing the unexpected keys

    :type model_state_dict: typing.Dict[str, ValueType]
    rR   Fr   r   N)rS  rC   r   r   r   )ri   r   r   r   s       rB   r3   r3     s`    : .I. 
 EubUGL+R6%e-=tD	E E Es   )A

Ac                    t               5  t        |t        j                  j                        r|fn
t        |      }t        | |d|      }t        i ||       t        | |||       ddd       y# 1 sw Y   yxY w)a  Load the optimizers state_dict.

    The counterpart of ``get_optimizer_state_dict`` to set the state_dict to the
    optimizers. See ``set_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        optim_state_dict: OptimizerStateType:
            the optimizer state_dict to load.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        None

    :type optim_state_dict: typing.OptimizerStateType
    TrU  N)	rC   rw   r`   r   rK  rL  r   r   rG  )ri   r!  r   r   r   s        rB   r4   r4     sz    6 
 	J *ekk&;&;< Mz" 	
 ujT7S2/6uj2BDI	J 	J 	Js   AA11A:c                .   t        | |      }t               5  t        |t        j                  j
                        r|fn
t        |      }t        | || |      }t        |||       t        | |||       t        | ||      cddd       S # 1 sw Y   yxY w)a4  Load the model state_dict and optimizers state_dict.

    The counterpart of ``get_state_dict`` to set the state_dict to the model and
    optimizers.  The given ``model_state_dict`` and ``optim_state_dict`` do not
    have to be returned by ``get_state_dict`` but must meet the following
    requirements: 1) all FQNs are canonical FQNs as defined in ``get_state_dict``,
    2) if a tensor is sharded, it must be either a ShardedTensor or DTensor,
    3) optimizer state_dict cannot contain the parameter IDs; the keys should be
    the canonical FQNs.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        model_state_dict: (Union[Dict[nn.Module, Dict[str, ValueType]], Dict[str, ValueType]]):
           the model state_dict to load. If the key of the ``model_state_dict``
           is nn.Module, the key is a submodule of ``model`` and the value should
           be the state_dict of the submodule. When loading the state_dict,
           the prefix of the submodule will be append to the state_dict.
        optim_state_dict: OptimizerStateType:
            the optimizer state_dict to load.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
            * **missing_keys** is a list of str containing the missing keys of the model state_dict.
            * **unexpected_keys** is a list of str containing the unexpected keys of the model state_dict.

    :type model_state_dict: typing.Dict[str, ValueType]
    :type optim_state_dict: typing.OptimizerStateType
    rU  N)rS  rC   rw   r`   r   rK  rL  r   r   rG  r   )ri   r!  r   r   r   r   s         rB   r5   r5     s    T .I. 
 E *ekk&;&;< Mz" 	
 :.>*>
 	+-=tDuj2BDI%e-=tDE E Es   A*BBc                $   t        j                  t        | |      fd}|| _        t        j                  t        | |      dt
        t        t        f   ffd}|| _        t        j                  |       t        j                  |       y)a  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model``.

    Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model`` to
    be a partial function to call ``get_state_dict`` and ``set_state_dict``.

    Example:
        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        from torch.distributed.checkpoint.state_dict import patch_model_state_dict

        model = fsdp(model)
        patch_model_state_dict(model)

    Args:
        model (nn.Module): the nn.Module to the model.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.
    Returns:
        None
    )ri   r   c                               S r<   rR   _state_dict_calls   rB   state_dict_callz0_patch_model_state_dict.<locals>.state_dict_call<      !!rS   r   c                      |        y )N)r   rR   r   _load_state_dict_calls    rB   load_state_dict_callz5_patch_model_state_dict.<locals>.load_state_dict_callG      z:rS   N)r   r   r0   r   r3   r
   r_   r   r   r:   r   )ri   r   r\  ra  r`  r[  s       @@rB   _patch_model_state_dictrc    s    6 !((" 'E%--;c3h ; 1EO,01rS   c                   t        j                  t        | ||      fd}t        j                  t        | ||      dt        t
        t        f   ffd}t        j                  |       t        j                  |       t        |t        j                  j                        r|fn
t        |      }|D ]  }||_        ||_         y)a  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers``.

    Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers`` to
    be a partial function to call ``get_state_dict`` and ``set_state_dict``.

    Note that if there are multiple optimizers, all of the optimizers will be patched.
    So users only need to call one of the state_dict() to get the full result.

    Example:
        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        from torch.distributed.checkpoint.state_dict import patch_model_state_dict

        model = fsdp(model)
        patch_model_state_dict(model)

    Args:
        model (nn.Module): the nn.Module to the model.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.
    Returns:
        None
    )ri   r!  r   c                               S r<   rR   rZ  s   rB   r\  z4_patch_optimizer_state_dict.<locals>.state_dict_callx  r]  rS   r   c                      |        y )N)r   rR   r_  s    rB   ra  z9_patch_optimizer_state_dict.<locals>.load_state_dict_call  rb  rS   N)r   r   r1   r4   r
   r_   r   r:   r   rw   r`   r   rK  rL  r   r   )ri   r!  r   r\  ra  r   r`  r[  s         @@rB   _patch_optimizer_state_dictrg  R  s    > !(( 	" &-- 	;c3h ; O,01 j%++"7"78 
: 
  5* 45rS   )TT)rrc   r   r=   r   dataclassesr   r   r   	itertoolsr   typingr   r   r	   r
   r   r   r   r   r   r   r   r   r`   torch.distributedr   r   torch.nnrf   'torch.distributed._shard.sharded_tensorr   #torch.distributed._state_dict_utilsr   r   r   r   r   r   ;torch.distributed.algorithms._checkpoint.checkpoint_wrapperr   torch.distributed.fsdpr   r   r   rz   r   r   r    r!   r"   $torch.distributed.fsdp._common_utilsr#   r$   torch.distributed.tensorr%   torch.nn.modules.moduler&   torch.nn.parallelr'   rx   torch.utils._pytreer(   __all__r|   r  r  r  r_   r)   ra   r  r  r*   r+   r,   r-   r.   rb   r:   rQ   r   rC   r/   rU   	lru_cacherg   rP   r   r   r   r   rK  r   r   r   r   no_gradr   r   r  r  r   r-  r6  rG  r0   r1   r2   rS  r3   r4   r5   rc  rg  rR   rS   rB   <module>ry     s     	  0 0         A 	 	 	 - 5 < -" 
		Sg}ellCKL4&m(<d3CS>TT	 S)^$' #u]4E%EFFG  &)U S] *   +/ +/ +/\ @% @ @ T" !!%	;E99;E
;E ;E 	;E
 ;E #;E|	 	"H ,0*.99%++'',- 
 RYY( &' D*3	>**(* * 
	*Zbii)>)>>? c h S#X&4	#s(^$ @;99@;*@;	#y.@; @;F 2
992
S)^$2
 2
 	2
 2
j#&U[[22 #&t #&L=*< =c9nAU =@*;;  *S)^$* * 	*Z <A99<Aekk++S01<A <A 	<A <A~E99E;;  E )E 	E
 EP ZN99ZNekk++S01ZN #ZN 	ZN
 
ZN ZN@ ,0*.	" 99"  RYY("  &'	" 
 
#y." R ,0*.* 99* ekk++Xekk6K6K-LLM*  RYY(	* 
 &'*  * b ,0*.V299V2ekk++Xekk6K6K-LLMV2 RYY(	V2
 &'V2 4Y!334V2r6996d299d3	>&::;T#y.=QQR6 
#y.6J +/	$E99$E3	>*$E &'	$E
 $EX +/$J99$Jekk++Xekk6K6K-LLM$J )$J
 &'$J 
$JZ +/9E999Eekk++Xekk6K6K-LLM9E 3	>*	9E
 )9E &'9E 9E|  +/129912 &'12 
	12 12l 
 +/	;599;5 ekk++S01;5 &'	;5
 
;5 ;5rS   