
    sg´                     N   d dl Z d dlZd dlZd dlZd dlmZmZmZmZm	Z	m
Z
mZmZmZmZmZmZmZmZ d dlZd dlmZ d dlmc mc mZ d dlmc mc mZ d dlmc mc mZ  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z' d dl(m)Z) d dl*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 d dl2m3Z3m4Z4m5Z5m6Z6 d dl7m8Z8 d d	l9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZA d d
lBmCZC d dlDmEZE d dlFmGZG d dlHmIZI erd dlJmKZK dZL	 d dlMmNZNmOZO  eQd      ZRdZSeej                  ej                  f   ZUeeej                  eUf      ZVe?j                  e6j                  e?j                  e6j                  e?j                  e6j                  e?j                  e6j                  e?j                  e6j                  iZ\e?j                  e?j                  gZ]e?j                  e?j                  fZ^e	 d^de,deVde?deeC   dee'   de,fd       Z_ede,deVde'de,fd       Z`ededeafd       Zbede'deafd       ZcedeQdej                  fd       Zded ej                  deQdej                  fd!       Zed ej                  deQdeej                  ej                  f   fd"Zfe	 d^de,d#e"j                  d$ee
ejD                  j                        d%eee
ejD                  j                        ee
ejD                  j                        f   de,f
d&       Zid%ee   d'eaddfd(Zjede,d#e"j                  d)ee"j                     d*eeeQej                  f      de,f
d+       Zlede,d#e"j                  de,fd,       Zmede,dee?   d-ee>   d.ee;   d/ead0ead1eQd2eQde,fd3       Znede,de,fd4       Zoede,d5e:d6eade,fd7       Zped^de,de'de,fd8       Zqede,de,fd9       Zrd#e"j                  d:ee"j                     ddfd;Zsede,d<e"j                  d*eeeQej                  f      d=eee"j                  gdf      d>eade,fd?       Ztede,d:ee"j                     d<e"j                  fd@       ZudAe"j                  dBee
ejD                  j                        dee"j                     fdCZv	 d^dAejD                  j                  d$eejD                  j                     dDee
ejD                  j                        deejD                  j                     fdEZwdAejD                  j                  d$eejD                  j                     deex   fdFZydAe"j                  deex   fdGZzd#e"j                  d)ee"j                     d*eeeQej                  f      ddfdHZ{d*eeeQej                  f      dIeQdJe+deej                     fdKZ|d#e"j                  d)ee"j                     d$ee"j                     deeaeaf   fdLZ}dAe"j                  d=ee"j                  gdf   d$ee"j                     ddfdMZ~dAe"j                  dNeej                     d$ee"j                     dJe+fdOZdAe"j                  d$ee"j                     dee"j                     fdPZd#e"j                  d)ee"j                     dQeej                     dNeej                     ddf
dRZd:ee"j                     dSeej                     dNeej                     ddfdTZdU Zd#e"j                  d)ee"j                     dNeej                     dIeQdJe+dej                  fdVZd#e"j                  d:ee"j                     dej                  ddfdWZdXeej                     ddfdYZd#e"j                  d)ee"j                     dee"j                     fdZZd)ee"j                     ddfd[Zde?fd\Zdej                  de$j                  fd]Zy# eP$ r dZLY w xY w)_    N)AnyCallableDequeDict	GeneratorIterableIteratorListno_type_checkOptionalSetTupleTYPE_CHECKINGUnion)default_hooks)_mesh_resources
DeviceMesh)_get_default_group)_FSDPDeviceHandle
_FSDPState_get_module_fsdp_state_is_fsdp_flattened!_named_parameters_with_duplicatesclean_tensor_nameTrainingState)_FSDP_USE_FULL_PREC_IN_EVALFlatParameterFlatParamHandleHandleShardingStrategy)_FreeEventQueue)BackwardPrefetch
CPUOffloadFullOptimStateDictConfigFullStateDictConfigMixedPrecisionShardingStrategyStateDictConfigStateDictType)_Policy)DTensorExtensions)_sync_params_and_buffers)is_traceable_wrapper_subclass)RemovableHandleT)deferred_initfakeFi  _fsdp_syncedstateprocess_groupsharding_strategypolicydevice_meshreturnc                 (   ||t        d      |t        v }|r#|||t        d| d      t        | ||      } n4|r|| _        |j	                  d      | _        n||n	t               | _        | j
                  j                         | _        | j
                  j                         | _	        | j                  }|r|| j                  j                         z  }t        j                  j                  |      | _        || j                  z  | _        | S )NzcCannot pass both process_group and device_mesh at the same time. Please just pass only one of them.zManual wrapping with zA requires explicit specification of process group or device_mesh.r   mesh_dim)
ValueErrorHYBRID_SHARDING_STRATEGIES*_init_process_group_state_for_hybrid_shard_device_mesh	get_groupr2   r   ranksize
world_size_inter_node_pgr   DefaultState_get_gradient_predivide_factor_gradient_predivide_factor_gradient_postdivide_factor)r1   r2   r3   r4   r5   is_hybrid_strategydata_parallel_world_sizes          U/var/www/html/venv/lib/python3.12/site-packages/torch/distributed/fsdp/_init_utils.py_init_process_group_staterJ   g   sI     [%<<
 	
 +.HH V^8K '(9': ;S S 
 ?}kE !,E"-"7"7"7"CE "/!:@R@T  $$))+EJ**//1E$// E$8$8$=$=$?? ""AA$	
 
$ 	!5#C#CC 
% L    c                    |rYt        |      r6|| _        |j                  d      | _        |j                  d      | _        nt        d|j                         |@t               }t        || j                  j                               \  }}|| _        || _        n2t        |      r|\  | _        | _        nt        dt        |             t        | j                        | _        | S )Nr   r8      z,Expected device_mesh to have ndim=2 but got zmExpected process_group to be passed in as either None or Tuple[dist.ProcessGroup, dist.ProcessGroup] but got r2   )"_is_valid_hybrid_shard_device_meshr=   r>   rB   r2   r:   ndimr   !_init_intra_and_inter_node_groups_device_handledevice_count_is_valid_hybrid_shard_pg_typetype_get_default_comm_hook_state_inter_node_state)r1   r2   r5   default_groupintra_node_groupinter_node_groups         rI   r<   r<      s
    -k:!,E $/#8#8!#8#DE "-"7"7"7"CE>{?O?O>PQ  
	*,-N5//<<>.
** // *-8 9F5E!5GGKMGZF[] 
 ;**E LrK   c                 j    t        | t              xr" t        |       dk(  xr t        d | D              S )N   c              3   P   K   | ]  }t        |t        j                           y wN)
isinstancedistProcessGroup).0pgs     rI   	<genexpr>z1_is_valid_hybrid_shard_pg_type.<locals>.<genexpr>   s     Jb
2t001J   $&)r_   tuplelenallrN   s    rI   rT   rT      s:     	=%( 	K!#	KJMJJrK   c                 D    t        | t              xr | j                  dk(  S )Nr\   )r_   r   rP   )r5   s    rI   rO   rO      s    k:.H;3C3Cq3HHrK   num_devices_per_nodec                 6    t        j                  |       \  }}|S )aU  
    Return a process group across the current node.

    For example, given each row is a distinct node:
    0  1  2  3  4  5  6  7
    8  9 10 11 12 13 14 15
    This API would return an intra-node subgroup across
    [0, 1, ..., 7] or [8, 9, ..., 15] depending on the process's rank.
    For example, rank 3 would get [0, 1, ..., 7].
    )r`   new_subgroups)rj   intra_node_subgroup_s      rI   _init_intra_node_process_groupro      s!     "//0DErK   global_process_groupc                 T   d}t        j                  |       }t        j                  |       }||z  }t        j                  |       |z  }t	        |      D ]?  }t	        |      D cg c]
  }|||z  z    }	}t        j
                  |	|      }
||k(  s>|
}A |
J | d       |S c c}w )a  
    Return an inter-node process group where each contained rank has the same local rank.

    For example, given each row is a distinct node:
    0  1  2  3  4  5  6  7
    8  9 10 11 12 13 14 15
    This API would return inter-node process group [0, 8], [1, 9], [2, 10], and so forth
    depending on the process's rank. For example, rank 1 would get [1, 9], rank 5
    would get [5, 13].
    N)ranksbackendz. expected to assign inter-node pg, but did not)r`   get_backendget_world_sizeget_rankrange	new_group)rp   rj   inter_node_pgsharding_backendrA   	num_nodesmy_local_rank
local_rankiranks_for_inter_groupgrps              rI   _init_inter_node_process_groupr      s      M''(<=$$%9:J22IMM"67:NNM01  
=B9=M!
89J!223!
 !
 nn#8BRS&M  	!H
FGH!!
s   %B%c                 0    t        |      t        | |      fS )a  
    Initialize intra and inter-node process groups and return the ones corresponding to this process's rank.

    This function can be used to initialize process groups for ``HYBRID_SHARD`` or
    ``_HYBRID_SHARD_ZERO2`` in FSDP.
    This function assumes each node has an equal number of CUDA-enabled devices.
    Returns:
        Tuple[dist.ProcessGroup, dist.ProcessGroup]: Intra and inter-node process group.
    )ro   r   )rp   rj   s     rI   rQ   rQ   
  s#     	'';<&';=QR rK   moduleignored_modulesignored_statesc                    ||t        d      d }|d u}|rt        |      }t        |d       ng }t        |t        |      ng d       t        |      dkD  r"t	        |d   t
        j                        r|}n|}t        ||      | _        t        || j                  |      | _
        t        || j                        | _        | S )NzfCannot pass both ignored_modules and ignored_states at the same time. Please just pass ignored_states.TFr   )r:   list_check_ignored_statesrg   r_   nn	Parameter_get_ignored_modules_ignored_modules_get_ignored_params_ignored_params_get_ignored_buffer_names_ignored_buffer_names)r1   r   r   r   ignored_parameterspassed_as_ignored_statesignored_states_lists          rI   _init_ignored_module_statesr     s     "~'A:
 	
 -T9">2148 %4%@D!b%	
 !#)!,bll;!41O1&/JE/E
 #<#E LrK   r   c                    t        |       dk(  ry|r`t        d | D              }t        d | D              }|s9|s6t        | D ch c]  }t        |       c}t              }t        d|       yyt        d | D              s6t        | D ch c]  }t        |       c}t              }t        d|       yc c}w c c}w )	z
    Check that the ignored states are uniformly parameters or uniformly modules.

    We may remove this check in the future if we permit mixing.
    r   Nc              3   P   K   | ]  }t        |t        j                           y wr^   )r_   r   r   rb   r1   s     rI   rd   z(_check_ignored_states.<locals>.<genexpr>W  s     UUE2<<8Ure   c              3   P   K   | ]  }t        |t        j                           y wr^   r_   r   Moduler   s     rI   rd   z(_check_ignored_states.<locals>.<genexpr>X  s     S5*UBII6Sre   )keyzUignored_states expects all nn.Parameter or all nn.Module list elements but got types c              3   P   K   | ]  }t        |t        j                           y wr^   r   r   s     rI   rd   z(_check_ignored_states.<locals>.<genexpr>a  s     LE:eRYY/Lre   z>ignored_modules expects nn.Module list elements but got types )rg   rh   sortedrU   reprr:   )r   r   
all_paramsall_modulesr1   sorted_typess         rI   r   r   L  s     >aUnUU
SNSS+!N"K54;"KQUVL**69  #.z L^LL!N"K54;"KQUVL%(  M #L #Ls   B;C ignored_params	device_idc                 6   d}|1t        |t        j                        r|nt        j                  |      }|t        ||      D ]|  }|j                  j                  dv r||j                  }+|j                  j                  |j                  k7  sOt        d|j                   d|j                  j                          |xs t        j                  j                         }|j                  dk(  rt        d      t        j                  |      | _
        | S )a<  
    Determine device handle used for initializing FSDP.

    If a device is specified by ``device_id``,
    then returns device handle corresponds to that device type. Otherwise, If the
    module is already on a non-CPU device, then the device type is that non-CPU device type.
    If the module is on CPU or meta, then the device type is the current accelerator device.
    See the :ref:`Accelerators<accelerators>` for details.


    This method will be called once ignored paramters was determined, as the device handle maybe needed
    for other initialization.
    N>   cpumetazLFSDP does not support modules with different device types but got params on z and r   zOFSDP needs a non-CPU accelerator device, but no accelerator device is detected.)r_   torchdevice_get_orig_paramsrU   RuntimeError_C_get_acceleratorr   from_devicerR   )r1   r   r   r   determined_deviceparams         rI   _init_device_handler   i  s   (  )U\\2 i( 	
  %fn= 
	E||  O3 ($)LL!<<$$(9(>(>>&-->-C-C,DE%,,J[J[I\^ 
	 .L1J1J1L!!U*a  -889JKELrK   c                     t        |      | _        i }|j                         D ]  \  }}t        |      }|j                  ||<   ! || _        | S r^   )_get_buffer_names_buffer_namesnamed_buffersr   dtype_buffer_name_to_orig_dtype)r1   r   r   buffer_namebuffers        rI   _init_buffer_stater     s_    
 ,F3E
 :<%335 ?V'428,,";/? (BE$LrK   mixed_precisioncpu_offloadlimit_all_gathersuse_orig_paramsbackward_prefetch_limitforward_prefetch_limitc                    | j                   dk(  rO|t        j                  k7  r+t        j                  d|xs t        j
                   d       t        j                  }n/|t        j                  k(  rt        j                  dt        d       |xs t        j
                  | _        |xs
 t               | _	        |5t        j                  j                  dt        | j                                t        j                  j!                  t"        d      d	k(  | _        |xs
 t'               | _        || _        || _        t.        j0                  | _        d | _        t7               | _        t;        j<                         | _        tA        jB                  | j>                  ||      | _"        d | _#        i }|| _$        d }	|	| _%        g }
|
| _&        | S )
NrM   z/FSDP is switching to use `NO_SHARD` instead of z since the world size is 1.zoThe `NO_SHARD` sharding strategy is deprecated. If having issues, please use `DistributedDataParallel` instead.   )
stacklevelz'torch.distributed.fsdp.mixed_precision. 1)'rA   r&   NO_SHARDwarningswarn
FULL_SHARDFutureWarningr3   r%   r   r   r   _log_api_usage_oncestrosenvirongetr   _use_full_prec_in_evalr"   r   r   _use_orig_paramsr   IDLEtraining_state_is_rootr    _free_event_queuer`   get_debug_level_debug_levelexec_order_utils_ExecOrderData_exec_order_data_unshard_event_fully_sharded_module_to_handle_handleparams)r1   r3   r   r   r   r   r   r   r   r   r   s              rI   _init_core_stater     s    1 0 9 99MMA$C(8(C(CD E''
 -55	.77	7< 	
 0N3C3N3NE+?~/?E"$$5c%:O:O6P5QR	
 	

2B73> 
  $3z|E/E,E(--EEN-/E--/E-<<E
  E IK#,KE) *.GEM"$FELLrK   c                 f    g }|| _         g }|| _        g }|| _        d| _        d | _        d | _        | S )NT)_root_pre_forward_handles_pre_forward_handles_post_forward_handles_sync_gradients
_comm_hook_comm_hook_state)r1   r   r   r   s       rI   _init_runtime_stater     sK     8:&?E#24!5E35"7E EE!ELrK   backward_prefetchforward_prefetchc                 "    || _         || _        | S r^   )r   r   )r1   r   r   s      rI   _init_prefetching_stater     s     0E-E LrK   c                     t        j                  |      }|r+|| j                  k7  rt        | j                        | _        | S d | _        | S r^   )r   get_root_meshr=   r*   rR   _fsdp_extension)r1   r5   	root_meshs      rI   _init_extensionr     sO      --k:I yE$6$66 1%2F2F G
 L !%LrK   c                     t         j                  | _        t               }t	               | _        || _        i }|| _        | S r^   )r(   FULL_STATE_DICT_state_dict_typer$   r#   _optim_state_dict_config_state_dict_config_unshard_params_ctx)r1   state_dict_configunshard_params_ctxs      rI   _init_state_dict_stater     s?    *::E)<)>%=%?E"0E57 2ELrK   r   c                     |D ]O  }t        |j                        dk(  sd}| j                         D ]  \  }}||u s|} n |sJ t        d| d       y)z
    Verify if the parameters are accepted by FSDP. The only restriction now
    is that the parameter cannot be a scalar tensor (param.shape == []).
    r   r   z.FSDP doesn't support salar parameters. Change z& to a 1D tensor with numel equal to 1.N)rg   shapenamed_parametersr:   )r   r   r   
param_namenameparam_s         rI   _verify_managed_paramsr   (  s~    
  u{{q J & 7 7 9 fF?!%J :$%KM rK   fully_sharded_moduleparam_init_fnsync_module_statesc                 r    t        | j                  |       t        | j                   j                        }t        | j                   j                        \  }}|s|r|t        || j                         nA|r#t        || j                   j                         n|rt        j                  | fd        j                  D 	ch c]  }|j                         D ]  }	|	  }
}}	t        | j                  |
|       t        | j                  | j                   j                         _        t        t!        | j                              }t#        ||       |r@t%        || j&                          j(                  t*        v rt%        || j,                         t/         ||        S c c}	}w )zHInitialize a ``FlatParamHandle`` from a module ``fully_sharded_module``.c                 >    t        |       d u xr | j                  vS r^   )r   r   )	submoduler1   s    rI   <lambda>z0_init_param_handle_from_module.<locals>.<lambda>Z  s(    '=i'HD'P (8!7!77 rK   )check_fn)_check_single_device_moduler   _get_device_from_device_idr?   rR   _need_to_materialize_moduler   _materialize_with_param_init_fn_materialize_meta_moduler.   materialize_modulebuffers_move_module_to_device_get_compute_devicecompute_devicer   r   r   _sync_module_params_and_buffersr2   r3   r;   rB   _init_param_handle_from_params)r1   r   r   r  r  device_from_device_idis_meta_moduleis_torchdistX_deferred_initignored_moduler   ignored_buffersmanaged_paramss   `           rI   _init_param_handle_from_moduler  ;  s      4e6K6KYW65::u33 3Ne33U5K5K3/N/ 	5=;T' -1G1G	
 
  ""  		
 
%(( 8	
 $44$,,.  	O  	 /

E *+?AVAVWXN/@' .%2E2E	
 ""&@@+$ne6J6J #5.:NOL?s   F3c                    t        |      dk(  ry t        ||| j                  t        | j                     | j
                  j                  | j                  j                  | j                  j                  | j                  j                  | j                  | j                  | j                        }|j                          | j                  rJ | j                   j#                  |j$                         || _        || j&                  |j(                  <   t+        j,                  d      }| j
                  j                  r,|j$                  j,                  |k7  r|j/                  |       y y y )Nr   )fsdp_extensionr   )rg   r   r  SHARDING_STRATEGY_MAPr3   r   offload_paramsr   param_dtypereduce_dtypekeep_low_precision_gradsr2   r   r   shardr   r   append
flat_paramr   _fully_sharded_moduler   r   flat_param_to)r1   r   r   handle
cpu_devices        rI   r  r    s%    6{ae556(())**66,,F LLN}}	LL))*EMJPE))&*F*FGe$J''F,=,=,D,D
,RZ( -S'rK   root_moduler   c           	         d}	 |t        |      n	t               }|D ]V  }t        |t        j
                  j                        st        |dt        |       z         t        |      sMt        d       | j                         D ])  }t        j                  |      r|j                  |       + |D ch c]3  }|j                         D ]  }t        |t        j                        s|  5 }}}| |v rt        j                   d        | j                         D ]9  }t        |      }	|	t#        |	d      sJ |j%                  |	j&                         ; |S # t        $ r }t        |dt        |       z         |d}~ww xY wc c}}w )ah  
    Check that ``_ignored_modules`` is an iterable of ``nn.Module`` s without any FSDP instances.

    Return the modules contained in their module
    subtrees as a :class:`set`. Nested FSDP instances are excluded, but their
    already-computed ignored modules are included.

    ``_ignored_modules`` represents the argument passed by the user to FSDP.
    z>`ignored_modules` should be an iterable of `torch.nn.Module`s Nzbut got zbut got an iterable with z1`ignored_modules` should not include FSDP moduleszTrying to ignore the top-level module passed into the FSDP constructor itself will result in all parameters being ignored and is not well-supported: r   )set	TypeErrorrU   r_   r   r   r   r   r:   modulestraversal_utils_composableadd	fsdp_fileFullyShardedDataParallelr   r   hasattrupdater   )
r)  r   
msg_prefixignored_root_moduleser   childr   r  optional_fsdp_states
             rI   r   r     s    RJQ%5%AC !su 	
 ' R&%((//2J+DT&\N)SSTT!&) PQQR %%' -**62 $$V,- +^^% %!C!CD 	O  o%228;	
 !((* I	4Y?*.0BCCC""#6#G#GH	I
 I  Q
x5E0F/G%HHIqPQ$s   E 78F	F%F  Fr   c                    t               }|D ch c]%  }|j                         D ]  }t        |      r| ' }}}|j                  |       |,|D ch c]  }t        |      r| }}|j                  |       | j	                         D ]9  }t        |      }	|	t        |	d      sJ |j                  |	j                         ; |S c c}}w c c}w )z
    Return the parameters of the modules in ``ignored_modules`` and the parameters in ``ignored_parameters``.

    :class:`FlatParameter` s are excluded from the result.
    r   )r+  
parametersr   r4  r-  r   r3  r   )
r)  r   r   all_ignored_paramsmpparams_in_ignored_modulesparams_in_ignored_parametersr  r9  s
             rI   r   r     s     36% #!ALLN!'(BTUVBW!	! ! 78%)(
1CA1FA(
$ (
 	!!">? !((* K	4Y?*.0ABBB%%&9&I&IJ	K '!(
s   #C
C
C%Cc           	         t               }|D ch c]  }|j                         D ]  }|  }}}|j                  | j                         D ch c]  \  }}||v rt	        |       c}}       | j                         D ]9  }t        |      }|t        |d      sJ |j                  |j                         ; |S c c}}w c c}}w )z6Return the cleaned buffer FQNs in ``ignored_modules``.r   )	r+  r  r4  r   r   r-  r   r3  r   )	r)  r   all_ignored_buffer_namesr=  r   buffers_in_ignored_modulesr   r  r9  s	            rI   r   r     s    
 *- ("aiik",2"" " ## (3'@'@'B	
#V33 k*	
 !((* W	4Y?*.0GHHH$++,?,U,UV	W $#'"
	
s   B>C
c                 f    | j                         D ch c]  \  }}t        |       c}}S c c}}w )zrReturn the fully prefixed names of all buffers in the module hierarchy rooted at ``root_module`` as a class:`set`.)r   r   )r)  r   rn   s      rI   r   r     s5     >I=V=V=X+9;+&  s   -c                     t        | |      D ch c]  }|j                   }}t        |      dk(  r%t        j                  d      |v r|t	        d      yt        |      dkD  rt	        d|       yc c}w )z
    Raise an error if ``module`` has original parameters on multiple devices, ignoring the parameters in ``ignored_params``.

    Thus, after this method, the
    module must be either fully on the CPU or fully on a non-CPU device.
    r\   r   NzTTo support a module with both CPU and GPU params, please pass in device_id argument.rM   z;FSDP only supports single device modules but got params on )r   r   rg   r   r   )r   r   r   r   devicess        rI   r  r    s     *:&.)QRu||RGR 7|qU\\%0G;5  
 
W	I'S
 	
 
 Ss   A4r?   device_handlec                 ^   | yt        | t        j                        r| nt        j                  |       }|j                  dk7  ri|j                  ]t        j                  d|  d| d|j                          d|j                   d	       t        j                  |j                               }|S )z
    Return a ``torch.device`` for the specified ``device_id``.

    Processes ``device_id`` and returns either the corresponding device or
    ``None`` if ``device_id`` is ``None``.
    Nr   z"FSDP got the argument `device_id` z	 on rank zJ, which does not have an explicit index. FSDP will use the current device z6. If this is incorrect, please explicitly call `torch.zk.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.)r_   r   r   rU   indexr   r   current_device)r   r?   rG  r   s       rI   r	  r	  <  s     	5<<8	ell9>U  {{e 409f 00=0L0L0N/O PCCI;;- P11	
 m::<=MrK   c                    t        t        | |            }t        d |D              }| j                         D ]-  }||v r|j	                  d      D ]  }||j
                  z  } / | xr t        xr t        d |D              }||fS )z
    Return if ``module`` has parameters on meta device and if ``module`` is using torchdistX deferred initialization.

    At most of the returned bools can
    be ``True``. If either is ``True``, then ``module`` needs to be
    materialized.
    c              3   4   K   | ]  }|j                     y wr^   )is_metarb   r   s     rI   rd   z._need_to_materialize_module.<locals>.<genexpr>f  s     C5Cs   Frecursec              3   F   K   | ]  }t        j                  |        y wr^   )r/   is_fakerN  s     rI   rd   z._need_to_materialize_module.<locals>.<genexpr>r  s     @U#@s   !)r   r   anyr-  r  rM  _TORCHDISTX_AVAIL)r   r   r   r  r  r  bufr  s           rI   r
  r
  Y  s     *6>BCNCNCCN ^^% *	'$$U$3 	*Cckk)N	**  	A	A@@@  
 666rK   c                     t        |      st        d| dt        |             t        | |      }|D ]
  } ||        y )Nz	Expected z to be callable but got )callabler:   rU   _get_modules_to_materialize)r)  r  r   modules_to_materializer   s        rI   r  r  w  sV    
 M"&>tM?R>ST
 	
 9oV( frK   r  c           	      :   |xs# t        j                  |j                               }t        | |      }d }	 t        j                         5  |D ]u  }t        j                  |j                  d      |j                  d            }t        t        |            dkD  }|sS|j                  |d       |j                          w 	 d d d        y # 1 sw Y   y xY w# t        $ r5}	t        j                  dt!        |	       dt#        |       d       |	d }	~	ww xY w)NFrO  r   )r   rP  zIUnable to call `reset_parameters()` for module on meta device with error z(. Please ensure that your module oftype z* implements a `reset_parameters()` method.)r   r   rJ  rX  no_grad	itertoolschainr;  r  rg   r   to_emptyreset_parametersBaseExceptionr   r   r   rU   )
r)  r  r   rG  materialization_devicerY  r   module_state_iterhas_module_statesr7  s
             rI   r  r    s    3 ell$$&7 9oVF ]]_ 
	.0 	. %.OO%%e%4fnnUn6S%! %(->(?$@1$D!$OO+A5OQ++-	.
	. 
	. 
	.  !!$Q )L>!KM	

 s<   C AC!%CC CC C 	D%0DDc                 "   g }t        j                  | g      }| h}|rq|j                         }|j                  |       |j	                         D ]:  }||vst        |      ||vs|j                  |       |j                  |       < |rq|S r^   )collectionsdequepopleftr#  childrenr   r0  )r)  r   rY  queuevisited_modulesr   child_modules          rI   rX  rX    s    
 /1{m,E'2mO
%%f-"OO- 	+LO3*<8@ 7##L1\*	+  "!rK   r  c                    t        j                  d      |	t        j                         }|j	                  |        g }g }|r|j                         }|j                  fd|j                  d      D               |j                  fd|j                  d      D               |j                         D ].  }t        |t        j                        r|j	                  |       0 |r|D 	cg c]	  }	|	|vs|	 }
}	|D 	cg c]	  }	|	|vs|	 }}	t        |
||       yt        t        | |      d      }||j                  k(  rt!                yyyc c}	w c c}	w )a  
    Move ``module`` depending on ``device_from_device_id`` and its current device.

    This includes moving ignored modules' parameters.

    - If ``device_from_device_id`` is not ``None``, then this moves
    ``module`` to the device.
    - If ``device_from_device_id`` is ``None``, then this does not move
    ``module`` but warns the user if it is on CPU.

    Precondition: ``_check_single_device_module()``.
    r   Nc              3   @   K   | ]  }|j                   k(  r|  y wr^   r   )rb   r   r(  s     rI   rd   z)_move_module_to_device.<locals>.<genexpr>  s%      <<:-    FrO  c              3   @   K   | ]  }|j                   k(  r|  y wr^   rn  )rb   r   r(  s     rI   rd   z)_move_module_to_device.<locals>.<genexpr>  s%      ==J. ro  )r   r   re  rf  r#  rg  extendr;  r  rh  r_   r1  r2  _move_states_to_devicenextr   _warn_cpu_init)r   r   r  r  ri  r   r  curr_moduler  r>  params_to_movebufs_to_mover   r(  s                @rI   r  r    s\   $ e$J( #."3"3"5V%'&(--/K
 MM (33E3B 
 NN )11%1@ 
 )113 ,	!)Y-O-OPLL+,! & &,Gq/F!GG#*Gaa.FGG~|=RS!&.94@EU\\Z7 8 HGs   1	E;E	EEr  c                 6   t        |       dk(  rt        |      dk(  ryt        |       dkD  r| d   j                  }nt        |      dkD  r|d   j                  }t        j                  d      }|| D ]k  }t        j                         5  |j	                  |      |_        |j                  *|j                  j	                  |      |j                  _        ddd       m |D ]  }|j	                  |      |_         y|k(  rt                yy# 1 sw Y   xY w)z
    Move states to the specified device.

    Precondition: ``_check_single_device_module()`` and module's parameters and
    buffers have been materialized if needed.
    r   Nr   )rg   r   r   r[  todatagradrt  )r   r  r  rJ  r(  r   r   s          rI   rr  rr    s    6{aCLA-
6{Q))	W	 **e$J(  	KE K"XX&;<
::)&+jjmm4I&JEJJOK K	K
  	;F ))$9:FK	;	:	% 
&K Ks   	ADD	c                  .    t        j                  d       y )Nam  The passed-in `module` is on CPU and will thus have FSDP's sharding initialization run on CPU, which may be slower than on GPU. We recommend passing in the `device_id` argument for FSDP to move `module` to GPU for the sharding initialization. `module` must also be on GPU device to work with the `sync_module_states=True` flag since that requires GPU communication.)r   r    rK   rI   rt  rt    s    MM	1rK   c                     t        t        | |      d      }|&|j                  j                  dk7  r|j                  }n#t	        j                  |j                               }|||k7  rt        d| d| d|       |S )a)  
    Determine and return this FSDP instance's compute device.

    If the module is already on a non-CPU device, then the compute device is that non-CPU
    device. If the module is on CPU, then the compute device is the current
    device.

    Since this method should be called after materializing the module, any
    non-CPU device should not be meta device. For now, the compute device is
    always a CUDA or CUDA-like device with its explicit index.

    Precondition: ``_check_single_device_module()`` and
    ``_move_module_to_device()``.
    Nr   z4Inconsistent compute device and `device_id` on rank z: z vs )rs  r   r   rU   r   rJ  r:   )r   r   r  r?   rG  r   r  s          rI   r  r    s    * !&.94@EU\\..%7m&B&B&DE(^?T-TB4&d#8"9;
 	
 rK   c                 ~   g }| j                         D ]  }t        |t        d      rt        |t        d       |j	                         }t        |      r>|j                         \  }}|D cg c]  }t        ||       }	}|j                  |	       |j                  |        |D ]l  }
|
j	                         }t        |      r>|j                         \  }}|D cg c]  }t        ||       }}|j                  |       \|j                  |       n t        |       t        ||t        d       yc c}w c c}w )z
    Synchronize module states (i.e. parameters ``params`` and all not-yet-synced buffers) by broadcasting from rank 0 to all ranks.

    Precondition: ``sync_module_states == True`` and ``self.process_group`` has
    been set.
    FTr   )srcN)r  getattrFSDP_SYNCEDsetattrdetachr,   __tensor_flatten__rq  r#  +_check_module_states_for_sync_module_statesr+   PARAM_BROADCAST_BUCKET_SIZE)r   r   r2   module_statesr   detached_bufferattrsrn   attrinner_buffersr   detached_paraminner_paramss                rI   r  r  B  s*    )+M.." 6v{E2FK.$mmoO,_= +==?qLQ RD$!? R R$$]3$$_56  1(8%88:HE1FKLdGND9LLL  .  01 0>#	 !S Ms   +D5D:r  c                 D    | rt        d | D              rt        d      y y )Nc              3   `   K   | ]&  }|j                   t        j                   d       k(   ( yw)r   N)r   r   )rb   tensors     rI   rd   z>_check_module_states_for_sync_module_states.<locals>.<genexpr>q  s'      17e,,s   ,.zThe module has CPU parameters or buffers when `sync_module_states=True`, which requires them to be on GPU. Please specify the `device_id` argument or move the module to GPU before passing it to FSDP.)rS  r:   )r  s    rI   r  r  n  s7      ;H  C
 	
}rK   c              #      K   | j                         }	 	 t        |      }||vrt        |      s| # t        $ r Y yw xY ww)aD  
    Return an iterator over the original parameters in ``module``.

    The iterator does not return
    the parameters in ``ignored_params``, any ``FlatParameter`` s (which may be
    present due to nested FSDP wrapping), or any original parameters already
    flattened (only relevant when ``use_orig_params=True``).
    N)r;  rs  r   StopIteration)r   r   	param_genr   s       rI   r   r   {  sT      !!#IOEN*3Ee3L   s   A 4 	A AA  Ac           	          t        |       D ]A  \  }}||vst        |      rt        d| d|j                          d|j                          y)a5  
    Check that original parameters in ``fsdp_module`` have been flattened.

    The flattened parameters are made
    invisible to ``named_parameters()`` for the module hierarchy rooted at
    ``fsdp_module``. This should be called as a sanity check after flattening
    the wrapped module's parameters.
    z Found an unflattened parameter: z;  N)r   r   r   r@   	__class__)fsdp_moduler   r   r   s       rI   _check_orig_params_flattenedr    s^     ?{K 
E&/A%/H2:,b::<.%//!24 rK   c                 h    | t         j                  k(  rt        j                  S t        j                  S r^   )r&   r   r   allreduce_hookreduce_scatter_hook)r3   s    rI   _get_default_comm_hookr    s3      0 9 99 	$$ ..rK   c                 .    t        j                  |       S )NrN   )r   rC   rN   s    rI   rV   rV     s     %%MBBrK   r^   )re  r\  r   r   typingr   r   r   r   r   r   r	   r
   r   r   r   r   r   r   r   torch.distributeddistributedr`   (torch.distributed.fsdp._exec_order_utilsfsdp_exec_order_utilsr   'torch.distributed.fsdp._traversal_utils_traversal_utilsr.  2torch.distributed.fsdp.fully_sharded_data_parallelfully_sharded_data_parallelr1  torch.nnr   (torch.distributed.algorithms._comm_hooksr   torch.distributed.device_meshr   r   "torch.distributed.distributed_c10dr   $torch.distributed.fsdp._common_utilsr   r   r   r   r   r   r   "torch.distributed.fsdp._flat_paramr   r   r   r   %torch.distributed.fsdp._limiter_utilsr    torch.distributed.fsdp.apir!   r"   r#   r$   r%   r&   r'   r(   torch.distributed.fsdp.wrapr)   &torch.distributed.tensor.parallel.fsdpr*   torch.distributed.utilsr+   torch.utils._python_dispatchr,   torch.utils.hooksr-   rT  
torchdistxr.   r/   ImportErrorintr  r  ra   HybridShardProcessGroupTypeProcessGroupTyper   r   SHARD_GRAD_OPHYBRID_SHARD_HYBRID_SHARD_ZERO2r  r;   #NO_RESHARD_AFTER_FORWARD_STRATEGIESrJ   r<   boolrT   rO   ro   r   rQ   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r   r   r   r   r   r  r	  r
  r  r  rX  Tensorr  rr  rt  r  r  r  r   r  r  rC   rV   r}  rK   rI   <module>r     s
     	    "    C C A A F F  B E A    B	 	 	 0 D < F 1 . ""34 #D$5$5t7H7H$HI E$"3"35P"PQR 
 5>>!7!B!B""$:$H$H!!#9#F#F((*@*T*T  !!(( 
 ""((' #  )-00#0 (0 W	0
 *%0 0 0f ((#( ( 	( (V # $   IJ I4 I I  ARAR   !++!! 
! !H++ 4d///0&  	++II+ huxx78+ %((,,-.%((//9R0SS	+ + +\I9=	: --II- %- c5<</01	-
 - -` II  " >> 01> n-> *%	>
 > > !>  > > >B    		'	 	 		 	 : J *   *   299 d2<<6H T & AA))A c5<</01A Hbii[$%678	A
 A A AH ))) ))) )<66x896 	^6x BF) !%((*<*<!=> 				D$$)$ 	X$:299 S 
II
%
 c5<</01
 
	
<c5<</01
 % ell	:7II7%7 ^7 4:	7<RYYK-. ^ 
	  #ELL1  ^  %	 F""-0^"	"))_",3II3%3 &3 $ELL1	3
 
3l%,, $ELL1 
	@II% $ELL1 	
 % \\F)II)) $$) 
	)X

%

	

II% bll,% 
(.> C$$CCO#  s   ^ ^$#^$