
    sg              	          d dl Z d dlZd dlZd dlZd dlZd dlmZmZ d dl mZ d dl	m
Z
 d dlmZmZ d dlmZ d dlmZmZmZmZmZmZmZmZmZ d dlmZ d dlZd dlmZ d dl m!Z! d dl"m!c m#Z$ d d	l%m&Z& d d
l'm(Z( d dl)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/m0Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7m8Z8m9Z9 d dl:m;Z; d dl<m=Z=m>Z>m?Z? d dl@mAZAmBZBmCZC d dlDmEZEmFZFmGZGmHZH d dl mIZImJZJ d dlKmLZM d dlNmOZOmPZPmQZQmRZR d dlSmTZTmUZU d dlVmWZW  G d de      ZX G d de      ZY G d de!j                  e      Z[d e!j                  d!ej                  d"efd#Z]	 	 djd e!j                  d$e^fd%Z_dkd&Z`d' Zad( Zbdld e!j                  d)e^fd*Zcd e!j                  d+e^fd,Zdd e!j                  d-e^fd.Ze G d/ d0      Zf G d1 d2e[      Zg G d3 d4e[      Zh G d5 d6eh      Zi G d7 d8eh      Zj G d9 d:e[      Zk G d; d<ek      Zl G d= d>e!j                        Zm G d? d@eh      Zn G dA dBe!j                        Zo G dC dDe!j                        Zq G dE dFe!j                        Zre j                  dGefdH       Zte j                  dIefdJ       Zue j                  dKefdL       Zvee j                  dMefdN              Zwee j                  dOefdP              Zxee j                  dQefdR              Zyee j                  dSefdT              ZzdUed"edVedWefdXZ{	 dmdYe!j                  dZe!j                  d[ee|d\f   fd]Z} G d^ d_eP      Z~ G d` daeO      Zdndbee   fdcZ G dd dee!j                        Z G df dge!j                        Z G dh die!j                        Zy)o    N)ABCabstractmethod)nullcontext)deepcopy)autoEnumwraps)	AnyCallableDictListno_type_checkOptionalTupleTypeUnion)mock)
checkpoint)fully_shard)FSDPParamGroupRegisterPostBackwardFunction)
DeviceMesh)
CPUOffloadFullyShardedDataParallel)TrainingState)#NO_RESHARD_AFTER_FORWARD_STRATEGIES)BackwardPrefetchMixedPrecisionShardingStrategy)ShardedGradScaler)always_wrap_policyModuleWrapPolicywrap)distribute_tensorDTensorShard)ColwiseParallelparallelize_moduleRowwiseParallelSequenceParallel)TransformerDecoderLayerTransformerEncoderLayer)DistributedDataParallel)MultiProcessTestCaseMultiThreadedTestCaserun_subtests
TEST_SKIPS)FILE_SCHEMAget_cycles_per_ms)
has_tritonc                   (    e Zd Z e       Z e       Zy)FSDPInitModeN)__name__
__module____qualname__r   NO_FSDP	RECURSIVE     V/var/www/html/venv/lib/python3.12/site-packages/torch/testing/_internal/common_fsdp.pyr7   r7   C   s    fGIr>   r7   c                   6    e Zd Z e       Z e       Z e       Zy)CUDAInitModeN)r8   r9   r:   r   CUDA_BEFORE
CUDA_AFTER
CUDA_NEVERr=   r>   r?   rA   rA   L   s    &KJJr>   rA   c                       e Zd ZdZedeej                  df   fd       Zedej                  fd       Z	edd       Z
eeded	edej                  fd
              Zy)FSDPTestModelzZThis defines the interface expected from all models used commonly for
    FSDP unit tests.return.c                      y)z+Returns an input for the model as as tuple.Nr=   selfdevices     r?   	get_inputzFSDPTestModel.get_inputY        	r>   c                      y)z,Returns the loss given the input and output.Nr=   )rJ   inputoutputs      r?   get_losszFSDPTestModel.get_loss^   rM   r>   Nc                      y)z<Runs the backward pass (e.g. including ``loss.backward()``).Nr=   rJ   losss     r?   run_backwardzFSDPTestModel.run_backwardc   rM   r>   argskwargsc                       y)z&Initializes an instance of this model.Nr=   )rV   rW   s     r?   initzFSDPTestModel.inith   s     	r>   rG   N)r8   r9   r:   __doc__r   r   torchTensorrL   rQ   rU   staticmethodr   nnModulerY   r=   r>   r?   rF   rF   U   s     5s):#;        C 3 299   r>   rF   modelprocess_group	assert_fnc                 ,   | j                         D cg c]%  \  }}||j                         j                         f' }}}|| j                         D cg c]%  \  }}||j                         j                         f' c}}z  }t	        j
                  |      }t        |      D 	cg c]  }	d }
}	t	        j                  |
||       |
d   }|J |
dd D ])  }|J t        ||      D ]  \  \  }	}\  }	} |||        + yc c}}w c c}}w c c}	w )a  
    All-gathers module states across ranks and calls ``assert_fn`` on each pair
    of corresponding states from rank 0 and a nonzero rank. For example, if
    ``assert_fn`` is ``self.assertEqual()``, then this checks that all module
    states are equal across ranks.
    Ngroupr      )	named_parametersdetachcpunamed_buffersdistget_world_sizerangeall_gather_objectzip)ra   rb   rc   
param_nameparamnamed_module_statesbuffer_namebuffer
world_size_olistrank0_statesstatep1p2s                  r?   _assert_module_statesr}   o   s6    "'!7!7!9J 
U\\^'')*  #(#6#6#8K 
fmmo))+,  $$]3J ,-aT-E-5"5]K8L###qr     #L% 8 	GQWab"	
 .s   *D*D'	Dzero_buffersc                    |rt        j                  |       n	t               }|5  | j                         D ]/  }t	        j
                         5  |j                          ddd       1 |rB| j                         D ]/  }t	        j
                         5  |j                          ddd       1 ddd       y# 1 sw Y   xY w# 1 sw Y   PxY w# 1 sw Y   yxY w)zBZeros the parameters and optionally buffers of ``model`` in place.N)FSDPsummon_full_paramsr   
parametersr\   no_gradzero_buffers)ra   r~   summon_fullctxrr   ru   s         r?   _zero_modelr      s     -8$
!
!%
([]C	 #%%' 	E  	 --/ #]]_ #LLN# ### # # ## #s;   (CB43CC !
C4B=9C C	CCc                 j    |s| j                         } |r| j                          | j                         S N)cudahalf
state_dict)ra   cpu_offloadr   s      r?   _get_state_dictr      s+    



r>   c           	      j    dj                  |D cg c]  }|| t        |         nd c}      S c c}w )Nrw   none)joinstr)test_name_mappingrV   ss      r?   subtest_namer      s7    88IMNAam	3q6	"	?N Ns   0c                 @   |j                         D ];  \  }}|j                  t        j                  d      k7  s)|j                         ||<   = | dk(  r|nd g}t	        j
                  |       |d   }|j                         D ]  }||   j                         ||<    |S )Nrj   r   )itemsrK   r\   rj   rl   broadcast_object_listkeysr   )rankr   rq   rr   rx   s        r?   _broadcast_state_dictr      s     (--/ 1
E<<5<<..%*YY[Jz"1  19Z$/Eu%qJ oo' ?
!+J!7!<!<!>
:?r>   recursec                     t        j                  | |      5  t        t        | j	                                     cddd       S # 1 sw Y   yxY w)a[  
    Returns the full unsharded parameters of ``model``. Any FSDP-managed
    parameters offloaded to CPU are moved to GPU in the returned list.

    Args:
        recurse (bool): If ``False``, only unshards the parameters immediate to
            ``model``; if ``True``, recurses through the module hierarchy
            rooted at ``model``.
    )r   N)r   r   r   listr   )ra   r   s     r?   get_full_paramsr      s?     
	 	 	8 2U--/012 2 2s   "AAmove_to_cudac                 *    |r| j                         S | S r   )r   )ra   r   s     r?   _maybe_cudar      s    '5::<2U2r>   	wrap_fsdpc                 (    |s| S t        | g|i |S r   r   )ra   r   rV   rW   s       r?   _maybe_wrap_fsdpr      s    !5CtE'CD'CF'CCr>   c                   :    e Zd ZdedefdZdefdZdefdZd Zy)	DummyProcessGroupr   sizec                      || _         || _        y r   )_rank_size)rJ   r   r   s      r?   __init__zDummyProcessGroup.__init__   s    

r>   rG   c                     | j                   S r   )r   rJ   s    r?   r   zDummyProcessGroup.rank       zzr>   c                     | j                   S r   )r   r   s    r?   r   zDummyProcessGroup.size   r   r>   c                 B    t        j                         }d }||_        |S )Nc                  d    t         j                  j                         } | j                  d       | S )Nrg   )r\   futuresFuture
set_result)futures    r?   
get_futurez/DummyProcessGroup.allreduce.<locals>.get_future   s'    +0==+?+?+AFa Mr>   )r   Mockr   )rJ   rV   rW   	dist_waitr   s        r?   	allreducezDummyProcessGroup.allreduce   s"    IIK		
  *	r>   N)r8   r9   r:   intr   r   r   r   r=   r>   r?   r   r      s2    S  c c 	r>   r   c                        e Zd Zdej                  dededef fdZd Zd Z	d Z
d	 Ze	 	 	 ddej                  d
ededeeeef      dededeej(                  ef   fd       Zd Z xZS )TransformerWithSharedParamsrf   cuda_init_modeadd_bndeterministicc                    t         |           |j                         | _        |j                         | _        |rt        j                  d       d}d}t        j                  ||      | _	        t        j                  |dddd      | _        t        j                  ||      | _        | j                  j                  | j                  _        | j                  d| j                  j                  j!                  |f             | j                  d	t        j"                  | j$                  t
        j&                  
             d| _        |r)t
        j                  j+                  | j(                        nt
        j                  j-                         | _        |t0        j2                  k(  r| j5                         } |r| j7                          y y )Nr               g?)d_modelnum_encoder_layersnum_decoder_layersdim_feedforwarddropout
vocab_biaslong_buffer)dtype)superr   r   r   rv   r\   manual_seedr_   	Embeddingembed_tokensTransformertransformerLinearoutput_projweightregister_buffernew_ones
zeros_liker   longbsBatchNorm1dIdentitybnrA   rB   r   eval)rJ   rf   r   r   r   d_vocabr   	__class__s          r?   r   z$TransformerWithSharedParams.__init__   s^    	JJL	**,a LL':>>  
 99Wg6 #'"3"3":":$++22;;WJG	
 	T__EJJ?	

 39%((&&tww/uxx?P?P?R\55599;DIIK r>   c                 ,   t        j                  d| j                  z          t        j                  d|      j	                  d| j
                        }t        j                  | j
                  dz  |      j	                  d| j
                        }||fS )Nrg      rK         )r\   r   r   arangeviewr   )rJ   rK   srctgts       r?   rL   z%TransformerWithSharedParams.get_input  sl    !dii-(ll2f-221dgg>ll477Q;v6;;AtwwGSzr>   c                    | j                  |      }|| j                  z   | j                  j                  |      z   }| j                  |      }| j	                  |      }| j                  ||      }| j                  |      S r   )r   r   r   type_asr   r   r   )rJ   src_idstgt_idsr   r   xs         r?   forwardz#TransformerWithSharedParams.forward  sv    (DOO#d&6&6&>&>s&CC(ggclS#&""r>   c                     |\  }}t         j                  j                  |j                  d|j	                  d            |j                  d      d      S )Nsum)	reduction)r_   
functionalcross_entropyr   r   )rJ   rO   rP   rw   r   s        r?   rQ   z$TransformerWithSharedParams.get_loss%  sI    3}}**KKFKKO,chhrle + 
 	
r>   c                 $    |j                          y r   backwardrS   s     r?   rU   z(TransformerWithSharedParams.run_backward+      r>   fsdp_init_modefsdp_kwargsrG   c                 D   |i }|t         j                  k(  r&t        | t              r| d   }n| }t	        ||||      S |t         j
                  k(  rd|vrt        t        t        h      }n|j                  d      }d|v r8|d   t        j                  t        j                  hv rt        | t              sd}n| }t        | t              r| d   }	n| }	t	        |	|||      }
t        |
|fd|i|}|t        j                  k(  r|j!                         }|S t#        d|       )ao  
        Initializes a :class:`TransformerWithSharedParams` instance.

        Args:
            fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
                any modules with FSDP. If ``RECURSIVE``, then wraps with
                top-level FSDP. By default, the top-level FSDP uses the
                ``ModuleWrapPolicy`` for encoder and decoder layers, but a
                different auto wrap policy may be specified via
                ``fsdp_kwargs``.
            cuda_init_mode (CUDAInitMode): Determines model movement to CUDA.
            fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
                forwarded to the FSDP constructor.
            deterministic (bool): Whether to make the model deterministic
                across constructions.
            add_bn (bool): Whether to include batch norm in the model.
        Nr   auto_wrap_policysharding_strategyUnsupported FSDP init mode: )r7   r;   
isinstancetupler   r<   r#   r-   r,   popr    HYBRID_SHARD_HYBRID_SHARD_ZERO2r   rA   rC   r   
ValueError)rf   r   r   r   r   r   pgr   fsdp_pg
tformer_pgm
fsdp_models               r?   rY   z TransformerWithSharedParams.init.  sT   6 K\111%'1X.NFM  |555!4#3//$  $/??3E#F  ${2 34$113C3W3WXY"5%0%'"1X
"
+NFMA  "2 	J !8!88'__.
77GHIIr>   c                     | j                   gS r   )r   r   s    r?   get_ignored_modulesz/TransformerWithSharedParams.get_ignored_modules|  s      !!r>   )NFT)r8   r9   r:   rl   ProcessGrouprA   boolr   rL   r   rQ   rU   r^   r7   r   r   r   r   r   r_   r`   r   rY   r  __classcell__r   s   @r?   r   r      s    (  ( %( 	(
 (T#
 
 15#KJ  KJ$KJ %KJ d38n-	KJ
 KJ KJ 
ryy$	KJ KJZ"r>   r   c                        e Zd Zdej                  dededef fdZd Zd Z	d Z
d	 Ze	 	 ddej                  d
ededeeeef      dedej&                  fd       Z xZS )NestedWrappedModulerf   r   r   r   c                    t         |           j                         | _        j                         | _        |t
        j                  k(  }fd}|rt        j                  d       t        j                  t        t        j                  dd      |       |t        j                   |t        t        j                  dd      |            t        t        j                  dd      |                   |t        t        j                  dd      |            t        t        j                  dd      |            | _        y )Nc                 &    rt        | fi S | S r   r   layerr   rf   r   s    r?   _maybe_wrapz1NestedWrappedModule.__init__.<locals>._maybe_wrap      E58K88Lr>   r   r   r   r   )r   r   r   r   rv   rA   rB   r\   r   r_   
Sequentialr   r   module	rJ   rf   r   r   r   r   r   r  r   s	    ``  `  r?   r   zNestedWrappedModule.__init__  s     	JJL	**,%)A)AA	
 a mm		!Q6BIIa,<l KL		"b 1<@ BIIb!$4lCD		!Q6

r>   c                 x    t        j                  d| j                  z          t        j                  dd|      fS )Nrg   r   r   r   )r\   r   r   randrI   s     r?   rL   zNestedWrappedModule.get_input  s.    !dii-(

1a/11r>   c                 $    | j                  |      S r   r  rJ   r   s     r?   r   zNestedWrappedModule.forward      {{1~r>   c                 &    |j                         }|S r   )r   rJ   rO   rP   rT   s       r?   rQ   zNestedWrappedModule.get_loss  s    zz|r>   c                 $    |j                          y r   r   rS   s     r?   rU   z NestedWrappedModule.run_backward  r   r>   r   r   rG   c                     |i }|t         j                  k(  rt        | d||      S |t         j                  k(  r5t        | fd||d|}|t        j
                  k(  r|j                         }|S t        d|       )a  
        Initializes a :class:`NestedWrappedModule` instance.

        Args:
            fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
                any modules with FSDP. If ``RECURSIVE``, then wraps some nested
                modules with FSDP but not the top-level module. The model may
                later be wrapped with a top-level FSDP external to this method
                if desired.
            cuda_init_mode (CUDAInitMode): Determines model movement to CUDA.
            fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
                forwarded to the FSDP constructor.
            deterministic (bool): Whether to make the model deterministic
                across constructions.
        Fr   r   r   Tr  )r7   r;   r  r<   rA   rC   r   r  )rf   r   r   r   r   r  s         r?   rY   zNestedWrappedModule.init  s    . K\111&-+	  |555,-+	
 J !8!88'__.
77GHIIr>   NF)r8   r9   r:   rl   r  r  rA   r   rL   r   rQ   rU   r^   r7   r   r   r   r   r_   r`   rY   r  r  s   @r?   r  r    s    
  
 
 %	

 
@2 
 15#+J  +J$+J %+J d38n-	+J
 +J 
+J +Jr>   r  c                   h     e Zd Ze	 	 ddej
                  dededee	e
ef      def
 fd       Z xZS )AlwaysWrapNestedWrappedModulerf   r   r   r   r   c                 &   t         t        t          	 | t        j                  |||      }|t        j                  k(  r|S |t        j
                  k(  r=|xs i }t        |fdt        i|}|t        j                  k(  r|j                         }|S y)z
        Initializes a :class:`NestedWrappedModule` instance, but unlike
        :meth:`NestedWrappedModule.init`, for the ``RECURSIVE`` init mode, this
        wraps with top-level FSDP and the ``always_wrap_policy()`` auto wrap
        policy.
        )rf   r   r   r   r   r   N)r   r,  rY   r7   r;   r<   r   r"   rA   rC   r   )rf   r   r   r   r   ra   r  r   s          r?   rY   z"AlwaysWrapNestedWrappedModule.init  s     )+H
'//)#'  
 	 \111L|555%+KeX6HXKXJ!8!88'__.
 6r>   r*  )r8   r9   r:   r^   rl   r  r7   rA   r   r   r   r   r  rY   r  r  s   @r?   r,  r,    s^    
 15#  $ % d38n-	
  r>   r,  c                        e Zd Zdej                  dededef fdZed
d       Z	e	 	 ddej                  de
dedeeeef      def
d	       Z xZS )NonUniformReqGradNWMrf   r   r   r   c                    t         t        |           j                         | _        j	                         | _        |t        j                  k(  }fd}|rt        j                  d       t        j                  t        t        j                  dd      |       |t        j                   |t        t        j                  dd      |            t        t        j                  dd      |                   |t        j                  t        t        j                  dd      |      t        t        j                  dd      |                        | _        y )Nc                 &    rt        | fi S | S r   r   r  s    r?   r  z2NonUniformReqGradNWM.__init__.<locals>._maybe_wrap  r  r>   r   r   r   r   )r   r  r   r   r   rv   rA   rB   r\   r   r_   r  r   r   r  r  s	    ``  `  r?   r   zNonUniformReqGradNWM.__init__  s     	!413 JJL	**,%)A)AA	
 a mm		!Q6BIIa,<l KL		"b 1<@ 		"a 0,?		!Q>
r>   c                     | j                         D ]-  \  }}t        j                  ||      r|j                  d       / y r*  )rh   rematchrequires_grad_)ra   req_grad_masknps       r?   _set_nonuniform_req_gradz-NonUniformReqGradNWM._set_nonuniform_req_grad+  s:    **, 	(DAq88M1-  '	(r>   r   r   c                    t        j                  d      }|t        j                  k(  r't	        | d||      }t        j                  ||       |S |t        j                  k(  rO|i }t	        | fd||d|}|t        j                  k(  r|j                         }t        j                  ||       |S t        d|       )a  
        Initializes a :class:`NestedWrappedModule` instance, but unlike
        :meth:`NestedWrappedModule.init`, it wraps a second :class:`torch.nn.Sequential`
        container to enable the desired non-uniform ``requires_grad``
        ``use_orig_params=True`` tests. For both ``RECURSIVE`` and ``NO_FSDP``
        init modes, freezes all parameters except the last two to validate
        ``ShardedGradScaler`` support for ranks with no (non-zero sized) local shards in
        FSDP ``use_orig_params=True`` mode.
        zmodule\.2.*\.1.*Fr)  Tr  )r3  compiler7   r;   r/  r9  r<   rA   rC   r   r  )rf   r   r   r   r   req_grad_pattern	ddp_modelr  s           r?   rY   zNonUniformReqGradNWM.init1  s    ( ::&9:\111,-+	I !99)EUV|555" --+	
 J !8!88'__.
 99*FVW77GHIIr>   rZ   r*  )r8   r9   r:   rl   r  r  rA   r   r^   r9  r7   r   r   r   r   rY   r  r  s   @r?   r/  r/     s    (
  (
 (
 %	(

 (
T ( (
 
 15#+J  +J$+J %+J d38n-	+J
 +J +Jr>   r/  c                        e Zd ZdZdej
                  dedef fdZd Zd Z	d Z
d	 Zed
ee   dedededef
d       Z xZS )ModuleWithDelayzThis class wraps a :class:`FSDPTestModel` to optionally add a delay
    after computing the loss and/or before the gradient reduction.r  delay_after_loss_msdelay_before_reduction_msc                 L    t         |           || _        || _        || _        y r   )r   r   r@  rA  r  )rJ   r  r@  rA  r   s       r?   r   zModuleWithDelay.__init__d  s'     	#6 )B&r>   c                 8    | j                   j                  |      S r   )r  rL   rI   s     r?   rL   zModuleWithDelay.get_inputo  s    {{$$V,,r>   c                 $    | j                  |      S r   r"  r#  s     r?   r   zModuleWithDelay.forwardr  r$  r>   c                     | j                   j                  ||      }| j                  dkD  r=t        j                  j                  t        | j                  t               z               |S Nr   )r  rQ   r@  r\   r   _sleepr   r4   r&  s       r?   rQ   zModuleWithDelay.get_lossu  sQ    {{##E62##a'JJc$":":=N=P"PQRr>   c                      t         j                  j                   fd}t        j                  d|      5   j
                  j                  |       d d d        y # 1 sw Y   y xY w)Nc                      j                   dkD  r=t        j                  j                  t	        j                   t               z                | i |S rF  )rA  r\   r   rG  r   r4   )rV   rW   orig_reduce_scatterrJ   s     r?   _delayed_reduce_scatterz=ModuleWithDelay.run_backward.<locals>._delayed_reduce_scatter~  sL    --1

!!669J9LLM '777r>   z'torch.distributed.reduce_scatter_tensor)r\   distributedreduce_scatter_tensorr   patchr  rU   )rJ   rT   rK  rJ  s   `  @r?   rU   zModuleWithDelay.run_backward{  sW    #//EE	8 ZZ57N
 	+ KK$$T*	+ 	+ 	+s   AA'module_class
model_argsmodel_kwargsc                <    t         | j                  |i |||      S )aA  
        Args:
            module_class (Type[FSDPTestModel]): Wrapped module class to which
                to add delays.
            model_args: Positional arguments forwarded to the ``module_class``
                ``init()``.
            delay_after_loss_ms (int): Delay after computing the loss/before
                the optimizer step (in ms).
            delay_before_reduction_ms (int): Delay before reduce-scattering
                gradients (in ms).
            model_kwargs: Keyword arguments forwarded to the ``module_class``
                ``init()``.
        )r?  rY   )rO  r@  rA  rP  rQ  s        r?   rY   zModuleWithDelay.init  s,    * Lz:\:%
 	
r>   )r8   r9   r:   r[   r_   r`   r   r   rL   r   rQ   rU   r^   r   rF   r   rY   r  r  s   @r?   r?  r?  `  s    F				 !	 $'		-+ 
=)

 !
 $'	

 
 
r>   r?  c                   ~    e Zd Zeej
                  ddddfdej                  dedede	e
eef      ded	ed
efd       Zy)NestedWrappedModuleWithDelayNFr   rf   r   r   r   r   r@  rA  c           
      D    t         j                  t        | ||||||      S )Nrf   r   r   r   r   r@  rA  )r?  rY   r  rV  s          r?   rY   z!NestedWrappedModuleWithDelay.init  s4     ##))#' 3&? $ 	
 		
r>   )r8   r9   r:   r^   rA   rC   rl   r  r7   r   r   r   r   r  r   rY   r=   r>   r?   rT  rT    s     (4'>'>04##$)*
  
$
 %
 d38n-	

 
 !
 $'
 
r>   rT  c                   $     e Zd Z fdZd Z xZS )DummyDDPc                 0    t         |           || _        y r   )r   r   r  )rJ   r  r   s     r?   r   zDummyDDP.__init__  s    r>   c                 &     | j                   |i |S r   r"  rJ   rV   rW   s      r?   r   zDummyDDP.forward  s    t{{D+F++r>   r8   r9   r:   r   r   r  r  s   @r?   rX  rX    s    ,r>   rX  c                        e Zd Zdej                  dedededef
 fdZd Z	d Z
e	 	 	 ddej                  d	eded
eeeef      dedefd       Z xZS )MixtureOfExpertsrf   r   r   delay_before_free_msr   c                    t         |   ||||       || _        || _        || _        |t
        j                  k(  | _        |r"t        j                  d| j                  z          d}d}d}	t        t        j                  ||      | j                        }
t        d |
j                         D              | _        |
j                         D ]	  }d|_         |rt        j                  d       t        t        j                  ||      | j                        }|rHt        j$                  j'                  |j                         g      }t)        |
|fi |}
t)        ||fi |}t        j*                  t        t        j                  |	|      | j                        ||
t        t        j                  ||	      | j                              | _        y )	N)rf   r   r   r   *   r   r   r   c              3   <   K   | ]  }|j                           y wr   )numel).0r8  s     r?   	<genexpr>z,MixtureOfExperts.__init__.<locals>.<genexpr>  s     $L1QWWY$L   Tr   )r   r   rf   r_  r   rA   rB   r   r\   r   r   r   r_   r   r   r   num_expert_paramsexpertrL  	new_groupr   r  r  )rJ   rf   r   r   r_  r   r   d_expertd_sharedd_inputrh  r8  sharedexpert_groupr   s                 r?   r   zMixtureOfExperts.__init__  s    	)'	 	 	
 
$8!"*l.F.FFb499n-RYYx:D<M<MN!$$L8I8I8K$L!L""$ 	AAH	 a RYYx:D<M<MN ,,66L &,>+>F&%7;7Fmm		'84d6G6GH		(G4d6G6GH	
r>   c                 f     j                   dkD  r j                  d   }t        |t              ret        j
                  j                  j                  j                   fd}t        j                  d|      5   j                  |      cd d d        S  j                  |      S # 1 sw Y   xY w)Nr   r   c                      t         j                  j                  t        j                  t               z                | i |S r   )r\   r   rG  r   r_  r4   )rV   rW   orig_reshardrJ   s     r?   _delayed_reshardz2MixtureOfExperts.forward.<locals>._delayed_reshard  s>    JJ%%D558I8KKL (888r>   z.torch.distributed.fsdp._runtime_utils._reshard)r_  r  r  r   r\   rL  fsdp_runtime_utils_reshardr   rN  )rJ   r   rh  rr  rq  s   `   @r?   r   zMixtureOfExperts.forward  s    $$q([[^F&$'$0055DDMM9 ZZDFV *  ;;q>* *
 {{1~* *s   ;B''B0c                    |j                          | j                  st        j                         5  | j	                         D ]v  }t        |d      r|j                  |j                  j                  | j                         t        j                  j                  |j                  | j                         x 	 d d d        y y # 1 sw Y   y xY w)Nrh  re   )r   r   r\   r   r   hasattrgraddiv_rv   rL  
all_reducerf   )rJ   rT   r8  s      r?   rU   zMixtureOfExperts.run_backward  s    ~~ O* OAq(+ vv)DOO4))44QVV4::4NOO O O Os   -CACCr   r   c                     |i }|t         j                  k(  rt        | d|||      S |t         j                  k(  r6t        | fd|||d|}|t        j
                  k(  r|j                         }|S t        d|       )a  
        Initializes a :class:`MixtureOfExperts` instance.

        Args:
            fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
                any modules with FSDP. If ``RECURSIVE``, then wraps some nested
                modules with FSDP, including the expert and shared layers, but
                not the top-level module. The model may later be wrapped with a
                top-level FSDP external to this method if desired.
            cuda_init_mode (CUDAInitMode): Determines model movement to CUDA.
            fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
                forwarded to the FSDP constructor.
            deterministic (bool): Whether to make the model deterministic
                across constructions.
            delay_before_free_ms (int): Delay before resharding expert
                parameters in the forward pass (in ms).
        F)r   r   r_  r   Tr  )r7   r;   r^  r<   rA   rC   r   r  )rf   r   r   r   r   r_  r  s          r?   rY   zMixtureOfExperts.init  s    4 K\111#-%9+  |555)-%9+ J !8!88'__.
77GHIIr>   )NFr   )r8   r9   r:   rl   r  r  rA   r   r   r   rU   r^   r7   r   r   r   r   rY   r  r  s   @r?   r^  r^    s    2
  2
 2
 %	2

 "2
 2
h(
O 
 15#$%0J  0J$0J %0J d38n-	0J
 0J "0J 0Jr>   r^  c                        e Zd Z	 ddddddedeej                     deded	ef
 fd
Zdej                  dej                  fdZ
d Z xZS )MLPTFr   )biaswith_bufferdim_multiplierdimrK   r~  r  r  c                
   t         |           t        j                  |||z  ||      | _        t        j                  ||z  |||      | _        |r)| j                  dt        j                  |f|             y d | _	        y )N)rK   r~  ru   r   )
r   r   r_   r   in_projout_projr   r\   randnru   )rJ   r  rK   r~  r  r  r   s         r?   r   zMLP.__init__P  so     	yyns&:6PTU		.3"6FQUV  5;;vf+MNDKr>   r   rG   c                     | j                  |      }t        j                  |      }| j                  |      }t        j                  |      }| j                  || j                  z   }|S r   )r  Frelur  ru   )rJ   r   zs      r?   r   zMLP.forwarda  sS    LLOFF1IMM!FF1I;;"DKKAr>   c                     | j                   4t        j                  j                  j	                  | j                          y y r   )ru   r\   r_   rY   normal_r   s    r?   reset_parameterszMLP.reset_parametersj  s+    ;;"HHMM!!$++. #r>   r   )r8   r9   r:   r   r   r\   rK   r  r   r]   r   r  r  r  s   @r?   r}  r}  O  sv     *.
 ! &
   " %,, /r>   r}  c                   F     e Zd Zdddedef fdZdededed	d fd
Z xZS )MLPStackF)with_seq_parallelmlp_dimr  c                    t        |d      t        |      t        |d      g}|r&|j                  t        j                  |d             t	        |   |  || _        y )N   )r  Fr~  )r}  appendr_   	LayerNormr   r   r  )rJ   r  r  modulesr   s       r?   r   zMLPStack.__init__p  sX     *L*	$
 NN2<<e<='"!2r>   tp_meshdp_meshuse_activation_checkpointingrG   c           
         t        d      t        d      t        d      t        d      t        d      | j                  rt        t        d            n	t               d}| j                  rt	        d      |d<   t        | ||       | D ]8  }t        |t        j                        r|rt        |       t        |fd	|i| : t        | fd	|i| | S )
NF)use_local_outputrg   )output_layouts)z	0.in_projz
0.out_projz	1.in_projz
1.out_projz	2.in_projz
2.out_proj)sequence_dim3)device_meshparallelize_planmesh)r(   r*   r  r'   r+   r)   r  r_   r  r   r   )rJ   r  r  r  r   r  r  s          r?   parallelizezMLPStack.parallelize|  s     )%@)5A(%@)5A(%@%% *qB "
 !!$4!$DS!4WGWX 	=F&",,/+6"<W<<	= 	D6w6+6r>   )	r8   r9   r:   r   r  r   r   r  r  r  s   @r?   r  r  o  sD    BG 
3 
34 
3  '+	 
r>   r  c                        e Zd ZdZddedef fdZdej                  de	e
ej                  ej                  f   ej                  f   fdZ xZS )	DoubleLinearz
    This can be used for returning multiple outputs from a module
    (``use_second_linear=True``) or for having an unused module (``False``).
    r  use_second_linearc                     t         |           t        j                  ||      | _        t        j                  ||      | _        t        j                         | _        || _        y r   )	r   r   r_   r   lin1lin2ReLUr  r  )rJ   r  r  r   s      r?   r   zDoubleLinear.__init__  sG    IIc3'	IIc3'	GGI	!2r>   r   rG   c                     | j                   r@| j                  | j                  |            | j                  | j                  |            fS | j                  | j                  |            S r   )r  r  r  r  r#  s     r?   r   zDoubleLinear.forward  sQ     !!99TYYq\*DIIdiil,CCCyy1&&r>   T)r8   r9   r:   r[   r   r  r   r\   r]   r   r   r   r  r  s   @r?   r  r    sT    
3C 3D 3''	uU\\5<</0%,,>	?'r>   r  new_all_gather_into_tensorc              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )rl   all_gather_into_tensorbarrier)r  orig_all_gathers     r?   patch_all_gatherr    sO     11OLLN"<D6&5# 	&5#   0A;A  A;!A88A;new_reduce_scatter_tensorc              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )rl   rM  r  )r  rJ  s     r?   patch_reduce_scatterr    sP     44LLN!:D9%8" 	%8"r  new_all_reducec              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )rl   rz  r  )r  orig_all_reduces     r?   patch_all_reducer    sJ     ooOLLN$DO*) 	)r  new_unshardc              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )r   unshardrl   r  )r  orig_unshards     r?   patch_unshardr    Q      "))LLLN(N.!- 	!-r  new_reshardc              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )r   reshardrl   r  )r  rq  s     r?   patch_reshardr    r  r  new_post_backwardc              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )r   post_backwardrl   r  )r  orig_post_backwards     r?   patch_post_backwardr    sR      (55LLN#4N :'9$ 	'9$r  new_backwardc              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )r   r   rl   r  )r  orig_backwards     r?   *patch_register_post_backward_hook_backwardr    sT      199MLLN,8 )>0=$- 	0=$-r  rJ  rV   rW   c                     t        |      dkD  r|d   }nd|v r|d   }nt        d| d|        ||        ||i |S )Nr   rP   z,Cannot get reduce-scatter output from
args: z	
kwargs: )lenAssertionError)clsrJ  rc   rV   rW   rP   s         r?   reduce_scatter_with_assertr    sa     4y1}a	V	!;D6F8T
 	
 f///r>   replicated_modulesharded_moduleprefixes_to_ignore.c                    t        |j                         |j                               D ]  \  \  }}\  }}|}|D ]  }	|j                  |	d      } | j                  ||       | j	                  |t
               t        |t
              sJ |j                  |j                  }}
t        |      t        d      t        d      fk(  rt        d      t        ||
|      }| j                  |j                         |j                                |j                  | j                  |j                         | j!                  |j                         t        |j                  |
|      }| j	                  |j                  t
               t        |j                  t
              sJ | j                  |j                  j                         |j                                 y )N r   zmFSDP's (Shard(0), Shard(0)) layout differs from distribute_tensor(), so we cannot check for equality using it)rp   rh   replaceassertEqualassertIsInstancer&   r  r  
placementsr  r'   r  r%   to_localrx  assertIsNoneassertIsNotNone)r  r  r  r  replicated_namereplicated_paramsharded_namesharded_paramclean_sharded_nameprefixr  r  sharded_ref_paramsharded_ref_grads                 r?   check_sharded_parityr  "  s    OR**,n.M.M.OO TJ+*-JlM *( 	HF!3!;!;FB!G	H);<]G4-111(44m6N6Njq58 44 ;  ..>jQ..02C2L2L2NO  (]//0M../,-=-B-BD*U]//9-,,g666**3357G7P7P7RS1Tr>   c                   @     e Zd Zed        Z fdZd Zd Zd Z xZ	S )FSDPTestMultiThreadc                 ~    t         j                  j                         rt         j                  j                         S dS )Nr   )r\   r   is_availabledevice_countr   s    r?   rv   zFSDPTestMultiThread.world_sizeD  s)    ,1JJ,C,C,Euzz&&(L1Lr>   c                 B    t         |           | j                          y r   )r   setUp_spawn_threadsrJ   r   s    r?   r  zFSDPTestMultiThread.setUpH  s    r>   c                      t        | g|i |S r   r1   r[  s      r?   r1   z FSDPTestMultiThread.run_subtestsL      D242622r>   c                 @    t         j                  j                          y r   r\   _dynamoresetr   s    r?   perThreadSetUpz"FSDPTestMultiThread.perThreadSetUpO      r>   c                 @    t         j                  j                          y r   r  r   s    r?   perThreadTearDownz%FSDPTestMultiThread.perThreadTearDownR  r  r>   )
r8   r9   r:   propertyrv   r  r1   r  r  r  r  s   @r?   r  r  C  s+    M M3r>   r  c            $           e Zd Z fdZed        Zed        Zed        Zd Zd Z	d Z
d Zed	        Z	 	 	 	 	 	 	 d%dej                  dedededee   dedee   dededeeeef      fdZd
dd e       d
d
d
ddddd
d
fdee   dededee   dedededee   d ee    dee   d!ed"ededed#eeeef      deeeef      f d$Z! xZ"S )&FSDPTestc                 h    t         |           dt        j                  d<   | j	                          y )N0TORCH_NCCL_DESYNC_DEBUG)r   r  osenviron_spawn_processesr  s    r?   r  zFSDPTest.setUpW  s)     14

,-r>   c                     t         j                  j                         r(t        t         j                  j	                         d      S dS )Nr   r   )r\   r   r  minr  r   s    r?   rv   zFSDPTest.world_size_  s1    49JJ4K4K4Ms5::**,a0TSTTr>   c                 >    t         j                  j                         S r   )rl   distributed_c10d_get_default_groupr   s    r?   rb   zFSDPTest.process_groupc  s    $$7799r>   c                 *    t          | j                   S r   )r3   	file_namer   s    r?   init_methodzFSDPTest.init_methodg  s    t~~.//r>   c                 <    | j                  ||j                         y r   )r  r   )rJ   r  r   s      r?   _check_cpu_offloadzFSDPTest._check_cpu_offloadk  s    j&<&<=r>   c                 <    | j                  ||j                         y r   )r  backward_prefetch)rJ   r  r  s      r?   _check_backward_prefetchz!FSDPTest._check_backward_prefetchn  s    *J,H,HIr>   c                 <    | j                  ||j                         y r   )r  forward_prefetch)rJ   r  r  s      r?   _check_forward_prefetchz FSDPTest._check_forward_prefetchq  s    ):+F+FGr>   c                      t        | g|i |S r   r  r[  s      r?   r1   zFSDPTest.run_subtestst  r  r>   c                     | |      }||_         ||_        |j                  dd      }t        d|j                    d|j                          t
        j                  j                         rdnd}	 |r`t
        j                  j                  j                  j                  j                         }	t        j                  d|j                  ||	       n@t        j                  |j                  |t!        |j                        |j                   	       d }t
        j                  j                         rkt
        j                  j/                         rM|j                   t
        j                  j/                         z  }t
        j                  j1                  |       |g}t        j2                  |       t
        j4                  j7                          |j9                  ||       t
        j4                  j7                          t        j2                  |       t        j:                          y # t"        $ r=}
d
|
j$                  d   v r&t'        j(                  t*        d   j,                          d }
~
ww xY w)Nfake_pgFzdist init r=z, world=ncclgloofake)backendrv   r   store)r  r  rv   r   	recompiler   backend_unavailable)
device_ids)r   r
  getprintrv   r\   r   r  testing	_internalrL  r  	FakeStorerl   init_process_groupr  r   RuntimeErrorrV   sysexitr2   	exit_coder  
set_devicer  r  r  run_testdestroy_process_group)r  r   	test_namer
  piperW   rJ   r  r  r  er  	device_ids                r?   _runzFSDPTest._runw  s   9~	"**Y.TYYKx/@AB "JJ335&6	//;;CCMMO''"#	 '' $ 0 0#"4??3	 
::""$)@)@)B		EJJ$;$;$==IJJ!!),#J
 	
+i&
+""$/  	affQi'$9:DDE		s   1B"H- -	I368I..I3NFra   	num_stepsautocastlrfsdp_cpu_offload
save_modelmixed_precisionenable_sharded_grad_scaleruse_pure_fp16sharded_grad_scaler_kwargsc           	         |xr |j                   }t        |j                               j                  }|
i }
t	        d	d|i|
}t
        j                  j                  |j                         |d      }t        |      D ]  }|j                          t
        j                  j                  d|      5  |j                  j                  t        j                  d            }|	s|rMt        |t              s=t        |t
        j                         r|j#                         }nt%        d |D              } || }|rft        |t              rV|j&                  t(        vrD|j                         D ]1  }| j+                  |j                  t        j                  d             3 |j                  j-                  ||      j/                  |      }d d d        |j1                        }|s&|	s$|j2                  t
        j4                  k(  sJ d       |	r+| j+                  |j2                  t
        j6                         net        |t              r+|J | j+                  |j2                  |j8                         n*| j+                  |j2                  t
        j4                         |j                  j;                  |       |rTt        |t              rD|j                         D ]1  }| j+                  |j                  t        j                  d             3 |j=                  |       |j?                          |s|jA                         jC                         D ci c]  \  }}||jE                          }}}tG        |       |jI                  |        t        |t              r|jK                  tL        jN                         jQ                         S # 1 sw Y   xY wc c}}w )
Nenabledg?)r3  momentumr   )r;  c              3   <   K   | ]  }|j                           y wr   )r   )rd  r   s     r?   re  z4FSDPTest._train_for_several_steps.<locals>.<genexpr>  s     %>1affh%>rf  rj   zeloss data type should be float32, as the original                     parameter data type is float32.r=   ))offload_paramsnextr   rK   r!   r\   optimSGDrn   	zero_gradampr2  r  rL   r  r   r]   r   r  r  r   r  rQ   toscaler   float32float16param_dtyperU   stepupdater   r   cloner   load_state_dict_assert_stater   IDLEri   )rJ   ra   r1  r2  r3  r4  r5  r6  r7  r8  r9  cpu_offload_paramsmodel_devicesharded_grad_scalerr@  rw   rO   rP   r8  rT   kvr   s                          r?   _train_for_several_stepsz!FSDPTest._train_for_several_steps  s4    .Q2B2Q2QE,,./66%-)+&/ 
.
2L

  0 0 2rCHy! 9	2AOO##FH#= M..u||F/CD _Zt=T!%6 %

 %%>%> > '"5$/ //>? #--/ H((5<<3FGH ||,,UF;>>|L-M. ',,T2D"=JJ%--/555/ !$$TZZ?t,*666$$TZZ1L1LM$$TZZ?LL%%d+!j&=))+ DA$$QXXu||E/BCD  $$U+&&(7<7G7G7I7O7O7QRtq!alR
R E"%%j1s9	2v eT" 2 23{{}wM Mf Ss   5DOO O	r   Tmodel_classr   r   ref_init_fn	num_itersr   r  r  r  use_orig_paramsinit_kwargsc                 8   |t         j                  k7  sJ d       |i }d}| j                  j                         } |j                  | j                  t         j                  t
        j                  fddi|}|t        ||g|      }n ||      }|r|j                         }| j                  |||
du|||
|||	      }t        |j                               }|j                  |||	|
||d       	  |j                  | j                  |||fddi|}t!        |t"              st#        || j                  fi |}|r|j                         }|t
        j$                  k(  r|j'                         }|duxr |j(                  }|xr |t
        j$                  k(  }|xr |t
        j$                  k7  }|rFt+        j,                  d      }|j                         D ]  }| j/                  |j,                  |         |r| j1                  t2        d      n	t5               }|5  | j                  ||d||||
|||
      } ddd       |ry|rVt+        j,                  d      }|j                         D ]  }| j/                  |j,                  |          j'                         } t7        |      }!t*        j8                  j;                  | d       |
|s| j/                  ||!dd       yyy# t        $ r }t        d	| d
t        |             |d}~ww xY w# 1 sw Y   xY w)a  
        Tests FSDP training against a reference, which defaults to DDP but
        may be customized with ``ref_init_fn``.

        Args:
            model_class (Type[FSDPTestModel]): A model class that inherits from
                ``FSDPTestModel``, which defines the expected interface.
            fsdp_init_mode (FSDPInitMode): The mode to initialize the
                FSDP-wrapped model. This should not be ``NO_FSDP``.
            ref_init_fn (Optional[Callable]): A callable to invoke that wraps a
                non-wrapped model to construct the reference model, where this
                wrapper should provide data parallel semantics. If ``None``,
                then the callable defaults to the DDP constructor.
        z.Expects an FSDP init mode that wraps with FSDPN{Gz?r   T)r  output_device)r2  r3  r4  r6  r7  r8  r9  )r   r  r  r6  r  rX  zInitializing z raised error rj   zSAn FSDP-managed module with parameter CPU offloading enabled has parameters on cudaF)r2  r3  r4  r5  r6  r7  r8  r9  )check_dtypezFSDP did not match DDP)exact_devicemsg)r7   r;   rb   r   rY   rA   rB   DDPr   rT  r   r   rJ  	Exceptionr  r   r  r   rC   r   r>  r\   rK   r  assertRaisesRegexr%  r   r   r!  assert_close)"rJ   rU  r   r   rV  rW  r5  r   r  r  r6  r  rX  r7  r8  rY  r9  r   r3  r   ra   	ref_modelref_loss
ddp_paramsr  r.  r>  expects_device_errorexpects_cpu_device
cpu_devicerr   context	fsdp_lossfsdp_unsharded_paramss"                                     r?   _test_fsdp_parityzFSDPTest._test_fsdp_parity  s   F l222	<;	<2K!!&&(     $$
 	

 
 EtfDII#E*I!(I00$D0(+'A''A 1 

 )..01
*%6%6#2$4#2		

	Y)))""	
 # J *d+ j$*<*<LLJ#*J\444#*J$D0O[5O5O
 H~1H1HH 	 H~1H1HH 	 e,J#..0 ;  z:; $ "")  	  	55!,% /+E++E 6 I	   e,J#..0 ;  z:;!(I /
 ; 	""8YE"J "=%!,	   ,9"K  	Y}[MAxPQWXX	YF	 	s$   2"K$ L$	L-LLL)r[  NFNFFN)#r8   r9   r:   r  r  rv   rb   r  r  r  r  r1   classmethodr0  r_   r`   r   r  floatr   r   r   r   r   r   rT  r   rF   r7   rA   r   r   r    rm  r  r  s   @r?   r  r  V  s*     U U : : 0 0>JH3 3% 3%t 15 48+0#?CUyyU U 	U
 U #:.U U ".1U %)U U %-T#s(^$<Ux +/",,8<8<48!& %+0#04?C#b-(b %b %	b
 h'b b b  b $$45b $$45b ".1b b b %)b b  d38n-!b" %-T#s(^$<#br>   r  compile_compute_on_modulec                 @      fd G d dt               fd}|S )Nc                      t        j                  j                  j                  j                  | i | t        | d         r| d   j                          y y rF  )r\   rL  _composablers  r   r  r;  )rV   rW   rp  s     r?   !fully_shard_with_compiled_computez=test_compiled_fsdp.<locals>.fully_shard_with_compiled_compute  sT    %%**66GG$,
G.1
 GOO1
r>   c                   (    e Zd Z e       Z e       Zy)*test_compiled_fsdp.<locals>.FullyShardModeN)r8   r9   r:   r   EAGERCOMPILED_COMPUTEr=   r>   r?   FullyShardModerv    s    6r>   ry  c                 4     t                fd       }|S )Nc                     t         j                  j                  j                  j                  }D ]  }|j
                  k7  r t               st        j                  d       3t         j                  j                  j                  }t         j                  j                  j                  }t         j                  j                          |j
                  k(  r|}n^|j                  k(  rAdt         j                  j                  _        dt         j                  j                  _        }nt!        d|       |	j"                  |j$                  <    	| i | t         j                  j                          |	j"                  |j$                  <   |t         j                  j                  _        |t         j                  j                  _         y )Nz0Inductor on GPU needs Triton and recent GPU archTrg   z!Need to implement FullyShardMode=)r\   rL  rs  rs  r   rw  r5   warningswarnr  configskip_fsdp_hooks	_inductorcompile_threadsr  rx  NotImplementedError__globals__r8   )
rV   rW   original_fully_shardmodeoriginal_skip_fsdp_hooksoriginal_compile_threadsfully_shard_patchry  rt  funcs
          r?   wrapperz6test_compiled_fsdp.<locals>.decorator.<locals>.wrapper  sj   #(#4#4#@#@#E#E#Q#Q & R>///
MM"TU+0==+?+?+O+O(+0??+A+A+Q+Q(!!))+>///(<%^<<<;?EMM((8=>EOO**:(I%-;D6B  CT  !5!>!>?d%f%!!))+BV  !5!>!>?7O$$49Q&&69Rr>   r	   )r  r  ry  rt  s   ` r?   	decoratorz%test_compiled_fsdp.<locals>.decorator  s#    	t	R 
	R@ r>   )r   )rp  r  ry  rt  s   ` @@r?   test_compiled_fsdpr    s"    " ""H r>   c                   &     e Zd Zd fdZd Z xZS )
SkipModulec                 \    t         |           t        j                  ddd      | _        y N
   Fr  )r   r   r_   r   linr  s    r?   r   zSkipModule.__init__  s"    99R%0r>   c                 $    | j                  |      S r   )r  r#  s     r?   r   zSkipModule.forward  s    xx{r>   rZ   r\  r  s   @r?   r  r    s    1r>   r  c                   $     e Zd Z fdZd Z xZS )NestedLinearc                     t         |           |r5t        t        j                  ddd      j                               | _        y t        j                  ddd      j                         | _        y r  )r   r   r$   r_   r   r   nested_linear)rJ   	fsdp_wrapr   s     r?   r   zNestedLinear.__init__  sR    !%biiBU&C&H&H&J!KD!#2r!>!C!C!EDr>   c                 $    | j                  |      S r   )r  r#  s     r?   r   zNestedLinear.forward  s    !!!$$r>   r\  r  s   @r?   r  r    s    F%r>   r  c                   $     e Zd Z fdZd Z xZS )	SkipModelc                     t         |           t        j                  ddd      j	                         | _        t               j	                         | _        t        t        |            | _
        y )Nr  Fr  )r  )r   r   r_   r   r   linearr  linear_skipr$   r  r  )rJ   double_nestr   s     r?   r   zSkipModel.__init__  sP    iiBU388:%<,,.!,"EFr>   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r  r  r  r#  s     r?   r   zSkipModel.forward  s4    KKNQq!r>   r\  r  s   @r?   r  r    s    Gr>   r  )FT)FFr  )r=   r   )
contextlibr  r3  r&  r|  abcr   r   r   copyr   enumr   r   	functoolsr
   typingr   r   r   r   r   r   r   r   r   unittestr   r\   torch.distributedrL  rl   torch.nnr_   torch.nn.functionalr   r  torch.distributed._composabler   "torch.distributed._composable.fsdpr   4torch.distributed._composable.fsdp._fsdp_param_groupr   r   torch.distributed.device_meshr   torch.distributed.fsdpr   r   r   $torch.distributed.fsdp._common_utilsr   "torch.distributed.fsdp._init_utilsr   2torch.distributed.fsdp.fully_sharded_data_parallelr   r   r    *torch.distributed.fsdp.sharded_grad_scalerr!   torch.distributed.fsdp.wrapr"   r#   r$   torch.distributed.tensorr%   r&   r'   !torch.distributed.tensor.parallelr(   r)   r*   r+   r,   r-   torch.nn.parallel.distributedr.   r`  *torch.testing._internal.common_distributedr/   r0   r1   r2   $torch.testing._internal.common_utilsr3   r4   torch.utils._tritonr5   r7   rA   r`   rF   r  r}   r  r   r   r   r   r   r   r   r   r   r  r,  r/  r?  rT  rX  r^  r}  r  r  r  contextmanagerr  r  r  r  r  r  r  r  r   r  r  r  typer  r  r  r  r=   r>   r?   <module>r     s    	 	 
  # "   
 
 
        4 : 5 O > R 
 I R R F F  F H  P *4 4 BIIs 499$$ B #99##""2299 2t 23ryy 3 3DBII D$ D .Q"- Q"h[J- [J|$7 D]J. ]J@C
m C
L
? 
.,ryy ,FJ* FJR/")) /@*r}} *Z'299 '6 6 6 6 9H 9 9 *X * * .x .  . .x .  . :8 :  : >X >  >0!0 0 	0
 0. +-	TyyT IIT c3h	TB/ &P# Pf
0(4. 0f 	%299 	%		 r>   