
    sgT5                        d dl Z d dlmZmZmZmZmZ d dlZd dlm	Z
 d dlm	c mc mZ d dlm	c mZ d dlmZmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	l m!Z! d d
l"m#Z# d dl$m%Z%m&Z&m'Z'mZ( d dl)m*Z*m+Z+ dgZ,de&deejZ                  ejZ                  f   fdZ.de&de/deejZ                  ejZ                  f   fdZ0de&deejZ                  ejZ                  f   fdZ1de&de/defdZ2de&dejf                  defdZ4de&dejf                  fdZ5dejl                  dejn                  de/dejl                  fdZ8dejn                  de/de/de/de
jf                  dejn                  fd Z9dejn                  de/d!e%de&fd"Z:dejn                  deejn                  ee   f   fd#Z;de&d$ee%   dejn                  fd%Z< G d& de      Z=y)'    N)AnycastListOptionalTuple)ShardShardedTensorShardedTensorMetadataTensorProperties)ShardMetadata)ChunkShardingSpec)_mesh_resources)_set_fsdp_flattened)FSDPExtensions)_create_chunk_sharded_tensor)_remote_device)
DeviceMeshDTensor	Replicater   )_flatten_tensor_unflatten_tensorDTensorExtensionstensorreturnc                    | j                   }|j                  dk(  sJ d       | j                  d   }dgt        | j	                               z  }|j	                  d      }| j                  d   j                         r3t        t        |      j                  }| j	                  |      |z  }|||<   t        j                  |      | j                  j	                         fS )N   &Only 1D DeviceMeshes currently handledr   )mesh_dim)device_meshndim
placementslensizeis_shardr   DSharddimtorchSize_local_tensor)r   r   	placementoffsets
num_chunks	shard_dim
chunk_sizes          Y/var/www/html/venv/lib/python3.12/site-packages/torch/distributed/tensor/parallel/fsdp.py_get_boxr0       s    $$Kq J"JJ !!!$IcC&&G!!1!-J$$&+//	[[+z9
'	JJw!5!5!:!:!<==    idxc                 x    t        |       \  }}t        j                  |D cg c]  }||z  	 c}      |fS c c}w N)r0   r'   r(   )r   r2   r+   r#   vals        r/   _get_box_forr6   0   s6    V$MGTJJW5cc	56==5s   7c                 `    | j                   }|j                         }|J t        | |d         S )Nr   )r   get_coordinater6   )r   r   coords      r/   _get_local_boxr:   5   s8    $$K&&(Ea))r1   dtcurrent_rankc                     | j                   }|j                  dk(  sJ d       t        |       \  }}t        t	        |      t	        |      d| d| j
                  j                         S )Nr   r   rank:/shard_offsetsshard_sizesr*   )r   r    r:   r   listr)   device)r;   r<   meshr+   sizess        r/   _create_shard_md_from_dtrG   <   sg    >>D99>CCC>#B'NGU7mK,q)9)9)@)@(AB r1   dt_pgc                    g }t        j                  |      }|dkD  rdnd}| j                  d   j                         r|j	                         }nd}t        |      D ]a  }t        | |      \  }}|j                  t        t        |      t        |      d|dkD  r|n| d| j                  j                                c t        || j	                         t        | j                  | j                  | j                               S )Nr   r   r>   r?   r@   )dtypelayoutrequires_grad)shards_metadatar#   tensor_properties)distget_rankr!   r$   r#   ranger6   appendr   rC   r)   rD   r
   r   rJ   rK   rL   )	r;   rH   	shards_mdmy_rankscapegoat_rankshard_countir+   rF   s	            r/   !_create_sharded_tensor_md_from_dtrX   H   s     ImmE"G!A+Q1N	}}Q  "jjl; 

%b!,"7m Ka!eNA2CSCSCZCZB[\		


 !!WWY*((99**
	 	r1   c                 f    | j                   }|j                  dk(  sJ d       |j                         S )Nr   r   )r   r    	get_group)r;   rE   s     r/   
_get_dt_pgr[   o   s.    >>D99>CCC>>>r1   specrankc                    t        | t              s| S d}| j                  D ]G  }t        t        |      }|j                         |k(  s'|j                         |j                  k7  sEd} n |rt        j                  |       } t        | j                        D ]o  \  }}t        t        |      }|j                         |k(  s*|j                         |j                  k7  sHt	        d| d|j                         | j                  |<   q | S )z
    Rewrite ``spec`` to match the device of ``tensor``.

    FSDP.sharded_optim_state_dict sneakly ships optimizer state to CPU so if the original ShardingSpec
    produces CUDA metadata, ST construction bombs.
    FTr>   r?   )

isinstancer   r!   r   r   r]   rD   copydeepcopy	enumerate)r\   r   r]   rewriteprW   r*   s          r/   _rewrite_spec_if_neededre   u   s     d-. G__ #668t
fmm ;G	
 }}T"%doo6 	TLAy^Y7I~~4'I,<,<,>&--,O%3eD66==/4R%S"	T
 Kr1   
world_sizenum_devices_per_nodepgc           	         t        |       t        u rt        | j                               dk(  sJ | j	                         }t        |||||      }| j                         d   }t        |t        j                  |j                              g}t        j                  | j                               }	d|	j                  _        t        j                  ||	| j                  d      }
|
S t        |       t        u r| j                  }|j                   dk(  sJ d       | j"                  }t        |||t$        j&                  j)                         |      }t+        |       }t        |t-        | t/        j0                  |                  g}t3        | |      }	d|	j                  _        t        j                  ||	|d      }
|
S t        | ||||      S )Nr   r   F)sharded_tensor_metadataprocess_group
init_rrefsr   )typer	   r"   local_shardslocal_tensorr   r   r`   ra   metadatarN   rL   +_init_from_local_shards_and_global_metadata_process_groupr   r   r    r)   r'   cudadevice_countr[   rG   rO   rP   rX   )r   r]   rf   rg   rh   inner_paraminner_stouter_local_shardshardsst_metast_outerr   rH   s                r/   _chunk_tensorr{      s    F|}$6&&()Q...))+/ 
 #//1!4(DMM*;*D*DEF
 -- 1227!!/ LL$+ //	
 	f	 ((1$N&NN$**/JJ##%
 6" (4VT]]5=QRS
 4FEB27!!/ LL$+	
 + 
 	
r1   r   c                    t        j                  |      }|t        d      |j                  dk  rt        d|j                   dd      | j	                         j                         } t        | t        j                        rt        | t              st        |j                        D cg c]  }t                }}t        |j                        D cg c]  }t                }}t        d      |d<   t        j                  | ||d      j                  ||	      S | j                  }|d   }| j!                         } t        |j                        D cg c]  }t                }}||d
<   t        |j                        D 	cg c]  }	t                }}	t        d      |d<   ||d
<   t        j                  | ||d      j                  ||	      S c c}w c c}w c c}w c c}	w )z
    Shard a tensor to chunks along the first dimension.

    The local rank will gets its corresponding chunk as the local tensor to create a DTensor.
    z4No parent device_mesh is found for FSDP device_mesh.   z!Found parent device_mesh of ndim=,zbut meshes must be at least 2D.r   F)	run_checkr   r!   )r   get_root_meshRuntimeErrorr    clonedetachr_   r'   Tensorr   rQ   r   r%   
from_localredistributer!   to_local)
r   r]   r   	root_mesh_replicate_placementsshard_placementstp_placementstp_placementrW   s
             r/   _chunk_dtensorr      s     --k:IQRR~~/	/?qA-
 	
 \\^""$F
 &%,,'
670K 6;9>>5JK	KK16y~~1FGAIKGG$Qi!!I3u

,!'  
	
 ))$Q'" 6;9>>5JK	KK#/R 16y~~1FGAIKGG%ay+!!I3u

,!'  
	
9  LG*  LGs   +GGG!G&c                    t        t        |       j                         }t        |      dk(  r?t	        |d   j
                        t        u r!|d   j
                  }|j                         }|} | t        |      dkD  r|fS g fS )Nr   r   )r   r	   rn   r"   rm   r   )r   rx   inner_tensors      r/   _pre_load_state_dictr     sz     -(557F
6{aD!1!12mCay''**,c&kAoF66266r1   parent_meshc                 "   || j                   k(  sJ t        t        j                  | j                              }t        dt        |      dz
        D ]  }t               ||<    | j                  | j                   |      } | j                         S )zGAll gather a DTensor in its FSDP dimension and return the local tensor.r   r   r   )
r   rC   r`   ra   r!   rQ   r"   r   r   r   )r   r   r!   rW   s       r/   _all_gather_dtensorr   )  s    
 &,,,,,dmmF$5$567J 1c*o)* $!
1$  && ! F
 ??r1   c                       e Zd ZdZd fdZdej                  deej                  ee	   f   fdZ
dej                  de	dej                  fdZ	 ddej                  ded	ed
edej                  deej                     dej                  fdZdej                  dededej                  fdZdej                  deej                  ee   f   fdZdedee   dej                  fdZ xZS )r   z
    DTensorExtension is the TensorFlattener extension needed for 2D FSDP + TP.

    This is the implementation for FSDPExtensions defined in
    https://github.com/pytorch/pytorch/blob/main/torch/distributed/fsdp/_fsdp_extensions.py
    r   c                     t         |           d | _        || _        t        j
                  j                  | j                        | _        y r4   )super__init__compute_streamdevice_handler'   _dynamodisablepost_unflatten_transform)selfr   	__class__s     r/   r   zDTensorExtensions.__init__E  s=    "* ).(=(=d>[>[(\%r1   r   c                     t        |      S r4   )r   r   r   s     r/   pre_flatten_transformz'DTensorExtensions.pre_flatten_transformM  s     v&&r1   param_extensionc                    | j                   xs | j                  j                         }| j                  j                  |      5  t	        ||| j                  | j                         }t        |       |cd d d        S # 1 sw Y   y xY w)N)r   r   )r   r   current_streamstreamr   r   )r   r   r   r   results        r/   r   z*DTensorExtensions.post_unflatten_transformS  s}     $$K(:(:(I(I(K&&v. 	 '"00#22	F  '	 	 	s   0A>>Br]   rf   rg   rh   rD   c                      t        |||||      S r4   )r{   )r   r   r]   rf   rg   rh   rD   s          r/   chunk_tensorzDTensorExtensions.chunk_tensorf  s     VT:7KRPPr1   r   c                     t        |||      S r4   )r   )r   r   r]   r   s       r/   chunk_dtensorzDTensorExtensions.chunk_dtensorq  s     fdK88r1   c                     t        |      S r4   )r   r   s     r/   pre_load_state_dict_transformz/DTensorExtensions.pre_load_state_dict_transformy  s     $F++r1   r   c                     t        ||      S r4   )r   )r   r   r   s      r/   all_gather_dtensorz$DTensorExtensions.all_gather_dtensor  s    
 #6;77r1   )r   Nr4   )__name__
__module____qualname____doc__r   r'   r   r   r   r   r   r   intrO   ProcessGrouprD   r   r   r   r   r   r   r   r   __classcell__)r   s   @r/   r   r   =  sW   ]'' 
u||Xc]*	+'ll58	4 *.	Q	Q 	Q 		Q
 "	Q 	Q &	Q 
	Q99 9  	9
 
9,, 
u||T%[(	),88 j)8 
	8r1   )>r`   typingr   r   r   r   r   r'   torch.distributeddistributedrO   &torch.distributed._shard.sharding_spec_shardsharding_spec
shard_spec"torch.distributed.distributed_c10ddistributed_c10dc10d'torch.distributed._shard.sharded_tensorr   r	   r
   r   r   :torch.distributed._shard.sharding_spec.chunk_sharding_specr   torch.distributed.device_meshr   $torch.distributed.fsdp._common_utilsr   'torch.distributed.fsdp._fsdp_extensionsr   #torch.distributed.fsdp._shard_utilsr   torch.distributed.remote_devicer   torch.distributed.tensorr   r   r   r%   6torch.distributed.tensor.parallel._data_parallel_utilsr   r   __all__r(   r0   r   r6   r:   rG   r   rX   r[   ShardingSpecr   re   r{   r   r   r   r    r1   r/   <module>r      s;    3 3    ; ; 1 1  A X 9 D B L : T T 
>W >uzz5::'=!> > > >s >uUZZ5K/L >
*7 *uUZZ-C'D *	 	 	 	$$))$$N7 t00 

!
!+0<<?B:G
LLG

G
 G
 	G

 	G
 \\G
T>
LL>

>
 >
 	>
B	7LL	7
5<<e$%	7*% \\(G8 G8r1   