
    sg                     Z   d dl Z d dlZd dlZd dlmZ d dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZmZmZmZ d dlmZ d dlmZmZmZmZ d Z	 dd	ej2                  d
edededej6                  deej8                     defdZd	ej2                  d
ededefdZd	edee   dej2                  fdZy)    N)Optional)_get_device_module)distributed_c10d)ShardShardedTensorShardedTensorMetadataTensorProperties)ShardMetadata)
DeviceMeshDTensor	Replicater   c                     |j                         dk(  rd|  d| S |j                         dk(  r"d|  d| dt        |      j                          S d|  d| d| |z   S )Ncpuzrank:/hpu:)lowerr   current_device)rankdevice_typenum_devices_per_nodes      V/var/www/html/venv/lib/python3.12/site-packages/torch/distributed/fsdp/_shard_utils.py_get_remote_device_strr      s    e#tfAk]++					%tfAk]!,>{,K,Z,Z,\+]^^tfAk]!D3G,G+HII    tensorr   
world_sizer   pgdevicereturnc                    | j                  |d      }t        |      |kD  rx||   j                         }| j                         D cg c]  }d }	}t	        j
                  | j                         d   |z        |z  |	d<   t        j                  ||	|      g}
ng }
|D cg c]  }t        |j                                }}dgt        t        j                  |D cg c]  }|d   	 c}            dd z   }dgt        |d         dz
  z  }	|D cg c]  }|g|	z   
 }}|t        j                  |      j                  n|j                  }t        t        |            D cg c]#  }t        t!        j"                  ||      ||      % }}t        |      t        |      cxk(  rt        |      k(  sJ  J t%        |||      D cg c]  \  }}}t'        |||       }}}}t)        || j                         t+        | j,                  | j.                  dt0        j2                  | j5                                     }t7        j8                  |
||	      S c c}w c c}w c c}w c c}w c c}w c c}}}w )
z
    Shard a tensor to chunks along the first dimension. The local rank will gets its
    corresponding chunk as the local shard to create a ShardedTensor.
    r   )dimN   F)dtypelayoutrequires_gradmemory_format
pin_memory)shards_metadatasizetensor_properties)sharded_tensor_metadataprocess_group)chunklencloner*   mathceilr   from_tensor_and_offsetslist	itertools
accumulater   _get_pg_default_devicetyperanger   distget_global_rankzipr
   r   r	   r$   r%   torchcontiguous_format	is_pinnedr   +_init_from_local_shards_and_global_metadata)r   r   r   r   r   r   chunkslocal_shard_offsetslocal_shardsr.   chunk_sizes
chunk_sizedim0_offsetsd0chunk_offsetsr   r
placementsoffsetr*   	placementshard_metadatar,   s                            r   _create_chunk_sharded_tensorrP      su    \\*!\,F
6{TTl((*$kkm,1,,YYv{{}Q/*<=D
55k7DQR 4::%4

%:K:3kJ
jmJK	r L cSQ(1,-G.:;bTG^;M; > 	//388[[  s;'(  	  Q' 	
J  {s=1DS_DDDDD (+=+z'R #FD) 	fdI.N  4&[[]*,,==11'')

 DD.EUW U - ;J <s$   	I  II!I&#(I+	I0device_meshc                 h   | j                         j                         } t        |j                        D cg c]  }t	                }}t        |j                        D cg c]  }t	                }}t        d      |d<   t        j                  | ||d      j                  |      S c c}w c c}w )z
    Shard a tensor to chunks along the first dimension. The local rank will gets its
    corresponding chunk as the local tensor to create a DTensor.
    r   r"   F)	run_check)rL   )	r0   detachr9   ndimr   DShardr   
from_localredistribute)r   r   rQ   rC   replicate_placementsshard_placementss         r   _create_chunk_dtensorr[   \   s     \\^""$F 27{7G7G1HIAIKII-2;3C3C-DE	EE!!9R1Ul#  	 JEs   B* B/	root_meshc                     || j                   k(  sJ d       t        t        j                  | j                              }t               |d<   | j                  | j                   |      } | j                         S )zT
    All gather a DTensor in its sharded dimension and return the local tensor.
    z2The device mesh of a tensor should be a root mesh.r"   )rQ   rL   )rQ   r4   copydeepcopyrL   r   rX   to_local)r   r\   rL   s      r   _all_gather_dtensorra   u   sz     	V'''<;<' dmmF$5$567J [JrN  && ! F
 ??r   )N) r^   r5   r1   typingr   r=   torch.distributeddistributedr:   torch._utilsr   r   'torch.distributed._shard.sharded_tensorr   r   r   r	   &torch.distributed._shard.sharding_specr
   torch.distributed.tensorr   r   r   rV   r   TensorintProcessGroupr   rP   r[   ra    r   r   <module>rm      s           + .  A T TJ &*;LL;
; ; 	;
 	; U\\"; ;|LL
  	2
# \\r   