
    sg6                        d dl mZmZmZmZ d dlZd dlmc mc m	Z
 d dlmZ d dlmZ d dlmZ d dlmZmZmZmZmZ deded	ee   d
eedf   fdZdeded	ee   d
eeedf   eedf   f   fdZdej4                  ded	ee   d
eee   ee   f   fdZdej8                  j:                  dee   d
efdZdeded	ee   d
eedf   fdZ d
ejB                  fdZ"y)    )castListSequenceTupleN)	ShapeType)
DeviceMesh)DTensorSpec)_StridedShardPartial	Placement	ReplicateShardglobal_shapemesh
placementsreturn.c                 v   |j                         }|yt        |       }t        |       }t        |      D ]x  \  }}|j	                  |      }t        |t              s(|j                  }	|	|k  sJ d|	 d|        |j                  ||	   |||         \  }
}t        |
t              sJ |
||	<   z t        |      S )zl
    Compute the shape of a local shard of the given DTensor on its current
    coordinate of the mesh.
    )r   Sharding dim  greater than tensor ndim )get_coordinatelistlen	enumeratesize
isinstancer   dim_local_shard_size_on_diminttuple)r   r   r   my_coordinatelocal_shapendimidx	placementmesh_dim_size	shard_dimlocal_shard_size_s               R/var/www/html/venv/lib/python3.12/site-packages/torch/distributed/tensor/_utils.pycompute_local_shaper*      s     '')M<(< '
3 	:NC IIcNM)U+%MM	$O"9+-GvNO$&/&H&H	*M=;M'# ! ""2C888)9I&	: [!!    c                 h   |j                         }|yt        |       }dgt        |       z  }t        t        |             D cg c]  }dg|j                  z   }}dgt        |       z  }t        |      D ]  \  }	}
|j                  |	      }t        |
t              s(|
j                  }dgt        |       z  }|t        |      k  sJ d| dt        |              |
j                  ||   |||	   d      \  }}|||<   |||<   ||   ||   k  r	||   ||<   n||xx   ||   z  cc<   ||xx   |z  cc<    t        d |D              }|r7d	gt        |       z  }d	gt        |       z  }t        |      D ]  \  }	}
|j                  |	      }t        |
t              s(|
j                  }||   rt        d
|
 d|	 d| d      ||   rd||<   t        |
t              r!d||<   ||   |
j                  |z  z  ||   |	<   ||xx   |z  cc<   ||   ||   |	<    t        |      D cg c]0  \  }}t        t!        ||      D cg c]
  \  }}||z   c}}      2 }}}}}t!        ||      D cg c]
  \  }}||z   }}}t#        |      t#        |      fS c c}w c c}}w c c}}}}w c c}}w )a  
    Compute the local tensor shape and the global offsets into the original tensor
    of a DTensor on its current global rank. This is useful for checkpointing purpose.

    Example (2 host with 4GPUs each):
    # Below is a DeviceMesh with mesh_shape of (2, 4)
    mesh = DeviceMesh(device_type="cuda",
                        mesh=[
                        [0, 1, 2, 3],
                        [4, 5, 6, 7]
                        ],
    )

    Let's say we distribute a global_tensor of shape (8,4) over the above DeviceMesh
    with a placements of [Shard(0), Shard(0)].
    The local shape and global offset will be as follows:
    rank0 -- local_shape:[1, 4], global_offset:[0, 0]
    rank1 -- local_shape:[1, 4], global_offset:[1, 0]
    rank2 -- local_shape:[1, 4], global_offset:[2, 0]
    rank5 -- local_shape:[1, 4], global_offset:[5, 0]
    rank3 -- local_shape:[1, 4], global_offset:[3, 0]
    rank4 -- local_shape:[1, 4], global_offset:[4, 0]
    rank6 -- local_shape:[1, 4], global_offset:[6, 0]
    rank7 -- local_shape:[1, 4], global_offset:[7, 0]

    Let's say we distribute a global_tensor of shape (2) over the above DeviceMesh with
    a placements of [Shard(0)]. We will not have non-empty local tensor for all the ranks.
    The local shape and global offset will be as follows:
    rank0 -- local_shape:[1,], global_offset:[0,]
    rank1 -- local_shape:[1,], global_offset:[1,]
    rank2 -- local_shape:[0,], global_offset:[2,]
    rank5 -- local_shape:[0,], global_offset:[2,]
    rank3 -- local_shape:[0,], global_offset:[2,]
    rank4 -- local_shape:[0,], global_offset:[2,]
    rank6 -- local_shape:[0,], global_offset:[2,]
    rank7 -- local_shape:[0,], global_offset:[2,]
    ) r-   r      r   r   T)return_offsetc              3   <   K   | ]  }t        |t                y wN)r   r
   ).0ps     r)   	<genexpr>z8compute_local_shape_and_global_offset.<locals>.<genexpr>   s     Pz!];Ps   FzTStrided sharding does not allow Shard() to appear after the strided part has ended. z at idx z in z violates this assumption.)r   r   r   ranger"   r   r   r   r   r   r   anyNotImplementedErrorr
   split_factorsumzipr   )r   r   r   r    r!   global_offsetr(   shard_idx_stride_by_mesh_dimnum_shards_by_tensor_dimr#   r$   r%   r&   local_offset
shard_sizeshard_offsetstrided_shardingstrided_part_seenstrided_part_endshard_idx_stridexy	shard_idxs                          r)   %compute_local_shape_and_global_offsetrH   1   sd   P '')M<(c,//%*3|+<%=(
 !QC$))O(
$ (
 %&3\):#: '
3 	ENC IIcNM)U+%MM	 !sS%66 3$  ["9+-GKHXGYZ[  ,5+M+M	*!!#&"&	 ,N ,(
L *4I&*6Y' !+|I/FF/;I/FM),!),Y0GG,(3}D37	Eh PZPP!&#l*; ; %w\)::"+J"7 @Y $		#i/ )I'	21;;D+XcURV)l*DF  )36:(3!)];7;))4 5Y?%22]B 5Y? 1;M; 5Y? 5Y?1@< 4=04 /I/ s+;]'KLtq!QULMI  03;	/JKtq!QUKMK[!5#777E(
t M Ls$   J*J&
J 
J&
3J. J&
tensorc           	         t        | j                               }t        | j                               }t        |      D ]  \  }}|j                  |      }|j	                         rt        t        |      }|j                  dk  rt        d|       |j                  }	|	| j                  k  sJ d|	 d| j                   d| d       ||	   }
|
|z  ||	<   t        t        |            D ]  }||	k7  s	||   ||	   k\  s||   |z  ||<   ! t        |t        t        f      rt        dt!        |       d       ||fS )	aV  
    Compute the global size and stride of a DTensor from the given local tensor.
    The local size is multiplited by `world_size` per Sharding dim.
    The local stride is multiplited by `world_size` per Sharding dim, as long as the
    dimension is outside sharding dim.

    For example, if we have a local tensor with size (4, 8, 2) and stride (16, 1, 8).
    If the DTensor placements are [Shard(2)] and world_size is 2;
    then the global size is (4, 8, 4) and stride is (16 * 2, 1, 8).

    Args:
        tensor (:class:`torch.Tensor`):
            Local tensor which DTensor will be constructed from.
        mesh (:class:`DeviceMesh`):
            Object which describes the mesh topology
            of devices for the DTensor.
        placements (Sequence[:class:`Placement`]]):
            The attribute of the DTensor that describes its layout
            on the mesh topology.

    Return:
        tensor_shape: A List of int which specifies the size of DTensor which build
            on top of the local tensor.
        tensor_stride: A List of int which specifies the stride of DTensor.
    r   zOShard placements should have negative dims normalized in the user-facing APIs: r   r   z for placement number .zplacement type z not supported!)r   r   strider   is_shardr   r   r   AssertionErrorr"   r5   r   r   r   r   RuntimeErrortype)rI   r   r   tensor_shapetensor_strider#   r$   r%   shard_placementr&   local_dim_sizeis               r)   compute_global_tensor_inforV      st   8 &L)M#J/ SY		#"5)4O""Q&$--<,=?  (++I FKK'nyk)CFKK=Pfgjfkklmn' *)4N&4}&DL# 3}-. H	>mA&6-	:R&R'4Q'7-'GM!$H I	7';<i0AQRR3S4 &&r+   op_callargsc                 >   |D ]  }t        |t        j                  t        f      r|j                  c S t        |t
        t        f      sHt        |      dkD  sWt        |d   t        j                  t        f      s{|d   j                  c S  t        d|  d      )z
    Find the device mesh object from args.
    It returns None if no mesh is found.
    NOTE: we can optimize this search if needed
    r   z+Cannot find device mesh from args for op : rK   )	r   dtensorDTensorr	   device_meshr   r   r   
ValueError)rW   rX   args      r)   try_find_mesh_from_argsr_     s      &cGOO[9:??"sT5M*C13q6GOO[#ABq6%%%& B7)1M
NNr+   global_stridec                 p    dgt               z  t        |      D ]q  \  }}|j                         st        t        |      j
                  }t        t                     D ]*  } |    |   kD  s|xx   |j                  |      z  cc<   , s t         fdt        t                     D              S )z
    Compute the stride of a local tensor shard, given the global stride of the DTensor.
    NOTE: Currently this function is assuming the DTensor is evenly shardable.
    r.   c              3   4   K   | ]  }|   |   z    y wr1   r-   )r2   rU   r`   stride_divisorss     r)   r4   z'compute_local_stride.<locals>.<genexpr>'  s%      34aOA..s   )	r   r   rM   r   r   r   r5   r   r   )r`   r   r   mesh_idxr3   rU   jrc   s   `      @r)   compute_local_striderf     s     cC..O , >!::<UA""A 3}-. > #mA&66#A&$))H*==&>>  8=c->P8Q  r+   c                    t        | t        j                        r| S t        | t              r| g}n;t	        |       dk(  r"t        | d   t
              rt        | d         }nt        |       }t        j                  |      S )z
    Unify variable types of size argument to torch.Size
    Acceptable types include:
        int, Sequence[int], Tuple[int], Tuple[Sequence[int]],
        or torch.Size
    r.   r   )r   torchSizer   r   r   r   )r   
torch_sizes     r)   normalize_to_torch_sizerk   ,  se     $

#$V
	TaJtAw9$q']
$Z
::j!!r+   )#typingr   r   r   r   rh   torch.distributed.tensor._apidistributedrI   _apirZ   torch._prims_commonr   torch.distributed.device_meshr   &torch.distributed.tensor._dtensor_specr	   (torch.distributed.tensor.placement_typesr
   r   r   r   r   r   r*   rH   TensorrV   _ops
OpOverloadobjectr_   rf   ri   rk   r-   r+   r)   <module>rx      sV   . .  / / ) 4 > ""#-";CI;N"
38_">R8R8#-R8;CI;NR8
5c?E#s(O+,R8j8'LL8' *8'8@8K8'
49d3i 8'vOZZ""O*26*:OO*$.<DY<O
38_,"UZZ "r+   