
    sgJ;                        d dl Z d dlmZ d dlmZ d dlmZmZ ej                  fdZ	d ej                  fdZ
d ej                  fdZej                  ej                  fdZej                  ej                  fdZej                  fd	Zej                  fd
Zej                  fdZddej                  fdZej                  ej                  fdZ G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d  d!e      Zy)"    N)Function)groupReduceOpc                 0    t         j                  |||       S )a  
    Broadcasts the tensor to the whole group.

    ``tensor`` must have the same number of elements in all processes
    participating in the collective.

    Arguments:
        tensor (Tensor): Data to be sent if ``src`` is the rank of current
            process.
        src (int): Source rank.
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        Tensor: Received tensor from the broadcast op.

    )
_Broadcastapply)tensorsrcr   s      R/var/www/html/venv/lib/python3.12/site-packages/torch/distributed/nn/functional.py	broadcastr      s    " C//    c                 0    t         j                  |||       S )aT  
    Gathers a list of tensors in a single process.

    Arguments:
        tensor (Tensor): Input tensor.
        dst (int, optional): Destination rank (default is 0).
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        tuple[Tensor]: List of appropriately-sized tensors with the gathered data.
    )_Gatherr   )r	   dstr   s      r   gatherr       s     ==eV,,r   c                 0    t        j                  ||g|  S )a  
    Scatters a list of tensors to all processes in a group.

    Each process will receive exactly one tensor and store its data in the
    ``tensor`` argument.

    Arguments:
        tensors (list[Tensor]): List of tensors to scatter on the source rank.
            Receivers must pass ``None`.
        src (int, optional): Source rank (default is 0).
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        Tensor: Output tensor from the scatter operation.

    )_Scatterr   )tensorsr
   r   s      r   scatterr   /   s    " >>#u/w//r   c                 2    t         j                  ||||       S )a  
    Reduces the tensor data across all machines.

    Only the process with rank ``dst`` is going to receive the final result.

    Arguments:
        tensor (Tensor): Input of the collective.
        dst (int): Destination rank.
        op (optional): One of the values from
            ``torch.distributed.ReduceOp``
            enum.  Specifies an operation used for element-wise reductions.
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        Tensor: Output of the collective.

    )_Reducer   )r	   r   opr   s       r   reducer   C   s    $ ==b%00r   c                 2    t        j                  ||| g| S )a  
    Reduces, then scatters a list of tensors to all processes in a group.

    Arguments:
        output (Tensor): Output tensor.
        input_list (list[Tensor]): List of tensors to reduce and scatter.
        op (optional): One of the values from
            ``torch.distributed.ReduceOp``
            enum.  Specifies an operation used for element-wise reductions.
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        Tensor: Output of the collective.

    )_Reduce_Scatterr   )output
input_listr   r   s       r   reduce_scatterr   X   s        UF@Z@@r   c                 .    t         j                  ||       S )a  
    Gathers tensors from the whole group in a list.

    Arguments:
        tensor (Tensor): Tensor to be broadcast from current process.
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        tuple([Tensor]): Output of the collective.

    )
_AllGatherr   )r	   r   s     r   
all_gatherr!   k   s     E6**r   c                 0    t         j                  | ||      S )a  
    Single tensor all gather. Gathers a single tensor from all ranks, and puts them in a single output tensor.

    Args:
        output_tensor (Tensor): Output tensor. It should contain
            correctly-sized tensors to be used for output of the collective.
        input_tensor (Tensor): Tensor to be broadcast from current process.
        group (ProcessGroup, optional): The process group to work on. If None,
            the default process group will be used.

    Examples:
        >>> # All tensors below are of torch.int64 dtype.
        >>> # We have 2 process groups, 2 ranks.
        >>> # xdoctest: +SKIP("incorrect want text")
        >>> output_tensor = torch.zeros(2, dtype=torch.int64)
        >>> output_tensor
        [tensor([0, 0])] # Rank 0 and 1
        >>> tensor = torch.arange(1, dtype=torch.int64) + 1 + rank
        >>> tensor
        tensor([1]) # Rank 0
        tensor([2]) # Rank 1
        >>> dist.all_gather_base(output_tensor, tensor)
        >>> output_tensor
        tensor([1,2]) # Rank 0
        tensor([1,2]) # Rank 1

    .. warning::
        `_all_gather_base` is experimental and subject to change.
        It is the caller's responsibility to ensure the output_tensor
        is correctly sized.

    )_AllGatherBaser   )output_tensorinput_tensorr   s      r   _all_gather_baser&   z   s    B |UCCr   c                 0    t        j                  || g| S )a  
    Each process scatters list of input tensors to all processes in a group and return gathered list of tensors in output list.

    Arguments:
        output_tensor_list (list[Tensor]): list of tensors to gather one per rank.
        input_tensor_list (list[Tensor]): List of tensors to scatter one per rank.
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        tuple([Tensor]): Output of the collective.

    )	_AlltoAllr   )output_tensor_listinput_tensor_listr   s      r   
all_to_allr+      s     ??5"4I7HIIr   c                 4    t         j                  || |||      S )a  
    Each process splits input tensor and then scatters the split list to all processes in a group.

    Then concatenate the received tensors from all the processes in the group and return single output tensor.

    Arguments:
        output (Tensor): Gathered concatenated output tensor.
        input (Tensor): Input tensor to scatter.
        output_split_sizes: (list[Int], optional): Output split sizes for dim 0
            if specified None or empty, dim 0 of ``output`` tensor must divide
            equally by ``world_size``.
        input_split_sizes: (list[Int], optional): Input split sizes for dim 0
            if specified None or empty, dim 0 of ``input`` tensor must divide
            equally by ``world_size``.

    Returns:
        Tensor: Output of the collective.

    )_AlltoAllSingler   )r   inputoutput_split_sizesinput_split_sizesr   s        r   all_to_all_singler1      s$    4   v)+<e r   c                 0    t         j                  |||       S )a&  
    Reduces the tensor data across all machines in such a way that all get the final result.

    After the call the returned tensor is going to be bitwise
    identical in all processes.

    Arguments:
        tensor (Tensor): Input of the collective.
        op (optional): One of the values from
            ``torch.distributed.ReduceOp``
            enum.  Specifies an operation used for element-wise reductions.
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        Tensor: Output of the collective

    )
_AllReducer   )r	   r   r   s      r   
all_reducer4      s    $ Bv..r   c                   ,    e Zd Zed        Zed        Zy)r   c                     || _         || _        t        j                  |      | _        |j                         }t        j                  |||       |S Nr   )r
   r   distget_rankrankcloner   )ctxr
   r   r	   s       r   forwardz_Broadcast.forward   sB    	==u- vs%0r   c                     t         j                  | j                  t        j                  | j
                  |      }| j                  | j                  k7  r|j                          d d |fS N)r   r   r
   r   SUMr   r;   zero_)r=   grad_outputgxs      r   backwardz_Broadcast.backward   sH    ]]377HLL#))[I77chhHHJdBr   N__name__
__module____qualname__staticmethodr>   rE    r   r   r   r      s(         r   r   c                   ,    e Zd Zed        Zed        Zy)r   c                    || _         || _        t        t        j                  |            D cg c]  }t        j                  |       }}|j                         }t        j                  |      |k(  r$t        j                  ||||       t        |      S t        j                  |d ||       t        |      S c c}w r7   )r   r   ranger9   get_world_sizetorch
zeros_like
contiguousr:   r   tuple)r=   r   r   r	   itensor_lists         r   r>   z_Gather.forward   s    	 /4D4G4Ge4T.U
)*EV$
 
 ""$==u%,KKS> [!! KKc7[!!
s   B?c                 `    dt        j                  | j                  | j                  g| fz   S NNN)r   r   r   r   )r=   grad_outputss     r   rE   z_Gather.backward
  s(    x~~cggsyyP<PRRRr   NrF   rK   r   r   r   r      s*    " "$ S Sr   r   c                   ,    e Zd Zed        Zed        Zy)r   c                 ,   || _         || _        t        fdD              sJ t        j                  d         }t        j                  |      |k(  r$t        j                  |t              ||       |S t        j                  |d ||       |S )Nc              3   f   K   | ](  }|j                         d    j                         k(   * yw)r   N)size).0tr   s     r   	<genexpr>z#_Scatter.forward.<locals>.<genexpr>  s'     BQ1668wqz00Bs   .1r   r8   )	r
   r   allrP   rQ   r9   r:   r   list)r=   r
   r   r   r   s      ` r   r>   z_Scatter.forward  s    	B'BBBB!!'!*-==u%,LLg5A  LLs%8r   c                 ^    dt         j                  | j                  | j                  |      z   S rW   )r   r   r
   r   r=   rC   s     r   rE   z_Scatter.backward  s"    gmmCGGSYYLLLr   NrF   rK   r   r   r   r     s*    	 	 M Mr   r   c                   ,    e Zd Zed        Zed        Zy)r   c                 t    || _         || _        |j                         }t        j                  ||||       |S Nr   r   )r
   r   r<   r9   r   )r=   r
   r   r   r	   s        r   r>   z_Reduce.forward"  s2    	FCBe4r   c                 `    dt         j                  | j                  | j                  |      fz   S N)NNN)r   r   r
   r   rd   s     r   rE   z_Reduce.backward*  s'    !Z%5%5cggsyy+%V$XXXr   NrF   rK   r   r   r   r   !  s*      Y Yr   r   c                   ,    e Zd Zed        Zed        Zy)r   c                     || _         |j                         }t        d |D              }t        j                  |t        |      ||       |S )Nc              3   <   K   | ]  }|j                           y wr@   rR   r^   r_   s     r   r`   z*_Reduce_Scatter.forward.<locals>.<genexpr>5  s     !LQ!,,.!L   rh   )r   rR   rS   r9   r   rb   )r=   r   r   r	   r*   s        r   r>   z_Reduce_Scatter.forward0  sH    	""$!!L:K!LLFD):$;%Pr   c                 H    dt         j                  | j                  |      z   S rj   )r    r   r   rd   s     r   rE   z_Reduce_Scatter.backward9  s    !J$4$4SYY$LLLr   NrF   rK   r   r   r   r   /  s*      M Mr   r   c                   ,    e Zd Zed        Zed        Zy)r    c                     |j                         }|| _        t        t        j                  |            D cg c]  }t        j                  |       }}t        j                  |||       t        |      S c c}w r7   )	rR   r   rN   r9   rO   rP   
empty_liker!   rS   )r=   r   r	   _out_tensor_lists        r   r>   z_AllGather.forward?  sp     ""$	.3D4G4Ge4T.U
)*EV$
 
 	u=_%%
s   A:c                 :   t        j                  | j                        t         j                  j                  u rlt        j
                  | j                        }t        j                  ||         }t        j                  t        j                  | j                  |g| }d |fS |D cg c]  }t        j                  |       }}t        j                  | j                  |g| }t        j                  t        j                  |      d      }d |fS c c}w )Nr8   r   )dim)r9   get_backendr   BackendNCCLr:   rP   rt   r   r   r   rA   r(   sumstack)r=   rY   r;   rD   r	   rU   gxss          r   rE   z_AllGather.backwardL  s    #)),0A0AA==syy1D!!,t"45B &&x||SYYR\RB bz COO5++F3OKO//#))[H<HC5;;s+3Bbz Ps   +DNrF   rK   r   r   r    r    >  s(    
& 
&  r   r    c                   ,    e Zd Zed        Zed        Zy)r#   c                 `    || _         t        j                  ||j                         |       |S r7   )r   r9   r&   rR   )r=   r$   r%   r   s       r   r>   z_AllGatherBase.forward\  s*    	m\-D-D-FeTr   c                 L   t        j                  | j                        t         j                  j                  u rt        j
                  | j                        }t        |j                               }|d   |z  dk7  rt        d| d|       |d   t        j
                  | j                        z  |d<   t        j                  ||j                  |j                        }t        j                  ||t        j                  | j                         nt        d      d |d fS )Nr8   r   zTensor with dimensions: z8 does not have first dimension divisible by world_size: devicedtypezBackend not supported!)r9   ry   r   rz   r{   rO   rb   r]   RuntimeErrorrP   emptyr   r   _reduce_scatter_baser   rA   )r=   rC   
world_sizeout_sizerD   s        r   rE   z_AllGatherBase.backwardb  s    #)),0A0AA,,399=JK,,./H{Z'1,".xj 9IISV  #1+)<)<399)MMHQK!3!3;;L;LB %%b+x||SYYO788b$r   NrF   rK   r   r   r#   r#   [  s(     
    r   r#   c                   ,    e Zd Zed        Zed        Zy)r(   c                 b   || _         t        t        j                  |            D cg c]  }||   j	                          c}| _        t        j                  |      }t        d |D              }t        j                  |      t        j                  j                  u r]t        t        j                  |            D ]0  }d }||k(  rt        |      }t        j                  ||   |||       2 t        |      S t        j                  |t        |      |       t        |      S c c}w )Nr8   c              3   <   K   | ]  }|j                           y wr@   rn   ro   s     r   r`   z$_AlltoAll.forward.<locals>.<genexpr>~  s     818rp   )r   rN   r9   rO   r]   input_tensor_size_listr:   rS   ry   rz   GLOOrb   r   r+   )r=   r   rv   r   rT   my_rankto_sends          r   r>   z_AlltoAll.forwardw  s    	',T-@-@u-M'N&
"#GAJOO&
" --e,888%(DLL,=,==4..U;< J<"7mG_Q/!5I	J _%% OOW
 _%%%&
s   D,c           	          | j                   D cg c]4  }t        j                  ||d   j                  |d   j                        6 }}dt        j                  | j                  |g| z   S c c}w )Nr   r   rX   )r   rP   r   r   r   r(   r   r   )r=   rY   r]   rU   s       r   rE   z_AlltoAll.backward  ss     22	
  KK\!_33<?;P;P
 
 ioociiT|TTT
s   9A.NrF   rK   r   r   r(   r(   v  s*    & &, U Ur   r(   c                   ,    e Zd Zed        Zed        Zy)r-   c                     || _         |j                         | _        || _        || _        t        j                  |||||       |S )N)r/   r0   r   )r   r]   
input_sizer/   r0   r9   r1   )r=   r   r   r/   r0   r.   s         r   r>   z_AlltoAllSingle.forward  sJ    	!2 21/	
 r   c           	          t        j                  | j                  |j                  |j                        }dt
        j                  | j                  || j                  | j                  |j                               fz   S )Nr   )NNNN)rP   r   r   r   r   r-   r   r   r/   r0   rR   )r=   rC   r	   s      r   rE   z_AlltoAllSingle.backward  sq    NN;#5#5[=N=N
 (!!		&&%%&&(+
 
 	
r   NrF   rK   r   r   r-   r-     s(      
 
r   r-   c                   ,    e Zd Zed        Zed        Zy)r3   c                 r    || _         || _        |j                         }t        j                  |||       |S rg   )r   r   r<   r9   r4   )r=   r   r   r	   s       r   r>   z_AllReduce.forward  s0    	2U3r   c                 `    dt         j                  | j                  | j                  |      fz   S rW   )r3   r   r   r   rd   s     r   rE   z_AllReduce.backward  s'    z//		;OQQQr   NrF   rK   r   r   r3   r3     s*      R Rr   r3   )rP   torch.distributeddistributedr9   torch.autogradr   r   r   WORLDr   r   r   rA   r   r   r!   r&   r+   r1   r4   r   r   r   r   r   r    r#   r(   r-   r3   rK   r   r   <module>r      sF      #
 . "' 0(  - %++ 0( $<<u{{ 1* +3,,ekk A& #[[ + 9> !DH =BKK J& 
++> #,,ekk /*   (Sh S2Mx M$Yh YMh M : X  6 U  UF
h 
@R Rr   