
    sg*                         d dl Z d dlmZ d dlZd dlmZmZmZmZm	Z	m
Z
 d dlmZ ddddZddZddZdd	Zdddd
ZddddZy)    N)List)_flatten_dense_tensors_get_device_index_handle_complex_reorder_tensors_as_take_tensors_unflatten_dense_tensors)nccl)outc                
   t        |       } |du |du z  st        d| d|       |8|D cg c]  }t        |       }}t        j                  j                  | |      S t        j                  j                  | |      S c c}w )a  Broadcasts a tensor to specified GPU devices.

    Args:
        tensor (Tensor): tensor to broadcast. Can be on CPU or GPU.
        devices (Iterable[torch.device, str or int], optional): an iterable of
          GPU devices, among which to broadcast.
        out (Sequence[Tensor], optional, keyword-only): the GPU tensors to
          store output results.

    .. note::
        Exactly one of :attr:`devices` and :attr:`out` must be specified.

    Returns:
        - If :attr:`devices` is specified,
            a tuple containing copies of :attr:`tensor`, placed on
            :attr:`devices`.
        - If :attr:`out` is specified,
            a tuple containing :attr:`out` tensors, each containing a copy of
            :attr:`tensor`.
    NzFExactly one of 'devices' and 'out' must be specified, but got devices=z	 and out=)r   RuntimeErrorr   torch_C
_broadcast_broadcast_out)tensordevicesr   ds       I/var/www/html/venv/lib/python3.12/site-packages/torch/nn/parallel/comm.py	broadcastr      s    * V$F_-TU\T]]fgjfkl
 	
 189A$Q'99xx""6733xx&&vs33 :s   B c                     |D cg c]  }t        |       }}| D cg c]  }t        |       } }t        j                  j	                  | ||      S c c}w c c}w )a.  Broadcast a sequence of tensors to the specified GPUs.

    Small tensors are first coalesced into a buffer to reduce the number of synchronizations.

    Args:
        tensors (sequence): tensors to broadcast. Must be on the same device,
          either CPU or GPU.
        devices (Iterable[torch.device, str or int]): an iterable of GPU
          devices, among which to broadcast.
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple containing copies of :attr:`tensor`, placed on :attr:`devices`.
    )r   r   r   r   _broadcast_coalesced)tensorsr   buffer_sizer   ts        r   broadcast_coalescedr   2   sV     .55 #5G5+23aq!3G388((';GG 63s
   AAc           	         t        |d      }| d   j                         }d}t        |       D ]  \  }}|j                  j                  dk7  sJ d       |j                         |k(  r|}|j                         |k7  sOdj                  d |j                         D              }dj                  d	 |D              }t        d
| d| d|        |t        d      t        |       dk(  r| d   S t        j                  |       r2t        j                  | |         }t        j                  | ||       |S t        j                  | |   j                  j                  |      }	t        |       D 
cg c]  \  }}
||k7  s|
 }}}
| |   |d   j                  |	d      z   }|dd D ]$  }|j!                  |j                  |	d             & |S c c}
}w )a  Sum tensors from multiple GPUs.

    All inputs should have matching shapes, dtype, and layout. The output tensor
    will be of the same shape, dtype, and layout.

    Args:
        inputs (Iterable[Tensor]): an iterable of tensors to add.
        destination (int, optional): a device on which the output will be
            placed (default: current device).

    Returns:
        A tensor containing an elementwise sum of all inputs, placed on the
        :attr:`destination` device.
    T)optionalr   Ncpuz+reduce_add expects all inputs to be on GPUsxc              3   2   K   | ]  }t        |        y wNstr.0r    s     r   	<genexpr>zreduce_add.<locals>.<genexpr>]   s     6a3q66   c              3   2   K   | ]  }t        |        y wr"   r#   r%   s     r   r'   zreduce_add.<locals>.<genexpr>^   s     ;1A;r(   zinput z has invalid size: got z, but expected zLreduce_add expects destination to be on the same GPU with one of the tensors   )outputroot)devicenon_blocking)r   size	enumerater-   type
get_devicejoin
ValueErrorr   lenr
   is_availabler   
empty_likereducetoadd_)inputsdestination
input_size
root_indexiinpgotexpectedresultdestination_devicer   nonrootothers                r   
reduce_addrG   F   s    $K$?K!JJF# 	3zz%'V)VV'>>{*J88:#((6388:66Cxx;
;;H23%xjQ 	 Z
 	
 6{aay !!&"45F6
; M #\\&*<*C*C*H*H+V!*6!2FAa:o1FF
#gajmm%D '4 '
 
 QR[ 	PEKK(:NO	PM Gs   2G Gc                    | D cg c]  }g  }}g }g }t        |  D ]  }t        d |D              r2t        ||      }|j                  |       |j                  |d          Gt        ||      D ]2  \  }	}
|	j                  |
j                  r|
j                         n|
       4 |j                  |d   d           |D cg c]  }t        ||       }}t        | D ]U  }|D cg c]  }t        |       }}t        ||      }t        ||d         D ]  }
|j                  |
j                          W t        t        ||            S c c}w c c}w c c}w )a\  Sum tensors from multiple GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Args:
        inputs (Iterable[Iterable[Tensor]]): iterable of iterables that
            contain tensors from a single device.
        destination (int, optional): a device on which the output will be
            placed (default: current device).
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple of tensors containing an elementwise sum of each group of
        inputs, placed on the ``destination`` device.
    c              3   4   K   | ]  }|j                     y wr"   )	is_sparse)r&   r   s     r   r'   z'reduce_add_coalesced.<locals>.<genexpr>   s     3qq{{3s   r   )zipallrG   appendrJ   to_denser   r   r	   datatupler   )r;   r<   r   _dense_tensorsr+   	ref_ordertensor_at_gpusrC   collr   r   itrschunkschunkflat_tensorsflat_results                    r   reduce_add_coalescedr\   y   sk   & .4 4 4M 4FIv, 33N33<FMM&!^A./}n= @aAKKAJJLQ?@]1-b123 @MMGM';/MDMt* 	"7=
.3"5)
 
 !{;)+vayA 	"A MM!&&!		"	" $VY7883 !5 N
s   	EE%Ec          	      D   t        |       } |D|D cg c]  }t        |       }}t        t        j                  j                  | ||||            S |t        d|       |t        d|       t        t        j                  j                  | |||            S c c}w )a<  Scatters tensor across multiple GPUs.

    Args:
        tensor (Tensor): tensor to scatter. Can be on CPU or GPU.
        devices (Iterable[torch.device, str or int], optional): an iterable of
          GPU devices, among which to scatter.
        chunk_sizes (Iterable[int], optional): sizes of chunks to be placed on
          each device. It should match :attr:`devices` in length and sums to
          ``tensor.size(dim)``. If not specified, :attr:`tensor` will be divided
          into equal chunks.
        dim (int, optional): A dimension along which to chunk :attr:`tensor`.
          Default: ``0``.
        streams (Iterable[torch.cuda.Stream], optional): an iterable of Streams, among
          which to execute the scatter. If not specified, the default stream will
          be utilized.
        out (Sequence[Tensor], optional, keyword-only): the GPU tensors to
          store output results. Sizes of these tensors must match that of
          :attr:`tensor`, except for :attr:`dim`, where the total size must
          sum to ``tensor.size(dim)``.

    .. note::
        Exactly one of :attr:`devices` and :attr:`out` must be specified. When
        :attr:`out` is specified, :attr:`chunk_sizes` must not be specified and
        will be inferred from sizes of :attr:`out`.

    Returns:
        - If :attr:`devices` is specified,
            a tuple containing chunks of :attr:`tensor`, placed on
            :attr:`devices`.
        - If :attr:`out` is specified,
            a tuple containing :attr:`out` tensors, each containing a chunk of
            :attr:`tensor`.
    zI'devices' must not be specified when 'out' is specified, but got devices=zQ'chunk_sizes' must not be specified when 'out' is specified, but got chunk_sizes=)r   r   rQ   r   r   _scatterr   _scatter_out)r   r   chunk_sizesdimstreamsr   r   s          r   scatterrc      s    D V$F
{189A$Q'99UXX&&vwS'RSS[\c[de  "cdocpq  UXX**63WEFF :s   Bc                B   | D cg c]  }t        |       } }|P|dk(  rt        j                  dt        d       t	        |dd      }t
        j                  j                  | ||      S |t        d|       t
        j                  j                  | ||      S c c}w )a  Gathers tensors from multiple GPU devices.

    Args:
        tensors (Iterable[Tensor]): an iterable of tensors to gather.
          Tensor sizes in all dimensions other than :attr:`dim` have to match.
        dim (int, optional): a dimension along which the tensors will be
          concatenated. Default: ``0``.
        destination (torch.device, str, or int, optional): the output device.
          Can be CPU or CUDA. Default: the current CUDA device.
        out (Tensor, optional, keyword-only): the tensor to store gather result.
          Its sizes must match those of :attr:`tensors`, except for :attr:`dim`,
          where the size must equal ``sum(tensor.size(dim) for tensor in tensors)``.
          Can be on CPU or CUDA.

    .. note::
        :attr:`destination` must not be specified when :attr:`out` is specified.

    Returns:
        - If :attr:`destination` is specified,
            a tensor located on :attr:`destination` device, that is a result of
            concatenating :attr:`tensors` along :attr:`dim`.
        - If :attr:`out` is specified,
            the :attr:`out` tensor, now containing results of concatenating
            :attr:`tensors` along :attr:`dim`.
    rK   zjUsing -1 to represent CPU tensor is deprecated. Please use a device object or string instead, e.g., "cpu".   )
stacklevelT)	allow_cpur   zQ'destination' must not be specified when 'out' is specified, but got destination=)
r   warningswarnFutureWarningr   r   r   _gatherr   _gather_out)r   ra   r<   r   r   s        r   gatherrm      s    4 ,33aq!3G3
{"MM@	 (tdSxxk::"cdocpq  xx##GS#66! 4s   Br"   )   )Nrn   )NNr   N)r   N)rh   typingr   r   torch._utilsr   r   r   r   r   r	   
torch.cudar
   r   r   rG   r\   rc   rm        r   <module>rt      sX        44 4BH(0f,9^/GPT /Gd*7D *7rs   