
    sg                     \   d dl mZmZ d dlZd dlmZ d dlmZmZ d dl	m
Z
mZmZ ddlmZ ddlmZ dd	lmZ g d
Z G d de      Z G d de      Z G d dee      Z G d de      Z G d dee      Z G d de      Z G d dee      Z G d de      Z G d dee      Z G d de      Zy)    )AnyOptionalN)Tensor)
functionalinit)	ParameterUninitializedBufferUninitializedParameter   )SyncBatchNorm)LazyModuleMixin)Module)BatchNorm1dLazyBatchNorm1dBatchNorm2dLazyBatchNorm2dBatchNorm3dLazyBatchNorm3dr   c                        e Zd ZU dZdZg dZeed<   eed<   e	e   ed<   e
ed<   e
ed<   	 	 	 	 	 	 ddedede	e   de
de
d
d	f fdZddZddZd Zd Z fdZ xZS )	_NormBasez,Common base of _InstanceNorm and _BatchNorm.   )track_running_statsmomentumepsnum_featuresaffiner   r   r   r   r   Nreturnc                 v   ||d}t         |           || _        || _        || _        || _        || _        | j
                  rIt        t        j                  |fi |      | _
        t        t        j                  |fi |      | _        n$| j                  dd        | j                  dd        | j                  r| j                  dt        j                  |fi |       | j                  dt        j                  |fi |       |  |  | j                  dt        j                   	 ddt        j"                  i|j%                         D 	
ci c]  \  }	}
|	dk7  s|	|
 c}
}	       |  n6| j                  dd        | j                  dd        | j                  dd        | j'                          y c c}
}	w )	Ndevicedtypeweightbiasrunning_meanrunning_varnum_batches_trackedr!   r   )super__init__r   r   r   r   r   r   torchemptyr"   r#   register_parameterregister_bufferzerosonestensorlongitemsreset_parameters)selfr   r   r   r   r   r    r!   factory_kwargskv	__class__s              M/var/www/html/venv/lib/python3.12/site-packages/torch/nn/modules/batchnorm.pyr)   z_NormBase.__init__&   s    %+U;( #6 ;;#EKK$O$OPDK!%++l"Mn"MNDI##Hd3##FD1##  L KN K   uzz,I.I   %** )7(<(<(>O1!w,q!tO   6  5  !6= Ps   F5F5c                     | j                   rP| j                  j                          | j                  j	                  d       | j
                  j                          y y Nr   )r   r$   zero_r%   fill_r&   r4   s    r9   reset_running_statsz_NormBase.reset_running_statsU   sJ    ## ##%""1%$$**, $    c                     | j                          | j                  r?t        j                  | j                         t        j
                  | j                         y y N)r?   r   r   ones_r"   zeros_r#   r>   s    r9   r3   z_NormBase.reset_parameters]   s:      ";;JJt{{#KK		" r@   c                     t         rB   )NotImplementedErrorr4   inputs     r9   _check_input_dimz_NormBase._check_input_dimc   s    !!r@   c                 :     dj                   di | j                  S )Nzj{num_features}, eps={eps}, momentum={momentum}, affine={affine}, track_running_stats={track_running_stats} )format__dict__r>   s    r9   
extra_reprz_NormBase.extra_reprf   s)    ? 88>PAEP	
r@   c           	      d   |j                  dd       }||dk  r| j                  ru|dz   }	|	|vrl| j                  8| j                  j                  t	        j                  d      k7  r| j                  n$t	        j
                  dt        j                        ||	<   t        
| !  |||||||       y )Nversionr   r&   metar   )r!   )	getr   r&   r    r*   r0   r1   r(   _load_from_state_dict)r4   
state_dictprefixlocal_metadatastrictmissing_keysunexpected_keys
error_msgsrP   num_batches_tracked_keyr8   s             r9   rS   z_NormBase._load_from_state_dictl   s     !$$Y5Ow{0H0H '-/D&D#&j8 //;00775<<;OO ,, auzz:	 23 	%	
r@   h㈵>皙?TTNNr   N)__name__
__module____qualname____doc___version__constants__int__annotations__floatr   boolr)   r?   r3   rI   rN   rS   __classcell__r8   s   @r9   r   r      s    6HXM	JuoL $'$(- -  -  5/	- 
 -  "-  
- ^-#"
 
  
r@   r   c                   \     e Zd Z	 	 	 	 	 	 ddededee   dededdf fdZd	edefd
Z	 xZ
S )
_BatchNormNr   r   r   r   r   r   c                 8    ||d}t        	|   |||||fi | y Nr   )r(   r)   )
r4   r   r   r   r   r   r    r!   r5   r8   s
            r9   r)   z_BatchNorm.__init__   s0     %+U;#x1D	
HV	
r@   rH   c           
         | j                  |       | j                  d}n| j                  }| j                  rd| j                  rX| j                  L| j                  j                  d       | j                  dt        | j                        z  }n| j                  }	 | j                  rd}n| j                  d u xr | j                  d u }	 t        j                  || j                  r| j                  r| j                  nd | j                  r| j                  r| j                  nd | j                  | j                  ||| j                        S )N        r         ?T)rI   r   trainingr   r&   add_rh   r$   r%   F
batch_normr"   r#   r   )r4   rH   exponential_average_factorbn_trainings       r9   forwardz_BatchNorm.forward   s'   e$
 == ),&)-&==T55''3((--a0==(14uT=U=U7V1V.15.	 ==K,,4T4;K;Kt;SK	
 || ==D$<$< $(MMT5M5MDSWKKII&HH
 	
r@   r\   )r`   ra   rb   rf   rh   r   ri   r)   r   ry   rj   rk   s   @r9   rm   rm      sm     $'$(

 
 5/	

 
 "
 

.
V .
 .
r@   rm   c                   Z     e Zd ZU eed<   eed<   	 	 	 	 	 	 d	 d fdZd fdZddZ xZS )_LazyNormBaser"   r#   c           
         ||d}t        
|   d||ddfi | || _        || _        | j                  r t	        di || _        t	        di || _        | j                  rtt        di || _        t        di || _	        t        j                  	 ddt        j                  i|j                         D 	ci c]  \  }}	|dk7  s||	 c}	}| _        y y c c}	}w )Nr   r   Fr!   rK   r'   )r(   r)   r   r   r
   r"   r#   r	   r$   r%   r*   r0   r1   r2   r&   )r4   r   r   r   r   r    r!   r5   r6   r7   r8   s             r9   r)   z_LazyNormBase.__init__   s     %+U; 		
 		
 #6 ;;0B>BDK.@@DI## 3 En ED2D^DD',||(jj( %3$8$8$:KDAqa7l1a4K(D$ $ Ls   6CCc                 d    | j                         s| j                  dk7  rt        |           y y y )Nr   )has_uninitialized_paramsr   r(   r3   )r4   r8   s    r9   r3   z_LazyNormBase.reset_parameters   s0    ,,.43D3D3IG$& 4J.r@   c                 @   | j                         r|j                  d   | _        | j                  rt	        | j
                  t              sJ t	        | j                  t              sJ | j
                  j                  | j                  f       | j                  j                  | j                  f       | j                  rL| j                  j                  | j                  f       | j                  j                  | j                  f       | j                          y y r;   )r~   shaper   r   
isinstancer"   r
   r#   materializer   r$   r%   r3   rG   s     r9   initialize_parametersz#_LazyNormBase.initialize_parameters   s    ((* %AD{{!$++/EFFF!$))-CDDD''):):(<=		%%t'8'8&:;''!!--&&(   ,,&&( !!# +r@   r\   r_   )	r`   ra   rb   r
   rg   r)   r3   r   rj   rk   s   @r9   r{   r{      s=    ""
      
 D'$r@   r{   c                       e Zd ZdZd Zy)r   a  Applies Batch Normalization over a 2D or 3D input.

    Method described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the number of features or channels of the input). By default, the
    elements of :math:`\gamma` are set to 1 and the elements of :math:`\beta` are set to 0.
    At train time in the forward pass, the standard-deviation is calculated via the biased estimator,
    equivalent to ``torch.var(input, unbiased=False)``. However, the value stored in the
    moving average of the standard-deviation is calculated via the unbiased  estimator, equivalent to
    ``torch.var(input, unbiased=True)``.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, L)` slices, it's common terminology to call this Temporal Batch Normalization.

    Args:
        num_features: number of features or channels :math:`C` of the input
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``

    Shape:
        - Input: :math:`(N, C)` or :math:`(N, C, L)`, where :math:`N` is the batch size,
          :math:`C` is the number of features or channels, and :math:`L` is the sequence length
        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm1d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm1d(100, affine=False)
        >>> input = torch.randn(20, 100)
        >>> output = m(input)
    c                     |j                         dk7  r1|j                         dk7  rt        d|j                          d      y y Nr      zexpected 2D or 3D input (got D input)dim
ValueErrorrG   s     r9   rI   zBatchNorm1d._check_input_dimS  @    99;!		q 0<UYY[MRSS !1r@   Nr`   ra   rb   rc   rI   rK   r@   r9   r   r     s    DLTr@   r   c                       e Zd ZdZeZd Zy)r   aR  A :class:`torch.nn.BatchNorm1d` module with lazy initialization.

    Lazy initialization based on the ``num_features`` argument of the :class:`BatchNorm1d` that is inferred
    from the ``input.size(1)``.
    The attributes that will be lazily initialized are `weight`, `bias`,
    `running_mean` and `running_var`.

    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
    on lazy modules and their limitations.

    Args:
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
    c                     |j                         dk7  r1|j                         dk7  rt        d|j                          d      y y r   r   rG   s     r9   rI   z LazyBatchNorm1d._check_input_dimu  r   r@   N)r`   ra   rb   rc   r   cls_to_becomerI   rK   r@   r9   r   r   X  s    4  MTr@   r   c                       e Zd ZdZd Zy)r   a  Applies Batch Normalization over a 4D input.

    4D is a mini-batch of 2D inputs
    with additional channel dimension. Method described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
    to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
    standard-deviation is calculated via the biased estimator, equivalent to
    ``torch.var(input, unbiased=False)``. However, the value stored in the moving average of the
    standard-deviation is calculated via the unbiased  estimator, equivalent to
    ``torch.var(input, unbiased=True)``.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, H, W)` slices, it's common terminology to call this Spatial Batch Normalization.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, H, W)`
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``

    Shape:
        - Input: :math:`(N, C, H, W)`
        - Output: :math:`(N, C, H, W)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm2d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm2d(100, affine=False)
        >>> input = torch.randn(20, 100, 35, 45)
        >>> output = m(input)
    c                 d    |j                         dk7  rt        d|j                          d      y N   zexpected 4D input (got r   r   rG   s     r9   rI   zBatchNorm2d._check_input_dim  0    99;!6uyy{m8LMM r@   Nr   rK   r@   r9   r   r   z      ENNr@   r   c                       e Zd ZdZeZd Zy)r   aU  A :class:`torch.nn.BatchNorm2d` module with lazy initialization.

    Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm2d` that is inferred
    from the ``input.size(1)``.
    The attributes that will be lazily initialized are `weight`, `bias`,
    `running_mean` and `running_var`.

    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
    on lazy modules and their limitations.

    Args:
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
    c                 d    |j                         dk7  rt        d|j                          d      y r   r   rG   s     r9   rI   z LazyBatchNorm2d._check_input_dim  r   r@   N)r`   ra   rb   rc   r   r   rI   rK   r@   r9   r   r         4  MNr@   r   c                       e Zd ZdZd Zy)r   a  Applies Batch Normalization over a 5D input.

    5D is a mini-batch of 3D inputs with additional channel dimension as described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
    to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
    standard-deviation is calculated via the biased estimator, equivalent to
    ``torch.var(input, unbiased=False)``. However, the value stored in the moving average of the
    standard-deviation is calculated via the unbiased  estimator, equivalent to
    ``torch.var(input, unbiased=True)``.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, D, H, W)` slices, it's common terminology to call this Volumetric Batch Normalization
    or Spatio-temporal Batch Normalization.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, D, H, W)`
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``

    Shape:
        - Input: :math:`(N, C, D, H, W)`
        - Output: :math:`(N, C, D, H, W)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm3d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm3d(100, affine=False)
        >>> input = torch.randn(20, 100, 35, 45, 10)
        >>> output = m(input)
    c                 d    |j                         dk7  rt        d|j                          d      y N   zexpected 5D input (got r   r   rG   s     r9   rI   zBatchNorm3d._check_input_dim1  r   r@   Nr   rK   r@   r9   r   r     r   r@   r   c                       e Zd ZdZeZd Zy)r   aU  A :class:`torch.nn.BatchNorm3d` module with lazy initialization.

    Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm3d` that is inferred
    from the ``input.size(1)``.
    The attributes that will be lazily initialized are `weight`, `bias`,
    `running_mean` and `running_var`.

    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
    on lazy modules and their limitations.

    Args:
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
    c                 d    |j                         dk7  rt        d|j                          d      y r   r   rG   s     r9   rI   z LazyBatchNorm3d._check_input_dimS  r   r@   N)r`   ra   rb   rc   r   r   rI   rK   r@   r9   r   r   6  r   r@   r   c                        e Zd ZdZ	 	 	 	 	 	 	 ddededee   dededee   d	df fd
Z	d Z
d Zded	efdZedd       Z xZS )r   a  Applies Batch Normalization over a N-Dimensional input.

    The N-D input is a mini-batch of [N-2]D inputs with additional channel dimension) as described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over all
    mini-batches of the same process groups. :math:`\gamma` and :math:`\beta`
    are learnable parameter vectors of size `C` (where `C` is the input size).
    By default, the elements of :math:`\gamma` are sampled from
    :math:`\mathcal{U}(0, 1)` and the elements of :math:`\beta` are set to 0.
    The standard-deviation is calculated via the biased estimator, equivalent to
    `torch.var(input, unbiased=False)`.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done for each channel in the ``C`` dimension, computing
    statistics on ``(N, +)`` slices, it's common terminology to call this Volumetric Batch
    Normalization or Spatio-temporal Batch Normalization.

    Currently :class:`SyncBatchNorm` only supports
    :class:`~torch.nn.DistributedDataParallel` (DDP) with single GPU per process. Use
    :meth:`torch.nn.SyncBatchNorm.convert_sync_batchnorm()` to convert
    :attr:`BatchNorm*D` layer to :class:`SyncBatchNorm` before wrapping
    Network with DDP.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, +)`
        eps: a value added to the denominator for numerical stability.
            Default: ``1e-5``
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
        process_group: synchronization of stats happen within each process group
            individually. Default behavior is synchronization across the whole
            world

    Shape:
        - Input: :math:`(N, C, +)`
        - Output: :math:`(N, C, +)` (same shape as input)

    .. note::
        Synchronization of batchnorm statistics occurs only while training, i.e.
        synchronization is disabled when ``model.eval()`` is set or if
        ``self.training`` is otherwise ``False``.

    Examples::

        >>> # xdoctest: +SKIP
        >>> # With Learnable Parameters
        >>> m = nn.SyncBatchNorm(100)
        >>> # creating process group (optional)
        >>> # ranks is a list of int identifying rank ids.
        >>> ranks = list(range(8))
        >>> r1, r2 = ranks[:4], ranks[4:]
        >>> # Note: every rank calls into new_group for every
        >>> # process group created, even if that rank is not
        >>> # part of the group.
        >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
        >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm3d(100, affine=False, process_group=process_group)
        >>> input = torch.randn(20, 100, 35, 45, 10)
        >>> output = m(input)

        >>> # network is nn.BatchNorm layer
        >>> sync_bn_network = nn.SyncBatchNorm.convert_sync_batchnorm(network, process_group)
        >>> # only single gpu per process is currently supported
        >>> ddp_sync_bn_network = torch.nn.parallel.DistributedDataParallel(
        >>>                         sync_bn_network,
        >>>                         device_ids=[args.local_rank],
        >>>                         output_device=args.local_rank)
    Nr   r   r   r   r   process_groupr   c	                 F    ||d}	t        
|   |||||fi |	 || _        y ro   )r(   r)   r   )r4   r   r   r   r   r   r   r    r!   r5   r8   s             r9   r)   zSyncBatchNorm.__init__  s:     %+U;#x1D	
HV	
 +r@   c                 d    |j                         dk  rt        d|j                          d      y )Nr   z expected at least 2D input (got r   r   rG   s     r9   rI   zSyncBatchNorm._check_input_dim  s/    99;??		}HUVV r@   c                 B    |j                  d      dk(  rt        d      y )Nr   r   z9SyncBatchNorm number of input channels should be non-zero)sizer   rG   s     r9   _check_non_zero_input_channelsz,SyncBatchNorm._check_non_zero_input_channels  s'    ::a=AK  r@   rH   c                 z   | j                  |       | j                  |       | j                  d}n| j                  }| j                  rk| j                  r_| j
                  J | j
                  j                  d       | j                  d| j
                  j                         z  }n| j                  }	 | j                  rd}n| j                  d u xr | j                  d u }	 | j                  r| j                  r| j                  nd }| j                  r| j                  r| j                  nd }|xrL | j                  xr> t        j                  j                         xr t        j                  j                         }|r|j                  j                  dt        j                   j#                         fvr*t%        dt        j                   j#                                t        j                  j&                  j(                  }| j*                  r| j*                  }t        j                  j-                  |      }|dkD  }|s:t/        j0                  |||| j2                  | j4                  ||| j6                        S |sJ t9        j:                  || j2                  | j4                  ||| j6                  |	      S )Nrq   r   rr   Tcudaz4SyncBatchNorm expected input tensor to be on GPU or )rI   r   r   rs   r   r&   rt   itemr$   r%   r*   distributedis_availableis_initializedr    type_C_get_privateuse1_backend_namer   groupWORLDr   get_world_sizeru   rv   r"   r#   r   sync_batch_normapply)	r4   rH   rw   rx   r$   r%   	need_syncr   
world_sizes	            r9   ry   zSyncBatchNorm.forward  s   e$++E2
 == ),&)-&==T55++777$$))!,}}$-043K3K3P3P3R-R*-1]]*	 ==K,,4T4;K;Kt;SK	 &*]]d6N6NDTX 	 %)MMT5M5MDSW 	  33!!..03 !!002	 	 ||  668)  !Jxx==?@B 
 "--3399M!! $ 2 2**99-HJ"QI <<		*	 	 ;"((		*
 
r@   c                    |}t        |t        j                  j                  j                  j
                        rt        j                  j                  |j                  |j                  |j                  |j                  |j                  |      }|j                  r?t        j                         5  |j                  |_        |j                  |_        ddd       |j                  |_        |j                   |_        |j"                  |_        |j$                  |_        t'        |d      r|j(                  |_        |j+                         D ]'  \  }}|j-                  || j/                  ||             ) ~|S # 1 sw Y   xY w)aa  Converts all :attr:`BatchNorm*D` layers in the model to :class:`torch.nn.SyncBatchNorm` layers.

        Args:
            module (nn.Module): module containing one or more :attr:`BatchNorm*D` layers
            process_group (optional): process group to scope synchronization,
                default is the whole world

        Returns:
            The original :attr:`module` with the converted :class:`torch.nn.SyncBatchNorm`
            layers. If the original :attr:`module` is a :attr:`BatchNorm*D` layer,
            a new :class:`torch.nn.SyncBatchNorm` layer object will be returned
            instead.

        Example::

            >>> # Network with nn.BatchNorm layer
            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
            >>> module = torch.nn.Sequential(
            >>>            torch.nn.Linear(20, 100),
            >>>            torch.nn.BatchNorm1d(100),
            >>>          ).cuda()
            >>> # creating process group (optional)
            >>> # ranks is a list of int identifying rank ids.
            >>> ranks = list(range(8))
            >>> r1, r2 = ranks[:4], ranks[4:]
            >>> # Note: every rank calls into new_group for every
            >>> # process group created, even if that rank is not
            >>> # part of the group.
            >>> # xdoctest: +SKIP("distributed")
            >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
            >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
            >>> sync_bn_module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module, process_group)

        Nqconfig)r   r*   nnmodules	batchnormrm   r   r   r   r   r   r   no_gradr"   r#   r$   r%   r&   rs   hasattrr   named_children
add_moduleconvert_sync_batchnorm)clsmoduler   module_outputnamechilds         r9   r   z$SyncBatchNorm.convert_sync_batchnorm6  s<   H fehh..88CCD!HH22##

**M }}]]_ 5+1==M()/M&5 *0)<)<M&(.(:(:M%060J0JM-%+__M"vy)(.%!002 	KD%$$c00F	 5 5s   2#E;;F)r]   r^   TTNNNrB   )r`   ra   rb   rc   rf   rh   r   ri   r   r)   rI   r   r   ry   classmethodr   rj   rk   s   @r9   r   r   X  s    dR $'$('+++ + 5/	+
 + "+  }+ 
+"WZV Z Zx < <r@   r   )typingr   r   r*   r   torch.nnr   ru   r   torch.nn.parameterr   r	   r
   
_functionsr   r   lazyr   r   r   __all__r   rm   r{   r   r   r   r   r   r   rK   r@   r9   <module>r      s        * U U 8 ! s
 s
l>
 >
B9$OY 9$xIT* ITXTmZ TDJN* JNZNmZ NDJN* JNZNmZ ND[J [r@   