
    sgc                     z   d dl Z d dlmZmZ d dlZd dlmZ d dlmZm	Z	 d dl
mZmZ  edd      Zej                  ej                  ej                   ej"                  gZej&                  ej(                  gZeD  ci c];  } |  ej,                  |       j.                   ej,                  |       j0                  f= c} Zej5                  eD  ci c]G  } |  e ej8                  |       j.                         e ej8                  |       j0                        fI c}        d Zej=                  d	        eed
d      dej>                  de dedededejB                  dej>                  fd       Z" eed
d      dej>                  de dedededejB                  dej>                  fd       Z#ej=                  d        eedd      dej>                  dej>                  dej>                  dededejB                  dej>                  fd       Z$ eedd      dej>                  dej>                  dej>                  dededejB                  dej>                  fd       Z%ej=                  d        eedd      dej>                  dej>                  dej>                  dej>                  dej>                  dejB                  dej>                  fd       Z& eedd      dej>                  dej>                  dej>                  dej>                  dej>                  dejB                  dej>                  fd       Z'ej=                  d        eedd      dd dej>                  de dedededejB                  d!eejB                     dej>                  fd"       Z( eedd      dd dej>                  dej>                  dej>                  dededejB                  d!eejB                     dej>                  fd#       Z)ej=                  d$        eed%d      dd dej>                  dej>                  dej>                  dededejB                  d!eejB                     dej>                  fd&       Z* eed%d      dd dej>                  dej>                  dej>                  dededejB                  d!eejB                     dej>                  fd'       Z+ej=                  d(        eed)d      dd dej>                  dej>                  dej>                  dej>                  dej>                  dejB                  d!eejB                     dej>                  fd*       Z, eed)d      dd d!eejB                     dej>                  fd+       Z-ej=                  d,        eed-d      dej>                  d.ed/ed0e dejB                  deej>                  ej>                  f   fd1       Z.ej=                  d2        eed3d      dej>                  d.ed/ed0e dejB                  deej>                  ej>                  f   fd4       Z/ eed-d      dej>                  deded0e dejB                  deej>                  ej>                  f   fd5       Z0 eed3d      dej>                  deded0e dejB                  deej>                  ej>                  f   fd6       Z1d7 Z2ej=                  d8        eed9d      dej>                  d:ej>                  d;ej>                  d<edededejB                  dej>                  fd=       Z3 eed9d      dej>                  d:ej>                  d;ej>                  d<edededejB                  dej>                  fd>       Z4ej=                  d?        eed@d      dd dej>                  d:ej>                  d;eej>                     d<edededejB                  d!eejB                     dej>                  fdA       Z5 eed@d      dd dej>                  d:ej>                  d;eej>                     d<edededejB                  d!eejB                     dej>                  fdB       Z6ej=                  dC        eedDd      dej>                  dejB                  deej>                  ej>                  f   fdE       Z7 eedDd      dej>                  dejB                  deej>                  ej>                  f   fdF       Z8ej=                  dG        eedHdI      dej>                  dejB                  deej>                  ej>                  f   fdJ       Z9ej=                  dK        eedLd      dej>                  dejB                  deej>                  ej>                  f   fdM       Z: eedLd      dej>                  dejB                  deej>                  ej>                  f   fdN       Z;dO Z<ej=                  dP        eedQd      dej>                  d:ej>                  d;ej>                  dededejB                  fdR       Z= eedQd      dej>                  d:ej>                  d;ej>                  dededejB                  fdS       Z>ej=                  dT        eedUd      ej~                  fdej>                  d:ej>                  d;ej>                  dededejB                  dVejB                  fdW       Z@ eedUd      ej~                  fdej>                  d:ej>                  d;ej>                  dededejB                  dVejB                  fdX       ZAej=                  dY        eedZd      	 djdej>                  d:ej>                  d;ej>                  dededejB                  fd\       ZB eedZd      	 djdej>                  d:ej>                  d;ej>                  dededejB                  fd]       ZCej=                  d^        eed_d      d[ej~                  fd`ej>                  d:ej>                  d;eej>                     dededejB                  daedVejB                  fdb       ZDej=                  dc        G dd deej                  j                        ZG eedfdg      dej>                  d:ej>                  d;ej>                  d<edededej>                  fdh       ZH eedfd      dej>                  d:ej>                  d;ej>                  d<edededej>                  fdi       ZIyc c} w c c} w )k    N)OptionalTuple)_unsqueeze_multiple)determine_qparamsvalidate_qmin_qmax)implLibraryquantized_decomposedDEFc                     |t         vrt        d|       t         |   \  }}| |k\  sJ d| d|         ||k  sJ d| d|        y )NzUnsupported dtype: z9quant_min out of bound for dtype, quant_min_lower_bound: z quant_min: z9quant_max out of bound for dtype, quant_max_upper_bound: z quant_max: )_DTYPE_TO_QVALUE_BOUNDS
ValueError)	quant_min	quant_maxdtypequant_min_lower_boundquant_max_upper_bounds        W/var/www/html/venv/lib/python3.12/site-packages/torch/ao/quantization/fx/_decomposed.py_quant_min_max_bounds_checkr      s    ++.ug6773J53Q00-- 	""7!8YK	Q-
 -- 	""7!8YK	Q-    zxquantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> Tensorquantize_per_tensorCompositeExplicitAutogradinputscale
zero_pointr   r   r   returnc                    | j                   t        j                  t        j                  fv r| j	                  t        j
                        } | j                   t        j
                  k(  sJ d| j                           t        |||       d|z  }t        j                  t        j                  | |z        |z   ||      j	                  |      S )a  Affine quantization for the Tensor using the same quantization parameters to map
    from floating point to quantized values

    Args:
       input (torch.Tensor): original float32 or bfloat16 Tensor
       scale (float): quantization parameter for affine quantization
       zero_point (int): quantization parameter for affine quantization
       quant_min (int): minimum quantized value for output Tensor
       quant_max (int): maximum quantized value for output Tensor
       dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor

    Returns:
       Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
       are not stored in the Tensor, we are storing them in function arguments instead
    <Expecting input to have dtype torch.float32, but got dtype:       ?)	r   torchfloat16bfloat16tofloat32r   clampround)r   r   r   r   r   r   	inv_scales          r   r   r   2   s    0 {{u}}enn55'u}}$T	Eekk]ST$	9e<eI;;EI%&3Y	bir   Metac                 (   | j                   t        j                  t        j                  fv r| j	                  t        j
                        } | j                   t        j
                  k(  sJ d| j                           t        j                  | |      S )Nr   r   )r   r    r!   r"   r#   r$   
empty_liker   r   r   r   r   r   s         r   quantize_per_tensor_metar-   W   so     {{u}}enn55'u}}$T	Eekk]ST$E//r   zquantize_per_tensor.tensor(Tensor input, Tensor scale, Tensor zero_point, int quant_min, int quant_max, ScalarType dtype) -> Tensorzquantize_per_tensor.tensorc                    |j                         dk(  sJ d|j                                 |j                         dk(  sJ d|j                                 t        | |j                         |j                         |||      S zAffine quantization for the Tensor using the same quantization parameters to map
    from floating point to quantized values
    Same as `quantize_per_tensor` but scale and zero_point are Scalar Tensor instead of
    scalar values
       >Expecting zero_point tensor to be one element, but received : 9Expecting scale tensor to be one element, but received : numelr   itemr,   s         r   quantize_per_tensor_tensorr6   n   s    " 	a]	G
HXHXHZG[\] 	S	B5;;=/RSuzz|Z__.	9e r   c                    | j                   t        j                  t        j                  fv r| j	                  t        j
                        } |j                         dk(  sJ d|j                                 |j                         dk(  sJ d|j                                 | j                   t        j
                  k(  sJ d| j                           t        j                  | |      S )Nr0   r1   r2   r   r*   )r   r    r!   r"   r#   r$   r4   r+   r,   s         r   quantize_per_tensor_tensor_metar8      s     {{u}}enn55'a]	G
HXHXHZG[\] 	S	B5;;=/RS 	u}}$T	Eekk]ST$E//r   zquantize_per_tensor.tensor2(Tensor input, Tensor scale, Tensor zero_point, Tensor quant_min, Tensor quant_max, ScalarType dtype) -> Tensorzquantize_per_tensor.tensor2c                 >   |j                         dk(  sJ d|j                                 |j                         dk(  sJ d|j                                 t        | |j                         |j                         |j                         |j                         |      S r/   r3   r,   s         r   quantize_per_tensor_tensor2r:      s    " 	a]	G
HXHXHZG[\] 	S	B5;;=/RS

 r   c                 "    t        | |||||      S N)r8   r,   s         r    quantize_per_tensor_tensor2_metar=      s     +uj)Y r   zdequantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensordequantize_per_tensor	out_dtyper@   c                    | j                   |k(  sJ d| d| j                           |t        j                  }|t        v r| j	                  |      |z
  |z  S t        d|       )a  Affine dequantization for the Tensor using the same quantization parameters to map
    from quantized values to floating point values

    Args:
       input (torch.Tensor): Tensor with dtype matching `dtype` argument,
       e.g. (`torch.uint8`), it is a per tensor quantized Tensor if combined with
       quantization parameters in the argument of this function (scale/zero_point)

       scale (float): quantization parameter for affine quantization

       zero_point (int): quantization parameter for affine quantization

       quant_min (int): minimum quantized value for input Tensor (not used in computation,
       reserved for pattern matching)

       quant_max (int): maximum quantized value for input Tensor (not used in computation,
       reserved for pattern matching)

       dtype (torch.dtype): dtype for input Tensor (not used in computation,
       reserved for pattern matching)

       out_dtype (torch.dtype?): optional dtype for output Tensor

    Returns:
       dequantized float32 Tensor
    Expecting input to have dtype: z
, but got ,Unsupported dtype in dequantize_per_tensor: )r   r    r$   r   r#   r   r   r   r   r   r   r   r@   s          r   r>   r>      sz    L 	uH	(z%++GHMM	'' #j0E99GwOPPr   c                T    |t         j                  }t        j                  | |      S Nr*   )r    r$   r+   rD   s          r   dequantize_per_tensor_metarG     s&     MM	E33r   zdequantize_per_tensor.tensor(Tensor input, Tensor scale, Tensor zero_point, int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensorzdequantize_per_tensor.tensorc          	      
   |j                         dk(  sJ d|j                                 |j                         dk(  sJ d|j                                 t        | |j                         |j                         ||||      S zAffine dequantization for the Tensor using the same quantization parameters to map
    from quantized values to floating point values
    Same as `dequantize_per_tensor` but scale and zero_point are Scalar Tensor instead of
    scalar values
    r0   r1   r2   r?   r4   r>   r5   rD   s          r   dequantize_per_tensor_tensorrK   (  s    * 	a]	G
HXHXHZG[\] 	S	B5;;=/RS 

 r   c                ^   |t         j                  }|j                         dk(  sJ d|j                                 |j                         dk(  sJ d|j                                 | j                  |k(  s
J d|        |t        v rt        j
                  | |      S t        d|       )Nr0   r1   r2   rB   r*   rC   )r    r$   r4   r   r   r+   r   rD   s          r   !dequantize_per_tensor_tensor_metarM   M  s     MM	a]	G
HXHXHZG[\] 	S	B5;;=/RS;;%J#B5'!JJ''Y77GwOPPr   zdequantize_per_tensor.tensor2(Tensor input, Tensor scale, Tensor zero_point, Tensor quant_min, Tensor quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensorzdequantize_per_tensor.tensor2c          	      B   |j                         dk(  sJ d|j                                 |j                         dk(  sJ d|j                                 t        | |j                         |j                         |j                         |j                         ||      S rI   rJ   rD   s          r   dequantize_per_tensor_tensor2rO   n  s    * 	a]	G
HXHXHZG[\] 	S	B5;;=/RS 

 r   c          	      &    t        | ||||||      S )Nr?   )rM   rD   s          r   "dequantize_per_tensor_tensor2_metarQ     s      -uj)Y r   zrchoose_qparams.tensor(Tensor input, int quant_min, int quant_max, float eps, ScalarType dtype) -> (Tensor, Tensor)zchoose_qparams.tensorqminqmaxepsc           
         | j                   t        j                  t        j                  t        j                  fv sJ d| j                           |t
        v sJ dt
        j                          d|        t        ||       t        j                  |       \  }}t        |||||t        j                  |g      d      S )[  Given an input Tensor, derive the per tensor affine quantization parameter
    (scale and zero_point) for target quantized Tensor from the Tensor

    Args:
       input (torch.Tensor): floating point input Tensor
       quant_min (int): minimum quantized value for target quantized Tensor
       quant_max (int): maximum quantized value for target quantized Tensor
       dtype (torch.dtype): dtype for target quantized Tensor

    Returns:
       scale (float): quantization parameter for the target quantized Tensor
       zero_point (int): quantization parameter for the target quantized Tensor
    CExpecting input to have dtype torch.float32/16/b16, but got dtype: $Expecting target dtype to be one of , but got: F)has_customized_qrange)r   r    r$   r!   r"   r   keysr   aminmaxr   Tensorr   rR   rS   rT   r   min_valmax_vals          r   choose_qparams_tensorra     s    " ;;  [ 
MU[[MZ	[  	((a	-.E.J.J.L-M[Y^X_`a(tT"}}U+GWcU# r   z|choose_qparams_symmetric.tensor(Tensor input, int quant_min, int quant_max, float eps, ScalarType dtype) -> (Tensor, Tensor)zchoose_qparams_symmetric.tensorc           
         | j                   t        j                  t        j                  t        j                  fv sJ d| j                           |t
        v sJ dt
        j                          d|        t        ||       t        j                  |       \  }}t        |||||t        j                  |g      dt        j                        S )rV   rW   rX   rY   F)rZ   qscheme)r   r    r$   r!   r"   r   r[   r   r\   r   r]   per_tensor_symmetricr^   s          r   choose_qparams_symmetric_tensorre     s    * ;;  [ 
MU[[MZ	[  	((a	-.E.J.J.L-M[Y^X_`a(tT"}}U+GWcU#**	 	r   c                    | j                   t        j                  t        j                  t        j                  fv sJ d| j                           ||k  sJ d| d|        t        j
                  dt        j                  | j                        t        j
                  dt        j                  | j                        fS )NrW   zKExpecting quant_min to be smaller than quant_max but received min:         z max: r0   r   device)	r   r    r$   r!   r"   emptydoublerh   int64r   r   r   rT   r   s        r   choose_qparams_tensor_metarm     s     ;;  [ 
MU[[MZ	[  	I&
		6)&& ;;qU\\BEKK	U\\E  r   c                     t        j                  dt         j                  | j                        t        j                  dt         j                  | j                        fS )Nr0   rg   )r    ri   rj   rh   rk   rl   s        r   $choose_qparams_symmetric_tensor_metaro     sA     ;;qU\\BEKK	U\\E  r   c                     t        t        | j                                     }d||<   ||d<   | j                  t	        |            }||fS )Nr   )listrangedimpermutetuple)xaxisnew_axis_listys       r   _permute_to_axis_zerorz     sH    quuw(MM$M!			%&'Amr   zquantize_per_channel(Tensor input, Tensor scales, Tensor zero_points, int axis, int quant_min, int quant_max, ScalarType dtype) -> Tensorquantize_per_channelscaleszero_pointsrw   c                    | j                   t        j                  t        j                  fv r| j	                  t        j
                        } | j                   t        j
                  k(  sJ d| j                           || j                         k  sJ d| j                                 t        |||       t        | |      \  } }dg| j                         z  }|j                  d   |d<   |j                  |      }|j                  |      }t        j                  t        j                  | d|z  z        |z   ||      }	|	j                  t        |            }
|
j	                  |      S )at  Affine per channel quantization for the Tensor using the same quantization
    parameters for each channel/axis to map from floating point to quantized values

    Args:
       input (torch.Tensor): original float32 or bfloat16 Tensor
       scales (torch.Tensor): a list of scale quantization parameter for
       affine quantization, one per channel
       zero_point (torch.Tensor): a list of zero_point quantization parameter for
       affine quantization, one per channel
       quant_min (int): minimum quantized value for output Tensor
       quant_max (int): maximum quantized value for output Tensor
       dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor

    Returns:
       Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
       are not stored in the Tensor, we are storing them in function arguments instead
    r   Expecting axis to be < r0   r   r   )r   r    r!   r"   r#   r$   rs   r   rz   shapeviewr%   r&   rt   ru   )r   r|   r}   rw   r   r   r   permute_axis_list	new_shaperesouts              r   r{   r{   -  s2   6 {{u}}enn55'u}}$T	Eekk]ST$%))+F!8FF	9e<4UDAEeiik!I<<?IaL[[#F""9-K
++ES6\*+k99iC ++e-.
/C66%=r   c                    | j                   t        j                  t        j                  fv r| j	                  t        j
                        } | j                   t        j
                  k(  sJ d| j                           || j                         k  sJ d| j                                 t        |||       t        j                  | |      S )Nr   r   r*   )	r   r    r!   r"   r#   r$   rs   r   r+   )r   r|   r}   rw   r   r   r   s          r   quantize_per_channel_metar   ]  s     {{u}}enn55'u}}$T	Eekk]ST$%))+F!8FF	9e<E//r   zdequantize_per_channel(Tensor input, Tensor scales, Tensor? zero_points, int axis, int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensordequantize_per_channelc                   | j                   |k(  sJ d| d| j                           |t        j                  }|| j                         k  sJ d| j                                 t	        |||       t        | |      \  } }dg| j                         z  }	|j                  d   |	d<   |j                  |	      }|| |j                  |	      z
  |z  }
n| |z  }
|
j                  |      }
|
j                  t        |            }|S )a  Affine per channel dequantization for the Tensor using the same quantization
    parameters for each channel/axis to map from quantized values to floating point values

    Args:
       input (torch.Tensor): Tensor with dtype matching `dtype` argument,
       e.g. (`torch.uint8`), it is a per channel quantized Tensor if combined with
       quantization parameter in the argument of this function (scales/zero_points/axis)

       scales (torch.Tensor): a list of scale quantization parameter for
       affine quantization, one per channel

       zero_points (torch.Tensor): a list of zero_point quantization parameter for
       affine quantization, one per channel

       quant_min (int): minimum quantized value for output Tensor (not used in computation,
       reserved for pattern matching)

       quant_max (int): maximum quantized value for output Tensor (not used in computation,
       reserved for pattern matching)

       dtype (torch.dtype): requested dtype for output Tensor (not used in computation,
       reserved for pattern matching)

       out_dtype (torch.dtype?): optional dtype for output Tensor

    Returns:
       dequantized float32 Tensor
    Expecting input to have dtype , but got dtype: r   r0   r   )r   r    r$   rs   r   rz   r   r   r#   rt   ru   )r   r|   r}   rw   r   r   r   r@   r   r   r   r   s               r   r   r   {  s
   R 	uN	'w.?}MNMM	%))+F!8FF	9e<4UDAEeiik!I<<?IaL[[#F{''	22f<fn
&&
C
++e-.
/CJr   c                   | j                   |k(  sJ d| d| j                           |t        j                  }|| j                         k  sJ d| j                                 t	        |||       t        j
                  | |      S )Nr   r   r   r*   )r   r    r$   rs   r   r+   )r   r|   r}   rw   r   r   r   r@   s           r   dequantize_per_channel_metar     s     	uN	'w.?}MNMM	%))+F!8FF	9e<E33r   zLchoose_qparams_per_token(Tensor input, ScalarType dtype) -> (Tensor, Tensor)choose_qparams_per_tokenc                 p   | j                         j                  dd      }|j                  t        j                  k(  r|j                         }|t        j                  k(  rd}d|dz
  z  dz
  }nt        d|       |j                  d	      j                  |      }t        j                  |      }||fS )
  Choose quantization parameters for per token quantization. This means for a N dimension Tensor
    (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
    every N elements with the same quantization parameter. The dimension for scales/zero_points
    will be (M1 * M2 ... * Mn)

    Args:
       input (torch.Tensor): original float32/float16 Tensor
       dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor

    Returns:
        scales and zero_points, both float32 Tensors
    Trs   keepdim      r0   z/unsupported dtype in choose_qparams_per_token: gh㈵>min)absamaxr   r    r!   floatint8	Exceptionr%   div
zeros_like)r   r   r|   n_bitsr   r}   s         r   r   r     s    , YY["d3F||u}}$LLN 	 

&1*%)	=eWE
 	
 \\d\#''	2F""6*K;r   c                     d| j                  d      f}t        j                  |t        j                  | j                        t        j                  |t        j
                  | j                        fS Nr0   r   rg   sizer    ri   rj   rh   rk   r   r   r   s      r   choose_qparams_per_token_metar     S     uzz"~D;;t5<<Eu{{EKKH  r   z]_choose_qparams_per_token_asymmetric_impl(Tensor input, ScalarType dtype) -> (Tensor, Tensor))_choose_qparams_per_token_asymmetric_implCompositeImplicitAutogradc                    d\  }}t        j                  | dd      }t        j                  | dd      }t        j                  |t        j                  |            }t        j
                  |t        j                  |            }t        j                  t         j                        j                  }||z
  t        ||z
        z  }	|	j                  |      }	||	z  }
||	z  }||
z   }||z   }t        j                  ||z   dkD  ||
z
  ||z
        }t        j                  |||      j                         }|	j                  t         j                        |j                  t         j                        fS )r   )i   r   Tr   r   r   )r    aminr   r   r   maxfinfor$   rT   r   r%   wherer&   r#   )r   r   rR   rS   r_   r`   min_val_negmax_val_posrT   r   descaled_mindescaled_maxzero_point_from_min_errorzero_point_from_max_errorr   s                  r   r   r     sC   , JD$jjB5GjjB5G))GU%5%5g%>?K))GU%5%5g%>?K
++emm
$
(
(C ;&%t*<<EKKCK E &L&L $| 3 $| 3!$==A||J
 Zt4::<J88EMM"JMM%--$@@@r   zWchoose_qparams_per_token_asymmetric(Tensor input, ScalarType dtype) -> (Tensor, Tensor)#choose_qparams_per_token_asymmetricc                     t        | |      S r<   )r   )r   r   s     r   r   r   F  s     5UEBBr   c                     d| j                  d      f}t        j                  |t        j                  | j                        t        j                  |t        j
                  | j                        fS r   r   r   s      r   (choose_qparams_per_token_asymmetric_metar   R  r   r   c                    t        j                  t        | j                               d d       }||j	                         k(  sJ d| d|j                                 ||j	                         k(  sJ d| d|j                                 y )Nr   znum_tokens: z	 scales: z zero_points: )mathprodrq   r   r4   )r   r|   r}   
num_tokenss       r   !_per_token_quant_qparam_dim_checkr   a  s    4

-cr23Jflln$;	j\6;;=/:;$ 	k''))E	j\0@0@0B/CDE)r   z}quantize_per_token(Tensor input, Tensor scales, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype) -> Tensorquantize_per_tokenc                     t        |||       t        | ||       | j                  d|z        j                  |      j	                         j                  ||      j                  |      } | S )a  Per token quantization for the Tensor using the quantization parameters to map
    from floating point to quantized values. This means for a N dimension Tensor
    (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
    every N elements with the same quantization parameter. The dimension for scales/zero_points
    will be (M1 * M2 ... * Mn)

    Args:
       input (torch.Tensor): original float32 or bfloat16 Tensor
       scales (float32 torch.Tensor): quantization parameter for per token affine quantization
       zero_points (int32 torch.Tensor): quantization parameter for per token affine quantization
       quant_min (int): minimum quantized value for output Tensor
       quant_max (int): maximum quantized value for output Tensor
       dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor

    Returns:
       Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
       are not stored in the Tensor, we are storing them in function arguments instead
    r   )r   r   muladdr&   r%   r#   r   r|   r}   r   r   r   s         r   r   r   q  s^    6  	9e<%eV[A		#,	[			y)	$	E 
 Lr   c                 J    t        |||       t        j                  | |      S rF   r   r    r+   r   s         r   quantize_per_token_metar     s#      	9e<E//r   zdequantize_per_token(Tensor input, Tensor scales, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype, ScalarType output_dtype) -> Tensordequantize_per_tokenoutput_dtypec                 8    | |z
  } | j                  |      |z  } | S )a  Per token dequantization for the Tensor using the quantization parameters to map
    from floating point to quantized values. This means for a N dimension Tensor
    (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
    every N elements with the same quantization parameter. The dimension for scales/zero_points
    will be (M1 * M2 ... * Mn)

    Args:
       input (torch.Tensor): quantized Tensor (uint8, int8 etc.)
       scales (float32 torch.Tensor): quantization parameter for per token affine quantization
       zero_points (int32 torch.Tensor): quantization parameter for per token affine quantization
       quant_min (int): minimum quantized value for input Tensor
       quant_max (int): maximum quantized value for input Tensor
       dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor
       output_dtype (torch.dtype): dtype (e.g. torch.float32) for output Tensor

    Returns:
       dequantized Tensor with dtype `output_dtype`
    )r#   r   r|   r}   r   r   r   r   s          r   r   r     s&    8 KEHH\"V+ELr   c                 J    t        |||       t        j                  | |      S rF   r   r   s          r   dequantize_per_token_metar     s#      	9e<E66r   zquantize_per_channel_group(Tensor input, Tensor scales, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype, int group_size) -> Tensorquantize_per_channel_group   c                 L   |dkD  sJ || j                   d   kD  r!|j                   d   dk(  r| j                   d   }| j                   d   |z  dk(  sJ | j                         dk(  sJ | j                  d|      }t        j                  |      j                         dk(  sJ |j                  dd      }|j                  dd      }|j                  d|z        j                  |      j                         j                  ||      j                  |      j                  |       }|S )Nr0   r   r   r   r   )r   rs   reshaper    isnansumr   r   r&   clamp_r#   
reshape_as)	r   r|   r}   r   r   r   
group_sizeto_quant
input_int8s	            r   r   r     s    >>EKKO#R(8A(=[[_
;;r?Z'1,,,99;! }}R,H;;x $$&!+++^^B"F%%b!,K 	S6\"	[				9	%	E	E	  r   c                     |dkD  sJ || j                   d   kD  r!|j                   d   dk(  r| j                   d   }| j                   d   |z  dk(  sJ | j                         dk(  sJ t        j                  | |      S )aX  Groupwise quantization within each channel for an 2-d Tensor using the quantization parameters
    to map from floating point to quantized values. This means for each row of a 2-d Tensor
    (M, N), we calculate scales/zero_points for each `group_size` elements
    and quantize every `group_size` elements with the same quantization parameter.
    The dimension for scales/zero_points will be (M * ceil(N, group_size),)

    Args:
       input (torch.Tensor): original float32 or bfloat16 Tensor
       scales (float32 torch.Tensor): quantization parameter for per channel group affine quantization
       zero_points (int32 torch.Tensor): quantization parameter for per channel group affine quantization
       quant_min (int): minimum quantized value for output Tensor
       quant_max (int): maximum quantized value for output Tensor
       dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor

    Returns:
       Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
       are not stored in the Tensor, we are storing them in function arguments instead
    r0   r   r   r   r*   )r   rs   r    r+   )r   r|   r}   r   r   r   r   s          r   quantize_per_channel_group_metar   	  s    8 >>EKKO#R(8A(=[[_
;;r?Z'1,,,99;!E//r   zdequantize_per_channel_group(Tensor input, Tensor scales, Tensor? zero_points, int quant_min, int quant_max, ScalarType dtype, int group_size, ScalarType output_dtype) -> Tensordequantize_per_channel_groupw_int8r   c                     |dkD  sJ || j                   d   kD  r!|j                   d   dk(  r| j                   d   }| j                   d   |z  dk(  sJ | j                         dk(  sJ | j                  d|      }|j                  dd      }||j                  dd      }	n0t        j                  g t        j
                  |j                        }	|j                  |	      j                  |      j                  |       j                  |      }
|
S )a!  Groupwise dequantization within each channel for an 2-d Tensor using the quantization parameters
    to map from floating point to quantized values. This means for each row of a 2-d Tensor
    (M, N), we calculate scales/zero_points for each `group_size` elements
    and quantize every `group_size` elements with the same quantization parameter.
    The dimension for scales/zero_points will be (M * ceil(N, group_size),)

    Args:
       input (torch.Tensor): quantized Tensor (uint8/int8 etc.)
       scales (float32 torch.Tensor): quantization parameter for per channel group affine quantization
       zero_points (int32 torch.Tensor): quantization parameter for per channel group affine quantization
       quant_min (int): minimum quantized value for input Tensor
       quant_max (int): maximum quantized value for input Tensor
       dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor
       output_dtype (torch.dtype): dtype (e.g. torch.float32) for output Tensor

    Returns:
       dequantized Tensor with dtype `output_dtype`
    r0   r   r   r   rg   )r   rs   r   r    zerosint32rh   subr   r   r#   )r   r|   r}   r   r   r   r   r   w_int8_groupedzpw_dqs              r   r   r   5  s    D >>FLL$$b)9Q)>\\"%
<<j(A---::<1^^B
3N^^B"F  Q'[[5;;v}}Eb!%%f-88@CCLQDKr   zyfake_quant_per_channel(Tensor input, Tensor scales, Tensor zero_points, int axis, int quant_min, int quant_max) -> Tensorc                   ,    e Zd Zed        Zed        Zy)FakeQuantPerChannelc                    |j                   t        j                  k7  r|j                  t        j                        }|j                   t        j                  k7  r|j                  t        j                        }|j                   t        j                  k(  sJ d|j                           ||j                         k  sJ d|j                                 t        t        d|            t        t        |dz   |j                              z   }t        ||      }t        ||      }	t        j                  |d|z  z        |	z   }
t        j                  |
||      |	z
  |z  }t        j                  |
|k\  |
|k        }| j                  |       |S )Nr   r   r   r0   r   )r   r    r$   r#   r   rs   rq   rr   ndimr   r&   r%   logical_andsave_for_backward)ctxr   r|   r}   rw   r   r   broadcast_dimsunsqueeze_scalesunsqueeze_zero_pointstempr   masks                r   forwardzFakeQuantPerChannel.forwardo  sO   <<5==(YYu}}-F+%..5KKK5==(	XI%++W	X(eiik!J%<UYY[M#JJ!eAtn-U4!8UZZ5P0QQ.v~F 3K P{{5C*:$:;<?TTKKi36KK   $)"3ty7HJd#
r   c                 4    | j                   \  }||z  d d d d d fS r<   )saved_tensors)r   gyr   s      r   backwardzFakeQuantPerChannel.backward  s&    ##Dy$dD$66r   N)__name__
__module____qualname__staticmethodr   r    r   r   r   r   n  s(     * 7 7r   r   fake_quant_per_channelAutogradc                 6    t         j                  | |||||      S r<   )r   applyr   r|   r}   rw   r   r   s         r   r   r     s$     $$v{D)Y r   c                 ,    t        j                  |       S r<   )r    r+   r   s         r   fake_quant_per_channel_metar     s     E""r   )r   )Jr   typingr   r   r    torch._refsr   torch.ao.quantization.utilsr   r   torch.libraryr   r	   quantized_decomposed_libuint8r   int16r   _INTEGER_DTYPESfloat8_e5m2float8_e4m3fn_FLOAT_DTYPESiinfor   r   r   updateintr   r   definer]   r   r   r   r-   r6   r8   r:   r=   r>   rG   rK   rM   rO   rQ   ra   re   rm   ro   rz   r{   r   r   r   r   r   r   r   r   r   r   r   r$   r   r   r   r   r   autogradFunctionr   r   r   )ks   0r   <module>r     sa    "  + M '
 ##95A ;;

EKKE""E$7$78 :I45AAKEKKN..//    DQRqQ[U[[^	 #kekk!n&8&8"9::R    @  57RS!<<!! ! 	!
 ! ;;! \\! T!H  5v>0<<00 0 	0
 0 ;;0 \\0 ?0    @ :<W<<<<  	
  ;; \\0  <fE0<<0<<0 0 	0
 0 ;;0 \\0 F0.   F ;=X<<<<  ||	
 || ;; \\:  =vF
<<
<<
 
 ||	

 ||
 ;;
 \\
 G
"   _  79TU (,/Q<</Q/Q /Q 	/Q
 /Q ;;/Q $/Q \\/Q V/Qd  7@ (,4<<4<<4 4 	4
 4 ;;4 $4 \\4 A4   _ " (,<<<<  	
  ;; $ \\
@  >G (,Q<<Q<<Q Q 	Q
 Q ;;Q $Q \\Q HQ4   e # (,<<<<  ||	
 || ;; $ \\
@  ?H (, $ \\ I   7  79TU$<<$"$*-$49$BG++$
5<<%&$ V$N   7 %
$<<$"$*-$49$BG++$
5<<%&$
$N  7@<<$'47>CLQKK
5<<%& A"  A6J<<$'47>CLQKK
5<<%& K   @  68ST,<<,LL, , 	,
 , , ;;, \\, U,^  6?0<<0LL0 0 	0
 0 0 ;;0 \\0 @0.   _  8:UV (,;<<;LL; %,,'; 	;
 ; ; ;;; $; \\; W;|  8&A (,4<<4LL4 %,,'4 	4
 4 4 ;;4 $4 \\4 B4*   R
 
 << ;;  5<<%& 
 F 

<<;; 5<<%&
   c
 /
(A<<(A;;(A 5<<%&(A
(AV   ]
 )
C<<C;;C 5<<%&C
C )

<<;; 5<<%&
E   @  46QR#<<#LL# # 	#
 # ;;# S#L  4f=	0<<	0LL	0 	0 		0
 	0 ;;	0 >	0   Y  68ST !&<<LL  	
  ;; ++ U@  6? !&7<<7LL7 7 	7
 7 ;;7 ++7 @7   A :<W !<<!LL! ! 	!
 ! ;;!!H  <fE "0<<"0LL"0 "0 	"0
 "0 ;;"0 F"0J   Z "  %+LL+LL+ %,,'+ 	+
 + ;;+ + +++
+\   .7%..11 7:  8*E
<<
LL
 
 	

 
 
 \\
 F
  8&A#<<#LL# # 	#
 # # \\# B#O$ Ss   :A t3At8