
    qi                     v   d dl Z d dlZd dlmZ d dlmZmZ d dlmZm	Z	  e	dd      Z
ej                  ej                  ej                  ej                  ej                  gZej"                  ej$                  gZeD  ci c];  } |  ej(                  |       j*                   ej(                  |       j,                  f= c} Zej1                  eD  ci c]G  } |  e ej4                  |       j*                         e ej4                  |       j,                        fI c}        d Ze
j9                  d        ee
d	d
      dej:                  dededededej>                  dej:                  fd       Z  ee
d	d      dej:                  dededededej>                  dej:                  fd       Z!e
j9                  d        ee
dd
      dej:                  dej:                  dej:                  dededej>                  dej:                  fd       Z" ee
dd      dej:                  dej:                  dej:                  dededej>                  dej:                  fd       Z#e
j9                  d        ee
dd
      dej:                  dej:                  dej:                  dej:                  dej:                  dej>                  dej:                  fd       Z$ ee
dd      dej:                  dej:                  dej:                  dej:                  dej:                  dej>                  dej:                  fd       Z%e
j9                  d        ee
dd
      dddej:                  dededededej>                  d ej>                  dz  dej:                  fd!       Z& ee
dd      dddej:                  dej:                  dej:                  dededej>                  d ej>                  dz  dej:                  fd"       Z'e
j9                  d#        ee
d$d
      dddej:                  dej:                  dej:                  dededej>                  d ej>                  dz  dej:                  fd%       Z( ee
d$d      dddej:                  dej:                  dej:                  dededej>                  d ej>                  dz  dej:                  fd&       Z)e
j9                  d'        ee
d(d
      dddej:                  dej:                  dej:                  dej:                  dej:                  dej>                  d ej>                  dz  dej:                  fd)       Z* ee
d(d      ddd ej>                  dz  dej:                  fd*       Z+e
j9                  d+        ee
d,d
      dej:                  d-ed.ed/edej>                  de,ej:                  ej:                  f   fd0       Z-e
j9                  d1        ee
d2d
      dej:                  d-ed.ed/edej>                  de,ej:                  ej:                  f   fd3       Z. ee
d,d      dej:                  deded/edej>                  de,ej:                  ej:                  f   fd4       Z/ ee
d2d      dej:                  deded/edej>                  de,ej:                  ej:                  f   fd5       Z0d6 Z1e
j9                  d7        ee
d8d
      dej:                  d9ej:                  d:ej:                  d;edededej>                  dej:                  fd<       Z2 ee
d8d      dej:                  d9ej:                  d:ej:                  d;edededej>                  dej:                  fd=       Z3e
j9                  d>        ee
d?d
      dddej:                  d9ej:                  d:ej:                  dz  d;edededej>                  d ej>                  dz  dej:                  fd@       Z4 ee
d?d      dddej:                  d9ej:                  d:ej:                  dz  d;edededej>                  d ej>                  dz  dej:                  fdA       Z5e
j9                  dB        ee
dCd
      dej:                  dej>                  de,ej:                  ej:                  f   fdD       Z6 ee
dCd      dej:                  dej>                  de,ej:                  ej:                  f   fdE       Z7e
j9                  dF        ee
dGdH      dej:                  dej>                  de,ej:                  ej:                  f   fdI       Z8e
j9                  dJ        ee
dKd
      dej:                  dej>                  de,ej:                  ej:                  f   fdL       Z9 ee
dKd      dej:                  dej>                  de,ej:                  ej:                  f   fdM       Z:dN Z;e
j9                  dO        ee
dPd
      dej:                  d9ej:                  d:ej:                  dededej>                  fdQ       Z< ee
dPd      dej:                  d9ej:                  d:ej:                  dededej>                  fdR       Z=e
j9                  dS        ee
dTd
      ej|                  fdej:                  d9ej:                  d:ej:                  dededej>                  dUej>                  fdV       Z? ee
dTd      ej|                  fdej:                  d9ej:                  d:ej:                  dededej>                  dUej>                  fdW       Z@e
j9                  dX        ee
dYd
      	 dmdej:                  d9ej:                  d:ej:                  dededej>                  fd[       ZA ee
dYd      	 dmdej:                  d9ej:                  d:ej:                  dededej>                  fd\       ZBe
j9                  d]        ee
d^d
      dZej|                  fd_ej:                  d9ej:                  d:ej:                  dz  dededej>                  d`edUej>                  fda       ZCe
j9                  db        G dc ddej                  j                        ZF ee
dedf      dej:                  d9ej:                  d:ej:                  d;edededej:                  fdg       ZG ee
ded      dej:                  d9ej:                  d:ej:                  d;edededej:                  fdh       ZHe
j9                  di        ee
djd
      dej:                  dej>                  dej:                  fdk       ZI ee
djd      dej:                  dej>                  dej:                  fdl       ZJyc c} w c c} w )n    N)_unsqueeze_multiple)determine_qparamsvalidate_qmin_qmax)implLibraryquantized_decomposedDEFc                     |t         vrt        d|       t         |   \  }}| |k  rt        d| d|        ||kD  rt        d| d|       y )NzUnsupported dtype: z9quant_min out of bound for dtype, quant_min_lower_bound: z quant_min: z9quant_max out of bound for dtype, quant_max_upper_bound: z quant_max: )_DTYPE_TO_QVALUE_BOUNDS
ValueErrorAssertionError)	quant_min	quant_maxdtypequant_min_lower_boundquant_max_upper_bounds        j/home/ubuntu/crypto_trading_bot/.venv/lib/python3.12/site-packages/torch/ao/quantization/fx/_decomposed.py_quant_min_max_bounds_checkr      s    ++.ug6773J53Q00((&&;%<LU
 	

 ((&&;%<LU
 	
 )    zxquantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> Tensorquantize_per_tensorCompositeExplicitAutogradinputscale
zero_pointr   r   r   returnc                    | j                   t        j                  t        j                  fv r| j	                  t        j
                        } | j                   t        j
                  k7  rt        d| j                          t        |||       d|z  }t        j                  t        j                  | |z        |z   ||      j	                  |      S )a  Affine quantization for the Tensor using the same quantization parameters to map
    from floating point to quantized values

    Args:
       input (torch.Tensor): original float32 or bfloat16 Tensor
       scale (float): quantization parameter for affine quantization
       zero_point (int): quantization parameter for affine quantization
       quant_min (int): minimum quantized value for output Tensor
       quant_max (int): maximum quantized value for output Tensor
       dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor

    Returns:
       Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
       are not stored in the Tensor, we are storing them in function arguments instead
    <Expecting input to have dtype torch.float32, but got dtype:       ?)
r   torchfloat16bfloat16tofloat32r   r   clampround)r   r   r   r   r   r   	inv_scales          r   r   r   2   s    0 {{u}}enn55'{{emm#J5;;-X
 	
  	9e<eI;;EI%&3Y	bir   Metac                 0   | j                   t        j                  t        j                  fv r| j	                  t        j
                        } | j                   t        j
                  k7  rt        d| j                          t        j                  | |      S )Nr   r   )r   r   r    r!   r"   r#   r   
empty_liker   r   r   r   r   r   s         r   quantize_per_tensor_metar,   X   so     {{u}}enn55'{{emm#J5;;-X
 	
 E//r   zquantize_per_tensor.tensor(Tensor input, Tensor scale, Tensor zero_point, int quant_min, int quant_max, ScalarType dtype) -> Tensorzquantize_per_tensor.tensorc                    |j                         dk7  rt        d|j                                |j                         dk7  rt        d|j                                t        | |j                         |j                         |||      S zAffine quantization for the Tensor using the same quantization parameters to map
    from floating point to quantized values
    Same as `quantize_per_tensor` but scale and zero_point are Scalar Tensor instead of
    scalar values
       >Expecting zero_point tensor to be one element, but received : 9Expecting scale tensor to be one element, but received : numelr   r   itemr+   s         r   quantize_per_tensor_tensorr5   p   s      QLZM]M]M_L`a
 	
 {{}GW
 	
 

 r   c                    | j                   t        j                  t        j                  fv r| j	                  t        j
                        } |j                         dk7  rt        d|j                                |j                         dk7  rt        d|j                                | j                   t        j
                  k7  rt        d| j                          t        j                  | |      S )Nr/   r0   r1   r   r)   )	r   r   r    r!   r"   r#   r3   r   r*   r+   s         r   quantize_per_tensor_tensor_metar7      s     {{u}}enn55'QLZM]M]M_L`a
 	
 {{}GW
 	
 {{emm#J5;;-X
 	
 E//r   zquantize_per_tensor.tensor2(Tensor input, Tensor scale, Tensor zero_point, Tensor quant_min, Tensor quant_max, ScalarType dtype) -> Tensorzquantize_per_tensor.tensor2c                 N   |j                         dk7  rt        d|j                                |j                         dk7  rt        d|j                                t        | |j                         |j                         |j                         |j                         |      S r.   r2   r+   s         r   quantize_per_tensor_tensor2r9      s      QLZM]M]M_L`a
 	
 {{}GW
 	
 

 r   c                 "    t        | |||||      S N)r7   r+   s         r    quantize_per_tensor_tensor2_metar<      s#     + r   zdequantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensordequantize_per_tensor	out_dtyper?   c                    | j                   |k7  rt        d| d| j                          |t        j                  }|t        v r| j                  |      |z
  |z  S t        d|       )a  Affine dequantization for the Tensor using the same quantization parameters to map
    from quantized values to floating point values

    Args:
       input (torch.Tensor): Tensor with dtype matching `dtype` argument,
       e.g. (`torch.uint8`), it is a per tensor quantized Tensor if combined with
       quantization parameters in the argument of this function (scale/zero_point)

       scale (float): quantization parameter for affine quantization

       zero_point (int): quantization parameter for affine quantization

       quant_min (int): minimum quantized value for input Tensor (not used in computation,
       reserved for pattern matching)

       quant_max (int): maximum quantized value for input Tensor (not used in computation,
       reserved for pattern matching)

       dtype (torch.dtype): dtype for input Tensor (not used in computation,
       reserved for pattern matching)

       out_dtype (torch.dtype?): optional dtype for output Tensor

    Returns:
       dequantized float32 Tensor
    Expecting input to have dtype: 
, but got ,Unsupported dtype in dequantize_per_tensor: )r   r   r   r#   r   r"   r   r   r   r   r   r   r   r?   s          r   r=   r=      sz    J {{e-eWJu{{mL
 	
 MM	'' #j0E99GwOPPr   c                T    |t         j                  }t        j                  | |      S Nr)   )r   r#   r*   rD   s          r   dequantize_per_tensor_metarG   &  s&     MM	E33r   zdequantize_per_tensor.tensor(Tensor input, Tensor scale, Tensor zero_point, int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensorzdequantize_per_tensor.tensorc          	         |j                         dk7  rt        d|j                                |j                         dk7  rt        d|j                                t        | |j                         |j                         ||||      S zAffine dequantization for the Tensor using the same quantization parameters to map
    from quantized values to floating point values
    Same as `dequantize_per_tensor` but scale and zero_point are Scalar Tensor instead of
    scalar values
    r/   r0   r1   r>   r3   r   r=   r4   rD   s          r   dequantize_per_tensor_tensorrK   <  s    ( QLZM]M]M_L`a
 	
 {{}GW
 	
 !

 r   c                   |t         j                  }|j                         dk7  rt        d|j                                |j                         dk7  rt        d|j                                | j                  |k7  rt        d| d| j                         |t
        v rt        j                  | |      S t        d|       )Nr/   r0   r1   rA   rB   r)   rC   )r   r#   r3   r   r   r   r*   r   rD   s          r   !dequantize_per_tensor_tensor_metarM   c  s     MM	QLZM]M]M_L`a
 	
 {{}GW
 	
 {{e-eWJu{{mL
 	
 ''Y77GwOPPr   zdequantize_per_tensor.tensor2(Tensor input, Tensor scale, Tensor zero_point, Tensor quant_min, Tensor quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensorzdequantize_per_tensor.tensor2c          	      R   |j                         dk7  rt        d|j                                |j                         dk7  rt        d|j                                t        | |j                         |j                         |j                         |j                         ||      S rI   rJ   rD   s          r   dequantize_per_tensor_tensor2rO     s    ( QLZM]M]M_L`a
 	
 {{}GW
 	
 !

 r   c          	      &    t        | ||||||      S )Nr>   )rM   rD   s          r   "dequantize_per_tensor_tensor2_metarQ     s      -uj)Y r   zrchoose_qparams.tensor(Tensor input, int quant_min, int quant_max, float eps, ScalarType dtype) -> (Tensor, Tensor)zchoose_qparams.tensorqminqmaxepsc           
         | j                   t        j                  t        j                  t        j                  fvrt        d| j                          |t        vr#t        dt        j                          d|       t        ||       t        j                  |       \  }}t        |||||t        j                  |g      d      S )[  Given an input Tensor, derive the per tensor affine quantization parameter
    (scale and zero_point) for target quantized Tensor from the Tensor

    Args:
       input (torch.Tensor): floating point input Tensor
       quant_min (int): minimum quantized value for target quantized Tensor
       quant_max (int): maximum quantized value for target quantized Tensor
       dtype (torch.dtype): dtype for target quantized Tensor

    Returns:
       scale (float): quantization parameter for the target quantized Tensor
       zero_point (int): quantization parameter for the target quantized Tensor
    CExpecting input to have dtype torch.float32/16/b16, but got dtype: $Expecting target dtype to be one of , but got: F)has_customized_qrange)r   r   r#   r    r!   r   r   keysr   aminmaxr   Tensorr   rR   rS   rT   r   min_valmax_vals          r   choose_qparams_tensorra     s    " {{ 
 QRWR]R]Q^_
 	
 ++23J3O3O3Q2RR]^c]de
 	
 tT"}}U+GWcU# r   z|choose_qparams_symmetric.tensor(Tensor input, int quant_min, int quant_max, float eps, ScalarType dtype) -> (Tensor, Tensor)zchoose_qparams_symmetric.tensorc           
         | j                   t        j                  t        j                  t        j                  fvrt        d| j                          |t        vr#t        dt        j                          d|       t        ||       t        j                  |       \  }}t        |||||t        j                  |g      dt        j                        S )rV   rW   rX   rY   F)rZ   qscheme)r   r   r#   r    r!   r   r   r[   r   r\   r   r]   per_tensor_symmetricr^   s          r   choose_qparams_symmetric_tensorre     s    * {{ 
 QRWR]R]Q^_
 	
 ++23J3O3O3Q2RR]^c]de
 	
 tT"}}U+GWcU#**	 	r   c                    | j                   t        j                  t        j                  t        j                  fvrt        d| j                          ||k\  rt        d| d|       t        j                  dt        j                  | j                        t        j                  dt        j                  | j                        fS )NrW   zCExpecting quant_min to be smaller than quant_max but received min: z max: r/   r   device)
r   r   r#   r    r!   r   emptydoublerh   int64r   r   r   rT   r   s        r   choose_qparams_tensor_metarm   (  s     {{ 
 QRWR]R]Q^_
 	
 IQR[Q\\bclbmn
 	
 ;;qU\\BEKK	U\\E  r   c                     t        j                  dt         j                  | j                        t        j                  dt         j                  | j                        fS )Nr/   rg   )r   ri   rj   rh   rk   rl   s        r   $choose_qparams_symmetric_tensor_metaro   =  sA     ;;qU\\BEKK	U\\E  r   c                     t        t        | j                                     }d||<   ||d<   | j                  t	        |            }||fS )Nr   )listrangedimpermutetuple)xaxisnew_axis_listys       r   _permute_to_axis_zerorz   G  sH    quuw(MM$M!			%&'Amr   zquantize_per_channel(Tensor input, Tensor scales, Tensor zero_points, int axis, int quant_min, int quant_max, ScalarType dtype) -> Tensorquantize_per_channelscaleszero_pointsrw   c                    | j                   t        j                  t        j                  fv r| j	                  t        j
                        } | j                   t        j
                  k7  rt        d| j                          || j                         k\  rt        d| j                                t        |||       t        | |      \  } }dg| j                         z  }|j                  d   |d<   |j                  |      }|j                  |      }t        j                  t        j                  | d|z  z        |z   ||      }	|	j                  t        |            }
|
j	                  |      S )at  Affine per channel quantization for the Tensor using the same quantization
    parameters for each channel/axis to map from floating point to quantized values

    Args:
       input (torch.Tensor): original float32 or bfloat16 Tensor
       scales (torch.Tensor): a list of scale quantization parameter for
       affine quantization, one per channel
       zero_point (torch.Tensor): a list of zero_point quantization parameter for
       affine quantization, one per channel
       quant_min (int): minimum quantized value for output Tensor
       quant_max (int): maximum quantized value for output Tensor
       dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor

    Returns:
       Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
       are not stored in the Tensor, we are storing them in function arguments instead
    r   Expecting axis to be < r/   r   r   )r   r   r    r!   r"   r#   r   rs   r   rz   shapeviewr$   r%   rt   ru   )r   r|   r}   rw   r   r   r   permute_axis_list	new_shaperesouts              r   r{   r{   U  s2   6 {{u}}enn55'{{emm#J5;;-X
 	
 uyy{6uyy{mDEE	9e<4UDAEeiik!I<<?IaL[[#F""9-K
++ES6\*+k99iC ++e-.
/C66%=r   c                    | j                   t        j                  t        j                  fv r| j	                  t        j
                        } | j                   t        j
                  k7  rt        d| j                          || j                         k\  rt        d| j                                t        |||       t        j                  | |      S )Nr   r   r)   )
r   r   r    r!   r"   r#   r   rs   r   r*   )r   r|   r}   rw   r   r   r   s          r   quantize_per_channel_metar     s     {{u}}enn55'{{emm#J5;;-X
 	
 uyy{6uyy{mDEE	9e<E//r   zdequantize_per_channel(Tensor input, Tensor scales, Tensor? zero_points, int axis, int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensordequantize_per_channelc                   | j                   |k7  rt        d| d| j                          |t        j                  }|| j	                         k\  rt        d| j	                                t        |||       t        | |      \  } }dg| j	                         z  }	|j                  d   |	d<   |j                  |	      }|| |j                  |	      z
  |z  }
n| |z  }
|
j                  |      }
|
j                  t        |            }|S )a  Affine per channel dequantization for the Tensor using the same quantization
    parameters for each channel/axis to map from quantized values to floating point values

    Args:
       input (torch.Tensor): Tensor with dtype matching `dtype` argument,
       e.g. (`torch.uint8`), it is a per channel quantized Tensor if combined with
       quantization parameter in the argument of this function (scales/zero_points/axis)

       scales (torch.Tensor): a list of scale quantization parameter for
       affine quantization, one per channel

       zero_points (torch.Tensor): a list of zero_point quantization parameter for
       affine quantization, one per channel

       quant_min (int): minimum quantized value for output Tensor (not used in computation,
       reserved for pattern matching)

       quant_max (int): maximum quantized value for output Tensor (not used in computation,
       reserved for pattern matching)

       dtype (torch.dtype): requested dtype for output Tensor (not used in computation,
       reserved for pattern matching)

       out_dtype (torch.dtype?): optional dtype for output Tensor

    Returns:
       dequantized float32 Tensor
    rA   , but got dtype: r   r/   r   )r   r   r   r#   rs   r   rz   r   r   r"   rt   ru   )r   r|   r}   rw   r   r   r   r?   r   r   r   r   s               r   r   r     s
   P {{e-eW4Eekk]S
 	
 MM	uyy{6uyy{mDEE	9e<4UDAEeiik!I<<?IaL[[#F{''	22f<fn
&&
C
++e-.
/CJr   c                    | j                   |k7  rt        d| d| j                          |t        j                  }|| j	                         k\  rt        d| j	                                t        |||       t        j                  | |      S )NzExpecting input to have dtype r   r   r)   )r   r   r   r#   rs   r   r*   )r   r|   r}   rw   r   r   r   r?   s           r   dequantize_per_channel_metar     s     {{e,UG3DU[[MR
 	
 MM	uyy{6uyy{mDEE	9e<E33r   zLchoose_qparams_per_token(Tensor input, ScalarType dtype) -> (Tensor, Tensor)choose_qparams_per_tokenc                 p   | j                         j                  dd      }|j                  t        j                  k(  r|j                         }|t        j                  k(  rd}d|dz
  z  dz
  }nt        d|       |j                  d	      j                  |      }t        j                  |      }||fS )
  Choose quantization parameters for per token quantization. This means for a N dimension Tensor
    (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
    every N elements with the same quantization parameter. The dimension for scales/zero_points
    will be (M1 * M2 ... * Mn)

    Args:
       input (torch.Tensor): original float32/float16 Tensor
       dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor

    Returns:
        scales and zero_points, both float32 Tensors
    Trs   keepdim      r/   z/unsupported dtype in choose_qparams_per_token: gh㈵>min)absamaxr   r   r    floatint8	Exceptionr$   div
zeros_like)r   r   r|   n_bitsr   r}   s         r   r   r     s    , YY["d3F||u}}$LLN 	 

&1*%)	=eWE
 	
 \\d\#''	2F""6*K;r   c                     t        | j                  d d       dgz   }t        j                  |t        j                  | j
                        t        j                  |t        j                  | j
                        fS Nr   r/   rg   rq   r   r   ri   rj   rh   rk   r   r   sizes      r   choose_qparams_per_token_metar   -  ]     CR !QC'D;;t5<<Eu{{EKKH  r   z]_choose_qparams_per_token_asymmetric_impl(Tensor input, ScalarType dtype) -> (Tensor, Tensor))_choose_qparams_per_token_asymmetric_implCompositeImplicitAutogradc                    d\  }}t        j                  | dd      }t        j                  | dd      }t        j                  |t        j                  |            }t        j
                  |t        j                  |            }t        j                  t         j                        j                  }||z
  t        ||z
        z  }	|	j                  |      }	||	z  }
||	z  }||
z   }||z   }t        j                  ||z   dkD  ||
z
  ||z
        }t        j                  |||      j                         }|	j                  t         j                        |j                  t         j                        fS )r   )i   r   Tr   r   r   )r   aminr   r   r   maxfinfor#   rT   r   r$   wherer%   r"   float64rk   )r   r   rR   rS   r_   r`   min_val_negmax_val_posrT   r   descaled_mindescaled_maxzero_point_from_min_errorzero_point_from_max_errorr   s                  r   r   r   A  sC   , JD$jjB5GjjB5G))GU%5%5g%>?K))GU%5%5g%>?K
++emm
$
(
(C ;&%t*<<EKKCK E &L&L $| 3 $| 3!$==A||J
 Zt4::<J88EMM"JMM%++$>>>r   zWchoose_qparams_per_token_asymmetric(Tensor input, ScalarType dtype) -> (Tensor, Tensor)#choose_qparams_per_token_asymmetricc                     t        | |      S r;   )r   r   r   s     r   r   r   v  s     5UEBBr   c                     t        | j                  d d       dgz   }t        j                  |t        j                  | j
                        t        j                  |t        j                  | j
                        fS r   r   r   s      r   (choose_qparams_per_token_asymmetric_metar     r   r   c                 *   t        j                  t        | j                               d d       }||j	                         k7  rt        d| d|j                                ||j	                         k7  rt        d| d|j                                y )Nr   znum_tokens: z	 scales: z zero_points: )mathprodrq   r   r3   r   )r   r|   r}   
num_tokenss       r   !_per_token_quant_qparam_dim_checkr     s    4

-cr23JV\\^#|J<yPQQ[&&((:,n[5E5E5G4HI
 	
 )r   z}quantize_per_token(Tensor input, Tensor scales, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype) -> Tensorquantize_per_tokenc                     t        |||       t        | ||       | j                  d|z        j                  |      j	                         j                  ||      j                  |      } | S )a  Per token quantization for the Tensor using the quantization parameters to map
    from floating point to quantized values. This means for a N dimension Tensor
    (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
    every N elements with the same quantization parameter. The dimension for scales/zero_points
    will be (M1 * M2 ... * Mn)

    Args:
       input (torch.Tensor): original float32 or bfloat16 Tensor
       scales (float32 torch.Tensor): quantization parameter for per token affine quantization
       zero_points (int32 torch.Tensor): quantization parameter for per token affine quantization
       quant_min (int): minimum quantized value for output Tensor
       quant_max (int): maximum quantized value for output Tensor
       dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor

    Returns:
       Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
       are not stored in the Tensor, we are storing them in function arguments instead
    r   )r   r   muladdr%   r$   r"   r   r|   r}   r   r   r   s         r   r   r     s^    6  	9e<%eV[A		#,	[			y)	$	E 
 Lr   c                 J    t        |||       t        j                  | |      S rF   r   r   r*   r   s         r   quantize_per_token_metar     s#      	9e<E//r   zdequantize_per_token(Tensor input, Tensor scales, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype, ScalarType output_dtype) -> Tensordequantize_per_tokenoutput_dtypec                 8    | |z
  } | |z  } | j                  |      S )a  Per token dequantization for the Tensor using the quantization parameters to map
    from floating point to quantized values. This means for a N dimension Tensor
    (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
    every N elements with the same quantization parameter. The dimension for scales/zero_points
    will be (M1 * M2 ... * Mn)

    Args:
       input (torch.Tensor): quantized Tensor (uint8, int8 etc.)
       scales (float64 torch.Tensor): quantization parameter for per token affine quantization
       zero_points (int64 torch.Tensor): quantization parameter for per token affine quantization
       quant_min (int): minimum quantized value for input Tensor
       quant_max (int): maximum quantized value for input Tensor
       dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor
       output_dtype (torch.dtype): dtype (e.g. torch.float32) for output Tensor

    Returns:
       dequantized Tensor with dtype `output_dtype`
    )r"   r   r|   r}   r   r   r   r   s          r   r   r     s&    8 KEFNE88L!!r   c                 J    t        |||       t        j                  | |      S rF   r   r   s          r   dequantize_per_token_metar     s#      	9e<E66r   zquantize_per_channel_group(Tensor input, Tensor scales, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype, int group_size) -> Tensorquantize_per_channel_group   c                    |dk  rt        d      || j                  d   kD  r!|j                  d   dk(  r| j                  d   }| j                  d   |z  dk7  rt        d      | j                         dk7  rt        d      | j                  d|      }t	        j
                  |      j                         dk7  rt        d      |j                  dd      }|j                  dd      }|j                  d	|z        j                  |      j                         j                  ||      j                  |      j                  |       }|S )
Nr/   group_size must be > 1r   r   /input.shape[-1] must be divisible by group_sizer   input must be 2-dimensionalzto_quant must not contain NaNsr   )r   r   rs   reshaper   isnansumr   r   r%   clamp_r"   
reshape_as)	r   r|   r}   r   r   r   
group_sizeto_quant
input_int8s	            r   r   r     s(    Q566EKKO#R(8A(=[[_
{{2#q(NOOyy{a:;; }}R,H{{8  "a'=>>^^B"F%%b!,K 	S6\"	[				9	%	E	E	  r   c                 2   |dk  rt        d      || j                  d   kD  r!|j                  d   dk(  r| j                  d   }| j                  d   |z  dk7  rt        d      | j                         dk7  rt        d      t        j                  | |      S )	aX  Groupwise quantization within each channel for an 2-d Tensor using the quantization parameters
    to map from floating point to quantized values. This means for each row of a 2-d Tensor
    (M, N), we calculate scales/zero_points for each `group_size` elements
    and quantize every `group_size` elements with the same quantization parameter.
    The dimension for scales/zero_points will be (M * ceil(N, group_size),)

    Args:
       input (torch.Tensor): original float32 or bfloat16 Tensor
       scales (float32 torch.Tensor): quantization parameter for per channel group affine quantization
       zero_points (int32 torch.Tensor): quantization parameter for per channel group affine quantization
       quant_min (int): minimum quantized value for output Tensor
       quant_max (int): maximum quantized value for output Tensor
       dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor

    Returns:
       Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
       are not stored in the Tensor, we are storing them in function arguments instead
    r/   r   r   r   r   r   r   r)   )r   r   rs   r   r*   )r   r|   r}   r   r   r   r   s          r   quantize_per_channel_group_metar   >  s    8 Q566EKKO#R(8A(=[[_
{{2#q(NOOyy{a:;;E//r   zdequantize_per_channel_group(Tensor input, Tensor scales, Tensor? zero_points, int quant_min, int quant_max, ScalarType dtype, int group_size, ScalarType output_dtype) -> Tensordequantize_per_channel_groupw_int8r   c                 V   |dk  rt        d      || j                  d   kD  r!|j                  d   dk(  r| j                  d   }| j                  d   |z  dk7  rt        d      | j                         dk7  rt        d      | j                  d|      }|j                  dd      }||j                  dd      }	n0t	        j
                  g t        j                  |j                        }	|j                  |	      j                  |      j                  |       j                  |      }
|
S )	a!  Groupwise dequantization within each channel for an 2-d Tensor using the quantization parameters
    to map from floating point to quantized values. This means for each row of a 2-d Tensor
    (M, N), we calculate scales/zero_points for each `group_size` elements
    and quantize every `group_size` elements with the same quantization parameter.
    The dimension for scales/zero_points will be (M * ceil(N, group_size),)

    Args:
       input (torch.Tensor): quantized Tensor (uint8/int8 etc.)
       scales (float32 torch.Tensor): quantization parameter for per channel group affine quantization
       zero_points (int32 torch.Tensor): quantization parameter for per channel group affine quantization
       quant_min (int): minimum quantized value for input Tensor
       quant_max (int): maximum quantized value for input Tensor
       dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor
       output_dtype (torch.dtype): dtype (e.g. torch.float32) for output Tensor

    Returns:
       dequantized Tensor with dtype `output_dtype`
    r/   r   r   r   z0w_int8.shape[-1] must be divisible by group_sizer   zw_int8 must be 2-dimensionalrg   )r   r   rs   r   r   zerosint32rh   subr   r   r"   )r   r|   r}   r   r   r   r   r   w_int8_groupedzpw_dqs              r   r   r   m  s   D Q566FLL$$b)9Q)>\\"%
||B*$)OPPzz|q;<<^^B
3N^^B"F  Q'[[5;;v}}Eb!%%f-88@CCLQDKr   zyfake_quant_per_channel(Tensor input, Tensor scales, Tensor zero_points, int axis, int quant_min, int quant_max) -> Tensorc                   ,    e Zd Zed        Zed        Zy)FakeQuantPerChannelc                 ,   |j                   t        j                  k7  r|j                  t        j                        }|j                   t        j                  k7  r|j                  t        j                        }|j                   t        j                  k7  rt        d|j                          ||j                         k\  rt        d|j                                t        t        |            t        t        |dz   |j                              z   }t        ||      }t        ||      }	t        j                  |d|z  z        |	z   }
t        j                  |
||      |	z
  |z  }t        j                  |
|k\  |
|k        }| j                  |       |S )Nr   r   r/   r   )r   r   r#   r"   r   r   rs   rq   rr   ndimr   r%   r$   logical_andsave_for_backward)ctxr   r|   r}   rw   r   r   broadcast_dimsunsqueeze_scalesunsqueeze_zero_pointstempr   masks                r   forwardzFakeQuantPerChannel.forward  sO    <<5==(YYu}}-F+%..5K;;%--' Nu{{m\  599; #:599;-!HIIeDk*T%q%**2M-NN.v~F 3K P{{5C*:$:;<?TTKKi36KK   $)"3ty7HJd#
r   c                 4    | j                   \  }||z  d d d d d fS r;   )saved_tensors)r   gyr   s      r   backwardzFakeQuantPerChannel.backward  s(     ##Dy$dD$66r   N)__name__
__module____qualname__staticmethodr   r    r   r   r   r     s(     . 7 7r   r   fake_quant_per_channelAutogradc                 6    t         j                  | |||||      S r;   )r   applyr   r|   r}   rw   r   r   s         r   r   r     s$     $$v{D)Y r   c                 ,    t        j                  |       S r;   r   r*   r   s         r   fake_quant_per_channel_metar    s     E""r   zFconvert_element_type.no_fuse(Tensor input, ScalarType dtype) -> Tensorzconvert_element_type.no_fusec                 j    t         j                  j                  j                  j	                  | |      S r;   )r   opsprimsconvert_element_typedefaultr   s     r   r  r    s%     99??//77uEEr   c                 0    t        j                  | |      S rF   r  r   s     r   convert_element_type_metar
    s    E//r   )r   )Kr   r   torch._refsr   torch.ao.quantization.utilsr   r   torch.libraryr   r   quantized_decomposed_libuint8r   uint16int16r   _INTEGER_DTYPESfloat8_e5m2float8_e4m3fn_FLOAT_DTYPESiinfor   r   r   updateintr   r   definer]   r   r   r   r,   r5   r7   r9   r<   r=   rG   rK   rM   rO   rQ   ru   ra   re   rm   ro   rz   r{   r   r   r   r   r   r   r   r   r   r   r   r#   r   r   r   r   r   autogradFunctionr   r   r  r  r
  )ks   0r   <module>r     s     + M '
 ##95A ;;

ELL%++u{{S""E$7$78 :I45AAKEKKN..//    DQRqQ[U[[^	 #kekk!n&8&8"9::R
$   @  57RS"<<"" " 	"
 " ;;" \\" T"J  5v>0<<00 0 	0
 0 ;;0 \\0 ?0"   @ :<W<<<<  	
  ;; \\>  <fE0<<0<<0 0 	0
 0 ;;0 \\0 F04   F ;=X<<<<  ||	
 || ;; \\>  =vF<<<<  ||	
 || ;; \\ G,   _  79TU %)0Q<<0Q0Q 0Q 	0Q
 0Q ;;0Q {{T!0Q \\0Q V0Qf  7@ %)4<<4<<4 4 	4
 4 ;;4 {{T!4 \\4 A4   _ " %)<<<<  	
  ;; {{T! \\
D  >G %)Q<<Q<<Q Q 	Q
 Q ;;Q {{T!Q \\Q HQ>   e # %)<<<<  ||	
 || ;; {{T! \\
D  ?H %) {{T! \\ I   7  79TU(<<("(*-(49(BG++(
5<<%&( V(V   7 %
(<<("(*-(49(BG++(
5<<%&(
(V  7@<<$'47>CLQKK
5<<%& A(  A6J<<$'47>CLQKK
5<<%& K   @  68ST.<<.LL. . 	.
 . . ;;. \\. U.b  6?0<<0LL0 0 	0
 0 0 ;;0 \\0 @02   _  8:UV %)=<<=LL= $= 	=
 = = ;;= {{T!= \\= W=@  8&A %)4<<4LL4 $4 	4
 4 4 ;;4 {{T!4 \\4 B4.   R
 
 << ;;  5<<%& 
 F 

<<;; 5<<%&
   c
 /
(?<<(?;;(? 5<<%&(?
(?V   ]
 )
C<<C;;C 5<<%&C
C )

<<;; 5<<%&

   @  46QR#<<#LL# # 	#
 # ;;# S#L  4f=	0<<	0LL	0 	0 		0
 	0 ;;	0 >	0   Y  68ST !&"<<"LL" " 	"
 " ;;" ++" U"B  6? !&7<<7LL7 7 	7
 7 ;;7 ++7 @7   A :<W %<<%LL% % 	%
 % ;;%%P  <fE %0<<%0LL%0 %0 	%0
 %0 ;;%0 F%0P   Z "  %.LL.LL. $. 	.
 . ;;. . ++.
.b   .7%..11 7B  8*E
<<
LL
 
 	

 
 
 \\
 F
  8&A#<<#LL# # 	#
 # # \\# B#   L
 "
F FU[[ FU\\ F
F  >G0U\\ 0%++ 0%,, 0 H0E' Ss   =A v1Av6