
    sg                         d dl Z d dlmZmZmZmZmZ d dlmZ ddl	m
Z
 erddlmZ ddlmZmZmZmZ dd	lmZ  e       rd dlZ ej*                  e      Z G d
 de
      Zy)    N)TYPE_CHECKINGAnyDictListOptional)version   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_fbgemm_gpu_availableis_torch_availablelogging)get_module_from_namec                        e Zd ZdZdZdZddgZ fdZd Zdd	Z	d
dddde
dee
ef   fdZ	 dd
dddde
dddee
ef   deee
      fdZddZg fd
ddee
   fdZdee
   de
dee
   fdZddZedefd       Z xZS ) FbgemmFp8HfQuantizerz/
    FP8 quantization using fbgemm kernels
    TFz
fbgemm-gpu
acceleratec                 4    t        |   |fi | || _        y N)super__init__quantization_config)selfr   kwargs	__class__s      _/var/www/html/venv/lib/python3.12/site-packages/transformers/quantizers/quantizer_fbgemm_fp8.pyr   zFbgemmFp8HfQuantizer.__init__.   s    ,77#6     c                    t               rHt        j                  t        j                  j                  d            t        j                  d      k  rt        d      t               st        d      t        d      st        d      t        j                  j                         st        d      t        j                  j                         }|\  }}|dk  rt        d	      |j                  d
d       }|t        j!                  d       y |N| j"                  sAt%        |t&              r0d|j)                         v sd|j)                         v rt        d      y y y y )Ntorchz2.1.0z~Using fbgemm fp8 quantization requires torch > 2.1.0Please install the latest version of torch ( pip install --upgrade torch )zUsing fbgemm fp8 quantization requires fbgemm-gpu libraryPlease install the latest version of fbgemm-gpu library by following : https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-librariesz0.32.2z`Loading an FP8 quantized model requires accelerate > 0.32.1 (`pip install --upgrade accelerate`)z=Using FP8 quantized models with fbgemm kernels requires a GPU	   zXFP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)
device_mapzYou have loaded an FP8 model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. cpudiskzYou are attempting to load an FP8 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r   r   parse	importlibmetadataImportErrorr   r   r    cudais_availableRuntimeErrorget_device_capability
ValueErrorgetloggerwarning_oncepre_quantized
isinstancedictvalues)r   argsr   compute_capabilitymajorminorr"   s          r   validate_environmentz)FbgemmFp8HfQuantizer.validate_environment2   s`   !#w}}Y5G5G5O5OPW5X'Y\c\i\ijq\r'r]  '(F 
 'x0r  zz&&(^__"ZZ==?)u19j  ZZd3
| #&&z40j//11Vz?P?P?R5R n  6S 1 ' $r   returnc                     |(t         j                  }t        j                  d|       |S |t         j                  k(  rt        d      |S )Na  Overriding torch_dtype=%s with `torch_dtype=torch.bloat16` due to requirements of `fbgemm-gpu` to enable model loading in fp8. Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass torch_dtype=torch.bfloat16 to remove this warning.zeYou cannot use FP8 with torch_dtype=torch.float16.We recommend you passing torch_dtype=torch.bfloat16)r    bfloat16r/   infofloat16r-   )r   torch_dtypes     r   update_torch_dtypez'FbgemmFp8HfQuantizer.update_torch_dtype_   sY    ..KKKF   EMM)F  r   modelr   param_valueztorch.Tensor
param_name
state_dictc                     ddl m} t        ||      \  }}t        ||      rP| j                  s|dk(  r.|dk(  r(|j
                  t        j                  k7  rt        d      y|dk(  rt        d      y	y)
Nr   FbgemmFp8Linearbiasweightz6Expect quantized weights but got an unquantized weightFweight_scalez;Expect unquantized weights but got a quantized weight_scaleT)	integrationsrG   r   r2   r1   dtyper    float8_e4m3fnr-   )	r   rA   rB   rC   rD   r   rG   moduletensor_names	            r   check_quantized_paramz*FbgemmFp8HfQuantizer.check_quantized_paramp   sv     	325*Efo.!![F%:(*{/@/@EDWDW/W$%]^^.0$%bccr   target_deviceztorch.deviceunexpected_keysc                 \   t         j                  j                  j                  |      \  }}t	        ||      \  }	}
|j                  |      |	j                  |
<   |j                  |j                  d   d      j                  |      |	j                  d<   |||v r|j                  |       ~y)z@
        Quantizes weights into weight and weight_scale
        r   r	   rJ   N)
r    opsfbgemmquantize_fp8_per_rowr   to_buffersviewshaperemove)r   rA   rB   rC   rQ   rD   rR   	new_valuerJ   rN   rO   s              r   create_quantized_paramz+FbgemmFp8HfQuantizer.create_quantized_param   s     #())"2"2"G"G"T	<25*E'0||M'B$*6*;*;L<N<Nq<QST*U*X*XYf*g'&:+H"":.r   c                     |S r    )r   rA   r   s      r   #_process_model_after_weight_loadingz8FbgemmFp8HfQuantizer._process_model_after_weight_loading   s    r   keep_in_fp32_modulesc                 B   ddl m}m}  ||      | _        | j                  j                  /| j                  j                  | j                  j                          ||| j                  | j                  | j                        }| j                  |j                  _        y )Nr   )get_keys_to_not_convertreplace_with_fbgemm_fp8_linear)modules_to_not_convertr   r1   )rK   rc   rd   re   r   extendr1   config)r   rA   r"   ra   r   rc   rd   s          r   $_process_model_before_weight_loadingz9FbgemmFp8HfQuantizer._process_model_before_weight_loading   s     	[&=e&D###::F''..t/G/G/^/^_.#'#>#> $ 8 8,,	
 ,0+C+C(r   missing_keysprefixc                 $   ddl m} g }|j                         D ]\  \  }}t        ||      s|D ]E  }||v s
|| d| v s|j	                  d      r#|j	                  d      r5|j                  |       G ^ |D 	cg c]	  }	|	|vs|	 c}	S c c}	w )Nr   rF   .z.weightz.bias)rK   rG   named_modulesr2   endswithappend)
r   rA   ri   rj   rG   not_missing_keysnamerN   missingks
             r   update_missing_keysz(FbgemmFp8HfQuantizer.update_missing_keys   s    2!//1 	9LD&&/2+ 9GDvhay4I,I ' 0 0 ; ' 0 0 9(//89	9 (Ea14D+DEEEs   <	BBc                      y)NTr_   )r   safe_serializations     r   is_serializablez$FbgemmFp8HfQuantizer.is_serializable   s    r   c                      y)NFr_   )r   s    r   is_trainablez!FbgemmFp8HfQuantizer.is_trainable   s    r   )r?   torch.dtyper:   rz   r   )rA   r   )__name__
__module____qualname____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesr   r9   r@   strr   r   rP   r   r   r]   r`   rh   rt   rw   propertyboolry   __classcell__)r   s   @r   r   r   $   s*    (,$ %|47+Z"  $ 	
 cN< 04  $ 	
 & cN "$s),. +-	D D #3i	D.FtCy F# FRVWZR[ F d  r   r   )r&   typingr   r   r   r   r   	packagingr   baser
   modeling_utilsr   utilsr   r   r   r   quantizers_utilsr   r    
get_loggerr{   r/   r   r_   r   r   <module>r      sT     ; ;   0 a a 2  
		H	%h; hr   