
    sgdP              
          d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZmZ ddlmZmZmZmZ  e       r
ddlZddlmZ  ej*                  e      Zg d	g d
g dddg d	g dg ddddg d	g d
g dddg d	g d
g ddddZdddddddddddddddddddddddddZd Z	 	 	 	 d$defdZd Zd Zd Zd  Z d! Z!d" Z"d# Z#y)%z;AWQ (Activation aware Weight Quantization) integration file    N)version   )ACT2FN)PreTrainedModel)is_auto_awq_availableis_ipex_availableis_torch_availablelogging)AwqBackendPackingMethod	AwqConfigAWQLinearVersionExllamaVersion)q_projk_projv_projo_proj)	gate_projup_proj	down_proj)input_layernormpost_attention_layernormnormF)	attentionmlp	layernorm	use_alibi)w1w3w2g    .A)r   r   r   r   
rope_theta)mistralmixtralllamallavaactc_fc)r%   layer_before_actdense_h_to_4hr   fc_in	gelu_impl)
starcoder2RefinedWebModelfalconmptgptjgpt_neoxgpt_bigcodebloomc                 Z   ddl m} |t        vr| S | j                         D ]  \  }}t        |   d   }t        |   d   }||k(  rYt	        | |      rMt        | t        |   d         }|j                  }t        j                  |      }	 |||	      | j                  |<   t        ||      }
 | S )Nr   )ScaledActivationr%   r'   )awq.modules.actr4   AWQ_SCALES_MAPPINGSnamed_childrenhasattrgetattrout_featurestorchones_modulesreplace_quantization_scales)model
model_typer4   namemoduleact_namelayer_before_act_namer'   size
scale_like_s              P/var/www/html/venv/lib/python3.12/site-packages/transformers/integrations/awq.pyr>   r>   M   s    0,,,,. <f&z259 3J ?@R S8/D E&u.A*.MN`.ab#00DD)J#3FJ#GENN4 '
;< L    returnc           	         |g }|j                   }t               st        d      |t        j                  k(  r|j
                  t        j                  k(  r	ddlm	} |}n|j
                  t        j                  k(  r	ddlm} |}n|j
                  t        j                  k(  rm|j                  d   t        j                   k(  r	ddlm}	 |	}n|j                  d   t        j&                  k(  r	ddlm}
 |
}nat        d	|j                  d          |j
                  t        j,                  k(  r	dd
lm} |}n t        d|j
                         ddlm} |}| j7                         D ]  \  }}g j9                  |       t;        |t<        j>                        r||vrtA        fd|D              s|jB                  }|jD                  } ||jF                  |jH                  |||jJ                  du|jL                  jN                        | jP                  |<   d}| jP                  |   jS                  d       tU        tW        |jY                                     dkD  rt[        ||||      \  }}j]                  d         | |fS )a  
    Public method that recursively replaces the Linear layers of the given model with AWQ quantized layers.
    `accelerate` is needed to use this method. Returns the converted model and a boolean that indicates if the
    conversion has been successfull or not.

    During the module replacement, we also infer the backend to use through the `quantization_config` object.

    Args:
        model (`torch.nn.Module`):
            The model to convert, can be any `torch.nn.Module` instance.
        quantization_config (`AwqConfig`):
            The quantization config object that contains the quantization parameters.
        modules_to_not_convert (`list`, *optional*):
            A list of modules to not convert. If a module name is in the list (e.g. `lm_head`), it will not be
            converted.
        current_key_name (`list`, *optional*):
            A list that contains the current key name. This is used for recursion and should not be passed by the user.
        has_been_replaced (`bool`, *optional*):
            A boolean that indicates if the conversion has been successful or not. This is used for recursion and
            should not be passed by the user.
    NzAWQ (either `autoawq` or `llmawq`) is not available. Please install it with `pip install autoawq` or check out the installation guide in https://github.com/mit-han-lab/llm-awqr   )WQLinear_GEMM)WQLinear_GEMVr   )WQLinear_Exllama)WQLinear_ExllamaV2Unrecognized Exllama version: WQLinear_IPEXzUnrecognized AWQ version: )WQLinearc              3   D   K   | ]  }|d j                        v   yw).N)join).0keycurrent_key_names     rH   	<genexpr>z*replace_with_awq_linear.<locals>.<genexpr>   s      [Sschh'788[s    )w_bit
group_sizein_featuresr:   biasdevTF)modules_to_not_convertrY   quantization_confighas_been_replaced)/backendr   
ValueErrorr   AUTOAWQr   r   GEMMawq.modules.linear.gemmrL   GEMVawq.modules.linear.gemvrM   EXLLAMAexllama_configr   ONEawq.modules.linear.exllamarN   TWOawq.modules.linear.exllamav2rO   IPEXawq.modules.linear.gemm_ipexrR   awq.quantize.qmodulerS   r7   append
isinstancennLinearanyr]   r:   bitsr\   r^   weightdevicer=   requires_grad_lenlistchildrenreplace_with_awq_linearpop)r?   r`   ra   rY   rb   rd   rL   
target_clsrM   rN   rO   rR   rS   rA   rB   r]   r:   rG   s      `              rH   r   r   ^   sf   8 %!#!))G " ~
 	
 )111&&*:*?*??=&J ((,<,A,AA=&J ((,<,D,DD"11)<@R@RRG-
$33I>.BTBTTK/
 #ABUBdBdenBoAp!qrr ((,<,A,AAB&J9:M:U:U9VWXX1
,,.  !f#!%fbii(T9O-O[DZ[[$00%22'1-222== +!-D0,,(t$ %)! t$33E:tFOO%&'!+#:'=!1$7"3$ A  	R A !B ###rI   c                    t        | t              s"t        d| j                  j                         |j
                  |j
                  }|j                  |d<   |S | j                  j                  t        v r~t        | j                  j                     }| j                  j                  d      }|j                  }|j                  }t        |d|      }||d<   ||d<   ||d<   |j                  |d<   |S t        d      )	af  
    Returns the fusing mapping given the quantization config and the model

    Args:
        model (`~PreTrainedModel`):
            The model to fuse - note this model should have been converted into AWQ format beforehand.
        quantization_config (`~transformers.quantization_config.AWQConfig`):
            The quantization configuration to use.
    z:The model should be an instance of `PreTrainedModel`, got max_seq_lenTdecodernum_key_value_headshidden_sizenum_attention_headsa  Fusing mapping not found either on the quantization config or the supported `AWQ_FUSED_MAPPINGS`. Please pass a `fused_mapping` argument in the `quantization_config` or raise an issue on transformers https://github.com/huggingface/transformers to add its support.)ru   r   	TypeError	__class____name__modules_to_fusefuse_max_seq_lenconfigr@   AWQ_FUSED_MAPPINGSget_text_configr   r   r9   re   )r?   ra   current_fused_mappingr   r   r   r   s          rH   get_modules_to_fuser      s"    e_-TUZUdUdUmUmTnopp **6 3 C C/B/S/Sm,, ! + 
	 	 $6	6 25<<3J3J K --d-; (($88%f.CEXY 0;m,7J347J34/B/S/Sm, ! 	 N
 	
rI   c                 >  
 t        |t              rt        j                  |      }|j                  }t        | |      }t        |dd      }|t        j                  k(  rddl	m
} ddlm} ddlm} nt        d      g 
| j!                         D ]  \  }|t#        fd|D              rt%        |d	   ||       |j&                  d
k7  rt)        | |d   ||       nt*        j-                  d       t/        | |||      }	|	sv
j1                  j3                  d      d           t5        
      dkD  rc| j!                         D ]P  \  }t#        
fd
D              st7        |d      s)t7        |j8                  d      s@d|j8                  _        R | S )aJ  
    Optionally fuse some modules in the model to speedup inference.

    Args:
        model (`~PreTrainedModel`):
            The model to fuse - note this model should have been converted into AWQ format beforehand.
        quantization_config (`Union[AwqConfig, dict]`):
            The quantization configuration to use.
    r`   Nr   )QuantAttentionFused)QuantFusedMLP)FasterTransformerRMSNormz0Fusing is only supported for the AutoAWQ backendc              3   &   K   | ]  }|v  
 y wN )rW   module_name_to_not_convertrA   s     rH   rZ   z#fuse_awq_modules.<locals>.<genexpr>  s     o:T-5o   r   ipexr   z7The IPEX version AWQ does not support fuse mlp for now.rU   c              3   &   K   | ]  }v  
 y wr   r   )rW   fused_attention_parent_modulefused_attention_modulesmodule_names     rH   rZ   z#fuse_awq_modules.<locals>.<genexpr>(  s      ;X66r   r   _attn_implementationcustom)ru   dictr   	from_dictrd   r   r9   r   rf   awq.modules.fused.attnr   awq.modules.fused.mlpr   awq.modules.fused.normr   re   named_modulesrx   _fuse_awq_layernormr   _fuse_awq_mlploggerinfo_fuse_awq_attention_layersrt   splitr}   r8   r   r   )r?   ra   rd   r   r`   r   r   r   rB   attention_has_been_fusedr   r   rA   s             @@@rH   fuse_awq_modulesr      s    %t,'112EF!))G)%1DEO$%8:RTXY)111>7CKLL ++- ?f!-oXnoo 	OK8&BZ[ &&&0%u'=v}UKKQR $>6?D2E$
  $#**4::c?1+=>)?2 "#a'#(#6#6#8 	BK \s  68,H^1_9AFMM6	B LrI   c                     | D ]i  }t        ||      st        ||      } ||j                  |j                        j	                  |j                  j
                        |j                  |<   ~k y)a  
    Fuse the LayerNorm layers into a target class using autoawq

    Args:
        fuse_module_names (`List[str]`):
            The list of module names to fuse
        module (`nn.Module`):
            The pytorch parent module that has layernorm modules to fuse
        target_cls (`~autoawq.FasterTransformerRMSNorm`):
            The `FasterTransformerRMSNorm` class as it only supports that class
            for now.
    N)r8   r9   rz   variance_epsilontor{   r=   )fuse_module_namesrB   r   r   
old_modules        rH   r   r   0  sn     ) 6;' 5J+5!!++, b""))* OOK( rI   c                    t        |      dk(  ryt        ||d         rt        ||d         }t        ||d         }t        ||d         }|j                  j                  }| j
                  j                  d      }	|	j                  }
t        |
   } |||||      }|j                  dd      \  }}| j                  |      }t        |||j                  |             ~~~yy)a  
    Fuse the MLP layers into a target class using autoawq

    Args:
        model (`~PreTrainedModel`):
            The input pretrained model
        current_module_name (`str`):
            The current submodule name
        fuse_module_names (`List[str]`):
            The list of module names to fuse. For the MLP layers it has to be an array
            of length 3 that consists of the 3 MLP layers in the order (gate (dense layer post-attention) / up / down layers)
        module (`nn.Module`):
            The pytorch parent module that has layernorm modules to fuse
        target_cls (`~autoawq.QuantFusedMLP`):
            The `QuantFusedMLP` class as it only supports that class
            for now.
    r   N   r   Tr   rU   )r}   r8   r9   qweightr{   r   r   
hidden_actr   rsplitget_submodulesetattrr   )r?   current_module_namer   rB   r   r   r   r   previous_devicer   r   activation_fn
new_moduleparent_name
child_nameparents                   rH   r   r   G  s    $ "v(+,F$5a$89	&"3A"67F$5a$89	#++22 --d-;&&
z*	9g}M
"5"<"<S!"DZ$$[1
JMM/$BCw	# -rI   c                    ddl m}m} d}t        |d         dk(  r|S t	        ||d   d         r't        ||d   d         }t        ||      r|}	d}
nt        ||      r|}	d}
ntt               r_t        j                  t        j                  j                  d            t        j                  d      kD  rddl m} t        ||      r|}	d}
nt        d	      |j                  j                  }t        ||d   d         }t        ||d   d
         }t        ||d   d         }|j                   8t#        j$                  |j                   |j                   |j                   gd      nd} 	|j&                  |j(                  |j*                  |j,                  |j,                  z   |j,                  z   |j                   dut/        t1        |j3                         j5                                     j                        }t#        j$                  |j                  |j                  |j                  g
      |_        t#        j$                  |j6                  |j6                  |j6                  g|
      |_        t#        j$                  |j8                  |j8                  |j8                  g|
      |_        t        ||      r|j:                  |_        ||_         ||d   |d   |d   ||||d   |d   |j=                  dd      	      }d|_        |jA                  dd      \  }}| jC                  |      }tE        |||jG                  |             ~~~~d}|S )a  
    Fuse the Attention layers into a target class using autoawq

    Args:
        model (`~PreTrainedModel`):
            The input pretrained model
        module (`nn.Module`):
            The pytorch parent module that has layernorm modules to fuse
        modules_to_fuse (`List[str]`):
            The module fusing mapping. The dictionary has to contain a field `attention` with attention module names
            in the correct order: q, k, v, o layer
        current_module_name (`str`):
            The current submodule name
        target_cls (`~autoawq.QuantAttentionFused`):
            The `QuantAttentionFused` class as it only supports that class
            for now.
    r   )rL   rM   Fr   r   autoawqz0.2.6rQ   z'Unsupported q_proj type: {type(q_proj)}r      N)dimr   r   r   r   r   r    g     @)r   r    TrU   )$awq.modules.linearrL   rM   r}   r8   r9   ru   r   r   parse	importlibmetadatarR   re   r   r{   r^   r;   catr[   r\   r]   r:   nextiter
state_dictvaluesqzerosscalessplit_k_itersgetis_hf_transformersr   r   r   r   )r?   rB   r   r   r   rL   rM   module_has_been_fusedr   linear_target_clscat_dimrR   r   r   r   r   r^   	qkv_layerfused_attention_layerr   r   r   s                         rH   r   r   p  s   $ @!
?;'(A-$$v{3A67!=a!@Afm, -G. -G W]]93E3E3M3Mi3X%Y\c\i\ijq\r%r8&-0$1!FGG ..//!=a!@A!=a!@A!=a!@ALRKKLcuyy&++v{{FKK@aHim%LL&"5"558K8KKKKt#f'')00234;;
	 "IIv~~v~~v~~&V\cd	 99fmmV]]FMM%RX_`	 99fmmV]]FMM%RX_`	i/&,&:&:I#	 *M*1212M*%k2&**<A!
 480"5"<"<S!"DZ$$[1
$9$<$<_$MNFFF $  rI   c                     |d   t         j                  k(  rddlm}  ||       } | S |d   t         j                  k(  rddlm}  || |d   |d         } | S t        d|d          )	z
    Runs post init for Exllama layers which performs:
        - Weights unpacking, reordering and repacking
        - Devices scratch space allocation
    r   r   )exllama_post_init)exllamav2_post_initmax_input_lenmax_batch_size)r   r   rP   )r   rm   rn   r   ro   rp   r   re   )r?   rl   r   r   s       rH   post_init_awq_exllama_modulesr     s     i N$6$66@!%( L 
		"n&8&8	8D#(9)*:;
 L 9.:S9TUVVrI   c                 "    ddl m}  ||       } | S )zl
    Runs post init for IPEX layers which performs:
        - Weights packing, reordering and repacking
    r   )ipex_post_init)rr   r   )r?   r   s     rH   post_init_awq_ipex_modulesr     s     <5!ELrI   )NNNF)$__doc__r   	packagingr   activationsr   modeling_utilsr   utilsr   r   r	   r
   utils.quantization_configr   r   r   r   r;   torch.nnrv   
get_loggerr   r   r   r6   r>   boolr   r   r   r   r   r   r   r   r   rI   rH   <module>r      s4   >     , Y Y  			H	% >4L	 >!L >4L	 >4L	) :  V<$/JAi8w7?C f= oF	 &  f$ 
f$R&!R=@.&*R\!~2
rI   