''' Tested on: transformers==4.38.1, autoawq=0.2.3 Run on 1 card (mem>=18G) ''' import torch from awq.quantize.quantizer import AwqQuantizer from awq.quantize.quantizer import * from awq import AutoAWQForCausalLM from transformers import AutoTokenizer from unittest.mock import patch class FalconAwqQuantizer(AwqQuantizer): def quantize(self): print('Patched!') for i in tqdm(range(len(self.modules)), desc="AWQ"): # Move module and inputs to correct device common_device = next(self.modules[i].parameters()).device if common_device is None or str(common_device) == "cpu": if torch.cuda.is_available(): best_device = "cuda:" + str(i % torch.cuda.device_count()) else: best_device = get_best_device() self.modules[i] = self.modules[i].to(best_device) common_device = next(self.modules[i].parameters()).device if self.module_kwargs.get("position_ids") is not None: self.module_kwargs["position_ids"] = self.module_kwargs[ "position_ids" ].to(common_device) if self.module_kwargs.get("attention_mask") is not None: self.module_kwargs["attention_mask"] = self.module_kwargs[ "attention_mask" ].to(common_device) # include alibi if self.module_kwargs.get("alibi") is not None: self.module_kwargs["alibi"] = self.module_kwargs[ "alibi" ].to(common_device) else: self.module_kwargs['alibi'] = None print(f'alibi=None in layer {i}, this is expected if use_alibi=False.') self.inps = self.inps.to(common_device) # [STEP 1]: Get layer, extract linear modules, extract input features named_linears = get_named_linears(self.modules[i]) # Filter out the linear layers we don't want to exclude named_linears = exclude_layers_to_not_quantize( named_linears, self.modules_to_not_convert ) input_feat = self._get_input_feat(self.modules[i], named_linears) clear_memory() # [STEP 2]: Compute and apply scale list module_config: List[Dict] = self.awq_model.get_layers_for_scaling( self.modules[i], input_feat, self.module_kwargs ) scales_list = [ self._search_best_scale(self.modules[i], **layer) for layer in module_config ] apply_scale(self.modules[i], scales_list, input_feat_dict=input_feat) scales_list = append_str_prefix( scales_list, get_op_name(self.model, self.modules[i]) + "." ) # [STEP 3]: Compute and apply clipping list clip_list = self._search_best_clip( self.modules[i], named_linears, input_feat ) apply_clip(self.modules[i], clip_list) clip_list = append_str_prefix( clip_list, get_op_name(self.model, self.modules[i]) + "." ) # [STEP 4]: Quantize weights if not self.export_compatible: self._apply_quant(self.modules[i], named_linears) clear_memory() model_path = 'tiiuae/falcon-40b' # model_path = 'yujiepan/falcon-new-tiny-random' quant_path = 'falcon-40b-autoawq-w4g128' quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"} # Load model model = AutoAWQForCausalLM.from_pretrained( model_path, device_map='cpu', trust_remote_code=False, **{"low_cpu_mem_usage": True, "use_cache": False} ) tokenizer = AutoTokenizer.from_pretrained(model_path) # Quantize with patch('awq.models.base.AwqQuantizer', FalconAwqQuantizer): model.quantize(tokenizer, quant_config=quant_config) # Save quantized model model.save_quantized(quant_path) tokenizer.save_pretrained(quant_path) print(f'Model is quantized and saved at "{quant_path}"')