yujiepan/Meta-Llama-3-8B-Instruct-awq-w4g64-v2
This model applies AutoAWQ on meta-llama/Meta-Llama-3-8B-Instruct.
- 4-bit asymmetric weight only quantization
- group_size=64
- skip last layer FFN
- calibration set: pileval
Accuracy
model | precision | wikitext ppl (↓) |
---|---|---|
meta-llama/Meta-Llama-3-8B-Instruct | FP16 | 10.842 |
yujiepan/Meta-Llama-3-8B-Instruct-awq-w4g64 | w4g64 | 10.943 |
yujiepan/Meta-Llama-3-8B-Instruct-awq-w4g64-v2 | w4g64, skip last layer's FFN | 10.928 |
Note:
- Evaluated on lm-evaluation-harness "wikitext" task
- Wikitext PPL does not guarantee actual accuracy, but helps to check the distortion after quantization.
Usage
model = AutoModelForCausalLM.from_pretrained('<MODEL_ID>', torch_dtype=torch.float16)
Codes
from unittest.mock import patch
import torch
from awq import AutoAWQForCausalLM
from awq.models.llama import LlamaAWQForCausalLM
from transformers import AutoTokenizer
module2fullname = {}
def exclude_layers_to_not_quantize(linear_layers, modules_to_not_convert):
if modules_to_not_convert is None:
return linear_layers
filtered_layers = {}
for name, linear_layer in linear_layers.items():
full_name = module2fullname[linear_layer]
if not any(key in full_name for key in modules_to_not_convert):
filtered_layers[name] = linear_layer
else:
print('Skipping', full_name)
return filtered_layers
class PatchedLlamaAWQForCausalLM(LlamaAWQForCausalLM):
@staticmethod
def get_layers_for_scaling(module, input_feat, module_kwargs):
print(input_feat.keys())
layers = []
# attention input
if 'self_attn.q_proj' in input_feat:
layers.append(
dict(
prev_op=module.input_layernorm,
layers=[
module.self_attn.q_proj,
module.self_attn.k_proj,
module.self_attn.v_proj,
],
inp=input_feat["self_attn.q_proj"],
module2inspect=module.self_attn,
kwargs=module_kwargs,
)
)
# attention out
# Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696
if 'self_attn.o_proj' in input_feat:
if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape:
layers.append(
dict(
prev_op=module.self_attn.v_proj,
layers=[module.self_attn.o_proj],
inp=input_feat["self_attn.o_proj"],
)
)
if 'mlp.gate_proj' in input_feat:
# linear 1
layers.append(
dict(
prev_op=module.post_attention_layernorm,
layers=[module.mlp.gate_proj, module.mlp.up_proj],
inp=input_feat["mlp.gate_proj"],
module2inspect=module.mlp,
)
)
if 'mlp.down_proj' in input_feat:
# linear 2
layers.append(
dict(
prev_op=module.mlp.up_proj,
layers=[module.mlp.down_proj],
inp=input_feat["mlp.down_proj"],
)
)
return layers
quant_config = {
"zero_point": True, "q_group_size": 64, "w_bit": 4, "version": "GEMM",
"modules_to_not_convert": [
'layers.31.mlp',
],
}
with patch('awq.quantize.quantizer.exclude_layers_to_not_quantize', exclude_layers_to_not_quantize):
model_path = "meta-llama/Meta-Llama-3-8B-Instruct"
# model_path = 'yujiepan/meta-llama-3-tiny-random'
model = PatchedLlamaAWQForCausalLM.from_pretrained(model_path, model_type='llama', device_map='cuda')
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
module2fullname = {module: name for name, module in model.named_modules()}
model.quantize(tokenizer, quant_config=quant_config)
- Downloads last month
- 10
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social
visibility and check back later, or deploy to Inference Endpoints (dedicated)
instead.