|
--- |
|
library_name: transformers |
|
tags: [] |
|
--- |
|
|
|
# yujiepan/Meta-Llama-3-8B-awq-w4g64-v2 |
|
This model applies AutoAWQ on [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B). |
|
|
|
- 4-bit asymmetric weight only quantization |
|
- group_size=64 |
|
- skip last layer FFN |
|
- calibration set: pileval |
|
|
|
## Accuracy |
|
| model | precision | wikitext ppl (↓) | |
|
|-|-|-| |
|
| meta-llama/Meta-Llama-3-8B | FP16 | 9.179 | |
|
| yujiepan/Meta-Llama-3-8B-awq-w4g64 | w4g64 | 9.219 | |
|
| yujiepan/Meta-Llama-3-8B-awq-w4g64-v2 | w4g64, skip last layer's FFN | 9.278 | |
|
|
|
Note: |
|
- Evaluated on lm-evaluation-harness "wikitext" task |
|
- Wikitext PPL does not guarantee actual accuracy, but helps to check the distortion after quantization. |
|
|
|
## Usage |
|
```python |
|
model = AutoModelForCausalLM.from_pretrained('<MODEL_ID>', torch_dtype=torch.float16) |
|
``` |
|
|
|
## Codes |
|
```python |
|
from unittest.mock import patch |
|
|
|
import torch |
|
|
|
from awq import AutoAWQForCausalLM |
|
from awq.models.llama import LlamaAWQForCausalLM |
|
from transformers import AutoTokenizer |
|
|
|
module2fullname = {} |
|
|
|
|
|
def exclude_layers_to_not_quantize(linear_layers, modules_to_not_convert): |
|
if modules_to_not_convert is None: |
|
return linear_layers |
|
filtered_layers = {} |
|
for name, linear_layer in linear_layers.items(): |
|
full_name = module2fullname[linear_layer] |
|
if not any(key in full_name for key in modules_to_not_convert): |
|
filtered_layers[name] = linear_layer |
|
else: |
|
print('Skipping', full_name) |
|
return filtered_layers |
|
|
|
|
|
class PatchedLlamaAWQForCausalLM(LlamaAWQForCausalLM): |
|
@staticmethod |
|
def get_layers_for_scaling(module, input_feat, module_kwargs): |
|
print(input_feat.keys()) |
|
layers = [] |
|
# attention input |
|
if 'self_attn.q_proj' in input_feat: |
|
layers.append( |
|
dict( |
|
prev_op=module.input_layernorm, |
|
layers=[ |
|
module.self_attn.q_proj, |
|
module.self_attn.k_proj, |
|
module.self_attn.v_proj, |
|
], |
|
inp=input_feat["self_attn.q_proj"], |
|
module2inspect=module.self_attn, |
|
kwargs=module_kwargs, |
|
) |
|
) |
|
# attention out |
|
# Please refer to https://github.com/mit-han-lab/llm-awq/pull/67#issue-1850622696 |
|
if 'self_attn.o_proj' in input_feat: |
|
if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape: |
|
layers.append( |
|
dict( |
|
prev_op=module.self_attn.v_proj, |
|
layers=[module.self_attn.o_proj], |
|
inp=input_feat["self_attn.o_proj"], |
|
) |
|
) |
|
|
|
if 'mlp.gate_proj' in input_feat: |
|
# linear 1 |
|
layers.append( |
|
dict( |
|
prev_op=module.post_attention_layernorm, |
|
layers=[module.mlp.gate_proj, module.mlp.up_proj], |
|
inp=input_feat["mlp.gate_proj"], |
|
module2inspect=module.mlp, |
|
) |
|
) |
|
|
|
if 'mlp.down_proj' in input_feat: |
|
# linear 2 |
|
layers.append( |
|
dict( |
|
prev_op=module.mlp.up_proj, |
|
layers=[module.mlp.down_proj], |
|
inp=input_feat["mlp.down_proj"], |
|
) |
|
) |
|
return layers |
|
|
|
|
|
quant_config = { |
|
"zero_point": True, "q_group_size": 64, "w_bit": 4, "version": "GEMM", |
|
"modules_to_not_convert": [ |
|
'layers.31.mlp', |
|
], |
|
} |
|
with patch('awq.quantize.quantizer.exclude_layers_to_not_quantize', exclude_layers_to_not_quantize): |
|
model_path = "meta-llama/Meta-Llama-3-8B" |
|
# model_path = 'yujiepan/meta-llama-3-tiny-random' |
|
model = PatchedLlamaAWQForCausalLM.from_pretrained(model_path, model_type='llama', device_map='cuda') |
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) |
|
module2fullname = {module: name for name, module in model.named_modules()} |
|
model.quantize(tokenizer, quant_config=quant_config) |
|
``` |
|
|