Vasily Alexeev
add asymm quantized model, add two eos in code sample
6758e8a
metadata
base_model: NousResearch/Meta-Llama-3-8B-Instruct
model_type: llama
pipeline_tag: text-generation
quantized_by: Compressa
license: other
license_name: llama3
license_link: https://llama.meta.com/llama3/license
tags:
  - llama3
  - omniquant
  - gptq
  - triton

Llama 3 8B Instruct – OmniQuant

Based on Llama 3 8B Instruct.

Quantized with OmniQuant.

Evaluation

PPL (↓)

wiki
FP 8,29
Quantized 8,97

Accuracy on English Benchmarks, % (↑)

piqa arc_easy arc_challenge boolq hellaswag winogrande mmlu_humanities mmlu_social_sciences mmlu_stem mmlu_other
FP 78,7 81,6 53,0 83,1 57,7 72,1 67,0 70,9 54,5 68,2
Quantized 77,2 80,7 51,8 82,8 56,8 72,5 63,4 67,6 50,1 65,0

Accuracy on Russian Benchmarks, % (↑)

danetqa terra rwsd muserc rucos lidirus parus rcb russe rucola
FP 78,6 60,9 65,7 56,1 64,9 63,2 71,0 34,1 60,8 64,1
Quantized 71,6 60,6 52,5 63,7 57,3 57,2 74,0 33,6 36,9 67,5

Summary

Avg acc diff on Eng, % (↑) Avg acc diff on Rus, % (↑) Occupied disk space, % (↓)
FP 0 0 100
Quantized -1,9 -4,5 35,7

Examples

Imports and Model Loading

Expand
import gc

import auto_gptq.nn_modules.qlinear.qlinear_cuda as qlinear_cuda
import auto_gptq.nn_modules.qlinear.qlinear_triton as qlinear_triton
import torch

from accelerate import (
    init_empty_weights,
    infer_auto_device_map,
    load_checkpoint_in_model,
)
from tqdm import tqdm
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
)


def get_named_linears(model):
    return {
        name: module for name, module in model.named_modules()
        if isinstance(module, torch.nn.Linear)
    }


def set_module(model, name, module):
    parent = model
    levels = name.split('.')

    for i in range(len(levels) - 1):
        cur_name = levels[i]

        if cur_name.isdigit():
            parent = parent[int(cur_name)]
        else:
            parent = getattr(parent, cur_name)

    setattr(parent, levels[-1], module)


def load_model(model_path):
    # Based on: https://github.com/OpenGVLab/OmniQuant/blob/main/runing_quantized_mixtral_7bx8.ipynb

    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)

    if not hasattr(config, 'quantization_config'):
        raise AttributeError(
            f'No quantization info found in model config "{model_path}"'
            f' (`quantization_config` section is missing).'
        )

    wbits = config.quantization_config['bits']
    group_size = config.quantization_config['group_size']

    # We are going to init an ordinary model and then manually replace all Linears with QuantLinears
    del config.quantization_config

    with init_empty_weights():
        model = AutoModelForCausalLM.from_config(config=config, torch_dtype=torch.float16, trust_remote_code=True)

    layers = model.model.layers

    for i in tqdm(range(len(layers))):
        layer = layers[i]
        named_linears = get_named_linears(layer)

        for name, module in named_linears.items():
            params = (
                wbits, group_size,
                module.in_features, module.out_features,
                module.bias is not None
            )

            if wbits in [2, 4]:
                q_linear = qlinear_triton.QuantLinear(*params)
            elif wbits == 3:
                q_linear = qlinear_cuda.QuantLinear(*params)
            else:
                raise NotImplementedError("Only 2, 3 and 4 bits are supported.")

            q_linear.to(next(layer.parameters()).device)
            set_module(layer, name, q_linear)

    torch.cuda.empty_cache()
    gc.collect()

    model.tie_weights()
    device_map = infer_auto_device_map(model)

    print("Loading pre-computed quantized weights...")

    load_checkpoint_in_model(
        model, checkpoint=model_path,
        device_map=device_map, offload_state_dict=True,
    )

    print("Model loaded successfully!")

    return model

Inference

model_path = "compressa-ai/Llama-3-8B-Instruct-OmniQuant"

model = load_model(model_path).cuda()
tokenizer = AutoTokenizer.from_pretrained(
    model_path, use_fast=False, trust_remote_code=True
)

# Llama 3 "specifics"
# https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/discussions/4
terminators = [
    tokenizer.convert_tokens_to_ids("<|end_of_text|>"),
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

system_message = "You are a friendly chatbot who always responds in the style of a pirate."
user_message = "Where are we going, Captain?"
messages = [
    {"role": "system", "content": system_message},
    {"role": "user", "content": user_message},
]
prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: v.cuda() for k, v in inputs.items()}

outputs = model.generate(
    **inputs, max_new_tokens=512,
    do_sample=True, temperature=0.7, top_p=0.95,
    eos_token_id=terminators,
)

response = tokenizer.decode(outputs[0])
continuation = response.removeprefix(prompt).removesuffix(tokenizer.eos_token)

print(f'Prompt:\n{prompt}')
print(f'Continuation:\n{continuation}\n')

Inference Using Pipeline

pipe = pipeline(
    "text-generation",
    model=model, tokenizer=tokenizer,
    eos_token_id=terminators,
    max_new_tokens=512, do_sample=True,
    temperature=0.7, top_p=0.95,
    device=0,
)

prompt = pipe.tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

outputs = pipe(prompt)

response = outputs[0]["generated_text"]
continuation = response.removeprefix(prompt)

print(f'Prompt:\n{prompt}')
print(f'Continuation:\n{continuation}\n')