Sergei Skvortsov
New calibration
ab05f76
from datasets import load_dataset
from transformers import AutoTokenizer
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
from llmcompressor.transformers.compression.helpers import (
calculate_offload_device_map,
)
recipe = """
quant_stage:
quant_modifiers:
QuantizationModifier:
ignore: ["lm_head"]
config_groups:
group_0:
weights:
num_bits: 8
type: float
strategy: tensor
dynamic: false
symmetric: true
input_activations:
num_bits: 8
type: float
strategy: tensor
dynamic: false
symmetric: true
targets: ["Linear"]
"""
model_stub = "southfreebird/Qwen2.5-0.5B-Instruct"
model_name = model_stub.split("/")[-1]
device_map = calculate_offload_device_map(
model_stub, reserve_for_hessians=False, num_gpus=2, torch_dtype="auto"
)
model = SparseAutoModelForCausalLM.from_pretrained(
model_stub, torch_dtype="auto", device_map=device_map
)
tokenizer = AutoTokenizer.from_pretrained(model_stub)
output_dir = f"/{model_name}-FP8"
DATASET_ID = "neuralmagic/LLM_compression_calibration"
DATASET_SPLIT = "train"
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 8192
def preprocess_fn(example):
return {"text": tokenizer.apply_chat_template(example["messages"], add_generation_prompt=False, tokenize=False)}
ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
ds = ds.shuffle().select(range(NUM_CALIBRATION_SAMPLES))
ds = ds.map(preprocess_fn)
oneshot(
model=model,
output_dir=output_dir,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
save_compressed=True,
)