|
|
|
""" |
|
Quantize Qwen/Qwen3-235B-A22B (MoE) to INT4-W4A16 on a CPU-only machine. |
|
Output: Qwen3-235B-A22B-INT4-W4A16 |
|
""" |
|
|
|
import os, warnings |
|
import torch |
|
from accelerate import init_empty_weights, infer_auto_device_map |
|
from transformers import AutoModelForCausalLM |
|
from llmcompressor import oneshot |
|
from llmcompressor.modifiers.quantization import QuantizationModifier |
|
|
|
|
|
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = "" |
|
warnings.filterwarnings("ignore", message="Can't initialize NVML") |
|
|
|
model_id = "Qwen/Qwen3-235B-A22B" |
|
output_dir = "Qwen3-235B-A22B-INT4-W4A16" |
|
|
|
|
|
|
|
with init_empty_weights(): |
|
dummy = AutoModelForCausalLM.from_pretrained( |
|
model_id, torch_dtype=torch.bfloat16, trust_remote_code=True |
|
) |
|
device_map = infer_auto_device_map( |
|
dummy, no_split_module_classes=dummy._no_split_modules |
|
) |
|
del dummy |
|
|
|
|
|
device_map = {name: "cpu" for name in device_map} |
|
|
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_id, |
|
device_map=device_map, |
|
torch_dtype=torch.bfloat16, |
|
trust_remote_code=True, |
|
) |
|
|
|
|
|
|
|
recipe = QuantizationModifier( |
|
targets="Linear", |
|
scheme="W4A16", |
|
ignore=[ |
|
"lm_head", |
|
r"re:.*\.mlp\.gate$", |
|
], |
|
dampening_frac=0.1, |
|
) |
|
|
|
|
|
|
|
oneshot( |
|
model=model, |
|
recipe=recipe, |
|
output_dir=output_dir, |
|
) |
|
|
|
print(f"\n✅ Quantized model written to: {output_dir}") |
|
print( " (router gates & lm_head remain in BF16; everything else INT4 W4A16)") |
|
|