|
import os |
|
from transformers import AutoModel |
|
from accelerate import Accelerator, init_empty_weights |
|
from accelerate.utils import BnbQuantizationConfig, load_and_quantize_model |
|
|
|
|
|
os.environ["TRANSFORMERS_OFFLINE"] = "1" |
|
|
|
|
|
model_fp32 = AutoModel.from_pretrained("./models/all-MiniLM-L6-v2") |
|
with init_empty_weights(): |
|
empty_model = model_fp32 |
|
|
|
|
|
weights_location = "./models/all-MiniLM-L6-v2-unquantized/pytorch_model.bin" |
|
|
|
|
|
bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True, llm_int8_threshold=6) |
|
|
|
|
|
quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location, |
|
bnb_quantization_config=bnb_quantization_config, device_map="auto") |
|
|
|
|
|
accelerator = Accelerator() |
|
new_weights_location = "./models/all-MiniLM-L6-v2-unquantized-q8" |
|
accelerator.save_model(quantized_model, new_weights_location) |
|
|