import os
from transformers import AutoModel
from accelerate import Accelerator, init_empty_weights
from accelerate.utils import BnbQuantizationConfig, load_and_quantize_model

# Make sure transformers works offline
os.environ["TRANSFORMERS_OFFLINE"] = "1"

# 1. Initialize the empty model
model_fp32 = AutoModel.from_pretrained("./models/all-MiniLM-L6-v2")
with init_empty_weights():
    empty_model = model_fp32

# 2. Get the path to the weights of your model. For now, we'll assume it's in the same folder.
weights_location = "./models/all-MiniLM-L6-v2-unquantized/pytorch_model.bin"

# 3. Set quantization configuration (8-bit for this example)
bnb_quantization_config = BnbQuantizationConfig(load_in_8bit=True, llm_int8_threshold=6)

# 4. Quantize the empty model
quantized_model = load_and_quantize_model(empty_model, weights_location=weights_location,
                                          bnb_quantization_config=bnb_quantization_config, device_map="auto")

# 5. Save the quantized model
accelerator = Accelerator()
new_weights_location = "./models/all-MiniLM-L6-v2-unquantized-q8"
accelerator.save_model(quantized_model, new_weights_location)