from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig | |
model_id = "elyza/ELYZA-japanese-Llama-2-13b-fast-instruct" | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
gptq_config = GPTQConfig(bits=4, dataset = "c4", tokenizer=tokenizer, group_size=64) | |
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", quantization_config=gptq_config) | |
tokenizer.save_pretrained("./quantized") | |
model.save_pretrained("./quantized") | |