|
import torch
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
device = "cuda:0"
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("glm-4-voice-9b", trust_remote_code=True)
|
|
|
|
tokenizer.chat_template = "{{role}}: {{content}}"
|
|
|
|
query = "你好"
|
|
|
|
inputs = tokenizer.apply_chat_template([{"role": "user", "content": query}],
|
|
add_generation_prompt=True,
|
|
tokenize=True,
|
|
return_tensors="pt",
|
|
return_dict=True
|
|
)
|
|
|
|
inputs = inputs.to(device)
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
"glm-4-voice-9b",
|
|
low_cpu_mem_usage=True,
|
|
trust_remote_code=True,
|
|
load_in_4bit=True
|
|
).eval()
|
|
model.save_pretrained("glm-4-voice-9b-int4")
|
|
tokenizer.save_pretrained("glm-4-voice-9b-int4") |