A Moe model built on top of Qwen1.5-7B-Chat, Qwen1.5-7B and Crystalcareai/CrystalQwen-1.5-7B.
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_id = "mzbac/qwen-1.5-2x3-hf"
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
load_in_4bit=True,
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
chat = [
{"role": "user", "content": "how backpropagation works?"},
{"role": "assistant", "content": "\n"},
]
text = tokenizer.apply_chat_template(chat, tokenize=False)
inputs = tokenizer.encode(text, return_tensors="pt").to("cuda")
generate_kwargs = dict(
input_ids=inputs,
temperature=0.6,
max_new_tokens=500,
do_sample=True,
)
outputs = model.generate(**generate_kwargs)
print(tokenizer.decode(outputs[0]))