lyraChatGLM / demo.py
moyanwang
update demo
5f232f5
raw
history blame
1.04 kB
# coding=utf-8
from transformers import AutoTokenizer
from faster_chat_glm import GLM6B, FasterChatGLM
MAX_OUT_LEN = 100
chatglm6b_dir = './models'
tokenizer = AutoTokenizer.from_pretrained(chatglm6b_dir, trust_remote_code=True)
input_str = ["为什么我们需要对深度学习模型加速?", ]
inputs = tokenizer(input_str, return_tensors="pt", padding=True)
input_ids = inputs.input_ids.to('cuda:0')
plan_path = './models/glm6b-bs8.ftm'
# kernel for chat model.
kernel = GLM6B(plan_path=plan_path,
batch_size=1,
num_beams=1,
use_cache=True,
num_heads=32,
emb_size_per_heads=128,
decoder_layers=28,
vocab_size=150528,
max_seq_len=MAX_OUT_LEN)
chat = FasterChatGLM(model_dir="./models", kernel=kernel).half().cuda()
# generate
sample_output = chat.generate(inputs=input_ids, max_length=MAX_OUT_LEN)
# de-tokenize model output to text
res = tokenizer.decode(sample_output[0], skip_special_tokens=True)
print(res)