# -*- coding: utf-8 -*-
"""Fujisaki_CPU.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1Damnr0Ha4zZAlKFvne9cu76uuElLNYus
李萌萌的电子骨灰盒
----
这是一个通过ChatGLM模型训练的李萌萌的数字分身,你可以在问题栏目填入内容,或者什么都不填,来观察李萌萌到底会说些什么。
T4级别的GPU已经可以很胜任这个任务了。
### 安装依赖
"""
from modeling_chatglm import ChatGLMForConditionalGeneration
import torch
import sys
from transformers import AutoTokenizer, GenerationConfig
model = ChatGLMForConditionalGeneration.from_pretrained("THUDM/chatglm-6b").float()
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
peft_path = 'ljsabc/Fujisaki_GLM' # change it to your own
model = PeftModel.from_pretrained(
model,
peft_path,
torch_dtype=torch.float,
)
# dump a log to ensure everything works well
print(model.peft_config)
# We have to use full precision, as some tokens are >65535
model.eval()
torch.set_default_tensor_type(torch.FloatTensor)
def evaluate(context, temperature, top_p, top_k):
generation_config = GenerationConfig(
temperature=temperature,
top_p=top_p,
top_k=top_k,
#repetition_penalty=1.1,
num_beams=1,
do_sample=True,
)
with torch.no_grad():
input_text = f"Context: {context}Answer: "
ids = tokenizer.encode(input_text)
input_ids = torch.LongTensor([ids]).to('cpu')
out = model.generate(
input_ids=input_ids,
max_length=160,
generation_config=generation_config
)
out_text = tokenizer.decode(out[0]).split("Answer: ")[1]
return out_text
def evaluate_stream(msg, history, temperature, top_p):
generation_config = GenerationConfig(
temperature=temperature,
top_p=top_p,
#repetition_penalty=1.1,
num_beams=1,
do_sample=True,
)
history.append([msg, None])
context = ""
if len(history) > 4:
history.pop(0)
for j in range(len(history)):
history[j][0] = history[j][0].replace("
", "")
# concatenate context
for h in history[:-1]:
context += h[0] + "||" + h[1] + "||"
context += history[-1][0]
context = context.replace(r'
', '')
# TODO: Avoid the tokens are too long.
CUTOFF = 224
while len(tokenizer.encode(context)) > CUTOFF:
# save 15 token size for the answer
context = context[15:]
h = []
print("History:", history)
print("Context:", context)
for response, h in model.stream_chat(tokenizer, context, h, max_length=CUTOFF, top_p=top_p, temperature=temperature):
history[-1][1] = response
yield history, ""
#return response
import gradio as gr
title = """
可能是因为数据的原因,相比于提问,陈述性的上下文更容易跑出更好的结果。
""" footer = """项目在GitHub上托管,基于清华的THUDM/chatglm-6b项目。
"I'm... a boy." --Chihiro Fujisaki
""" with gr.Blocks() as demo: gr.HTML(title) state = gr.State() with gr.Row(): with gr.Column(scale=2): temp = gr.components.Slider(minimum=0, maximum=1.1, value=0.8, label="Temperature", info="温度参数,越高的温度生成的内容越丰富,但是有可能出现语法问题。小的温度也能帮助生成更相关的回答。") top_p = gr.components.Slider(minimum=0.5, maximum=1.0, value=0.975, label="Top-p", info="top-p参数,只输出前p>top-p的文字,越大生成的内容越丰富,但也可能出现语法问题。数字越小似乎上下文的衔接性越好。") #code = gr.Textbox(label="temp_output", info="解码器输出") #top_k = gr.components.Slider(minimum=1, maximum=200, step=1, value=25, label="Top k", # info="top-k参数,下一个输出的文字会从top-k个文字中进行选择,越大生成的内容越丰富,但也可能出现语法问题。数字越小似乎上下文的衔接性越好。") with gr.Column(scale=3): chatbot = gr.Chatbot(label="聊天框", info="") msg = gr.Textbox(label="输入框", placeholder="最近过得怎么样?", info="输入你的内容,按[Enter]发送。也可以什么都不填写生成随机数据。对话一般不能太长,否则就复读机了,建议清除数据。") clear = gr.Button("清除聊天") msg.submit(evaluate_stream, [msg, chatbot, temp, top_p], [chatbot, msg]) clear.click(lambda: None, None, chatbot, queue=False) gr.HTML(footer) demo.queue() demo.launch(debug=False)