chat_with_llm

Sleeping

App Files Files Community

chat_with_llm / main.py

qgyd2021

[update]

7cee691 8 months ago

raw history blame contribute delete

No virus

9.19 kB

	#!/usr/bin/python3
	# -- coding: utf-8 --
	from typing import List, Tuple
	from threading import Thread

	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from transformers.generation.streamers import TextIteratorStreamer
	import torch

	from project_settings import project_path


	def greet(question: str, history: List[Tuple[str, str]]):
	answer = "Hello " + question + "!"
	result = history + [(question, answer)]
	return result


	model_map: dict = dict()


	def init_model(pretrained_model_name_or_path: str):
	device: str = "cuda" if torch.cuda.is_available() else "cpu"

	global model_map
	if pretrained_model_name_or_path not in model_map.keys():
	# clear
	for k1, v1 in model_map.items():
	for k2, v2 in v1.items():
	del v2
	model_map = dict()

	# build model
	model = AutoModelForCausalLM.from_pretrained(
	pretrained_model_name_or_path,
	trust_remote_code=True,
	low_cpu_mem_usage=True,
	# torch_dtype=torch.bfloat16,
	torch_dtype=torch.float16,
	device_map="auto",
	offload_folder="./offload",
	offload_state_dict=True,
	# load_in_4bit=True,
	)
	if model.config.model_type == "chatglm":
	model = model.eval()
	else:
	# model = model.bfloat16().eval()
	model = model.eval()

	tokenizer = AutoTokenizer.from_pretrained(
	pretrained_model_name_or_path,
	trust_remote_code=True,
	# llama不支持fast
	use_fast=False if model.config.model_type == "llama" else True,
	padding_side="left"
	)

	# QWenTokenizer比较特殊, pad_token_id, bos_token_id, eos_token_id 均为None. eod_id对应的token为<\|endoftext\|>
	if tokenizer.__class__.__name__ == "QWenTokenizer":
	tokenizer.pad_token_id = tokenizer.eod_id
	tokenizer.bos_token_id = tokenizer.eod_id
	tokenizer.eos_token_id = tokenizer.eod_id

	model_map[pretrained_model_name_or_path] = {
	"model": model,
	"tokenizer": tokenizer,
	}
	else:
	model = model_map[pretrained_model_name_or_path]["model"]
	tokenizer = model_map[pretrained_model_name_or_path]["tokenizer"]
	return model, tokenizer


	def chat_with_llm_non_stream(question: str,
	history: List[Tuple[str, str]],
	pretrained_model_name_or_path: str,
	max_new_tokens: int, top_p: float, temperature: float, repetition_penalty: float,
	history_max_len: int,
	):
	device: str = "cuda" if torch.cuda.is_available() else "cpu"

	model, tokenizer = init_model(pretrained_model_name_or_path)

	# input_ids
	if model.config.model_type == "chatglm":
	input_ids = []
	else:
	input_ids = [tokenizer.bos_token_id]

	# history
	utterances = list()
	for idx, (h_question, h_answer) in enumerate(history):
	if model.config.model_type == "chatglm":
	h_question = "[Round {}]\n\n问：{}\n\n答：".format(idx, h_question)
	utterances.append(h_question)
	utterances.append(h_answer)
	utterances.append(question)

	encoded_utterances = tokenizer.__call__(utterances, add_special_tokens=False)
	encoded_utterances = encoded_utterances["input_ids"]

	for encoded_utterance in encoded_utterances:
	input_ids.extend(encoded_utterance)
	if model.config.model_type == "chatglm":
	input_ids.append(tokenizer.eos_token_id)

	input_ids = torch.tensor([input_ids], dtype=torch.long)
	input_ids = input_ids[:, -history_max_len:].to(device)

	with torch.no_grad():
	outputs = model.generate(
	input_ids=input_ids,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	top_p=top_p,
	temperature=temperature,
	repetition_penalty=repetition_penalty,
	eos_token_id=tokenizer.eos_token_id
	)
	outputs = outputs.tolist()[0][len(input_ids[0]):]
	answer = tokenizer.decode(outputs)
	answer = answer.strip().replace(tokenizer.eos_token, "").strip()

	result = history + [(question, answer)]
	return result


	def chat_with_llm_streaming(question: str,
	history: List[Tuple[str, str]],
	pretrained_model_name_or_path: str,
	max_new_tokens: int, top_p: float, temperature: float, repetition_penalty: float,
	history_max_len: int,
	):
	device: str = "cuda" if torch.cuda.is_available() else "cpu"

	model, tokenizer = init_model(pretrained_model_name_or_path)

	# history
	utterances = list()
	for idx, (h_question, h_answer) in enumerate(history):
	if model.config.model_type == "chatglm":
	h_question = "[Round {}]\n\n问：{}\n\n答：".format(idx, h_question)
	elif model.config.model_type == "llama2":
	h_question = "Question: {}\n\nAnswer: ".format(h_question)
	utterances.append(h_question)
	utterances.append(h_answer)
	utterances.append(question)

	encoded_utterances = tokenizer.__call__(utterances, add_special_tokens=False)
	encoded_utterances = encoded_utterances["input_ids"]

	# input_ids
	if model.config.model_type == "chatglm":
	input_ids = []
	else:
	input_ids = [tokenizer.bos_token_id]

	for encoded_utterance in encoded_utterances:
	input_ids.extend(encoded_utterance)
	if model.config.model_type != "chatglm":
	input_ids.append(tokenizer.eos_token_id)

	input_ids = torch.tensor([input_ids], dtype=torch.long)
	input_ids = input_ids[:, -history_max_len:].to(device)

	streamer = TextIteratorStreamer(tokenizer=tokenizer)

	generation_kwargs = dict(
	inputs=input_ids,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	top_p=top_p,
	temperature=temperature,
	repetition_penalty=repetition_penalty,
	eos_token_id=tokenizer.eos_token_id,
	pad_token_id=tokenizer.pad_token_id,
	streamer=streamer,
	)
	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	answer = ""
	for output_ in streamer:
	output_ = output_.replace(question, "")
	output_ = output_.replace(tokenizer.eos_token, "")

	answer += output_

	result = [(question, answer)]

	yield history + result


	def main():
	description = """
	chat llm
	"""

	with gr.Blocks() as blocks:
	gr.Markdown(value=description)

	chatbot = gr.Chatbot([], elem_id="chatbot", height=400)
	with gr.Row():
	with gr.Column(scale=4):
	text_box = gr.Textbox(show_label=False, placeholder="Enter text and press enter", container=False)
	with gr.Column(scale=1):
	submit_button = gr.Button("💬Submit")
	with gr.Column(scale=1):
	clear_button = gr.Button(
	'🗑️Clear',
	variant='secondary',
	)

	with gr.Row():
	with gr.Column(scale=1):
	max_new_tokens = gr.Slider(minimum=0, maximum=512, value=512, step=1, label="max_new_tokens")
	with gr.Column(scale=1):
	top_p = gr.Slider(minimum=0, maximum=1, value=0.85, step=0.01, label="top_p")
	with gr.Column(scale=1):
	temperature = gr.Slider(minimum=0, maximum=1, value=0.35, step=0.01, label="temperature")
	with gr.Column(scale=1):
	repetition_penalty = gr.Slider(minimum=0, maximum=2, value=1.2, step=0.01, label="repetition_penalty")
	with gr.Column(scale=1):
	history_max_len = gr.Slider(minimum=0, maximum=4096, value=1024, step=1, label="history_max_len")

	with gr.Row():
	with gr.Column(scale=1):
	model_name = gr.Dropdown(
	choices=[
	"Qwen/Qwen-7B-Chat",
	"THUDM/chatglm2-6b",
	"baichuan-inc/Baichuan2-7B-Chat",
	"qgyd2021/sft_llama2_stack_exchange"
	],
	value="Qwen/Qwen-7B-Chat",
	label="model_name",
	)
	gr.Examples(examples=["你好"], inputs=text_box)

	inputs = [
	text_box, chatbot, model_name,
	max_new_tokens, top_p, temperature, repetition_penalty,
	history_max_len
	]
	outputs = [
	chatbot
	]
	text_box.submit(chat_with_llm_streaming, inputs, outputs)
	submit_button.click(chat_with_llm_streaming, inputs, outputs)
	clear_button.click(
	fn=lambda: ('', ''),
	outputs=[text_box, chatbot],
	queue=False,
	api_name=False,
	)

	blocks.queue().launch()

	return


	if __name__ == '__main__':
	main()