Spaces:

NewBreaker
/

chatglm-6b-int4

Runtime error

App Files Files Community

NewBreaker commited on Apr 25, 2023

Commit

47b54c6

1 Parent(s): 6647827

first

Browse files

Files changed (17) hide show

1.py +14 -0
1wandb.py +7 -0
2 +1 -0
PROJECT.md +18 -0
README.md +0 -12
api.py +60 -0
api_use.py +14 -0
app.py +35 -0
cli_demo.py +57 -0
demo_app.py +18 -0
demo_mult_chats.py +67 -0
demo_single_chat.py +52 -0
requirements.txt +8 -0
utils.py +54 -0
web_demo.py +104 -0
web_demo2.py +69 -0
web_demo_old.py +45 -0

1.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from transformers import AutoTokenizer, AutoModel
+tokenizer = AutoTokenizer.from_pretrained(".\\models\\chatglm-6b-int4", trust_remote_code=True, revision="")
+model = AutoModel.from_pretrained(".\\models\\chatglm-6b-int4", trust_remote_code=True, revision="").half().cuda()
+kernel_file = ".\\models\\chatglm-6b-int4\\quantization_kernels.so"
+model = model.quantize(bits=4,kernel_file=kernel_file)
+model = model.eval()
+response, history = model.chat(tokenizer, "你好", history=[])
+print(response)

1wandb.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import wandb
+wandb.login(key="b88cdc5f017d4e8c7b6a07aec184f577942139de")
+wandb.init(project="chatglm")
+print(1111)
+# import wandb
+#

2 ADDED Viewed

	@@ -0,0 +1 @@


1	+ 你好

PROJECT.md ADDED Viewed

	@@ -0,0 +1,18 @@

+# 友情链接
+以下是部分基于本仓库开发的开源项目：
+* [SwissArmyTransformer](https://github.com/THUDM/SwissArmyTransformer): 一个Transformer统一编程框架，ChatGLM-6B已经在SAT中进行实现并可以进行P-tuning微调。
+* [ChatGLM-MNN](https://github.com/wangzhaode/ChatGLM-MNN): 一个基于 MNN 的 ChatGLM-6B C++ 推理实现，支持根据显存大小自动分配计算任务给 GPU 和 CPU
+* [ChatGLM-Tuning](https://github.com/mymusise/ChatGLM-Tuning): 基于 LoRA 对 ChatGLM-6B 进行微调。类似的项目还包括 [Humanable ChatGLM/GPT Fine-tuning | ChatGLM 微调](https://github.com/hscspring/hcgf)
+* [langchain-ChatGLM](https://github.com/imClumsyPanda/langchain-ChatGLM)：基于本地知识的 ChatGLM 应用，基于LangChain
+* [bibliothecarius](https://github.com/coderabbit214/bibliothecarius)：快速构建服务以集成您的本地数据和AI模型，支持ChatGLM等本地化模型接入。
+* [闻达](https://github.com/l15y/wenda)：大型语言模型调用平台，基于 ChatGLM-6B 实现了类 ChatPDF 功能
+* [JittorLLMs](https://github.com/Jittor/JittorLLMs)：最低3G显存或者没有显卡都可运行 ChatGLM-6B FP16， 支持Linux、windows、Mac部署
+* [ChatGLM-Finetuning](https://github.com/liucongg/ChatGLM-Finetuning)：基于ChatGLM-6B模型，进行下游具体任务微调，涉及Freeze、Lora、P-tuning等，并进行实验效果对比。
+* [InstructGLM](https://github.com/yanqiangmiffy/InstructGLM)：基于ChatGLM-6B进行指令学习，汇总开源中英文指令数据，基于Lora进行指令数据微调，开放了Alpaca、Belle微调后的Lora权重，修复web_demo重复问题
+* [ChatGLM-web](https://github.com/NCZkevin/chatglm-web)：基于FastAPI和Vue3搭建的ChatGLM演示网站(支持chatglm流式输出、前端调整模型参数、上下文选择、保存图片、知识库问答等功能)
+* [glm-bot](https://github.com/initialencounter/glm-bot)：将ChatGLM接入Koishi可在各大聊天平台上调用ChatGLM
+以下是部分针对本项目的教程/文档：
+* [Windows部署文档](https://github.com/ZhangErling/ChatGLM-6B/blob/main/deployment_windows.md)
+* [ChatGLM-6B 的部署与微调教程 @ModelWhale平台](https://www.heywhale.com/mw/project/6436d82948f7da1fee2be59e)

README.md DELETED Viewed

@@ -1,12 +0,0 @@
----
-title: Chatglm 6b Int4
-emoji: 📉
-colorFrom: purple
-colorTo: yellow
-sdk: gradio
-sdk_version: 3.27.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

api.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from fastapi import FastAPI, Request
+from transformers import AutoTokenizer, AutoModel
+import uvicorn, json, datetime
+import torch
+DEVICE = "cuda"
+DEVICE_ID = "0"
+CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE
+def torch_gc():
+    if torch.cuda.is_available():
+        with torch.cuda.device(CUDA_DEVICE):
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+app = FastAPI()
+@app.post("/")
+async def create_item(request: Request):
+    global model, tokenizer
+    json_post_raw = await request.json()
+    json_post = json.dumps(json_post_raw)
+    json_post_list = json.loads(json_post)
+    prompt = json_post_list.get('prompt')
+    history = json_post_list.get('history')
+    max_length = json_post_list.get('max_length')
+    top_p = json_post_list.get('top_p')
+    temperature = json_post_list.get('temperature')
+    response, history = model.chat(tokenizer,
+                                   prompt,
+                                   history=history,
+                                   max_length=max_length if max_length else 2048,
+                                   top_p=top_p if top_p else 0.7,
+                                   temperature=temperature if temperature else 0.95)
+    now = datetime.datetime.now()
+    time = now.strftime("%Y-%m-%d %H:%M:%S")
+    answer = {
+        "response": response,
+        "history": history,
+        "status": 200,
+        "time": time
+    }
+    log = "[" + time + "] " + '", prompt:"' + prompt + '", response:"' + repr(response) + '"'
+    print(log)
+    torch_gc()
+    return answer
+if __name__ == '__main__':
+    # tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+    # model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
+    tokenizer = AutoTokenizer.from_pretrained("models/chatglm-6b-int4", trust_remote_code=True, revision="")
+    model = AutoModel.from_pretrained("models/chatglm-6b-int4", trust_remote_code=True, revision="").half().cuda()
+    model.eval()
+    uvicorn.run(app, host='0.0.0.0', port=8000, workers=1)

api_use.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import requests
+import json
+url = 'http://127.0.0.1:8000'
+headers = {
+    'Content-Type': 'application/json'
+}
+data = {
+    'prompt': '你好',
+    'history': []
+}
+response = requests.post(url=url, headers=headers, data=json.dumps(data))
+print(response.json())

app.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from transformers import AutoTokenizer, AutoModel
+import gradio as gr
+tokenizer = AutoTokenizer.from_pretrained(".\\models\\chatglm-6b-int4", trust_remote_code=True, revision="")
+model = AutoModel.from_pretrained(".\\models\\chatglm-6b-int4", trust_remote_code=True, revision="").half().cuda().cpu()
+# from transformers import AutoTokenizer, AutoModel
+# tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+# model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+# model = model.eval()
+# kernel_file = "./models/chatglm-6b-int4/quantization_kernels.so"
+# tokenizer = AutoTokenizer.from_pretrained("./models/chatglm-6b-int4", trust_remote_code=True, revision="")
+# model = AutoModel.from_pretrained("./models/chatglm-6b-int4", trust_remote_code=True, revision="").half().cuda()
+# model = AutoModel.from_pretrained("./models/chatglm-6b-int4", trust_remote_code=True, revision="").half()
+# model = model.quantize(bits=model_args.quantization_bit, kernel_file=kernel_file)
+model = model.eval()
+def chat(msg):
+    history = []
+    response, history = model.chat(tokenizer, msg, history=history)
+    print("response:", response)
+    return response
+iface = gr.Interface(fn=chat, inputs="text", outputs="text")
+iface.launch()

cli_demo.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import os
+import platform
+import signal
+from transformers import AutoTokenizer, AutoModel
+tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
+model = model.eval()
+os_name = platform.system()
+clear_command = 'cls' if os_name == 'Windows' else 'clear'
+stop_stream = False
+def build_prompt(history):
+    prompt = "欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序"
+    for query, response in history:
+        prompt += f"\n\n用户：{query}"
+        prompt += f"\n\nChatGLM-6B：{response}"
+    return prompt
+def signal_handler(signal, frame):
+    global stop_stream
+    stop_stream = True
+def main():
+    history = []
+    global stop_stream
+    print("欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序")
+    while True:
+        query = input("\n用户：")
+        if query.strip() == "stop":
+            break
+        if query.strip() == "clear":
+            history = []
+            os.system(clear_command)
+            print("欢迎使用 ChatGLM-6B 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序")
+            continue
+        count = 0
+        for response, history in model.stream_chat(tokenizer, query, history=history):
+            if stop_stream:
+                stop_stream = False
+                break
+            else:
+                count += 1
+                if count % 8 == 0:
+                    os.system(clear_command)
+                    print(build_prompt(history), flush=True)
+                    signal.signal(signal.SIGINT, signal_handler)
+        os.system(clear_command)
+        print(build_prompt(history), flush=True)
+if __name__ == "__main__":
+    main()

demo_app.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from transformers import AutoTokenizer, AutoModel
+import gradio as gr
+tokenizer = AutoTokenizer.from_pretrained(".\\models\\chatglm-6b-int4", trust_remote_code=True, revision="")
+model = AutoModel.from_pretrained(".\\models\\chatglm-6b-int4", trust_remote_code=True, revision="").half().cuda()
+model = model.eval()
+def chat(msg):
+    history = []
+    response, history = model.chat(tokenizer, msg, history=history)
+    print("response:", response)
+    return response
+iface = gr.Interface(fn=chat, inputs="text", outputs="text")
+iface.launch()

demo_mult_chats.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from transformers import AutoTokenizer, AutoModel
+tokenizer = AutoTokenizer.from_pretrained(".\\models\\chatglm-6b-int4", trust_remote_code=True, revision="")
+model = AutoModel.from_pretrained(".\\models\\chatglm-6b-int4", trust_remote_code=True, revision="").half().cuda()
+model = model.eval()
+def parse_text(text):
+    lines = text.split("\n")
+    lines = [line for line in lines if line != ""]
+    count = 0
+    for i, line in enumerate(lines):
+        if "```" in line:
+            count += 1
+            items = line.split('`')
+            if count % 2 == 1:
+                lines[i] = f'<pre><code class="language-{items[-1]}">'
+            else:
+                lines[i] = f'<br></code></pre>'
+        else:
+            if i > 0:
+                if count % 2 == 1:
+                    line = line.replace("`", "\`")
+                    line = line.replace("<", "&lt;")
+                    line = line.replace(">", "&gt;")
+                    line = line.replace(" ", "&nbsp;")
+                    line = line.replace("*", "&ast;")
+                    line = line.replace("_", "&lowbar;")
+                    line = line.replace("-", "&#45;")
+                    line = line.replace(".", "&#46;")
+                    line = line.replace("!", "&#33;")
+                    line = line.replace("(", "&#40;")
+                    line = line.replace(")", "&#41;")
+                    line = line.replace("$", "&#36;")
+                lines[i] = "<br>"+line
+    text = "".join(lines)
+    return text
+def predict(input, chatbot, max_length, top_p, temperature, history):
+    chatbot.append((parse_text(input), ""))
+    for response, history in model.stream_chat(tokenizer, input, history, max_length=max_length, top_p=top_p,
+                                               temperature=temperature):
+        chatbot[-1] = (parse_text(input), parse_text(response))
+        yield chatbot, history
+response_new = ''
+history = []
+for i in range(3000):
+    length_history = len(history)
+    if (length_history > 5):  # 如果对话长度太长，就把之前的遗忘掉
+        del history[0]
+        del history[0]
+    print('\033[1;31m{}\033[0m'.format('\nYou:'),end='')
+    msg = input()
+    print('\033[1;34m{}\033[0m'.format('ChatGLM:'),end='')
+    for chatbot, history in  predict(input=msg, chatbot=[], max_length=10000, top_p=0.5, temperature=0.5, history=history):
+        response_old = response_new
+        response_new = chatbot[0][1]
+        new_single = response_new.replace(response_old, '')
+        print(new_single,end='')

demo_single_chat.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from transformers import AutoTokenizer, AutoModel
+tokenizer = AutoTokenizer.from_pretrained(".\\models\\chatglm-6b-int4", trust_remote_code=True, revision="")
+model = AutoModel.from_pretrained(".\\models\\chatglm-6b-int4", trust_remote_code=True, revision="").half().cuda()
+kernel_file =
+model = model.quantize(bits=4, kernel_file=kernel)
+model = model.eval()
+def parse_text(text):
+    lines = text.split("\n")
+    lines = [line for line in lines if line != ""]
+    count = 0
+    for i, line in enumerate(lines):
+        if "```" in line:
+            count += 1
+            items = line.split('`')
+            if count % 2 == 1:
+                lines[i] = f'<pre><code class="language-{items[-1]}">'
+            else:
+                lines[i] = f'<br></code></pre>'
+        else:
+            if i > 0:
+                if count % 2 == 1:
+                    line = line.replace("`", "\`")
+                    line = line.replace("<", "&lt;")
+                    line = line.replace(">", "&gt;")
+                    line = line.replace(" ", "&nbsp;")
+                    line = line.replace("*", "&ast;")
+                    line = line.replace("_", "&lowbar;")
+                    line = line.replace("-", "&#45;")
+                    line = line.replace(".", "&#46;")
+                    line = line.replace("!", "&#33;")
+                    line = line.replace("(", "&#40;")
+                    line = line.replace(")", "&#41;")
+                    line = line.replace("$", "&#36;")
+                lines[i] = "<br>"+line
+    text = "".join(lines)
+    return text
+def predict(input, chatbot, max_length, top_p, temperature, history):
+    chatbot.append((parse_text(input), ""))
+    for response, history in model.stream_chat(tokenizer, input, history, max_length=max_length, top_p=top_p,
+                                               temperature=temperature):
+        chatbot[-1] = (parse_text(input), parse_text(response))
+        yield chatbot, history
+response_new = ''
+history = []
+for chatbot, history in  predict('请写一篇1000字的散文', chatbot=[], max_length=10000, top_p=0.5, temperature=0.5, history=history):
+    response_old = response_new
+    response_new = chatbot[0][1]
+    new_single = response_new.replace(response_old, '')
+    print(new_single,end='')

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+protobuf
+transformers==4.27.1
+cpm_kernels
+torch>=1.10
+gradio
+mdtex2html
+sentencepiece
+accelerate

utils.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+from typing import Dict, Tuple, Union, Optional
+from torch.nn import Module
+from transformers import AutoModel
+def auto_configure_device_map(num_gpus: int) -> Dict[str, int]:
+    # transformer.word_embeddings 占用1层
+    # transformer.final_layernorm 和 lm_head 占用1层
+    # transformer.layers 占用 28 层
+    # 总共30层分配到num_gpus张卡上
+    num_trans_layers = 28
+    per_gpu_layers = 30 / num_gpus
+    # bugfix: 在linux中调用torch.embedding传入的weight,input不在同一device上,导致RuntimeError
+    # windows下 model.device 会被设置成 transformer.word_embeddings.device
+    # linux下 model.device 会被设置成 lm_head.device
+    # 在调用chat或者stream_chat时,input_ids会被放到model.device上
+    # 如果transformer.word_embeddings.device和model.device不同,则会导致RuntimeError
+    # 因此这里将transformer.word_embeddings,transformer.final_layernorm,lm_head都放到第一张卡上
+    device_map = {'transformer.word_embeddings': 0,
+                  'transformer.final_layernorm': 0, 'lm_head': 0}
+    used = 2
+    gpu_target = 0
+    for i in range(num_trans_layers):
+        if used >= per_gpu_layers:
+            gpu_target += 1
+            used = 0
+        assert gpu_target < num_gpus
+        device_map[f'transformer.layers.{i}'] = gpu_target
+        used += 1
+    return device_map
+def load_model_on_gpus(checkpoint_path: Union[str, os.PathLike], num_gpus: int = 2,
+                       device_map: Optional[Dict[str, int]] = None, **kwargs) -> Module:
+    if num_gpus < 2 and device_map is None:
+        model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs).half().cuda()
+    else:
+        from accelerate import dispatch_model
+        model = AutoModel.from_pretrained(checkpoint_path, trust_remote_code=True, **kwargs).half()
+        if device_map is None:
+            device_map = auto_configure_device_map(num_gpus)
+        model = dispatch_model(model, device_map=device_map)
+    return model

web_demo.py ADDED Viewed

	@@ -0,0 +1,104 @@

+from transformers import AutoModel, AutoTokenizer
+import gradio as gr
+import mdtex2html
+# tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+# model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
+tokenizer = AutoTokenizer.from_pretrained(".\\models\\chatglm-6b-int4", trust_remote_code=True, revision="")
+model = AutoModel.from_pretrained(".\\models\\chatglm-6b-int4", trust_remote_code=True, revision="").half().cuda()
+model = model.eval()
+"""Override Chatbot.postprocess"""
+def postprocess(self, y):
+    if y is None:
+        return []
+    for i, (message, response) in enumerate(y):
+        y[i] = (
+            None if message is None else mdtex2html.convert((message)),
+            None if response is None else mdtex2html.convert(response),
+        )
+    return y
+gr.Chatbot.postprocess = postprocess
+def parse_text(text):
+    """copy from https://github.com/GaiZhenbiao/ChuanhuChatGPT/"""
+    lines = text.split("\n")
+    lines = [line for line in lines if line != ""]
+    count = 0
+    for i, line in enumerate(lines):
+        if "```" in line:
+            count += 1
+            items = line.split('`')
+            if count % 2 == 1:
+                lines[i] = f'<pre><code class="language-{items[-1]}">'
+            else:
+                lines[i] = f'<br></code></pre>'
+        else:
+            if i > 0:
+                if count % 2 == 1:
+                    line = line.replace("`", "\`")
+                    line = line.replace("<", "&lt;")
+                    line = line.replace(">", "&gt;")
+                    line = line.replace(" ", "&nbsp;")
+                    line = line.replace("*", "&ast;")
+                    line = line.replace("_", "&lowbar;")
+                    line = line.replace("-", "&#45;")
+                    line = line.replace(".", "&#46;")
+                    line = line.replace("!", "&#33;")
+                    line = line.replace("(", "&#40;")
+                    line = line.replace(")", "&#41;")
+                    line = line.replace("$", "&#36;")
+                lines[i] = "<br>"+line
+    text = "".join(lines)
+    return text
+def predict(input, chatbot, max_length, top_p, temperature, history):
+    chatbot.append((parse_text(input), ""))
+    for response, history in model.stream_chat(tokenizer, input, history, max_length=max_length, top_p=top_p,
+                                               temperature=temperature):
+        chatbot[-1] = (parse_text(input), parse_text(response))
+        yield chatbot, history
+def reset_user_input():
+    return gr.update(value='')
+def reset_state():
+    return [], []
+with gr.Blocks() as demo:
+    gr.HTML("""<h1 align="center">ChatGLM</h1>""")
+    chatbot = gr.Chatbot()
+    with gr.Row():
+        with gr.Column(scale=4):
+            with gr.Column(scale=12):
+                user_input = gr.Textbox(show_label=False, placeholder="Input...", lines=10).style(
+                    container=False)
+            with gr.Column(min_width=32, scale=1):
+                submitBtn = gr.Button("Submit", variant="primary")
+        with gr.Column(scale=1):
+            emptyBtn = gr.Button("Clear History")
+            max_length = gr.Slider(0, 4096, value=2048, step=1.0, label="Maximum length", interactive=True)
+            top_p = gr.Slider(0, 1, value=0.7, step=0.01, label="Top P", interactive=True)
+            temperature = gr.Slider(0, 1, value=0.95, step=0.01, label="Temperature", interactive=True)
+    history = gr.State([])
+    submitBtn.click(predict, [user_input, chatbot, max_length, top_p, temperature, history], [chatbot, history],
+                    show_progress=True)
+    submitBtn.click(reset_user_input, [], [user_input])
+    emptyBtn.click(reset_state, outputs=[chatbot, history], show_progress=True)
+demo.queue().launch(share=False, inbrowser=True)

web_demo2.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from transformers import AutoModel, AutoTokenizer
+import streamlit as st
+from streamlit_chat import message
+st.set_page_config(
+    page_title="ChatGLM-6b 演示",
+    page_icon=":robot:"
+)
+@st.cache_resource
+def get_model():
+    tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+    model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
+    model = model.eval()
+    return tokenizer, model
+MAX_TURNS = 20
+MAX_BOXES = MAX_TURNS * 2
+def predict(input, max_length, top_p, temperature, history=None):
+    tokenizer, model = get_model()
+    if history is None:
+        history = []
+    with container:
+        if len(history) > 0:
+            for i, (query, response) in enumerate(history):
+                message(query, avatar_style="big-smile", key=str(i) + "_user")
+                message(response, avatar_style="bottts", key=str(i))
+        message(input, avatar_style="big-smile", key=str(len(history)) + "_user")
+        st.write("AI正在回复:")
+        with st.empty():
+            for response, history in model.stream_chat(tokenizer, input, history, max_length=max_length, top_p=top_p,
+                                               temperature=temperature):
+                query, response = history[-1]
+                st.write(response)
+    return history
+container = st.container()
+# create a prompt text for the text generation
+prompt_text = st.text_area(label="用户命令输入",
+            height = 100,
+            placeholder="请在这儿输入您的命令")
+max_length = st.sidebar.slider(
+    'max_length', 0, 4096, 2048, step=1
+)
+top_p = st.sidebar.slider(
+    'top_p', 0.0, 1.0, 0.6, step=0.01
+)
+temperature = st.sidebar.slider(
+    'temperature', 0.0, 1.0, 0.95, step=0.01
+)
+if 'state' not in st.session_state:
+    st.session_state['state'] = []
+if st.button("发送", key="predict"):
+    with st.spinner("AI正在思考，请稍等........"):
+        # text generation
+        st.session_state["state"] = predict(prompt_text, max_length, top_p, temperature, st.session_state["state"])

web_demo_old.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from transformers import AutoModel, AutoTokenizer
+import gradio as gr
+tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True).half().cuda()
+model = model.eval()
+MAX_TURNS = 20
+MAX_BOXES = MAX_TURNS * 2
+def predict(input, max_length, top_p, temperature, history=None):
+    if history is None:
+        history = []
+    for response, history in model.stream_chat(tokenizer, input, history, max_length=max_length, top_p=top_p,
+                                               temperature=temperature):
+        updates = []
+        for query, response in history:
+            updates.append(gr.update(visible=True, value="用户：" + query))
+            updates.append(gr.update(visible=True, value="ChatGLM-6B：" + response))
+        if len(updates) < MAX_BOXES:
+            updates = updates + [gr.Textbox.update(visible=False)] * (MAX_BOXES - len(updates))
+        yield [history] + updates
+with gr.Blocks() as demo:
+    state = gr.State([])
+    text_boxes = []
+    for i in range(MAX_BOXES):
+        if i % 2 == 0:
+            text_boxes.append(gr.Markdown(visible=False, label="提问："))
+        else:
+            text_boxes.append(gr.Markdown(visible=False, label="回复："))
+    with gr.Row():
+        with gr.Column(scale=4):
+            txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter", lines=11).style(
+                container=False)
+        with gr.Column(scale=1):
+            max_length = gr.Slider(0, 4096, value=2048, step=1.0, label="Maximum length", interactive=True)
+            top_p = gr.Slider(0, 1, value=0.7, step=0.01, label="Top P", interactive=True)
+            temperature = gr.Slider(0, 1, value=0.95, step=0.01, label="Temperature", interactive=True)
+            button = gr.Button("Generate")
+    button.click(predict, [txt, max_length, top_p, temperature, state], [state] + text_boxes)
+demo.queue().launch(share=False, inbrowser=True)