File size: 5,254 Bytes
9b6a4ab
 
8d030a2
 
9c042fd
 
4947e7b
9c042fd
 
 
89cb869
8d030a2
de222eb
8d030a2
9b6a4ab
360d9e4
8d030a2
89cb869
 
 
 
 
 
 
 
 
 
8d030a2
 
 
89cb869
 
 
 
d7ec399
89cb869
 
 
 
 
 
 
 
634ed9b
7eb763b
634ed9b
89cb869
 
d7ec399
89cb869
60399ca
 
360d9e4
9c042fd
 
634ed9b
 
 
 
 
360d9e4
9c042fd
da75503
9c042fd
adb2ab9
9c042fd
360d9e4
 
53c5ff4
634ed9b
 
 
 
 
 
 
 
 
 
53c5ff4
634ed9b
 
360d9e4
adb2ab9
 
53c5ff4
 
634ed9b
adb2ab9
 
 
d82d66e
634ed9b
d82d66e
adb2ab9
 
7eb763b
 
 
adb2ab9
53c5ff4
634ed9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53c5ff4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""Test various models."""
# pylint: disable=invalid-name, line-too-long,broad-exception-caught, protected-access
import os
import time
from pathlib import Path

import gradio as gr
import torch
from loguru import logger
from transformers import AutoModel, AutoTokenizer

# ruff: noqa: E402
# os.system("pip install --upgrade torch transformers sentencepiece scipy cpm_kernels accelerate bitsandbytes loguru")

# os.system("pip install torch transformers sentencepiece loguru")


# fix timezone in Linux
os.environ["TZ"] = "Asia/Shanghai"
try:
    time.tzset()  # type: ignore # pylint: disable=no-member
except Exception:
    # Windows
    logger.warning("Windows, cant run time.tzset()")

model_name = "THUDM/chatglm2-6b-int4"  # 3.9G

tokenizer = AutoTokenizer.from_pretrained(
    "THUDM/chatglm2-6b-int4", trust_remote_code=True
)

has_cuda = torch.cuda.is_available()
# has_cuda = False  # force cpu

logger.debug("load")
if has_cuda:
    if model_name.endswith("int4"):
        model = AutoModel.from_pretrained(model_name, trust_remote_code=True).cuda()
    else:
        model = (
            AutoModel.from_pretrained(model_name, trust_remote_code=True).cuda().half()
        )
else:
    model = (
        AutoModel.from_pretrained(model_name, trust_remote_code=True).float()
    )  # .float() .half().float(): must use float for cpu

model = model.eval()
logger.debug("done load")

# tokenizer = AutoTokenizer.from_pretrained("openchat/openchat_v2_w")
# model = AutoModelForCausalLM.from_pretrained("openchat/openchat_v2_w", load_in_8bit_fp32_cpu_offload=True, load_in_8bit=True)

# locate model file cache
cache_loc = Path("~/.cache/huggingface/hub").expanduser()
model_cache_path = [
    elm
    for elm in Path(cache_loc).rglob("*")
    if Path(model_name).name in elm.as_posix() and "pytorch_model.bin" in elm.as_posix()
]

logger.debug(f"{model_cache_path=}")

if model_cache_path:
    model_size_gb = model_cache_path[0].stat().st_size / 2**30
    logger.info(f"{model_name=} {model_size_gb=:.2f} GB")


def respond(message, chat_history):
    """Gen a response."""
    message = message.strip()
    response, chat_history = model.chat(
        tokenizer,
        message,
        history=chat_history,
        temperature=0.7,
        repetition_penalty=1.2,
        max_length=128,
    )
    chat_history.append((message, response))
    return message, chat_history


theme = gr.themes.Soft(text_size="sm")
with gr.Blocks(theme=theme) as block:
    chatbot = gr.Chatbot()

    with gr.Row():
        with gr.Column(scale=12):
            msg = gr.Textbox()
        with gr.Column(scale=1, min_width=16):
                btn = gr.Button("Send")
        with gr.Column(scale=1, min_width=8):
                clear = gr.ClearButton([msg, chatbot])

    # do not clear prompt
    btn.submit(respond, [msg, chatbot], [msg, chatbot])

    msg.click(lambda x, y: ("",) + respond(x, y)[1:], [msg, chatbot], [msg, chatbot])


    with gr.Accordion("Example inputs", open=True):
        etext = """In America, where cars are an important part of the national psyche, a decade ago people had suddenly started to drive less, which had not happened since the oil shocks of the 1970s. """
        examples = gr.Examples(
            examples=[
                ["Explain the plot of Cinderella in a sentence."],
                [
                    "How long does it take to become proficient in French, and what are the best methods for retaining information?"
                ],
                ["What are some common mistakes to avoid when writing code?"],
                ["Build a prompt to generate a beautiful portrait of a horse"],
                ["Suggest four metaphors to describe the benefits of AI"],
                ["Write a pop song about leaving home for the sandy beaches."],
                ["Write a summary demonstrating my ability to tame lions"],
                ["鲁迅和周树人什么关系"],
                ["从前有一头牛,这头牛后面有什么?"],
                ["正无穷大加一大于正无穷大吗?"],
                ["正无穷大加正无穷大大于正无穷大吗?"],
                ["-2的平方根等于什么"],
                ["树上有5只鸟,猎人开枪打死了一只。树上还有几只鸟?"],
                ["树上有11只鸟,猎人开枪打死了一只。树上还有几只鸟?提示:需考虑鸟可能受惊吓飞走。"],
                ["鲁迅和周树人什么关系 用英文回答"],
                ["以红楼梦的行文风格写一张委婉的请假条。不少于320字。"],
                [f"{etext} 翻成中文,列出3个版本"],
                [f"{etext} \n 翻成中文,保留原意,但使用文学性的语言。不要写解释。列出3个版本"],
                ["js 判断一个数是不是质数"],
                ["js 实现python 的 range(10)"],
                ["js 实现python 的 [*(range(10)]"],
                ["假定 1 + 2 = 4, 试求 7 + 8"],
                ["Erkläre die Handlung von Cinderella in einem Satz."],
                ["Erkläre die Handlung von Cinderella in einem Satz. Auf Deutsch"],
            ],
            inputs=[msg],
            examples_per_page=60,
        )

block.queue().launch()