File size: 3,857 Bytes
c5d6bc2
baae243
 
16c80da
5e6b787
 
baae243
16c80da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e6b787
 
16c80da
baae243
6ddacd8
baae243
 
 
 
 
 
 
 
16c80da
 
5e6b787
 
 
 
16c80da
5e6b787
baae243
16c80da
baae243
16c80da
baae243
5e6b787
 
 
 
 
 
baae243
5e6b787
 
 
 
 
 
 
 
 
 
baae243
5e6b787
 
 
 
 
baae243
5e6b787
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
baae243
5e6b787
 
 
 
 
baae243
5e6b787
baae243
5e6b787
 
baae243
 
 
 
 
 
 
 
 
 
 
 
5e6b787
baae243
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import spaces
import gradio as gr
from huggingface_hub import InferenceClient
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import subprocess

subprocess.run("rm -rf /data-nvme/zerogpu-offload/*", env={}, shell=True)
subprocess.run("pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True)

kwargs = {}

"""
https://hugging-face.cn/docs/transformers/quantization/bitsandbytes
"""

# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_compute_dtype=torch.bfloat16,
# )

# quantization_config = BitsAndBytesConfig(
#     load_in_8bit=True,
#     # llm_int8_enable_fp32_cpu_offload=True,
# )

# kwargs = { "quantization_config": quantization_config, "low_cpu_mem_usage": True }

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", trust_remote_code=True, torch_dtype=torch.bfloat16, **kwargs).cuda()

@spaces.GPU(duration=120)
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    modelx = model

    if len(message) < 1:
        message = "write a quick sort algorithm in python."

    messages = [
        { "role": "user", "content": message }
    ]

    inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(modelx.device)

    outputs = modelx.generate(inputs, max_new_tokens=max_tokens, do_sample=True, temperature=temperature, top_k=50, top_p=top_p, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)

    return tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)

"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
# client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")

# @spaces.GPU
# def respond(
#     message,
#     history: list[tuple[str, str]],
#     system_message,
#     max_tokens,
#     temperature,
#     top_p,
# ):
#     messages = [{"role": "system", "content": system_message}]

#     for val in history:
#         if val[0]:
#             messages.append({"role": "user", "content": val[0]})
#         if val[1]:
#             messages.append({"role": "assistant", "content": val[1]})

#     if len(message) < 1:
#         message = "write a quick sort algorithm in python."

#     messages.append({"role": "user", "content": message})

#     response = ""

#     for message in client.chat_completion(
#         messages,
#         max_tokens=max_tokens,
#         stream=True,
#         temperature=temperature,
#         top_p=top_p,
#     ):
#         token = message.choices[0].delta.content

#         response += token
#         yield response

"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/main/docs/gradio/chatinterface
"""

css = """
#msg_input {
    flex-grow: 7;
}
"""

demo = gr.ChatInterface(
    fn=respond,
    textbox=gr.Textbox(elem_id="msg_input", placeholder="write a quick sort algorithm in python."),
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
    css=css,
)


if __name__ == "__main__":
    demo.launch()