|
import gradio as gr |
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
import torch |
|
import threading |
|
import os |
|
import time |
|
|
|
|
|
model_name = "Qwen/Qwen2.5-0.5B-Instruct" |
|
offload_dir = "offload" |
|
|
|
|
|
tokenizer = None |
|
model = None |
|
model_lock = threading.Lock() |
|
|
|
|
|
def load_model(): |
|
global tokenizer, model |
|
if model is None: |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
os.makedirs(offload_dir, exist_ok=True) |
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
load_in_8bit=True, |
|
device_map="auto", |
|
offload_folder=offload_dir, |
|
torch_dtype=torch.float16 |
|
) |
|
|
|
|
|
def predict(history, message, bot_name="Bot", personality="wise AI", tone="friendly"): |
|
load_model() |
|
history = history or [] |
|
|
|
history.append({"role": "user", "content": message}) |
|
|
|
|
|
system_prompt = ( |
|
f"You are {bot_name}, a {personality}.\n" |
|
f"You express emotion, think logically, and talk like a wise, emotional, intelligent human being.\n" |
|
f"Your tone is always {tone}." |
|
) |
|
|
|
|
|
messages = [{"role": "system", "content": system_prompt}] |
|
for msg in history: |
|
messages.append({"role": msg["role"], "content": msg["content"]}) |
|
|
|
text = tokenizer.apply_chat_template( |
|
messages, tokenize=False, add_generation_prompt=True |
|
) |
|
model_inputs = tokenizer([text], return_tensors="pt").to(model.device) |
|
|
|
reply = "" |
|
try: |
|
with model_lock: |
|
with torch.no_grad(): |
|
start = time.time() |
|
generated_ids = model.generate(**model_inputs, max_new_tokens=256) |
|
if time.time() - start > 30: |
|
reply = "[Response timed out]" |
|
else: |
|
generated_ids = [ |
|
output_ids[len(input_ids):] |
|
for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) |
|
] |
|
reply = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] |
|
except Exception as e: |
|
reply = f"[Error: {str(e)}]" |
|
|
|
|
|
history.append({"role": "assistant", "content": reply}) |
|
return history, reply |
|
|
|
|
|
def keep_alive(msg="ping"): |
|
return "pong" |
|
|
|
|
|
with gr.Blocks() as demo: |
|
with gr.Tab("Chatbot"): |
|
chatbot = gr.Chatbot(type="messages") |
|
msg = gr.Textbox(placeholder="Type your message here...") |
|
bot_name_input = gr.Textbox(label="Bot Name", value="Bot") |
|
personality_input = gr.Textbox(label="Personality", value="wise AI") |
|
tone_input = gr.Textbox(label="Tone", value="friendly") |
|
|
|
msg.submit( |
|
predict, |
|
inputs=[chatbot, msg, bot_name_input, personality_input, tone_input], |
|
outputs=[chatbot, msg] |
|
) |
|
|
|
with gr.Tab("Keep Alive"): |
|
box = gr.Textbox(label="Ping", value="ping", interactive=False) |
|
gr.Button("Ping").click(keep_alive, inputs=None, outputs=box) |
|
|
|
|
|
demo.queue() |
|
|
|
|
|
demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True) |
|
|