File size: 3,448 Bytes
1489919 a9670a5 a74f64b dd3768c 5ed6726 1489919 dd3768c 16658fa dd3768c 1489919 dd3768c a9670a5 dd3768c 5ed6726 dd3768c 5ed6726 dd3768c 5ed6726 dd3768c 5ed6726 3dcc866 5ed6726 a74f64b a9670a5 dd3768c 5ed6726 dd3768c 5ed6726 dd3768c a9670a5 5ed6726 5520bbf 3dcc866 5ed6726 dd3768c 3dcc866 dd3768c 5ed6726 dd3768c 5ed6726 dd3768c 30480d0 1489919 5ed6726 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import threading
import os
import time
# Model config
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
offload_dir = "offload"
# Global variables
tokenizer = None
model = None
model_lock = threading.Lock()
# Lazy-load the model with quantization & offloading
def load_model():
global tokenizer, model
if model is None:
tokenizer = AutoTokenizer.from_pretrained(model_name)
os.makedirs(offload_dir, exist_ok=True)
model = AutoModelForCausalLM.from_pretrained(
model_name,
load_in_8bit=True,
device_map="auto",
offload_folder=offload_dir,
torch_dtype=torch.float16
)
# Chatbot prediction function
def predict(history, message, bot_name="Bot", personality="wise AI", tone="friendly"):
load_model()
history = history or []
# Append user message
history.append({"role": "user", "content": message})
# Build dynamic system prompt
system_prompt = (
f"You are {bot_name}, a {personality}.\n"
f"You express emotion, think logically, and talk like a wise, emotional, intelligent human being.\n"
f"Your tone is always {tone}."
)
# Prepare messages for Qwen
messages = [{"role": "system", "content": system_prompt}]
for msg in history:
messages.append({"role": msg["role"], "content": msg["content"]})
text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
reply = ""
try:
with model_lock:
with torch.no_grad():
start = time.time()
generated_ids = model.generate(**model_inputs, max_new_tokens=256)
if time.time() - start > 30:
reply = "[Response timed out]"
else:
generated_ids = [
output_ids[len(input_ids):]
for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
reply = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
except Exception as e:
reply = f"[Error: {str(e)}]"
# Append bot reply
history.append({"role": "assistant", "content": reply})
return history, reply
# Keep-alive endpoint
def keep_alive(msg="ping"):
return "pong"
# Gradio UI
with gr.Blocks() as demo:
with gr.Tab("Chatbot"):
chatbot = gr.Chatbot(type="messages")
msg = gr.Textbox(placeholder="Type your message here...")
bot_name_input = gr.Textbox(label="Bot Name", value="Bot")
personality_input = gr.Textbox(label="Personality", value="wise AI")
tone_input = gr.Textbox(label="Tone", value="friendly")
msg.submit(
predict,
inputs=[chatbot, msg, bot_name_input, personality_input, tone_input],
outputs=[chatbot, msg]
)
with gr.Tab("Keep Alive"):
box = gr.Textbox(label="Ping", value="ping", interactive=False)
gr.Button("Ping").click(keep_alive, inputs=None, outputs=box)
# Enable request queue (multi-user safe)
demo.queue() # simple queue; compatible with current Gradio versions
# Launch Space
demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)
|