import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import torch import threading import os import time # Model config model_name = "Qwen/Qwen2.5-0.5B-Instruct" offload_dir = "offload" # Global variables tokenizer = None model = None model_lock = threading.Lock() # Lazy-load the model with quantization & offloading def load_model(): global tokenizer, model if model is None: tokenizer = AutoTokenizer.from_pretrained(model_name) os.makedirs(offload_dir, exist_ok=True) model = AutoModelForCausalLM.from_pretrained( model_name, load_in_8bit=True, device_map="auto", offload_folder=offload_dir, torch_dtype=torch.float16 ) # Chatbot prediction function def predict(history, message, bot_name="Bot", personality="wise AI", tone="friendly"): load_model() history = history or [] # Append user message history.append({"role": "user", "content": message}) # Build dynamic system prompt system_prompt = ( f"You are {bot_name}, a {personality}.\n" f"You express emotion, think logically, and talk like a wise, emotional, intelligent human being.\n" f"Your tone is always {tone}." ) # Prepare messages for Qwen messages = [{"role": "system", "content": system_prompt}] for msg in history: messages.append({"role": msg["role"], "content": msg["content"]}) text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) model_inputs = tokenizer([text], return_tensors="pt").to(model.device) reply = "" try: with model_lock: with torch.no_grad(): start = time.time() generated_ids = model.generate(**model_inputs, max_new_tokens=256) if time.time() - start > 30: reply = "[Response timed out]" else: generated_ids = [ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) ] reply = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] except Exception as e: reply = f"[Error: {str(e)}]" # Append bot reply history.append({"role": "assistant", "content": reply}) return history, reply # Keep-alive endpoint def keep_alive(msg="ping"): return "pong" # Gradio UI with gr.Blocks() as demo: with gr.Tab("Chatbot"): chatbot = gr.Chatbot(type="messages") msg = gr.Textbox(placeholder="Type your message here...") bot_name_input = gr.Textbox(label="Bot Name", value="Bot") personality_input = gr.Textbox(label="Personality", value="wise AI") tone_input = gr.Textbox(label="Tone", value="friendly") msg.submit( predict, inputs=[chatbot, msg, bot_name_input, personality_input, tone_input], outputs=[chatbot, msg] ) with gr.Tab("Keep Alive"): box = gr.Textbox(label="Ping", value="ping", interactive=False) gr.Button("Ping").click(keep_alive, inputs=None, outputs=box) # Enable request queue (multi-user safe) demo.queue() # simple queue; compatible with current Gradio versions # Launch Space demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)