JARVIS / app.py
Krish45's picture
Update app.py
5520bbf verified
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import threading
import os
import time
# Model config
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
offload_dir = "offload"
# Global variables
tokenizer = None
model = None
model_lock = threading.Lock()
# Lazy-load the model with quantization & offloading
def load_model():
global tokenizer, model
if model is None:
tokenizer = AutoTokenizer.from_pretrained(model_name)
os.makedirs(offload_dir, exist_ok=True)
model = AutoModelForCausalLM.from_pretrained(
model_name,
load_in_8bit=True,
device_map="auto",
offload_folder=offload_dir,
torch_dtype=torch.float16
)
# Chatbot prediction function
def predict(history, message, bot_name="Bot", personality="wise AI", tone="friendly"):
load_model()
history = history or []
# Append user message
history.append({"role": "user", "content": message})
# Build dynamic system prompt
system_prompt = (
f"You are {bot_name}, a {personality}.\n"
f"You express emotion, think logically, and talk like a wise, emotional, intelligent human being.\n"
f"Your tone is always {tone}."
)
# Prepare messages for Qwen
messages = [{"role": "system", "content": system_prompt}]
for msg in history:
messages.append({"role": msg["role"], "content": msg["content"]})
text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
reply = ""
try:
with model_lock:
with torch.no_grad():
start = time.time()
generated_ids = model.generate(**model_inputs, max_new_tokens=256)
if time.time() - start > 30:
reply = "[Response timed out]"
else:
generated_ids = [
output_ids[len(input_ids):]
for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
reply = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
except Exception as e:
reply = f"[Error: {str(e)}]"
# Append bot reply
history.append({"role": "assistant", "content": reply})
return history, reply
# Keep-alive endpoint
def keep_alive(msg="ping"):
return "pong"
# Gradio UI
with gr.Blocks() as demo:
with gr.Tab("Chatbot"):
chatbot = gr.Chatbot(type="messages")
msg = gr.Textbox(placeholder="Type your message here...")
bot_name_input = gr.Textbox(label="Bot Name", value="Bot")
personality_input = gr.Textbox(label="Personality", value="wise AI")
tone_input = gr.Textbox(label="Tone", value="friendly")
msg.submit(
predict,
inputs=[chatbot, msg, bot_name_input, personality_input, tone_input],
outputs=[chatbot, msg]
)
with gr.Tab("Keep Alive"):
box = gr.Textbox(label="Ping", value="ping", interactive=False)
gr.Button("Ping").click(keep_alive, inputs=None, outputs=box)
# Enable request queue (multi-user safe)
demo.queue() # simple queue; compatible with current Gradio versions
# Launch Space
demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)