Spaces:

Krish45
/

JARVIS

Sleeping

App Files Files Community

JARVIS / app.py

Krish45

Update app.py

5520bbf verified 8 days ago

raw

history blame contribute delete

3.45 kB

	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import torch
	import threading
	import os
	import time

	# Model config
	model_name = "Qwen/Qwen2.5-0.5B-Instruct"
	offload_dir = "offload"

	# Global variables
	tokenizer = None
	model = None
	model_lock = threading.Lock()

	# Lazy-load the model with quantization & offloading
	def load_model():
	global tokenizer, model
	if model is None:
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	os.makedirs(offload_dir, exist_ok=True)
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	load_in_8bit=True,
	device_map="auto",
	offload_folder=offload_dir,
	torch_dtype=torch.float16
	)

	# Chatbot prediction function
	def predict(history, message, bot_name="Bot", personality="wise AI", tone="friendly"):
	load_model()
	history = history or []
	# Append user message
	history.append({"role": "user", "content": message})

	# Build dynamic system prompt
	system_prompt = (
	f"You are {bot_name}, a {personality}.\n"
	f"You express emotion, think logically, and talk like a wise, emotional, intelligent human being.\n"
	f"Your tone is always {tone}."
	)

	# Prepare messages for Qwen
	messages = [{"role": "system", "content": system_prompt}]
	for msg in history:
	messages.append({"role": msg["role"], "content": msg["content"]})

	text = tokenizer.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

	reply = ""
	try:
	with model_lock:
	with torch.no_grad():
	start = time.time()
	generated_ids = model.generate(**model_inputs, max_new_tokens=256)
	if time.time() - start > 30:
	reply = "[Response timed out]"
	else:
	generated_ids = [
	output_ids[len(input_ids):]
	for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
	]
	reply = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
	except Exception as e:
	reply = f"[Error: {str(e)}]"

	# Append bot reply
	history.append({"role": "assistant", "content": reply})
	return history, reply

	# Keep-alive endpoint
	def keep_alive(msg="ping"):
	return "pong"

	# Gradio UI
	with gr.Blocks() as demo:
	with gr.Tab("Chatbot"):
	chatbot = gr.Chatbot(type="messages")
	msg = gr.Textbox(placeholder="Type your message here...")
	bot_name_input = gr.Textbox(label="Bot Name", value="Bot")
	personality_input = gr.Textbox(label="Personality", value="wise AI")
	tone_input = gr.Textbox(label="Tone", value="friendly")

	msg.submit(
	predict,
	inputs=[chatbot, msg, bot_name_input, personality_input, tone_input],
	outputs=[chatbot, msg]
	)

	with gr.Tab("Keep Alive"):
	box = gr.Textbox(label="Ping", value="ping", interactive=False)
	gr.Button("Ping").click(keep_alive, inputs=None, outputs=box)

	# Enable request queue (multi-user safe)
	demo.queue() # simple queue; compatible with current Gradio versions

	# Launch Space
	demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)