Spaces:

Krish45
/

JARVIS

Sleeping

File size: 3,448 Bytes

1489919
a9670a5
a74f64b
dd3768c
 
5ed6726
1489919
dd3768c
16658fa
dd3768c
1489919
dd3768c
 
 
 
a9670a5
dd3768c
 
 
 
 
 
 
 
5ed6726
dd3768c
5ed6726
dd3768c
 
 
 
5ed6726
dd3768c
 
5ed6726
 
 
 
 
 
 
 
 
3dcc866
5ed6726
 
 
 
a74f64b
a9670a5
 
 
 
 
dd3768c
 
5ed6726
dd3768c
 
 
5ed6726
dd3768c
 
 
 
 
 
 
 
 
a9670a5
5ed6726
 
5520bbf
3dcc866
5ed6726
dd3768c
 
 
 
3dcc866
dd3768c
5ed6726
dd3768c
5ed6726
 
 
 
 
 
 
 
 
dd3768c
 
30480d0
 
1489919
5ed6726

import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import threading
import os
import time

# Model config
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
offload_dir = "offload"

# Global variables
tokenizer = None
model = None
model_lock = threading.Lock()

# Lazy-load the model with quantization & offloading
def load_model():
    global tokenizer, model
    if model is None:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        os.makedirs(offload_dir, exist_ok=True)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            load_in_8bit=True,
            device_map="auto",
            offload_folder=offload_dir,
            torch_dtype=torch.float16
        )

# Chatbot prediction function
def predict(history, message, bot_name="Bot", personality="wise AI", tone="friendly"):
    load_model()
    history = history or []
    # Append user message
    history.append({"role": "user", "content": message})

    # Build dynamic system prompt
    system_prompt = (
        f"You are {bot_name}, a {personality}.\n"
        f"You express emotion, think logically, and talk like a wise, emotional, intelligent human being.\n"
        f"Your tone is always {tone}."
    )

    # Prepare messages for Qwen
    messages = [{"role": "system", "content": system_prompt}]
    for msg in history:
        messages.append({"role": msg["role"], "content": msg["content"]})

    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    reply = ""
    try:
        with model_lock:
            with torch.no_grad():
                start = time.time()
                generated_ids = model.generate(**model_inputs, max_new_tokens=256)
                if time.time() - start > 30:
                    reply = "[Response timed out]"
                else:
                    generated_ids = [
                        output_ids[len(input_ids):]
                        for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
                    ]
                    reply = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    except Exception as e:
        reply = f"[Error: {str(e)}]"

    # Append bot reply
    history.append({"role": "assistant", "content": reply})
    return history, reply

# Keep-alive endpoint
def keep_alive(msg="ping"):
    return "pong"

# Gradio UI
with gr.Blocks() as demo:
    with gr.Tab("Chatbot"):
        chatbot = gr.Chatbot(type="messages")
        msg = gr.Textbox(placeholder="Type your message here...")
        bot_name_input = gr.Textbox(label="Bot Name", value="Bot")
        personality_input = gr.Textbox(label="Personality", value="wise AI")
        tone_input = gr.Textbox(label="Tone", value="friendly")

        msg.submit(
            predict,
            inputs=[chatbot, msg, bot_name_input, personality_input, tone_input],
            outputs=[chatbot, msg]
        )

    with gr.Tab("Keep Alive"):
        box = gr.Textbox(label="Ping", value="ping", interactive=False)
        gr.Button("Ping").click(keep_alive, inputs=None, outputs=box)

# Enable request queue (multi-user safe)
demo.queue()  # simple queue; compatible with current Gradio versions

# Launch Space
demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)