import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import threading
import os
import time

# Model config
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
offload_dir = "offload"

# Global variables
tokenizer = None
model = None
model_lock = threading.Lock()

# Lazy-load the model with quantization & offloading
def load_model():
    global tokenizer, model
    if model is None:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        os.makedirs(offload_dir, exist_ok=True)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            load_in_8bit=True,
            device_map="auto",
            offload_folder=offload_dir,
            torch_dtype=torch.float16
        )

# Chatbot prediction function
def predict(history, message, bot_name="Bot", personality="wise AI", tone="friendly"):
    load_model()
    history = history or []
    # Append user message
    history.append({"role": "user", "content": message})

    # Build dynamic system prompt
    system_prompt = (
        f"You are {bot_name}, a {personality}.\n"
        f"You express emotion, think logically, and talk like a wise, emotional, intelligent human being.\n"
        f"Your tone is always {tone}."
    )

    # Prepare messages for Qwen
    messages = [{"role": "system", "content": system_prompt}]
    for msg in history:
        messages.append({"role": msg["role"], "content": msg["content"]})

    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    reply = ""
    try:
        with model_lock:
            with torch.no_grad():
                start = time.time()
                generated_ids = model.generate(**model_inputs, max_new_tokens=256)
                if time.time() - start > 30:
                    reply = "[Response timed out]"
                else:
                    generated_ids = [
                        output_ids[len(input_ids):]
                        for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
                    ]
                    reply = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    except Exception as e:
        reply = f"[Error: {str(e)}]"

    # Append bot reply
    history.append({"role": "assistant", "content": reply})
    return history, reply

# Keep-alive endpoint
def keep_alive(msg="ping"):
    return "pong"

# Gradio UI
with gr.Blocks() as demo:
    with gr.Tab("Chatbot"):
        chatbot = gr.Chatbot(type="messages")
        msg = gr.Textbox(placeholder="Type your message here...")
        bot_name_input = gr.Textbox(label="Bot Name", value="Bot")
        personality_input = gr.Textbox(label="Personality", value="wise AI")
        tone_input = gr.Textbox(label="Tone", value="friendly")

        msg.submit(
            predict,
            inputs=[chatbot, msg, bot_name_input, personality_input, tone_input],
            outputs=[chatbot, msg]
        )

    with gr.Tab("Keep Alive"):
        box = gr.Textbox(label="Ping", value="ping", interactive=False)
        gr.Button("Ping").click(keep_alive, inputs=None, outputs=box)

# Enable request queue (multi-user safe)
demo.queue()  # simple queue; compatible with current Gradio versions

# Launch Space
demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)