import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
import spaces
import time

model_name = "hosseinhimself/ISANG-v1.0-8B"
base_model_name = "unsloth/Meta-Llama-3.1-8B"

# Load tokenizer globally
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

@spaces.GPU
def load_model():
    try:
        # Load the base model
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True,
            low_cpu_mem_usage=True
        )
        # Load the PEFT model
        model = PeftModel.from_pretrained(base_model, model_name)
        print(f"Model loaded successfully. Using device: {model.device}")
        return model
    except Exception as e:
        print(f"Error loading model: {e}")
        raise

@spaces.GPU
def generate_text(prompt):
    model = load_model()
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=200, num_return_sequences=1, temperature=0.7)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

gradio_app = gr.Interface(
    generate_text,
    inputs=gr.Textbox(label="Enter your message", lines=3),
    outputs=gr.Textbox(label="Chatbot Response"),
    title="ISANG Chatbot",
    description=f"""This is a simple chatbot powered by the ISANG model. It is fine-tuned from the {base_model_name} model.
    Enter your message and see how the chatbot responds!""",
    examples=[
        ["سلام، چطوری؟"],
        ["برام یه داستان تعریف کن"],
        ["بهترین کتابی که خوندی چی بوده؟"],
        ["توی اوقات فراغتت چی کار می‌کنی؟"],
        ["نظرت درباره هوش مصنوعی چیه؟"]
    ]
)

if __name__ == "__main__":
    gradio_app.launch()