import spaces
import gradio as gr
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import StorageContext, load_index_from_storage, Settings
from llama_index.llms.huggingface import HuggingFaceLLM
import torch
from pydantic import BaseModel

PERSIST_DIR = './storage'

# Configure the settings
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Pydantic config to avoid protected namespace warning
class Config(BaseModel):
    model_config = {'protected_namespaces': ()}

# @spaces.GPU(duration=240)
def setup():
    Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5", device=DEVICE)
    Settings.llm = HuggingFaceLLM(
        model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
        tokenizer_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
        context_window=2048,
        max_new_tokens=256,
        generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
        device_map="auto",
    )

setup()

# Load the existing index
# @spaces.GPU
def load_context():
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)
    query_engine = index.as_query_engine()
    return query_engine

query_engine = None

def initialize_query_engine():
    global query_engine
    query_engine = load_context()

# Initialize query engine at the start
initialize_query_engine()

# Chatbot response function
@spaces.GPU
def chatbot_response(message, history):
    if query_engine is None:
        initialize_query_engine()
    response = query_engine.query(message)
    return str(response)

# Initialize Gradio interface
iface = gr.ChatInterface(
    fn=chatbot_response,
    title="UESP Lore Chatbot: CPU bound version of Phi-3-mini",
    description=(
        "Low quality and extremely slow version of the ones you can find on the github page: "
        "https://github.com/emarron/UESP-lore. I am not paying to have Llama3 on here."
    ),
    examples=["Who is Zaraphus?"],
    cache_examples=True,
)

if __name__ == "__main__":
    iface.launch()