# -*- coding: utf-8 -*-
"""Untitled8.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1krY-kSVbf8NSdFeA5eZ_1vvYGLuuSv7I
"""

import os
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
import gradio as gr

# Step 5: Initialize the LLM
openai_api_key = os.getenv("tauhid")
print(f"API key retrieved: {'[NOT FOUND]' if not openai_api_key else '[FOUND - first 4 chars: ' + openai_api_key[:4] + ']'}")


# Add this line to explicitly set the environment variable
os.environ["OPENAI_API_KEY"] = openai_api_key

# Then create embeddings
embeddings = OpenAIEmbeddings()

# Step 1: Load the System Prompt
prompt_path = "system_prompt.txt"  # Ensure this file is in the same directory
if not os.path.exists(prompt_path):
    raise FileNotFoundError(f"The file '{prompt_path}' is missing. Please upload it to the Space.")

with open(prompt_path, "r") as file:
    system_prompt = file.read()

# Step 2: Load the Retrieval Database
csv_path = "retrievaldb.csv"  # Ensure this file is in the same directory
if not os.path.exists(csv_path):
    raise FileNotFoundError(f"The file '{csv_path}' is missing. Please upload it to the Space.")

# Load the CSV
df = pd.read_csv(csv_path)

# Step 3: Preprocess the Data
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = []
metadatas = []

# Process each row to chunk text and attach metadata
for _, row in df.iterrows():
    chunk_text = row.get("chunk_text", "")
    if pd.notna(chunk_text):
        chunks = text_splitter.split_text(chunk_text)
        for chunk in chunks:
            texts.append(chunk)
            metadatas.append({
                "source": row.get("content_source", "Unknown Source"),
                "title": row.get("document_name", "Unknown Document"),
                "page": row.get("page_number", "N/A"),
                "topic": row.get("main_topic", "N/A"),
                "week": row.get("metadata", "N/A")
            })

if len(texts) != len(metadatas):
    raise ValueError("Mismatch between texts and metadata after preprocessing.")


# Step 4: Create the Vector Store
embeddings = OpenAIEmbeddings()
vector_store = FAISS.from_texts(
    texts=texts,
    embedding=embeddings,
    metadatas=metadatas
)


# Initialize the LLM
llm = ChatOpenAI(
    model_name="gpt-4o-mini",
    temperature=0.7,
    api_key=openai_api_key
)

# Initialize Embeddings with the same key
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)


# Step 6: Set Up the RetrievalQA Chain
retriever = vector_store.as_retriever(search_kwargs={"k": 5})
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # Concatenates retrieved chunks for context
    retriever=retriever,
    return_source_documents=False  # Do not include source documents in the response
)

# Step 7: Define Query Function
def query_bradtgpt(user_input):
    # Add system prompt dynamically to the query
    full_prompt = f"""
    {system_prompt}

    User: {user_input}
    Assistant:
    """
    response = qa_chain({"query": full_prompt})
    return response["result"]  # Return the main answer only

# Step 8: Gradio Interface
def respond(message):
    return query_bradtgpt(message)

demo = gr.Interface(
    fn=respond,
    inputs=gr.Textbox(
        label="Your question",
        placeholder="Ask BradGPT anything about CPSC 183!",
        lines=3
    ),
    outputs=gr.Textbox(
        label="Response",
        lines=10
    ),
    title="BradGPT",
    description="Ask BradGPT questions about CPSC 183 course readings or topics.",
    theme="monochrome"
)

if __name__ == "__main__":
    demo.launch()