bradgpt / app.py
datboyalex's picture
Update app.py
9c040b5 verified
# -*- coding: utf-8 -*-
"""Untitled8.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1krY-kSVbf8NSdFeA5eZ_1vvYGLuuSv7I
"""
import os
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
import gradio as gr
# Step 5: Initialize the LLM
openai_api_key = os.getenv("tauhid")
print(f"API key retrieved: {'[NOT FOUND]' if not openai_api_key else '[FOUND - first 4 chars: ' + openai_api_key[:4] + ']'}")
# Add this line to explicitly set the environment variable
os.environ["OPENAI_API_KEY"] = openai_api_key
# Then create embeddings
embeddings = OpenAIEmbeddings()
# Step 1: Load the System Prompt
prompt_path = "system_prompt.txt" # Ensure this file is in the same directory
if not os.path.exists(prompt_path):
raise FileNotFoundError(f"The file '{prompt_path}' is missing. Please upload it to the Space.")
with open(prompt_path, "r") as file:
system_prompt = file.read()
# Step 2: Load the Retrieval Database
csv_path = "retrievaldb.csv" # Ensure this file is in the same directory
if not os.path.exists(csv_path):
raise FileNotFoundError(f"The file '{csv_path}' is missing. Please upload it to the Space.")
# Load the CSV
df = pd.read_csv(csv_path)
# Step 3: Preprocess the Data
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = []
metadatas = []
# Process each row to chunk text and attach metadata
for _, row in df.iterrows():
chunk_text = row.get("chunk_text", "")
if pd.notna(chunk_text):
chunks = text_splitter.split_text(chunk_text)
for chunk in chunks:
texts.append(chunk)
metadatas.append({
"source": row.get("content_source", "Unknown Source"),
"title": row.get("document_name", "Unknown Document"),
"page": row.get("page_number", "N/A"),
"topic": row.get("main_topic", "N/A"),
"week": row.get("metadata", "N/A")
})
if len(texts) != len(metadatas):
raise ValueError("Mismatch between texts and metadata after preprocessing.")
# Step 4: Create the Vector Store
embeddings = OpenAIEmbeddings()
vector_store = FAISS.from_texts(
texts=texts,
embedding=embeddings,
metadatas=metadatas
)
# Initialize the LLM
llm = ChatOpenAI(
model_name="gpt-4o-mini",
temperature=0.7,
api_key=openai_api_key
)
# Initialize Embeddings with the same key
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
# Step 6: Set Up the RetrievalQA Chain
retriever = vector_store.as_retriever(search_kwargs={"k": 5})
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff", # Concatenates retrieved chunks for context
retriever=retriever,
return_source_documents=False # Do not include source documents in the response
)
# Step 7: Define Query Function
def query_bradtgpt(user_input):
# Add system prompt dynamically to the query
full_prompt = f"""
{system_prompt}
User: {user_input}
Assistant:
"""
response = qa_chain({"query": full_prompt})
return response["result"] # Return the main answer only
# Step 8: Gradio Interface
def respond(message):
return query_bradtgpt(message)
demo = gr.Interface(
fn=respond,
inputs=gr.Textbox(
label="Your question",
placeholder="Ask BradGPT anything about CPSC 183!",
lines=3
),
outputs=gr.Textbox(
label="Response",
lines=10
),
title="BradGPT",
description="Ask BradGPT questions about CPSC 183 course readings or topics.",
theme="monochrome"
)
if __name__ == "__main__":
demo.launch()