Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
"""Untitled8.ipynb | |
Automatically generated by Colab. | |
Original file is located at | |
https://colab.research.google.com/drive/1krY-kSVbf8NSdFeA5eZ_1vvYGLuuSv7I | |
""" | |
import os | |
import pandas as pd | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_openai import ChatOpenAI | |
from langchain_openai import OpenAIEmbeddings | |
from langchain.vectorstores import FAISS | |
from langchain.chains import RetrievalQA | |
import gradio as gr | |
# Step 5: Initialize the LLM | |
openai_api_key = os.getenv("tauhid") | |
print(f"API key retrieved: {'[NOT FOUND]' if not openai_api_key else '[FOUND - first 4 chars: ' + openai_api_key[:4] + ']'}") | |
# Add this line to explicitly set the environment variable | |
os.environ["OPENAI_API_KEY"] = openai_api_key | |
# Then create embeddings | |
embeddings = OpenAIEmbeddings() | |
# Step 1: Load the System Prompt | |
prompt_path = "system_prompt.txt" # Ensure this file is in the same directory | |
if not os.path.exists(prompt_path): | |
raise FileNotFoundError(f"The file '{prompt_path}' is missing. Please upload it to the Space.") | |
with open(prompt_path, "r") as file: | |
system_prompt = file.read() | |
# Step 2: Load the Retrieval Database | |
csv_path = "retrievaldb.csv" # Ensure this file is in the same directory | |
if not os.path.exists(csv_path): | |
raise FileNotFoundError(f"The file '{csv_path}' is missing. Please upload it to the Space.") | |
# Load the CSV | |
df = pd.read_csv(csv_path) | |
# Step 3: Preprocess the Data | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
texts = [] | |
metadatas = [] | |
# Process each row to chunk text and attach metadata | |
for _, row in df.iterrows(): | |
chunk_text = row.get("chunk_text", "") | |
if pd.notna(chunk_text): | |
chunks = text_splitter.split_text(chunk_text) | |
for chunk in chunks: | |
texts.append(chunk) | |
metadatas.append({ | |
"source": row.get("content_source", "Unknown Source"), | |
"title": row.get("document_name", "Unknown Document"), | |
"page": row.get("page_number", "N/A"), | |
"topic": row.get("main_topic", "N/A"), | |
"week": row.get("metadata", "N/A") | |
}) | |
if len(texts) != len(metadatas): | |
raise ValueError("Mismatch between texts and metadata after preprocessing.") | |
# Step 4: Create the Vector Store | |
embeddings = OpenAIEmbeddings() | |
vector_store = FAISS.from_texts( | |
texts=texts, | |
embedding=embeddings, | |
metadatas=metadatas | |
) | |
# Initialize the LLM | |
llm = ChatOpenAI( | |
model_name="gpt-4o-mini", | |
temperature=0.7, | |
api_key=openai_api_key | |
) | |
# Initialize Embeddings with the same key | |
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key) | |
# Step 6: Set Up the RetrievalQA Chain | |
retriever = vector_store.as_retriever(search_kwargs={"k": 5}) | |
qa_chain = RetrievalQA.from_chain_type( | |
llm=llm, | |
chain_type="stuff", # Concatenates retrieved chunks for context | |
retriever=retriever, | |
return_source_documents=False # Do not include source documents in the response | |
) | |
# Step 7: Define Query Function | |
def query_bradtgpt(user_input): | |
# Add system prompt dynamically to the query | |
full_prompt = f""" | |
{system_prompt} | |
User: {user_input} | |
Assistant: | |
""" | |
response = qa_chain({"query": full_prompt}) | |
return response["result"] # Return the main answer only | |
# Step 8: Gradio Interface | |
def respond(message): | |
return query_bradtgpt(message) | |
demo = gr.Interface( | |
fn=respond, | |
inputs=gr.Textbox( | |
label="Your question", | |
placeholder="Ask BradGPT anything about CPSC 183!", | |
lines=3 | |
), | |
outputs=gr.Textbox( | |
label="Response", | |
lines=10 | |
), | |
title="BradGPT", | |
description="Ask BradGPT questions about CPSC 183 course readings or topics.", | |
theme="monochrome" | |
) | |
if __name__ == "__main__": | |
demo.launch() |