Spaces:

datboyalex
/

bradgpt

Runtime error

App Files Files Community

bradgpt / app.py

datboyalex

Update app.py

9c040b5 verified 9 months ago

raw

history blame contribute delete

3.87 kB

	# -- coding: utf-8 --
	"""Untitled8.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1krY-kSVbf8NSdFeA5eZ_1vvYGLuuSv7I
	"""

	import os
	import pandas as pd
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_openai import ChatOpenAI
	from langchain_openai import OpenAIEmbeddings
	from langchain.vectorstores import FAISS
	from langchain.chains import RetrievalQA
	import gradio as gr

	# Step 5: Initialize the LLM
	openai_api_key = os.getenv("tauhid")
	print(f"API key retrieved: {'[NOT FOUND]' if not openai_api_key else '[FOUND - first 4 chars: ' + openai_api_key[:4] + ']'}")


	# Add this line to explicitly set the environment variable
	os.environ["OPENAI_API_KEY"] = openai_api_key

	# Then create embeddings
	embeddings = OpenAIEmbeddings()

	# Step 1: Load the System Prompt
	prompt_path = "system_prompt.txt" # Ensure this file is in the same directory
	if not os.path.exists(prompt_path):
	raise FileNotFoundError(f"The file '{prompt_path}' is missing. Please upload it to the Space.")

	with open(prompt_path, "r") as file:
	system_prompt = file.read()

	# Step 2: Load the Retrieval Database
	csv_path = "retrievaldb.csv" # Ensure this file is in the same directory
	if not os.path.exists(csv_path):
	raise FileNotFoundError(f"The file '{csv_path}' is missing. Please upload it to the Space.")

	# Load the CSV
	df = pd.read_csv(csv_path)

	# Step 3: Preprocess the Data
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
	texts = []
	metadatas = []

	# Process each row to chunk text and attach metadata
	for _, row in df.iterrows():
	chunk_text = row.get("chunk_text", "")
	if pd.notna(chunk_text):
	chunks = text_splitter.split_text(chunk_text)
	for chunk in chunks:
	texts.append(chunk)
	metadatas.append({
	"source": row.get("content_source", "Unknown Source"),
	"title": row.get("document_name", "Unknown Document"),
	"page": row.get("page_number", "N/A"),
	"topic": row.get("main_topic", "N/A"),
	"week": row.get("metadata", "N/A")
	})

	if len(texts) != len(metadatas):
	raise ValueError("Mismatch between texts and metadata after preprocessing.")




	# Step 4: Create the Vector Store
	embeddings = OpenAIEmbeddings()
	vector_store = FAISS.from_texts(
	texts=texts,
	embedding=embeddings,
	metadatas=metadatas
	)


	# Initialize the LLM
	llm = ChatOpenAI(
	model_name="gpt-4o-mini",
	temperature=0.7,
	api_key=openai_api_key
	)

	# Initialize Embeddings with the same key
	embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)


	# Step 6: Set Up the RetrievalQA Chain
	retriever = vector_store.as_retriever(search_kwargs={"k": 5})
	qa_chain = RetrievalQA.from_chain_type(
	llm=llm,
	chain_type="stuff", # Concatenates retrieved chunks for context
	retriever=retriever,
	return_source_documents=False # Do not include source documents in the response
	)

	# Step 7: Define Query Function
	def query_bradtgpt(user_input):
	# Add system prompt dynamically to the query
	full_prompt = f"""
	{system_prompt}

	User: {user_input}
	Assistant:
	"""
	response = qa_chain({"query": full_prompt})
	return response["result"] # Return the main answer only

	# Step 8: Gradio Interface
	def respond(message):
	return query_bradtgpt(message)

	demo = gr.Interface(
	fn=respond,
	inputs=gr.Textbox(
	label="Your question",
	placeholder="Ask BradGPT anything about CPSC 183!",
	lines=3
	),
	outputs=gr.Textbox(
	label="Response",
	lines=10
	),
	title="BradGPT",
	description="Ask BradGPT questions about CPSC 183 course readings or topics.",
	theme="monochrome"
	)

	if __name__ == "__main__":
	demo.launch()