# -*- coding: utf-8 -*- """Untitled8.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1krY-kSVbf8NSdFeA5eZ_1vvYGLuuSv7I """ import os import pandas as pd from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_openai import ChatOpenAI from langchain_openai import OpenAIEmbeddings from langchain.vectorstores import FAISS from langchain.chains import RetrievalQA import gradio as gr # Step 5: Initialize the LLM openai_api_key = os.getenv("tauhid") print(f"API key retrieved: {'[NOT FOUND]' if not openai_api_key else '[FOUND - first 4 chars: ' + openai_api_key[:4] + ']'}") # Add this line to explicitly set the environment variable os.environ["OPENAI_API_KEY"] = openai_api_key # Then create embeddings embeddings = OpenAIEmbeddings() # Step 1: Load the System Prompt prompt_path = "system_prompt.txt" # Ensure this file is in the same directory if not os.path.exists(prompt_path): raise FileNotFoundError(f"The file '{prompt_path}' is missing. Please upload it to the Space.") with open(prompt_path, "r") as file: system_prompt = file.read() # Step 2: Load the Retrieval Database csv_path = "retrievaldb.csv" # Ensure this file is in the same directory if not os.path.exists(csv_path): raise FileNotFoundError(f"The file '{csv_path}' is missing. Please upload it to the Space.") # Load the CSV df = pd.read_csv(csv_path) # Step 3: Preprocess the Data text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) texts = [] metadatas = [] # Process each row to chunk text and attach metadata for _, row in df.iterrows(): chunk_text = row.get("chunk_text", "") if pd.notna(chunk_text): chunks = text_splitter.split_text(chunk_text) for chunk in chunks: texts.append(chunk) metadatas.append({ "source": row.get("content_source", "Unknown Source"), "title": row.get("document_name", "Unknown Document"), "page": row.get("page_number", "N/A"), "topic": row.get("main_topic", "N/A"), "week": row.get("metadata", "N/A") }) if len(texts) != len(metadatas): raise ValueError("Mismatch between texts and metadata after preprocessing.") # Step 4: Create the Vector Store embeddings = OpenAIEmbeddings() vector_store = FAISS.from_texts( texts=texts, embedding=embeddings, metadatas=metadatas ) # Initialize the LLM llm = ChatOpenAI( model_name="gpt-4o-mini", temperature=0.7, api_key=openai_api_key ) # Initialize Embeddings with the same key embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key) # Step 6: Set Up the RetrievalQA Chain retriever = vector_store.as_retriever(search_kwargs={"k": 5}) qa_chain = RetrievalQA.from_chain_type( llm=llm, chain_type="stuff", # Concatenates retrieved chunks for context retriever=retriever, return_source_documents=False # Do not include source documents in the response ) # Step 7: Define Query Function def query_bradtgpt(user_input): # Add system prompt dynamically to the query full_prompt = f""" {system_prompt} User: {user_input} Assistant: """ response = qa_chain({"query": full_prompt}) return response["result"] # Return the main answer only # Step 8: Gradio Interface def respond(message): return query_bradtgpt(message) demo = gr.Interface( fn=respond, inputs=gr.Textbox( label="Your question", placeholder="Ask BradGPT anything about CPSC 183!", lines=3 ), outputs=gr.Textbox( label="Response", lines=10 ), title="BradGPT", description="Ask BradGPT questions about CPSC 183 course readings or topics.", theme="monochrome" ) if __name__ == "__main__": demo.launch()