File size: 3,867 Bytes
bc94c3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2c8048
 
 
 
01dd91b
 
 
 
 
 
 
bc94c3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee69ae8
b2c8048
ee69ae8
bc94c3a
 
 
 
 
 
 
ee69ae8
bc94c3a
b0c1ef9
bc94c3a
 
 
 
 
 
b0c1ef9
 
 
 
bc94c3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9c040b5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# -*- coding: utf-8 -*-
"""Untitled8.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1krY-kSVbf8NSdFeA5eZ_1vvYGLuuSv7I
"""

import os
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
import gradio as gr

# Step 5: Initialize the LLM
openai_api_key = os.getenv("tauhid")
print(f"API key retrieved: {'[NOT FOUND]' if not openai_api_key else '[FOUND - first 4 chars: ' + openai_api_key[:4] + ']'}")


# Add this line to explicitly set the environment variable
os.environ["OPENAI_API_KEY"] = openai_api_key

# Then create embeddings
embeddings = OpenAIEmbeddings()

# Step 1: Load the System Prompt
prompt_path = "system_prompt.txt"  # Ensure this file is in the same directory
if not os.path.exists(prompt_path):
    raise FileNotFoundError(f"The file '{prompt_path}' is missing. Please upload it to the Space.")

with open(prompt_path, "r") as file:
    system_prompt = file.read()

# Step 2: Load the Retrieval Database
csv_path = "retrievaldb.csv"  # Ensure this file is in the same directory
if not os.path.exists(csv_path):
    raise FileNotFoundError(f"The file '{csv_path}' is missing. Please upload it to the Space.")

# Load the CSV
df = pd.read_csv(csv_path)

# Step 3: Preprocess the Data
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = []
metadatas = []

# Process each row to chunk text and attach metadata
for _, row in df.iterrows():
    chunk_text = row.get("chunk_text", "")
    if pd.notna(chunk_text):
        chunks = text_splitter.split_text(chunk_text)
        for chunk in chunks:
            texts.append(chunk)
            metadatas.append({
                "source": row.get("content_source", "Unknown Source"),
                "title": row.get("document_name", "Unknown Document"),
                "page": row.get("page_number", "N/A"),
                "topic": row.get("main_topic", "N/A"),
                "week": row.get("metadata", "N/A")
            })

if len(texts) != len(metadatas):
    raise ValueError("Mismatch between texts and metadata after preprocessing.")




# Step 4: Create the Vector Store
embeddings = OpenAIEmbeddings()
vector_store = FAISS.from_texts(
    texts=texts,
    embedding=embeddings,
    metadatas=metadatas
)


# Initialize the LLM
llm = ChatOpenAI(
    model_name="gpt-4o-mini",
    temperature=0.7,
    api_key=openai_api_key
)

# Initialize Embeddings with the same key
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)


# Step 6: Set Up the RetrievalQA Chain
retriever = vector_store.as_retriever(search_kwargs={"k": 5})
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # Concatenates retrieved chunks for context
    retriever=retriever,
    return_source_documents=False  # Do not include source documents in the response
)

# Step 7: Define Query Function
def query_bradtgpt(user_input):
    # Add system prompt dynamically to the query
    full_prompt = f"""
    {system_prompt}

    User: {user_input}
    Assistant:
    """
    response = qa_chain({"query": full_prompt})
    return response["result"]  # Return the main answer only

# Step 8: Gradio Interface
def respond(message):
    return query_bradtgpt(message)

demo = gr.Interface(
    fn=respond,
    inputs=gr.Textbox(
        label="Your question",
        placeholder="Ask BradGPT anything about CPSC 183!",
        lines=3
    ),
    outputs=gr.Textbox(
        label="Response",
        lines=10
    ),
    title="BradGPT",
    description="Ask BradGPT questions about CPSC 183 course readings or topics.",
    theme="monochrome"
)

if __name__ == "__main__":
    demo.launch()