Spaces:
Sleeping
Sleeping
Updated app code
Browse files- Dockerfile +0 -2
- app.py +43 -36
- chainlit.md +0 -10
- requirements.txt +4 -4
Dockerfile
CHANGED
@@ -2,7 +2,6 @@ FROM python:3.9
|
|
2 |
|
3 |
RUN pip install --upgrade pip
|
4 |
|
5 |
-
# Create a user and set up the environment
|
6 |
RUN useradd -m -u 1000 user
|
7 |
USER user
|
8 |
|
@@ -11,7 +10,6 @@ ENV HOME=/home/user \
|
|
11 |
|
12 |
WORKDIR $HOME/app
|
13 |
|
14 |
-
# Add this line to copy the data directory
|
15 |
COPY ./data /home/user/app/data
|
16 |
|
17 |
# Copy only requirements.txt first to leverage Docker cache
|
|
|
2 |
|
3 |
RUN pip install --upgrade pip
|
4 |
|
|
|
5 |
RUN useradd -m -u 1000 user
|
6 |
USER user
|
7 |
|
|
|
10 |
|
11 |
WORKDIR $HOME/app
|
12 |
|
|
|
13 |
COPY ./data /home/user/app/data
|
14 |
|
15 |
# Copy only requirements.txt first to leverage Docker cache
|
app.py
CHANGED
@@ -1,42 +1,39 @@
|
|
1 |
#-----Import Required Libraries-----#
|
2 |
import os
|
3 |
-
|
4 |
-
|
5 |
import openai
|
6 |
-
import fitz
|
7 |
import pandas as pd
|
|
|
8 |
from transformers import pipeline
|
9 |
from qdrant_client import QdrantClient
|
10 |
from qdrant_client.http import models as qdrant_models
|
11 |
-
import chainlit as cl
|
12 |
-
import tiktoken
|
13 |
-
|
14 |
-
# Specific imports from the libraries
|
15 |
from langchain.document_loaders import PyMuPDFLoader
|
16 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
17 |
-
from langchain.embeddings import OpenAIEmbeddings
|
18 |
from langchain_community.vectorstores import Qdrant
|
19 |
from langchain.prompts import ChatPromptTemplate
|
20 |
-
from langchain.chat_models import ChatOpenAI
|
21 |
from operator import itemgetter
|
22 |
from langchain.schema.output_parser import StrOutputParser
|
23 |
from langchain.schema.runnable import RunnablePassthrough
|
24 |
|
25 |
-
|
26 |
load_dotenv()
|
27 |
|
28 |
# Load environment variables
|
29 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
30 |
|
31 |
-
# Initialize OpenAI
|
32 |
openai.api_key = OPENAI_API_KEY
|
33 |
|
34 |
-
|
|
|
|
|
35 |
loader = PyMuPDFLoader("./data/Airbnb-10k.pdf")
|
36 |
documents = loader.load()
|
37 |
|
38 |
-
#Note: I changed the loader file path from one that worked locally only to one that worked with Docker. The old file path is loader = PyMuPDFLoader("/Users/sampazar/AIE3-Midterm/data/airbnb_q1_2024.pdf")
|
39 |
-
|
40 |
def tiktoken_len(text):
|
41 |
tokens = tiktoken.encoding_for_model("gpt-4o").encode(text)
|
42 |
return len(tokens)
|
@@ -47,37 +44,43 @@ text_splitter = RecursiveCharacterTextSplitter(
|
|
47 |
length_function = tiktoken_len
|
48 |
)
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
#-----Embedding and Vector Store Setup-----#
|
53 |
-
|
54 |
-
# Load OpenAI Embeddings Model
|
55 |
-
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
56 |
|
57 |
# Creating a Qdrant Vector Store
|
58 |
qdrant_vector_store = Qdrant.from_documents(
|
59 |
-
|
60 |
embeddings,
|
61 |
location=":memory:",
|
62 |
-
collection_name="
|
63 |
)
|
64 |
|
65 |
# Create a Retriever
|
66 |
retriever = qdrant_vector_store.as_retriever()
|
67 |
|
68 |
-
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
Context:
|
72 |
-
{context}
|
73 |
-
|
74 |
-
{question}
|
75 |
"""
|
|
|
|
|
|
|
76 |
|
77 |
-
prompt = ChatPromptTemplate.from_template(template)
|
78 |
|
79 |
-
# Define the
|
80 |
-
|
81 |
|
82 |
#-----Creating a Retrieval Augmented Generation (RAG) Chain-----#
|
83 |
# The RAG chain:
|
@@ -96,20 +99,24 @@ retrieval_augmented_qa_chain = (
|
|
96 |
# "response" : the "context" and "question" values are used to format our prompt object and then piped
|
97 |
# into the LLM and stored in a key called "response"
|
98 |
# "context" : populated by getting the value of the "context" key from the previous step
|
99 |
-
| {"response":
|
100 |
)
|
101 |
|
102 |
-
#-----Chainlit Integration-----#
|
103 |
# Sets initial chat settings at the start of a user session
|
104 |
@cl.on_chat_start
|
105 |
async def start_chat():
|
|
|
|
|
|
|
|
|
|
|
106 |
settings = {
|
107 |
"model": "gpt-4o",
|
108 |
"temperature": 0,
|
109 |
"max_tokens": 500,
|
110 |
-
"top_p": 1,
|
111 |
"frequency_penalty": 0,
|
112 |
-
"
|
|
|
113 |
}
|
114 |
cl.user_session.set("settings", settings)
|
115 |
|
@@ -127,6 +134,6 @@ async def handle_message(message: cl.Message):
|
|
127 |
|
128 |
# Extracting and sending just the content
|
129 |
content = response["response"].content
|
130 |
-
pretty_content = content.strip()
|
131 |
|
132 |
await cl.Message(content=pretty_content).send()
|
|
|
1 |
#-----Import Required Libraries-----#
|
2 |
import os
|
3 |
+
import chainlit as cl
|
4 |
+
import tiktoken
|
5 |
import openai
|
6 |
+
import fitz
|
7 |
import pandas as pd
|
8 |
+
from dotenv import load_dotenv
|
9 |
from transformers import pipeline
|
10 |
from qdrant_client import QdrantClient
|
11 |
from qdrant_client.http import models as qdrant_models
|
|
|
|
|
|
|
|
|
12 |
from langchain.document_loaders import PyMuPDFLoader
|
13 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
14 |
+
from langchain.embeddings import OpenAIEmbeddings
|
15 |
from langchain_community.vectorstores import Qdrant
|
16 |
from langchain.prompts import ChatPromptTemplate
|
17 |
+
from langchain.chat_models import ChatOpenAI
|
18 |
from operator import itemgetter
|
19 |
from langchain.schema.output_parser import StrOutputParser
|
20 |
from langchain.schema.runnable import RunnablePassthrough
|
21 |
|
22 |
+
# Set environment variables
|
23 |
load_dotenv()
|
24 |
|
25 |
# Load environment variables
|
26 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
27 |
|
28 |
+
# Initialize OpenAI
|
29 |
openai.api_key = OPENAI_API_KEY
|
30 |
|
31 |
+
# Load embedding model
|
32 |
+
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
33 |
+
|
34 |
loader = PyMuPDFLoader("./data/Airbnb-10k.pdf")
|
35 |
documents = loader.load()
|
36 |
|
|
|
|
|
37 |
def tiktoken_len(text):
|
38 |
tokens = tiktoken.encoding_for_model("gpt-4o").encode(text)
|
39 |
return len(tokens)
|
|
|
44 |
length_function = tiktoken_len
|
45 |
)
|
46 |
|
47 |
+
split_documents = text_splitter.split_documents(documents)
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
# Creating a Qdrant Vector Store
|
50 |
qdrant_vector_store = Qdrant.from_documents(
|
51 |
+
split_documents,
|
52 |
embeddings,
|
53 |
location=":memory:",
|
54 |
+
collection_name="Airbnb-10k",
|
55 |
)
|
56 |
|
57 |
# Create a Retriever
|
58 |
retriever = qdrant_vector_store.as_retriever()
|
59 |
|
60 |
+
# -- AUGMENTED -- #
|
61 |
+
"""
|
62 |
+
1. Define a String Template
|
63 |
+
2. Create a Prompt Template from the String Template
|
64 |
+
"""
|
65 |
+
### 1. DEFINE STRING TEMPLATE
|
66 |
+
RAG_PROMPT_TEMPLATE = """\
|
67 |
+
<|start_header_id|>system<|end_header_id|>
|
68 |
+
You are a helpful assistant. You answer user questions based on provided context. If you can't answer the question with the provided context,\
|
69 |
+
say you don't know.<|eot_id|>
|
70 |
+
<|start_header_id|>user<|end_header_id|>
|
71 |
+
User Query:
|
72 |
+
{query}
|
73 |
Context:
|
74 |
+
{context}<|eot_id|>
|
75 |
+
<|start_header_id|>assistant<|end_header_id|>
|
|
|
76 |
"""
|
77 |
+
#Note that we do not have the response here. We have assistent, we ONLY start, but not followed by <|eot_id> as we do not have a response YET.
|
78 |
+
|
79 |
+
rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT_TEMPLATE)
|
80 |
|
|
|
81 |
|
82 |
+
# Define the LLM
|
83 |
+
llm = ChatOpenAI(model_name="gpt-4o")
|
84 |
|
85 |
#-----Creating a Retrieval Augmented Generation (RAG) Chain-----#
|
86 |
# The RAG chain:
|
|
|
99 |
# "response" : the "context" and "question" values are used to format our prompt object and then piped
|
100 |
# into the LLM and stored in a key called "response"
|
101 |
# "context" : populated by getting the value of the "context" key from the previous step
|
102 |
+
| {"response": rag_prompt | llm, "context": itemgetter("context")}
|
103 |
)
|
104 |
|
|
|
105 |
# Sets initial chat settings at the start of a user session
|
106 |
@cl.on_chat_start
|
107 |
async def start_chat():
|
108 |
+
"""
|
109 |
+
This function will be called at the start of every user session.
|
110 |
+
We will build our LCEL RAG chain here, and store it in the user session.
|
111 |
+
The user session is a dictionary that is unique to each user session, and is stored in the memory of the server.
|
112 |
+
"""
|
113 |
settings = {
|
114 |
"model": "gpt-4o",
|
115 |
"temperature": 0,
|
116 |
"max_tokens": 500,
|
|
|
117 |
"frequency_penalty": 0,
|
118 |
+
"top_p": 1,
|
119 |
+
|
120 |
}
|
121 |
cl.user_session.set("settings", settings)
|
122 |
|
|
|
134 |
|
135 |
# Extracting and sending just the content
|
136 |
content = response["response"].content
|
137 |
+
pretty_content = content.strip()
|
138 |
|
139 |
await cl.Message(content=pretty_content).send()
|
chainlit.md
CHANGED
@@ -4,16 +4,6 @@ Welcome to the Airbnb 10k 2024 RAG application!
|
|
4 |
|
5 |
This RAG (retrieval augmentation generation) application allows you to query the Airbnb 10k 2024 filing dataset. It utilizes a generalized LLM and uses RAG techniques to retrieve and respond to user queries specific to knowledge of the Airbnb 10k 2024 filing dataset.
|
6 |
|
7 |
-
Build ποΈ
|
8 |
-
|
9 |
-
Data: Airbnb 10-k Filings from Q1, 2024
|
10 |
-
LLM: OpenAI
|
11 |
-
Embedding Model: OpenAI Embeddings (model="text-embedding-3-small")
|
12 |
-
Infrastructure: LangChain
|
13 |
-
Vector Store: QDrant
|
14 |
-
Deployment: Chainlit, Hugging Face
|
15 |
-
|
16 |
-
Ship π’
|
17 |
|
18 |
Evaluate your answers to the following questions
|
19 |
Q1 "What is Airbnb's 'Description of Business'?"
|
|
|
4 |
|
5 |
This RAG (retrieval augmentation generation) application allows you to query the Airbnb 10k 2024 filing dataset. It utilizes a generalized LLM and uses RAG techniques to retrieve and respond to user queries specific to knowledge of the Airbnb 10k 2024 filing dataset.
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
Evaluate your answers to the following questions
|
9 |
Q1 "What is Airbnb's 'Description of Business'?"
|
requirements.txt
CHANGED
@@ -3,10 +3,10 @@ langchain==0.2.5
|
|
3 |
langchain_community==0.2.5
|
4 |
langchain_core==0.2.9
|
5 |
langchain_text_splitters==0.2.1
|
|
|
6 |
python-dotenv==1.0.1
|
7 |
-
openai==1.35.3
|
8 |
-
qdrant-client==1.9.2
|
9 |
-
PyMuPDF==1.24.5 #Be sure to use the latest version 'pip show pymupdf'
|
10 |
-
tiktoken==0.7.0
|
11 |
transformers==4.37.0
|
12 |
pandas==2.0.3
|
|
|
|
3 |
langchain_community==0.2.5
|
4 |
langchain_core==0.2.9
|
5 |
langchain_text_splitters==0.2.1
|
6 |
+
PyMuPDF==1.24.5
|
7 |
python-dotenv==1.0.1
|
8 |
+
openai==1.35.3
|
9 |
+
qdrant-client==1.9.2
|
|
|
|
|
10 |
transformers==4.37.0
|
11 |
pandas==2.0.3
|
12 |
+
tiktoken==0.7.0
|