Spaces:

norjala
/

Airbnb-10k

Sleeping

App Files Files Community

norjala commited on 22 days ago

Commit

be8a991

•

1 Parent(s): 79f1ae1

Updated app code

Browse files

Files changed (4) hide show

Dockerfile +0 -2
app.py +43 -36
chainlit.md +0 -10
requirements.txt +4 -4

Dockerfile CHANGED Viewed

@@ -2,7 +2,6 @@ FROM python:3.9
 RUN pip install --upgrade pip
-# Create a user and set up the environment
 RUN useradd -m -u 1000 user
 USER user
@@ -11,7 +10,6 @@ ENV HOME=/home/user \
 WORKDIR $HOME/app
-# Add this line to copy the data directory
 COPY ./data /home/user/app/data
 # Copy only requirements.txt first to leverage Docker cache

 RUN pip install --upgrade pip
 RUN useradd -m -u 1000 user
 USER user
 WORKDIR $HOME/app
 COPY ./data /home/user/app/data
 # Copy only requirements.txt first to leverage Docker cache

app.py CHANGED Viewed

@@ -1,42 +1,39 @@
 #-----Import Required Libraries-----#
 import os
-from dotenv import load_dotenv
 import openai
-import fitz  # PyMuPDF
 import pandas as pd
 from transformers import pipeline
 from qdrant_client import QdrantClient
 from qdrant_client.http import models as qdrant_models
-import chainlit as cl
-import tiktoken
-# Specific imports from the libraries
 from langchain.document_loaders import PyMuPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.embeddings import OpenAIEmbeddings #Note: Old import was - from langchain_openai import OpenAIEmbeddings
 from langchain_community.vectorstores import Qdrant
 from langchain.prompts import ChatPromptTemplate
-from langchain.chat_models import ChatOpenAI #Note: Old import was - from langchain_openai import ChatOpenAI
 from operator import itemgetter
 from langchain.schema.output_parser import StrOutputParser
 from langchain.schema.runnable import RunnablePassthrough
-#-----Set Environment Variables-----#
 load_dotenv()
 # Load environment variables
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
-# Initialize OpenAI client after loading the environment variables
 openai.api_key = OPENAI_API_KEY
-#-----Document Loading and Processing -----#
 loader = PyMuPDFLoader("./data/Airbnb-10k.pdf")
 documents = loader.load()
-#Note: I changed the loader file path from one that worked locally only to one that worked with Docker. The old file path is loader = PyMuPDFLoader("/Users/sampazar/AIE3-Midterm/data/airbnb_q1_2024.pdf")
 def tiktoken_len(text):
     tokens = tiktoken.encoding_for_model("gpt-4o").encode(text)
     return len(tokens)
@@ -47,37 +44,43 @@ text_splitter = RecursiveCharacterTextSplitter(
     length_function = tiktoken_len
 )
-split_chunks = text_splitter.split_documents(documents)
-#-----Embedding and Vector Store Setup-----#
-# Load OpenAI Embeddings Model
-embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
 # Creating a Qdrant Vector Store
 qdrant_vector_store = Qdrant.from_documents(
-    split_chunks,
     embeddings,
     location=":memory:",
-    collection_name="Airbnb_Q1_2024",
 )
 # Create a Retriever
 retriever = qdrant_vector_store.as_retriever()
-#-----Prompt Template and Language Model Setup-----#
-# Define the prompt template
-template = """Answer the question based only on the following context. If you cannot answer the question with the context, please respond with 'I don't know':
 Context:
-{context}
-Question:
-{question}
 """
-prompt = ChatPromptTemplate.from_template(template)
-# Define the primary LLM
-primary_llm = ChatOpenAI(model_name="gpt-4o", temperature=0)
 #-----Creating a Retrieval Augmented Generation (RAG) Chain-----#
 # The RAG chain:
@@ -96,20 +99,24 @@ retrieval_augmented_qa_chain = (
     # "response" : the "context" and "question" values are used to format our prompt object and then piped
     #              into the LLM and stored in a key called "response"
     # "context"  : populated by getting the value of the "context" key from the previous step
-    | {"response": prompt | primary_llm, "context": itemgetter("context")}
 )
-#-----Chainlit Integration-----#
 # Sets initial chat settings at the start of a user session
 @cl.on_chat_start
 async def start_chat():
     settings = {
         "model": "gpt-4o",
         "temperature": 0,
         "max_tokens": 500,
-        "top_p": 1,
         "frequency_penalty": 0,
-        "presence_penalty": 0,
     }
     cl.user_session.set("settings", settings)
@@ -127,6 +134,6 @@ async def handle_message(message: cl.Message):
     # Extracting and sending just the content
     content = response["response"].content
-    pretty_content = content.strip()  # Remove any leading/trailing whitespace
     await cl.Message(content=pretty_content).send()

 #-----Import Required Libraries-----#
 import os
+import chainlit as cl
+import tiktoken
 import openai
+import fitz
 import pandas as pd
+from dotenv import load_dotenv
 from transformers import pipeline
 from qdrant_client import QdrantClient
 from qdrant_client.http import models as qdrant_models
 from langchain.document_loaders import PyMuPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings import OpenAIEmbeddings
 from langchain_community.vectorstores import Qdrant
 from langchain.prompts import ChatPromptTemplate
+from langchain.chat_models import ChatOpenAI
 from operator import itemgetter
 from langchain.schema.output_parser import StrOutputParser
 from langchain.schema.runnable import RunnablePassthrough
+# Set environment variables
 load_dotenv()
 # Load environment variables
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+# Initialize OpenAI
 openai.api_key = OPENAI_API_KEY
+# Load embedding model
+embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
 loader = PyMuPDFLoader("./data/Airbnb-10k.pdf")
 documents = loader.load()
 def tiktoken_len(text):
     tokens = tiktoken.encoding_for_model("gpt-4o").encode(text)
     return len(tokens)
     length_function = tiktoken_len
 )
+split_documents = text_splitter.split_documents(documents)
 # Creating a Qdrant Vector Store
 qdrant_vector_store = Qdrant.from_documents(
+    split_documents,
     embeddings,
     location=":memory:",
+    collection_name="Airbnb-10k",
 )
 # Create a Retriever
 retriever = qdrant_vector_store.as_retriever()
+# -- AUGMENTED -- #
+"""
+1. Define a String Template
+2. Create a Prompt Template from the String Template
+"""
+### 1. DEFINE STRING TEMPLATE
+RAG_PROMPT_TEMPLATE = """\
+<|start_header_id|>system<|end_header_id|>
+You are a helpful assistant. You answer user questions based on provided context. If you can't answer the question with the provided context,\
+    say you don't know.<|eot_id|>
+<|start_header_id|>user<|end_header_id|>
+User Query:
+{query}
 Context:
+{context}<|eot_id|>
+<|start_header_id|>assistant<|end_header_id|>
 """
+#Note that we do not have the response here. We have assistent, we ONLY start, but not followed by <|eot_id> as we do not have a response YET.
+rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT_TEMPLATE)
+# Define the LLM
+llm = ChatOpenAI(model_name="gpt-4o")
 #-----Creating a Retrieval Augmented Generation (RAG) Chain-----#
 # The RAG chain:
     # "response" : the "context" and "question" values are used to format our prompt object and then piped
     #              into the LLM and stored in a key called "response"
     # "context"  : populated by getting the value of the "context" key from the previous step
+    | {"response": rag_prompt | llm, "context": itemgetter("context")}
 )
 # Sets initial chat settings at the start of a user session
 @cl.on_chat_start
 async def start_chat():
+    """
+    This function will be called at the start of every user session.
+    We will build our LCEL RAG chain here, and store it in the user session.
+    The user session is a dictionary that is unique to each user session, and is stored in the memory of the server.
+    """
     settings = {
         "model": "gpt-4o",
         "temperature": 0,
         "max_tokens": 500,
         "frequency_penalty": 0,
+        "top_p": 1,
     }
     cl.user_session.set("settings", settings)
     # Extracting and sending just the content
     content = response["response"].content
+    pretty_content = content.strip()
     await cl.Message(content=pretty_content).send()

chainlit.md CHANGED Viewed

@@ -4,16 +4,6 @@ Welcome to the Airbnb 10k 2024 RAG application!
 This RAG (retrieval augmentation generation) application allows you to query the Airbnb 10k 2024 filing dataset. It utilizes a generalized LLM and uses RAG techniques to retrieve and respond to user queries specific to knowledge of the Airbnb 10k 2024 filing dataset.
-Build 🏗️
-Data: Airbnb 10-k Filings from Q1, 2024
-LLM: OpenAI
-Embedding Model: OpenAI Embeddings (model="text-embedding-3-small")
-Infrastructure: LangChain
-Vector Store: QDrant
-Deployment: Chainlit, Hugging Face
-Ship 🚢
 Evaluate your answers to the following questions
 Q1 "What is Airbnb's 'Description of Business'?"

 This RAG (retrieval augmentation generation) application allows you to query the Airbnb 10k 2024 filing dataset. It utilizes a generalized LLM and uses RAG techniques to retrieve and respond to user queries specific to knowledge of the Airbnb 10k 2024 filing dataset.
 Evaluate your answers to the following questions
 Q1 "What is Airbnb's 'Description of Business'?"

requirements.txt CHANGED Viewed

@@ -3,10 +3,10 @@ langchain==0.2.5
 langchain_community==0.2.5
 langchain_core==0.2.9
 langchain_text_splitters==0.2.1
 python-dotenv==1.0.1
-openai==1.35.3 #Be sure to use the latest version 'pip show openai'
-qdrant-client==1.9.2 #Be sure to use the latest version 'pip show qdrant-client'
-PyMuPDF==1.24.5 #Be sure to use the latest version 'pip show pymupdf'
-tiktoken==0.7.0
 transformers==4.37.0
 pandas==2.0.3

 langchain_community==0.2.5
 langchain_core==0.2.9
 langchain_text_splitters==0.2.1
+PyMuPDF==1.24.5
 python-dotenv==1.0.1
+openai==1.35.3
+qdrant-client==1.9.2
 transformers==4.37.0
 pandas==2.0.3
+tiktoken==0.7.0