Spaces:

norjala
/

Airbnb-10k

Sleeping

App Files Files Community

norjala commited on 22 days ago

Commit

79f1ae1

•

1 Parent(s): 5cb128c

Updated code

Browse files

Files changed (4) hide show

Dockerfile +20 -6
app.py +97 -109
chainlit.md +1 -1
requirements.txt +8 -10

Dockerfile CHANGED Viewed

@@ -1,13 +1,27 @@
 FROM python:3.9
 RUN useradd -m -u 1000 user
 USER user
 ENV HOME=/home/user \
     PATH=/home/user/.local/bin:$PATH
 WORKDIR $HOME/app
 COPY --chown=user . $HOME/app
-COPY ./requirements.txt ~/app/requirements.txt
-RUN pip install --upgrade pip
-RUN pip install -r requirements.txt
-RUN mkdir -p $HOME/app/data/vectorstore && chown -R user:user $HOME/app/data
-COPY . .
-CMD ["chainlit", "run", "app.py", "--port", "7860"]

 FROM python:3.9
+RUN pip install --upgrade pip
+# Create a user and set up the environment
 RUN useradd -m -u 1000 user
 USER user
 ENV HOME=/home/user \
     PATH=/home/user/.local/bin:$PATH
 WORKDIR $HOME/app
+# Add this line to copy the data directory
+COPY ./data /home/user/app/data
+# Copy only requirements.txt first to leverage Docker cache
+COPY --chown=user requirements.txt $HOME/app/requirements.txt
+# Install dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the application code
 COPY --chown=user . $HOME/app
+# Run the application
+CMD ["chainlit", "run", "app.py", "--port", "7860"]

app.py CHANGED Viewed

@@ -1,144 +1,132 @@
 import os
-import chainlit as cl
 import openai
 import tiktoken
-from dotenv import load_dotenv
 from operator import itemgetter
-from langchain_community.document_loaders import PyMuPDFLoader
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain_community.vectorstores import FAISS
-from langchain_openai.embeddings import OpenAIEmbeddings
-from langchain_core.prompts import PromptTemplate
-from langchain_core.runnables import RunnableConfig, RunnablePassthrough
-from langchain_openai import ChatOpenAI
-# Load environment variables from .env file
 load_dotenv()
-# Environment variables
-openai.api_key = os.environ.get("OPENAI_API_KEY")
-if not openai.api_key:
-    raise ValueError("OPENAI_API_KEY environment variable not set")
-# Set vector store path
-VECTOR_STORE_PATH = "./data/vectorstore"
-# Document loader
-document_loader = PyMuPDFLoader("./data/Airbnb-10k.pdf")
-documents = document_loader.load()
 def tiktoken_len(text):
     tokens = tiktoken.encoding_for_model("gpt-4o").encode(text)
     return len(tokens)
-# Load embeddings
-openai_embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
-# Text splitter
-text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=100)
-split_documents = text_splitter.split_documents(documents)
-# Create or load vector store
-if os.path.exists(os.path.join(VECTOR_STORE_PATH, "index.faiss")):
-    print("Loading existing vectorstore from disk.")
-    vectorstore = FAISS.load_local(
-        VECTOR_STORE_PATH,
-        openai_embeddings,
-        allow_dangerous_deserialization=True
-    )
-    retriever = vectorstore.as_retriever()
-    print("Loaded Vectorstore")
-else:
-    print("Indexing Files")
-    os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
-    vectorstore = FAISS.from_documents(split_documents[:32], openai_embeddings)
-    for i in range(32, len(split_documents), 32):
-        vectorstore.add_documents(split_documents[i:i+32])
-    vectorstore.save_local(VECTOR_STORE_PATH)
-    print("Vectorstore created and documents indexed.")
-# Create retriever
-retriever = vectorstore.as_retriever()
-# Define the prompt template
-RAG_PROMPT_TEMPLATE = """\
-system
-You are a helpful assistant. You answer user questions based on provided context. If you can't answer the question with the provided context, say you don't know.
-user
-User Query:
-{query}
 Context:
 {context}
-assistant
 """
-rag_prompt = PromptTemplate.from_template(RAG_PROMPT_TEMPLATE)
-# Create ChatOpenAI instance
-llm = ChatOpenAI(model_name="gpt-4o", temperature=0)
 retrieval_augmented_qa_chain = (
     {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
     | RunnablePassthrough.assign(context=itemgetter("context"))
-    | {"response": rag_prompt | llm, "context": itemgetter("context")}
 )
-# Chainlit
-@cl.on_chat_start
 async def start_chat():
-    """
-    This function will be called at the start of every user session.
-    We will build our LCEL RAG chain here and store it in the user session.
-    The user session is a dictionary that is unique to each user session and is stored in the memory of the server.
-    """
     settings = {
         "model": "gpt-4o",
         "temperature": 0,
-        "max_tokens": 64,
         "top_p": 1,
         "frequency_penalty": 0,
         "presence_penalty": 0,
     }
-    try:
-        lcel_rag_chain = ({"context": itemgetter("query") | retriever, "query": itemgetter("query")}
-                          | rag_prompt | llm)
-        cl.user_session.set("lcel_rag_chain", lcel_rag_chain)
-        print("Chat session started and LCEL RAG chain set.")
-    except Exception as e:
-        print(f"Error in start_chat: {e}")
-@cl.on_message
-async def main(message: cl.Message):
-    """
-    This function will be called every time a message is received from a session.
-    We will use the LCEL RAG chain to generate a response to the user query.
-    The LCEL RAG chain is stored in the user session and is unique to each user session - this is why we can access it here.
-    """
-    try:
-        lcel_rag_chain = cl.user_session.get("lcel_rag_chain")
-        print(f"Received message: {message.content}")
-        print("Using LCEL RAG chain to generate response...")
-        msg = cl.Message(content="")
-        async for chunk in lcel_rag_chain.astream(
-            {"query": message.content},
-            config=RunnableConfig(callbacks=[cl.LangchainCallbackHandler()]),
-        ):
-            chunk_text = chunk.content if hasattr(chunk, 'content') else str(chunk)
-            print(f"Streaming chunk: {chunk_text}")
-            await msg.stream_token(chunk_text)
-        print("Sending final message...")
-        await msg.send()
-        print("Message sent.")
-    except KeyError as e:
-        print(f"Session error: {e}")
-        await message.send("Session error occurred. Please try again.")
-    except Exception as e:
-        print(f"Error: {e}")
-        await message.send("An error occurred. Please try again.")

+#-----Import Required Libraries-----#
 import os
+from dotenv import load_dotenv
 import openai
+import fitz  # PyMuPDF
+import pandas as pd
+from transformers import pipeline
+from qdrant_client import QdrantClient
+from qdrant_client.http import models as qdrant_models
+import chainlit as cl
 import tiktoken
+# Specific imports from the libraries
+from langchain.document_loaders import PyMuPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings import OpenAIEmbeddings #Note: Old import was - from langchain_openai import OpenAIEmbeddings
+from langchain_community.vectorstores import Qdrant
+from langchain.prompts import ChatPromptTemplate
+from langchain.chat_models import ChatOpenAI #Note: Old import was - from langchain_openai import ChatOpenAI
 from operator import itemgetter
+from langchain.schema.output_parser import StrOutputParser
+from langchain.schema.runnable import RunnablePassthrough
+#-----Set Environment Variables-----#
 load_dotenv()
+# Load environment variables
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+# Initialize OpenAI client after loading the environment variables
+openai.api_key = OPENAI_API_KEY
+#-----Document Loading and Processing -----#
+loader = PyMuPDFLoader("./data/Airbnb-10k.pdf")
+documents = loader.load()
+#Note: I changed the loader file path from one that worked locally only to one that worked with Docker. The old file path is loader = PyMuPDFLoader("/Users/sampazar/AIE3-Midterm/data/airbnb_q1_2024.pdf")
 def tiktoken_len(text):
     tokens = tiktoken.encoding_for_model("gpt-4o").encode(text)
     return len(tokens)
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=500,
+    chunk_overlap=100,
+    length_function = tiktoken_len
+)
+split_chunks = text_splitter.split_documents(documents)
+#-----Embedding and Vector Store Setup-----#
+# Load OpenAI Embeddings Model
+embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
+# Creating a Qdrant Vector Store
+qdrant_vector_store = Qdrant.from_documents(
+    split_chunks,
+    embeddings,
+    location=":memory:",
+    collection_name="Airbnb_Q1_2024",
+)
+# Create a Retriever
+retriever = qdrant_vector_store.as_retriever()
+#-----Prompt Template and Language Model Setup-----#
+# Define the prompt template
+template = """Answer the question based only on the following context. If you cannot answer the question with the context, please respond with 'I don't know':
 Context:
 {context}
+Question:
+{question}
 """
+prompt = ChatPromptTemplate.from_template(template)
+# Define the primary LLM
+primary_llm = ChatOpenAI(model_name="gpt-4o", temperature=0)
+#-----Creating a Retrieval Augmented Generation (RAG) Chain-----#
+# The RAG chain:
+# (1) Takes the user question and retrieves relevant context,
+# (2) Passes the context through unchanged,
+# (3) Formats the prompt with context and question, then send it to the LLM to generate a response
 retrieval_augmented_qa_chain = (
+    # INVOKE CHAIN WITH: {"question" : "<>"}
+    # "question" : populated by getting the value of the "question" key
+    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
     {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
+    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
+    #              by getting the value of the "context" key from the previous step
     | RunnablePassthrough.assign(context=itemgetter("context"))
+    # "response" : the "context" and "question" values are used to format our prompt object and then piped
+    #              into the LLM and stored in a key called "response"
+    # "context"  : populated by getting the value of the "context" key from the previous step
+    | {"response": prompt | primary_llm, "context": itemgetter("context")}
 )
+#-----Chainlit Integration-----#
+# Sets initial chat settings at the start of a user session
+@cl.on_chat_start
 async def start_chat():
     settings = {
         "model": "gpt-4o",
         "temperature": 0,
+        "max_tokens": 500,
         "top_p": 1,
         "frequency_penalty": 0,
         "presence_penalty": 0,
     }
+    cl.user_session.set("settings", settings)
+# Processes incoming messages from the user and sends a response through a series of steps:
+# (1) Retrieves the user's settings
+# (2) Invokes the RAG chain with the user's message
+# (3) Extracts the content from the response and sends it back to the user
+@cl.on_message
+async def handle_message(message: cl.Message):
+    settings = cl.user_session.get("settings")
+    response = retrieval_augmented_qa_chain.invoke({"question": message.content})
+    # Extracting and sending just the content
+    content = response["response"].content
+    pretty_content = content.strip()  # Remove any leading/trailing whitespace
+    await cl.Message(content=pretty_content).send()

chainlit.md CHANGED Viewed

@@ -1,4 +1,4 @@
-# Airbnb 10k 2024 RAG Application
 Welcome to the Airbnb 10k 2024 RAG application!


1	+ ## Airbnb 10k 2024 RAG Application
2
3	Welcome to the Airbnb 10k 2024 RAG application!
4

requirements.txt CHANGED Viewed

@@ -1,14 +1,12 @@
-chainlit>=0.7.700
 langchain==0.2.5
 langchain_community==0.2.5
 langchain_core==0.2.9
-langchain_huggingface==0.0.3
 langchain_text_splitters==0.2.1
-langchain_openai==0.1.9
-python-dotenv==1.0.0
-pymupdf==1.24.5
-faiss-cpu==1.8.0.post1
-openai==1.35.3
-tiktoken<1,>=0.7
-uvicorn<0.26.0,>=0.25.0
-gunicorn==20.1.0

+chainlit==0.7.700
 langchain==0.2.5
 langchain_community==0.2.5
 langchain_core==0.2.9
 langchain_text_splitters==0.2.1
+python-dotenv==1.0.1
+openai==1.35.3 #Be sure to use the latest version 'pip show openai'
+qdrant-client==1.9.2 #Be sure to use the latest version 'pip show qdrant-client'
+PyMuPDF==1.24.5 #Be sure to use the latest version 'pip show pymupdf'
+tiktoken==0.7.0
+transformers==4.37.0
+pandas==2.0.3