norjala commited on
Commit
be8a991
β€’
1 Parent(s): 79f1ae1

Updated app code

Browse files
Files changed (4) hide show
  1. Dockerfile +0 -2
  2. app.py +43 -36
  3. chainlit.md +0 -10
  4. requirements.txt +4 -4
Dockerfile CHANGED
@@ -2,7 +2,6 @@ FROM python:3.9
2
 
3
  RUN pip install --upgrade pip
4
 
5
- # Create a user and set up the environment
6
  RUN useradd -m -u 1000 user
7
  USER user
8
 
@@ -11,7 +10,6 @@ ENV HOME=/home/user \
11
 
12
  WORKDIR $HOME/app
13
 
14
- # Add this line to copy the data directory
15
  COPY ./data /home/user/app/data
16
 
17
  # Copy only requirements.txt first to leverage Docker cache
 
2
 
3
  RUN pip install --upgrade pip
4
 
 
5
  RUN useradd -m -u 1000 user
6
  USER user
7
 
 
10
 
11
  WORKDIR $HOME/app
12
 
 
13
  COPY ./data /home/user/app/data
14
 
15
  # Copy only requirements.txt first to leverage Docker cache
app.py CHANGED
@@ -1,42 +1,39 @@
1
  #-----Import Required Libraries-----#
2
  import os
3
- from dotenv import load_dotenv
4
-
5
  import openai
6
- import fitz # PyMuPDF
7
  import pandas as pd
 
8
  from transformers import pipeline
9
  from qdrant_client import QdrantClient
10
  from qdrant_client.http import models as qdrant_models
11
- import chainlit as cl
12
- import tiktoken
13
-
14
- # Specific imports from the libraries
15
  from langchain.document_loaders import PyMuPDFLoader
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
17
- from langchain.embeddings import OpenAIEmbeddings #Note: Old import was - from langchain_openai import OpenAIEmbeddings
18
  from langchain_community.vectorstores import Qdrant
19
  from langchain.prompts import ChatPromptTemplate
20
- from langchain.chat_models import ChatOpenAI #Note: Old import was - from langchain_openai import ChatOpenAI
21
  from operator import itemgetter
22
  from langchain.schema.output_parser import StrOutputParser
23
  from langchain.schema.runnable import RunnablePassthrough
24
 
25
- #-----Set Environment Variables-----#
26
  load_dotenv()
27
 
28
  # Load environment variables
29
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
30
 
31
- # Initialize OpenAI client after loading the environment variables
32
  openai.api_key = OPENAI_API_KEY
33
 
34
- #-----Document Loading and Processing -----#
 
 
35
  loader = PyMuPDFLoader("./data/Airbnb-10k.pdf")
36
  documents = loader.load()
37
 
38
- #Note: I changed the loader file path from one that worked locally only to one that worked with Docker. The old file path is loader = PyMuPDFLoader("/Users/sampazar/AIE3-Midterm/data/airbnb_q1_2024.pdf")
39
-
40
  def tiktoken_len(text):
41
  tokens = tiktoken.encoding_for_model("gpt-4o").encode(text)
42
  return len(tokens)
@@ -47,37 +44,43 @@ text_splitter = RecursiveCharacterTextSplitter(
47
  length_function = tiktoken_len
48
  )
49
 
50
- split_chunks = text_splitter.split_documents(documents)
51
-
52
- #-----Embedding and Vector Store Setup-----#
53
-
54
- # Load OpenAI Embeddings Model
55
- embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
56
 
57
  # Creating a Qdrant Vector Store
58
  qdrant_vector_store = Qdrant.from_documents(
59
- split_chunks,
60
  embeddings,
61
  location=":memory:",
62
- collection_name="Airbnb_Q1_2024",
63
  )
64
 
65
  # Create a Retriever
66
  retriever = qdrant_vector_store.as_retriever()
67
 
68
- #-----Prompt Template and Language Model Setup-----#
69
- # Define the prompt template
70
- template = """Answer the question based only on the following context. If you cannot answer the question with the context, please respond with 'I don't know':
 
 
 
 
 
 
 
 
 
 
71
  Context:
72
- {context}
73
- Question:
74
- {question}
75
  """
 
 
 
76
 
77
- prompt = ChatPromptTemplate.from_template(template)
78
 
79
- # Define the primary LLM
80
- primary_llm = ChatOpenAI(model_name="gpt-4o", temperature=0)
81
 
82
  #-----Creating a Retrieval Augmented Generation (RAG) Chain-----#
83
  # The RAG chain:
@@ -96,20 +99,24 @@ retrieval_augmented_qa_chain = (
96
  # "response" : the "context" and "question" values are used to format our prompt object and then piped
97
  # into the LLM and stored in a key called "response"
98
  # "context" : populated by getting the value of the "context" key from the previous step
99
- | {"response": prompt | primary_llm, "context": itemgetter("context")}
100
  )
101
 
102
- #-----Chainlit Integration-----#
103
  # Sets initial chat settings at the start of a user session
104
  @cl.on_chat_start
105
  async def start_chat():
 
 
 
 
 
106
  settings = {
107
  "model": "gpt-4o",
108
  "temperature": 0,
109
  "max_tokens": 500,
110
- "top_p": 1,
111
  "frequency_penalty": 0,
112
- "presence_penalty": 0,
 
113
  }
114
  cl.user_session.set("settings", settings)
115
 
@@ -127,6 +134,6 @@ async def handle_message(message: cl.Message):
127
 
128
  # Extracting and sending just the content
129
  content = response["response"].content
130
- pretty_content = content.strip() # Remove any leading/trailing whitespace
131
 
132
  await cl.Message(content=pretty_content).send()
 
1
  #-----Import Required Libraries-----#
2
  import os
3
+ import chainlit as cl
4
+ import tiktoken
5
  import openai
6
+ import fitz
7
  import pandas as pd
8
+ from dotenv import load_dotenv
9
  from transformers import pipeline
10
  from qdrant_client import QdrantClient
11
  from qdrant_client.http import models as qdrant_models
 
 
 
 
12
  from langchain.document_loaders import PyMuPDFLoader
13
  from langchain.text_splitter import RecursiveCharacterTextSplitter
14
+ from langchain.embeddings import OpenAIEmbeddings
15
  from langchain_community.vectorstores import Qdrant
16
  from langchain.prompts import ChatPromptTemplate
17
+ from langchain.chat_models import ChatOpenAI
18
  from operator import itemgetter
19
  from langchain.schema.output_parser import StrOutputParser
20
  from langchain.schema.runnable import RunnablePassthrough
21
 
22
+ # Set environment variables
23
  load_dotenv()
24
 
25
  # Load environment variables
26
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
27
 
28
+ # Initialize OpenAI
29
  openai.api_key = OPENAI_API_KEY
30
 
31
+ # Load embedding model
32
+ embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
33
+
34
  loader = PyMuPDFLoader("./data/Airbnb-10k.pdf")
35
  documents = loader.load()
36
 
 
 
37
  def tiktoken_len(text):
38
  tokens = tiktoken.encoding_for_model("gpt-4o").encode(text)
39
  return len(tokens)
 
44
  length_function = tiktoken_len
45
  )
46
 
47
+ split_documents = text_splitter.split_documents(documents)
 
 
 
 
 
48
 
49
  # Creating a Qdrant Vector Store
50
  qdrant_vector_store = Qdrant.from_documents(
51
+ split_documents,
52
  embeddings,
53
  location=":memory:",
54
+ collection_name="Airbnb-10k",
55
  )
56
 
57
  # Create a Retriever
58
  retriever = qdrant_vector_store.as_retriever()
59
 
60
+ # -- AUGMENTED -- #
61
+ """
62
+ 1. Define a String Template
63
+ 2. Create a Prompt Template from the String Template
64
+ """
65
+ ### 1. DEFINE STRING TEMPLATE
66
+ RAG_PROMPT_TEMPLATE = """\
67
+ <|start_header_id|>system<|end_header_id|>
68
+ You are a helpful assistant. You answer user questions based on provided context. If you can't answer the question with the provided context,\
69
+ say you don't know.<|eot_id|>
70
+ <|start_header_id|>user<|end_header_id|>
71
+ User Query:
72
+ {query}
73
  Context:
74
+ {context}<|eot_id|>
75
+ <|start_header_id|>assistant<|end_header_id|>
 
76
  """
77
+ #Note that we do not have the response here. We have assistent, we ONLY start, but not followed by <|eot_id> as we do not have a response YET.
78
+
79
+ rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT_TEMPLATE)
80
 
 
81
 
82
+ # Define the LLM
83
+ llm = ChatOpenAI(model_name="gpt-4o")
84
 
85
  #-----Creating a Retrieval Augmented Generation (RAG) Chain-----#
86
  # The RAG chain:
 
99
  # "response" : the "context" and "question" values are used to format our prompt object and then piped
100
  # into the LLM and stored in a key called "response"
101
  # "context" : populated by getting the value of the "context" key from the previous step
102
+ | {"response": rag_prompt | llm, "context": itemgetter("context")}
103
  )
104
 
 
105
  # Sets initial chat settings at the start of a user session
106
  @cl.on_chat_start
107
  async def start_chat():
108
+ """
109
+ This function will be called at the start of every user session.
110
+ We will build our LCEL RAG chain here, and store it in the user session.
111
+ The user session is a dictionary that is unique to each user session, and is stored in the memory of the server.
112
+ """
113
  settings = {
114
  "model": "gpt-4o",
115
  "temperature": 0,
116
  "max_tokens": 500,
 
117
  "frequency_penalty": 0,
118
+ "top_p": 1,
119
+
120
  }
121
  cl.user_session.set("settings", settings)
122
 
 
134
 
135
  # Extracting and sending just the content
136
  content = response["response"].content
137
+ pretty_content = content.strip()
138
 
139
  await cl.Message(content=pretty_content).send()
chainlit.md CHANGED
@@ -4,16 +4,6 @@ Welcome to the Airbnb 10k 2024 RAG application!
4
 
5
  This RAG (retrieval augmentation generation) application allows you to query the Airbnb 10k 2024 filing dataset. It utilizes a generalized LLM and uses RAG techniques to retrieve and respond to user queries specific to knowledge of the Airbnb 10k 2024 filing dataset.
6
 
7
- Build πŸ—οΈ
8
-
9
- Data: Airbnb 10-k Filings from Q1, 2024
10
- LLM: OpenAI
11
- Embedding Model: OpenAI Embeddings (model="text-embedding-3-small")
12
- Infrastructure: LangChain
13
- Vector Store: QDrant
14
- Deployment: Chainlit, Hugging Face
15
-
16
- Ship 🚒
17
 
18
  Evaluate your answers to the following questions
19
  Q1 "What is Airbnb's 'Description of Business'?"
 
4
 
5
  This RAG (retrieval augmentation generation) application allows you to query the Airbnb 10k 2024 filing dataset. It utilizes a generalized LLM and uses RAG techniques to retrieve and respond to user queries specific to knowledge of the Airbnb 10k 2024 filing dataset.
6
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  Evaluate your answers to the following questions
9
  Q1 "What is Airbnb's 'Description of Business'?"
requirements.txt CHANGED
@@ -3,10 +3,10 @@ langchain==0.2.5
3
  langchain_community==0.2.5
4
  langchain_core==0.2.9
5
  langchain_text_splitters==0.2.1
 
6
  python-dotenv==1.0.1
7
- openai==1.35.3 #Be sure to use the latest version 'pip show openai'
8
- qdrant-client==1.9.2 #Be sure to use the latest version 'pip show qdrant-client'
9
- PyMuPDF==1.24.5 #Be sure to use the latest version 'pip show pymupdf'
10
- tiktoken==0.7.0
11
  transformers==4.37.0
12
  pandas==2.0.3
 
 
3
  langchain_community==0.2.5
4
  langchain_core==0.2.9
5
  langchain_text_splitters==0.2.1
6
+ PyMuPDF==1.24.5
7
  python-dotenv==1.0.1
8
+ openai==1.35.3
9
+ qdrant-client==1.9.2
 
 
10
  transformers==4.37.0
11
  pandas==2.0.3
12
+ tiktoken==0.7.0