norjala commited on
Commit
9f3be8d
β€’
1 Parent(s): f9c7571

Adding application file, chainlit.md, Dockerfile, README, requirements.txt, and data

Browse files
Files changed (7) hide show
  1. .gitignore +6 -0
  2. Dockerfile +20 -0
  3. README.md +5 -4
  4. app.py +144 -0
  5. chainlit.md +21 -0
  6. data/Airbnb-10k.pdf +0 -0
  7. requirements.txt +11 -0
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ .env
2
+ __pycache__/
3
+ .chainlit
4
+ *.faiss
5
+ *.pkl
6
+ .files
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12
2
+ RUN useradd -m -u 1000 user
3
+ USER user
4
+ ENV HOME=/home/user \
5
+ PATH=/home/user/.local/bin:$PATH
6
+ WORKDIR $HOME/app
7
+ COPY --chown=user . $HOME/app
8
+ COPY ./requirements.txt ~/app/requirements.txt
9
+ RUN pip install --upgrade pip
10
+ RUN pip install -r requirements.txt
11
+ RUN pip list
12
+
13
+ # Create the directory and set permissions
14
+ USER root
15
+ RUN mkdir -p /home/user/app/data/vectorstore && \
16
+ chown -R user:user /home/user/app/data && \
17
+ chmod -R 777 /home/user/app/data
18
+ USER user
19
+
20
+ CMD ["chainlit", "run", "app.py", "--port", "7860"]
README.md CHANGED
@@ -1,10 +1,11 @@
1
  ---
2
- title: Airbnb 10k
3
- emoji: 😻
4
- colorFrom: red
5
- colorTo: red
6
  sdk: docker
7
  pinned: false
 
8
  ---
9
 
10
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Airbnb 10k 2024
3
+ emoji: πŸ†
4
+ colorFrom: purple
5
+ colorTo: indigo
6
  sdk: docker
7
  pinned: false
8
+ license: openrail
9
  ---
10
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import chainlit as cl
3
+ import openai
4
+ import tiktoken
5
+ from dotenv import load_dotenv
6
+ from operator import itemgetter
7
+ from langchain_community.document_loaders import PyMuPDFLoader
8
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
9
+ from langchain_community.vectorstores import FAISS
10
+ from langchain_openai.embeddings import OpenAIEmbeddings
11
+ from langchain_core.prompts import PromptTemplate
12
+ from langchain_core.runnables import RunnableConfig, RunnablePassthrough
13
+ from langchain_openai import ChatOpenAI
14
+
15
+ # Load environment variables from .env file
16
+ load_dotenv()
17
+
18
+ # Environment variables
19
+ openai.api_key = os.environ.get("OPENAI_API_KEY")
20
+ if not openai.api_key:
21
+ raise ValueError("OPENAI_API_KEY environment variable not set")
22
+
23
+ # Set vector store path
24
+ VECTOR_STORE_PATH = "./data/vectorstore"
25
+
26
+ # Document loader
27
+ document_loader = PyMuPDFLoader("./data/Airbnb-10k.pdf")
28
+ documents = document_loader.load()
29
+
30
+ def tiktoken_len(text):
31
+ tokens = tiktoken.encoding_for_model("gpt-4o").encode(text)
32
+ return len(tokens)
33
+
34
+ # Load embeddings
35
+ openai_embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
36
+
37
+ # Text splitter
38
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=100)
39
+ split_documents = text_splitter.split_documents(documents)
40
+
41
+ # Create or load vector store
42
+ if os.path.exists(os.path.join(VECTOR_STORE_PATH, "index.faiss")):
43
+ print("Loading existing vectorstore from disk.")
44
+ vectorstore = FAISS.load_local(
45
+ VECTOR_STORE_PATH,
46
+ openai_embeddings,
47
+ allow_dangerous_deserialization=True
48
+ )
49
+ retriever = vectorstore.as_retriever()
50
+ print("Loaded Vectorstore")
51
+ else:
52
+ print("Indexing Files")
53
+ os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
54
+ vectorstore = FAISS.from_documents(split_documents[:32], openai_embeddings)
55
+ for i in range(32, len(split_documents), 32):
56
+ vectorstore.add_documents(split_documents[i:i+32])
57
+ vectorstore.save_local(VECTOR_STORE_PATH)
58
+ print("Vectorstore created and documents indexed.")
59
+
60
+ # Create retriever
61
+ retriever = vectorstore.as_retriever()
62
+
63
+ # Define the prompt template
64
+ RAG_PROMPT_TEMPLATE = """\
65
+ system
66
+ You are a helpful assistant. You answer user questions based on provided context. If you can't answer the question with the provided context, say you don't know.
67
+
68
+ user
69
+ User Query:
70
+ {query}
71
+
72
+ Context:
73
+ {context}
74
+
75
+ assistant
76
+ """
77
+
78
+ rag_prompt = PromptTemplate.from_template(RAG_PROMPT_TEMPLATE)
79
+
80
+ # Create ChatOpenAI instance
81
+ llm = ChatOpenAI(model_name="gpt-4o", temperature=0)
82
+
83
+ retrieval_augmented_qa_chain = (
84
+ {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
85
+ | RunnablePassthrough.assign(context=itemgetter("context"))
86
+ | {"response": rag_prompt | llm, "context": itemgetter("context")}
87
+ )
88
+
89
+ # Chainlit
90
+ @cl.on_chat_start
91
+ async def start_chat():
92
+ """
93
+ This function will be called at the start of every user session.
94
+ We will build our LCEL RAG chain here and store it in the user session.
95
+ The user session is a dictionary that is unique to each user session and is stored in the memory of the server.
96
+ """
97
+ settings = {
98
+ "model": "gpt-4o",
99
+ "temperature": 0,
100
+ "max_tokens": 500,
101
+ "top_p": 1,
102
+ "frequency_penalty": 0,
103
+ "presence_penalty": 0,
104
+ }
105
+ try:
106
+ lcel_rag_chain = ({"context": itemgetter("query") | retriever, "query": itemgetter("query")}
107
+ | rag_prompt | llm)
108
+
109
+ cl.user_session.set("lcel_rag_chain", lcel_rag_chain)
110
+ print("Chat session started and LCEL RAG chain set.")
111
+ except Exception as e:
112
+ print(f"Error in start_chat: {e}")
113
+
114
+ @cl.on_message
115
+ async def main(message: cl.Message):
116
+ """
117
+ This function will be called every time a message is received from a session.
118
+ We will use the LCEL RAG chain to generate a response to the user query.
119
+ The LCEL RAG chain is stored in the user session and is unique to each user session - this is why we can access it here.
120
+ """
121
+ try:
122
+ lcel_rag_chain = cl.user_session.get("lcel_rag_chain")
123
+ print(f"Received message: {message.content}")
124
+ print("Using LCEL RAG chain to generate response...")
125
+
126
+ msg = cl.Message(content="")
127
+
128
+ async for chunk in lcel_rag_chain.astream(
129
+ {"query": message.content},
130
+ config=RunnableConfig(callbacks=[cl.LangchainCallbackHandler()]),
131
+ ):
132
+ chunk_text = chunk.content if hasattr(chunk, 'content') else str(chunk)
133
+ print(f"Streaming chunk: {chunk_text}")
134
+ await msg.stream_token(chunk_text)
135
+
136
+ print("Sending final message...")
137
+ await msg.send()
138
+ print("Message sent.")
139
+ except KeyError as e:
140
+ print(f"Session error: {e}")
141
+ await message.send("Session error occurred. Please try again.")
142
+ except Exception as e:
143
+ print(f"Error: {e}")
144
+ await message.send("An error occurred. Please try again.")
chainlit.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Airbnb 10k 2024 RAG Application
2
+
3
+ Welcome to the Airbnb 10k 2024 RAG application!
4
+
5
+ This RAG (retrieval augmentation generation) application allows you to query the Airbnb 10k 2024 filing dataset. It utilizes a generalized LLM and uses RAG techniques to retrieve and respond to user queries specific to knowledge of the Airbnb 10k 2024 filing dataset.
6
+
7
+ Build πŸ—οΈ
8
+
9
+ Data: Airbnb 10-k Filings from Q1, 2024
10
+ LLM: OpenAI
11
+ Embedding Model: OpenAI Embeddings (model="text-embedding-3-small")
12
+ Infrastructure: LangChain
13
+ Vector Store: QDrant
14
+ Deployment: Chainlit, Hugging Face
15
+
16
+ Ship 🚒
17
+
18
+ Evaluate your answers to the following questions
19
+ Q1 "What is Airbnb's 'Description of Business'?"
20
+ Q2 "What was the total value of 'Cash and cash equivalents' as of December 31, 2023?"
21
+ Q3 "What is the 'maximum number of shares to be sold under the 10b5-1 Trading plan' by Brian Chesky?"
data/Airbnb-10k.pdf ADDED
Binary file (596 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ chainlit>=0.7.700
2
+ langchain==0.2.5
3
+ langchain_community==0.2.5
4
+ langchain_core==0.2.9
5
+ langchain_huggingface==0.0.3
6
+ langchain_text_splitters==0.2.1
7
+ langchain_openai==0.1.9
8
+ python-dotenv==1.0.0
9
+ pymupdf==1.24.5
10
+ faiss-cpu==1.8.0.post1
11
+ openai==1.35.3