VenkyPas commited on
Commit
9f3bc1a
1 Parent(s): f728247

Fix Question #2

Browse files
Files changed (3) hide show
  1. app.py +46 -52
  2. chainlit.md +0 -6
  3. requirements.txt +2 -0
app.py CHANGED
@@ -7,10 +7,14 @@ from langchain_community.document_loaders import PyMuPDFLoader
7
  from langchain_text_splitters import RecursiveCharacterTextSplitter
8
  from langchain_community.vectorstores import FAISS
9
  from langchain_community.vectorstores import Qdrant
 
 
10
  from langchain_huggingface import HuggingFaceEndpointEmbeddings
11
  from langchain_core.prompts import PromptTemplate
 
12
  from langchain.schema.runnable.config import RunnableConfig
13
  from langchain.globals import set_debug
 
14
 
15
  set_debug(False)
16
 
@@ -30,54 +34,38 @@ HF_LLM_ENDPOINT = os.environ["HF_LLM_ENDPOINT"]
30
  HF_EMBED_ENDPOINT = os.environ["HF_EMBED_ENDPOINT"]
31
  HF_TOKEN = os.environ["HF_TOKEN"]
32
 
33
- # ---- GLOBAL DECLARATIONS ---- #
34
-
35
- # -- RETRIEVAL -- #
36
- """
37
- 1. Load Documents from Text File
38
- 2. Split Documents into Chunks
39
- 3. Load HuggingFace Embeddings (remember to use the URL we set above)
40
- 4. Index Files if they do not exist, otherwise load the vectorstore
41
- """
42
  ### 1. CREATE TEXT LOADER AND LOAD DOCUMENTS
43
  ### NOTE: PAY ATTENTION TO THE PATH THEY ARE IN.
44
- pdf_loader = PyMuPDFLoader("./data/10Q-AirBnB.pdf")
45
- documents = pdf_loader.load()
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  ### 2. CREATE TEXT SPLITTER AND SPLIT DOCUMENTS
48
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=25)
49
  split_documents = text_splitter.split_documents(documents)
50
 
51
  ### 3. LOAD HUGGINGFACE EMBEDDINGS
52
- hf_embeddings = HuggingFaceEndpointEmbeddings(
53
- model=HF_EMBED_ENDPOINT,
54
- task="feature-extraction",
55
- huggingfacehub_api_token=HF_TOKEN,
56
- )
57
-
58
- # Step 6: Create a custom retriever
59
- # class CustomQdrantRetriever:
60
- # def __init__(self, vectorstore, top_k=5):
61
- # self.vectorstore = vectorstore
62
- # self.top_k = top_k
63
-
64
- # def __call__(self, query):
65
- # embedded_query = self.vectorstore.embedding_function(query)
66
- # search_result = vectorstore.search(
67
- # # collection_name=collection_name,
68
- # query_vector=embedded_query,
69
- # limit=self.top_k
70
- # )
71
- # documents = [
72
- # {"page_content": hit.payload["text"], "metadata": hit.payload}
73
- # for hit in search_result
74
- # ]
75
- # return documents
76
 
77
  FAISS_VECTOR_STORE = "FAISS"
78
  QDRANT_VECTOR_STORE = "QDRANT"
79
 
80
- VECTOR_STORE = FAISS_VECTOR_STORE
81
 
82
  hf_retriever = ""
83
 
@@ -86,7 +74,7 @@ if VECTOR_STORE == FAISS_VECTOR_STORE:
86
  VECTOR_STORE_DIR = os.path.join(DATA_DIR, "vectorstore")
87
  VECTOR_STORE_PATH = os.path.join(VECTOR_STORE_DIR, "index.faiss")
88
 
89
- FAISS_MAX_FETCH_SIZE = 2
90
  FAISS_MAX_BATCH_SIZE = 32
91
  if os.path.exists(VECTOR_STORE_PATH):
92
  vectorstore = FAISS.load_local(
@@ -94,7 +82,6 @@ if VECTOR_STORE == FAISS_VECTOR_STORE:
94
  hf_embeddings,
95
  allow_dangerous_deserialization=True # this is necessary to load the vectorstore from disk as it's stored as a `.pkl` file.
96
  )
97
- hf_retriever = vectorstore.as_retriever(search_kwargs={"k": FAISS_MAX_FETCH_SIZE, "fetch_k": FAISS_MAX_FETCH_SIZE})
98
  print("Loaded Vectorstore at " + VECTOR_STORE_DIR)
99
  else:
100
  print("Indexing Files")
@@ -108,7 +95,8 @@ if VECTOR_STORE == FAISS_VECTOR_STORE:
108
  vectorstore.add_documents(split_documents[i:i+FAISS_MAX_BATCH_SIZE])
109
  vectorstore.save_local(VECTOR_STORE_DIR)
110
 
111
- hf_retriever = vectorstore.as_retriever(search_kwargs={"k": FAISS_MAX_FETCH_SIZE, "fetch_k": FAISS_MAX_FETCH_SIZE})
 
112
  else:
113
  QDRANT_MAX_FETCH_SIZE = 2
114
  QDRANT_MAX_BATCH_SIZE = 32
@@ -127,7 +115,8 @@ else:
127
 
128
  # hf_retriever = CustomQdrantRetriever(vectorstore=vectorstore, top_k=QDRANT_MAX_FETCH_SIZE)
129
 
130
- hf_retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
 
131
 
132
  # -- AUGMENTED -- #
133
  """
@@ -158,15 +147,17 @@ rag_prompt = PromptTemplate.from_template(RAG_PROMPT_TEMPLATE)
158
  """
159
 
160
  ### 1. CREATE HUGGINGFACE ENDPOINT FOR LLM
161
- hf_llm = HuggingFaceEndpoint(
162
- endpoint_url=HF_LLM_ENDPOINT,
163
- max_new_tokens=64,
164
- top_k=10,
165
- top_p=0.95,
166
- temperature=0.3,
167
- repetition_penalty=1.15,
168
- huggingfacehub_api_token=HF_TOKEN,
169
- )
 
 
170
 
171
  @cl.author_rename
172
  def rename(original_author: str):
@@ -176,7 +167,7 @@ def rename(original_author: str):
176
  In this case, we're overriding the 'Assistant' author to be 'Paul Graham Essay Bot'.
177
  """
178
  rename_dict = {
179
- "Assistant" : "Paul Graham Essays Bot"
180
  }
181
  return rename_dict.get(original_author, original_author)
182
 
@@ -215,6 +206,9 @@ async def main(message: cl.Message):
215
  {"query": message.content},
216
  config=RunnableConfig(callbacks=[cl.LangchainCallbackHandler()]),
217
  ):
218
- await msg.stream_token(chunk)
 
 
 
219
 
220
  await msg.send()
 
7
  from langchain_text_splitters import RecursiveCharacterTextSplitter
8
  from langchain_community.vectorstores import FAISS
9
  from langchain_community.vectorstores import Qdrant
10
+ from langchain_openai import ChatOpenAI
11
+ from langchain_openai.embeddings import OpenAIEmbeddings
12
  from langchain_huggingface import HuggingFaceEndpointEmbeddings
13
  from langchain_core.prompts import PromptTemplate
14
+ from langchain_core.messages.ai import AIMessageChunk
15
  from langchain.schema.runnable.config import RunnableConfig
16
  from langchain.globals import set_debug
17
+ from llama_parse import LlamaParse
18
 
19
  set_debug(False)
20
 
 
34
  HF_EMBED_ENDPOINT = os.environ["HF_EMBED_ENDPOINT"]
35
  HF_TOKEN = os.environ["HF_TOKEN"]
36
 
 
 
 
 
 
 
 
 
 
37
  ### 1. CREATE TEXT LOADER AND LOAD DOCUMENTS
38
  ### NOTE: PAY ATTENTION TO THE PATH THEY ARE IN.
39
+ parser = LlamaParse(result_type='markdown', verbose=True, language='en')
40
+
41
+ pdf_documents = parser.load_data('./data/10Q-AirBnB.pdf')
42
+
43
+ class DataObj:
44
+ def __init__(self, data):
45
+ for key, value in data.items():
46
+ setattr(self, key, value)
47
+
48
+ # LlamaParse produces documents that don't have `page_content` attribute expected by Recursive Splitter`
49
+ document_dicts = [{"page_content": d.text, "metadata": {}} for d in pdf_documents]
50
+ documents = [DataObj(d) for d in document_dicts]
51
+ # print(documents[0].page_content)
52
 
53
  ### 2. CREATE TEXT SPLITTER AND SPLIT DOCUMENTS
54
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
55
  split_documents = text_splitter.split_documents(documents)
56
 
57
  ### 3. LOAD HUGGINGFACE EMBEDDINGS
58
+ # hf_embeddings = HuggingFaceEndpointEmbeddings(
59
+ # model=HF_EMBED_ENDPOINT,
60
+ # task="feature-extraction",
61
+ # huggingfacehub_api_token=HF_TOKEN,
62
+ # )
63
+ hf_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  FAISS_VECTOR_STORE = "FAISS"
66
  QDRANT_VECTOR_STORE = "QDRANT"
67
 
68
+ VECTOR_STORE = QDRANT_VECTOR_STORE
69
 
70
  hf_retriever = ""
71
 
 
74
  VECTOR_STORE_DIR = os.path.join(DATA_DIR, "vectorstore")
75
  VECTOR_STORE_PATH = os.path.join(VECTOR_STORE_DIR, "index.faiss")
76
 
77
+ FAISS_MAX_FETCH_SIZE = 5
78
  FAISS_MAX_BATCH_SIZE = 32
79
  if os.path.exists(VECTOR_STORE_PATH):
80
  vectorstore = FAISS.load_local(
 
82
  hf_embeddings,
83
  allow_dangerous_deserialization=True # this is necessary to load the vectorstore from disk as it's stored as a `.pkl` file.
84
  )
 
85
  print("Loaded Vectorstore at " + VECTOR_STORE_DIR)
86
  else:
87
  print("Indexing Files")
 
95
  vectorstore.add_documents(split_documents[i:i+FAISS_MAX_BATCH_SIZE])
96
  vectorstore.save_local(VECTOR_STORE_DIR)
97
 
98
+ # hf_retriever = vectorstore.as_retriever(search_kwargs={"k": FAISS_MAX_FETCH_SIZE, "fetch_k": FAISS_MAX_FETCH_SIZE})
99
+ hf_retriever = vectorstore.as_retriever()
100
  else:
101
  QDRANT_MAX_FETCH_SIZE = 2
102
  QDRANT_MAX_BATCH_SIZE = 32
 
115
 
116
  # hf_retriever = CustomQdrantRetriever(vectorstore=vectorstore, top_k=QDRANT_MAX_FETCH_SIZE)
117
 
118
+ # hf_retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
119
+ hf_retriever = vectorstore.as_retriever()
120
 
121
  # -- AUGMENTED -- #
122
  """
 
147
  """
148
 
149
  ### 1. CREATE HUGGINGFACE ENDPOINT FOR LLM
150
+ # hf_llm = HuggingFaceEndpoint(
151
+ # endpoint_url=HF_LLM_ENDPOINT,
152
+ # max_new_tokens=64,
153
+ # top_k=10,
154
+ # top_p=0.95,
155
+ # temperature=0.3,
156
+ # repetition_penalty=1.15,
157
+ # huggingfacehub_api_token=HF_TOKEN,
158
+ # )
159
+
160
+ hf_llm = ChatOpenAI(model="gpt-4o")
161
 
162
  @cl.author_rename
163
  def rename(original_author: str):
 
167
  In this case, we're overriding the 'Assistant' author to be 'Paul Graham Essay Bot'.
168
  """
169
  rename_dict = {
170
+ "Assistant" : "AirBnB 10Q agent"
171
  }
172
  return rename_dict.get(original_author, original_author)
173
 
 
206
  {"query": message.content},
207
  config=RunnableConfig(callbacks=[cl.LangchainCallbackHandler()]),
208
  ):
209
+ if (isinstance(chunk, AIMessageChunk)):
210
+ await msg.stream_token(chunk.content)
211
+ else:
212
+ await msg.stream_token(chunk)
213
 
214
  await msg.send()
chainlit.md CHANGED
@@ -1,9 +1,3 @@
1
  # AirBnB 10K Chat
2
 
3
  ### I am your personal assistant that can help answer questions about AirBnB 10K filing
4
-
5
- **Lessons not learned**
6
- - Chainlit.md is not rendering on the app
7
- - Not able to answer structured query (Q2) correctly
8
- - HuggingFace space setup takes way too long. Solved it through CPU upgrade
9
- - Work around Huggingface library restrictions on Context window
 
1
  # AirBnB 10K Chat
2
 
3
  ### I am your personal assistant that can help answer questions about AirBnB 10K filing
 
 
 
 
 
 
requirements.txt CHANGED
@@ -4,6 +4,8 @@ langchain_community==0.2.5
4
  langchain_core==0.2.9
5
  langchain_huggingface==0.0.3
6
  langchain_text_splitters==0.2.1
 
 
7
  python-dotenv==1.0.1
8
  faiss-cpu==1.8.0
9
  pymupdf==1.24.6
 
4
  langchain_core==0.2.9
5
  langchain_huggingface==0.0.3
6
  langchain_text_splitters==0.2.1
7
+ langchain-openai==0.1.14
8
+ llama-parse==0.4.5
9
  python-dotenv==1.0.1
10
  faiss-cpu==1.8.0
11
  pymupdf==1.24.6