isayahc commited on
Commit
47feab3
1 Parent(s): 446ec20

done more refactoring

Browse files
config.py CHANGED
@@ -2,7 +2,8 @@ import os
2
  from dotenv import load_dotenv
3
  from rag_app.database.db_handler import DataBaseHandler
4
  from langchain_huggingface import HuggingFaceEndpoint
5
- from langchain_huggingface import HuggingFaceHubEmbeddings
 
6
 
7
  load_dotenv()
8
 
@@ -16,7 +17,15 @@ HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
16
 
17
 
18
 
19
- embeddings = HuggingFaceHubEmbeddings(repo_id=EMBEDDING_MODEL)
 
 
 
 
 
 
 
 
20
 
21
  db = DataBaseHandler()
22
 
 
2
  from dotenv import load_dotenv
3
  from rag_app.database.db_handler import DataBaseHandler
4
  from langchain_huggingface import HuggingFaceEndpoint
5
+ # from langchain_huggingface import HuggingFaceHubEmbeddings
6
+ from langchain_huggingface import HuggingFaceEmbeddings
7
 
8
  load_dotenv()
9
 
 
17
 
18
 
19
 
20
+ # embeddings = HuggingFaceHubEmbeddings(repo_id=EMBEDDING_MODEL)
21
+
22
+ model_kwargs = {'device': 'cpu'}
23
+ encode_kwargs = {'normalize_embeddings': False}
24
+ embeddings = HuggingFaceEmbeddings(
25
+ model_name=EMBEDDING_MODEL,
26
+ model_kwargs=model_kwargs,
27
+ encode_kwargs=encode_kwargs
28
+ )
29
 
30
  db = DataBaseHandler()
31
 
rag_app/knowledge_base/utils.py CHANGED
@@ -1,38 +1,30 @@
1
  from langchain_core.documents import Document
2
  from chains import generate_document_summary_prompt
3
- from config import SEVEN_B_LLM_MODEL
4
  # embeddings functions
5
  from langchain_text_splitters import RecursiveCharacterTextSplitter
6
  from langchain_community.embeddings.sentence_transformer import (
7
  SentenceTransformerEmbeddings,
8
  )
9
  import time
10
- from langchain_core.documents import Document
11
- from config import EMBEDDING_MODEL
12
  from langchain.retrievers import VectorStoreRetriever
13
  from langchain_core.vectorstores import VectorStoreRetriever
14
  # vectorization functions
15
  from langchain_community.vectorstores import FAISS
16
  from langchain_community.vectorstores import Chroma
17
  from langchain_community.retrievers import BM25Retriever
 
18
 
19
- from rag_app.knowledge_base.utils import create_embeddings
20
- from rag_app.utils.generate_summary import generate_description, generate_keywords
21
-
22
- import time
23
- import os
24
-
25
- from config import FAISS_INDEX_PATH
26
 
27
  from pathlib import Path
28
  from langchain_community.vectorstores import FAISS
29
  from dotenv import load_dotenv
30
  import os
31
- from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
32
  import requests
33
- from langchain_community.vectorstores import Chroma
34
-
35
 
 
 
 
36
 
37
  def create_embeddings(
38
  docs: list[Document],
@@ -76,7 +68,8 @@ def create_embeddings(
76
 
77
 
78
  def generate_document_summaries(
79
- docs: list[Document]
 
80
  ) -> list[Document]:
81
  """
82
  Generates summaries for a list of Document objects and updates their metadata with the summaries.
@@ -99,7 +92,7 @@ def generate_document_summaries(
99
 
100
  for doc in new_docs:
101
 
102
- genrate_summary_chain = generate_document_summary_prompt | SEVEN_B_LLM_MODEL
103
  summary = genrate_summary_chain.invoke(
104
  {"document":str(doc.metadata)}
105
  )
@@ -155,105 +148,4 @@ def build_vector_store(
155
  result = f"built vectore store at {FAISS_INDEX_PATH}"
156
  return result
157
 
158
- def get_reranked_docs_faiss(
159
- query:str,
160
- path_to_db:str,
161
- embedding_model:str,
162
- hf_api_key:str,
163
- num_docs:int=5
164
- ) -> list:
165
- """ Re-ranks the similarity search results and returns top-k highest ranked docs
166
-
167
- Args:
168
- query (str): The search query
169
- path_to_db (str): Path to the vectorstore database
170
- embedding_model (str): Embedding model used in the vector store
171
- num_docs (int): Number of documents to return
172
-
173
- Returns: A list of documents with the highest rank
174
- """
175
- assert num_docs <= 10, "num_docs should be less than similarity search results"
176
-
177
- embeddings = HuggingFaceInferenceAPIEmbeddings(
178
- api_key=hf_api_key,
179
- model_name=embedding_model
180
- )
181
-
182
- # Load the vectorstore database
183
- db = FAISS.load_local(
184
- folder_path=path_to_db,
185
- embeddings=embeddings,
186
- allow_dangerous_deserialization=True
187
- )
188
-
189
- # Get 10 documents based on similarity search
190
- docs = db.similarity_search(query=query, k=10)
191
-
192
- # Add the page_content, description and title together
193
- passages = [doc.page_content + "\n" + doc.metadata.get('title', "") +"\n"+ doc.metadata.get('description', "")
194
- for doc in docs]
195
-
196
- # Prepare the payload
197
- inputs = [{"text": query, "text_pair": passage} for passage in passages]
198
-
199
- API_URL = "https://api-inference.huggingface.co/models/deepset/gbert-base-germandpr-reranking"
200
- headers = {"Authorization": f"Bearer {hf_api_key}"}
201
-
202
- response = requests.post(API_URL, headers=headers, json=inputs)
203
- scores = response.json()
204
-
205
- try:
206
- relevance_scores = [item[1]['score'] for item in scores]
207
- except ValueError as e:
208
- print('Could not get the relevance_scores -> something might be wrong with the json output')
209
- return
210
-
211
- if relevance_scores:
212
- ranked_results = sorted(zip(docs, passages, relevance_scores), key=lambda x: x[2], reverse=True)
213
- top_k_results = ranked_results[:num_docs]
214
- return [doc for doc, _, _ in top_k_results]
215
-
216
-
217
- def get_reranked_docs_chroma(query:str,
218
- path_to_db:str,
219
- embedding_model:str,
220
- hf_api_key:str,
221
- reranking_hf_url:str = "https://api-inference.huggingface.co/models/sentence-transformers/all-mpnet-base-v2",
222
- num_docs:int=5) -> list:
223
- """ Re-ranks the similarity search results and returns top-k highest ranked docs
224
-
225
- Args:
226
- query (str): The search query
227
- path_to_db (str): Path to the vectorstore database
228
- embedding_model (str): Embedding model used in the vector store
229
- num_docs (int): Number of documents to return
230
-
231
- Returns: A list of documents with the highest rank
232
- """
233
- embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=hf_api_key,
234
- model_name=embedding_model)
235
- # Load the vectorstore database
236
- db = Chroma(persist_directory=path_to_db, embedding_function=embeddings)
237
-
238
- # Get k documents based on similarity search
239
- sim_docs = db.similarity_search(query=query, k=10)
240
-
241
- passages = [doc.page_content for doc in sim_docs]
242
-
243
- # Prepare the payload
244
- payload = {"inputs":
245
- {"source_sentence": query,
246
- "sentences": passages}}
247
-
248
- headers = {"Authorization": f"Bearer {hf_api_key}"}
249
-
250
- response = requests.post(url=reranking_hf_url, headers=headers, json=payload)
251
- print(f'{response = }')
252
- if response.status_code != 200:
253
- print('Something went wrong with the response')
254
- return
255
-
256
- similarity_scores = response.json()
257
- ranked_results = sorted(zip(sim_docs, passages, similarity_scores), key=lambda x: x[2], reverse=True)
258
- top_k_results = ranked_results[:num_docs]
259
- return [doc for doc, _, _ in top_k_results]
 
1
  from langchain_core.documents import Document
2
  from chains import generate_document_summary_prompt
 
3
  # embeddings functions
4
  from langchain_text_splitters import RecursiveCharacterTextSplitter
5
  from langchain_community.embeddings.sentence_transformer import (
6
  SentenceTransformerEmbeddings,
7
  )
8
  import time
9
+ from langchain_core.language_models import BaseChatModel
 
10
  from langchain.retrievers import VectorStoreRetriever
11
  from langchain_core.vectorstores import VectorStoreRetriever
12
  # vectorization functions
13
  from langchain_community.vectorstores import FAISS
14
  from langchain_community.vectorstores import Chroma
15
  from langchain_community.retrievers import BM25Retriever
16
+ from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
17
 
 
 
 
 
 
 
 
18
 
19
  from pathlib import Path
20
  from langchain_community.vectorstores import FAISS
21
  from dotenv import load_dotenv
22
  import os
 
23
  import requests
 
 
24
 
25
+ from rag_app.knowledge_base.utils import create_embeddings
26
+ from rag_app.utils.generate_summary import generate_description, generate_keywords
27
+ from config import EMBEDDING_MODEL, FAISS_INDEX_PATH, SEVEN_B_LLM_MODEL
28
 
29
  def create_embeddings(
30
  docs: list[Document],
 
68
 
69
 
70
  def generate_document_summaries(
71
+ docs: list[Document],
72
+ llm:BaseChatModel= SEVEN_B_LLM_MODEL,
73
  ) -> list[Document]:
74
  """
75
  Generates summaries for a list of Document objects and updates their metadata with the summaries.
 
92
 
93
  for doc in new_docs:
94
 
95
+ genrate_summary_chain = generate_document_summary_prompt | llm
96
  summary = genrate_summary_chain.invoke(
97
  {"document":str(doc.metadata)}
98
  )
 
148
  result = f"built vectore store at {FAISS_INDEX_PATH}"
149
  return result
150
 
151
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
rag_app/vector_store_handler/vectorstores.py CHANGED
@@ -146,32 +146,43 @@ class ChromaVectorStore(BaseVectorStore):
146
  query:str,
147
  num_docs:int=5
148
  ):
 
 
 
 
 
 
 
149
 
150
- # Get 10 documents based on similarity search
151
- docs = self.vectorstore.similarity_search(query=query, k=10)
 
 
 
152
 
153
  # Add the page_content, description and title together
154
- passages = [doc.page_content + "\n" + doc.metadata.get('title', "") +"\n"+ doc.metadata.get('description', "")
155
- for doc in docs]
156
  # Prepare the payload
157
- inputs = [{"text": query, "text_pair": passage} for passage in passages]
158
-
159
- API_URL = "https://api-inference.huggingface.co/models/deepset/gbert-base-germandpr-reranking"
 
160
  headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}
 
161
 
162
- response = requests.post(API_URL, headers=headers, json=inputs)
163
- scores = response.json()
 
 
 
164
 
165
- try:
166
- relevance_scores = [item[1]['score'] for item in scores]
167
- except ValueError as e:
168
- print('Could not get the relevance_scores -> something might be wrong with the json output')
169
- return
170
 
171
- if relevance_scores:
172
- ranked_results = sorted(zip(docs, passages, relevance_scores), key=lambda x: x[2], reverse=True)
173
- top_k_results = ranked_results[:num_docs]
174
- return [doc for doc, _, _ in top_k_results]
175
 
176
  class FAISSVectorStore(BaseVectorStore):
177
  """
@@ -187,7 +198,7 @@ class FAISSVectorStore(BaseVectorStore):
187
  """
188
  self.vectorstore = FAISS.from_documents(texts, self.embeddings)
189
 
190
- def load_existing_vectorstore(self):
191
  """
192
  Load an existing FAISS vector store from the persist directory.
193
 
@@ -195,7 +206,7 @@ class FAISSVectorStore(BaseVectorStore):
195
  ValueError: If persist_directory is not set.
196
  """
197
  if self.persist_directory:
198
- self.vectorstore = FAISS.load_local(self.persist_directory, self.embeddings, allow_dangerous_deserialization=True)
199
  else:
200
  raise ValueError("Persist directory is required for loading FAISS.")
201
 
 
146
  query:str,
147
  num_docs:int=5
148
  ):
149
+ """ Re-ranks the similarity search results and returns top-k highest ranked docs
150
+
151
+ Args:
152
+ query (str): The search query
153
+ path_to_db (str): Path to the vectorstore database
154
+ embedding_model (str): Embedding model used in the vector store
155
+ num_docs (int): Number of documents to return
156
 
157
+ Returns: A list of documents with the highest rank
158
+ """
159
+
160
+ # Get k documents based on similarity search
161
+ sim_docs = self.vectorstore.similarity_search(query=query, k=10)
162
 
163
  # Add the page_content, description and title together
164
+ passages = [doc.page_content for doc in sim_docs]
165
+
166
  # Prepare the payload
167
+ payload = {"inputs":
168
+ {"source_sentence": query,
169
+ "sentences": passages}}
170
+
171
  headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}
172
+ reranking_hf_url:str = "https://api-inference.huggingface.co/models/sentence-transformers/all-mpnet-base-v2"
173
 
174
+ response = requests.post(url=reranking_hf_url, headers=headers, json=payload)
175
+ print(f'{response = }')
176
+ if response.status_code != 200:
177
+ print('Something went wrong with the response')
178
+ return
179
 
180
+ similarity_scores = response.json()
181
+ ranked_results = sorted(zip(sim_docs, passages, similarity_scores), key=lambda x: x[2], reverse=True)
182
+ top_k_results = ranked_results[:num_docs]
183
+ return [doc for doc, _, _ in top_k_results]
 
184
 
185
+
 
 
 
186
 
187
  class FAISSVectorStore(BaseVectorStore):
188
  """
 
198
  """
199
  self.vectorstore = FAISS.from_documents(texts, self.embeddings)
200
 
201
+ def load_existing_vectorstore(self,allow_dangerous_deserialization:bool=False):
202
  """
203
  Load an existing FAISS vector store from the persist directory.
204
 
 
206
  ValueError: If persist_directory is not set.
207
  """
208
  if self.persist_directory:
209
+ self.vectorstore = FAISS.load_local(self.persist_directory, self.embeddings, allow_dangerous_deserialization)
210
  else:
211
  raise ValueError("Persist directory is required for loading FAISS.")
212
 
tests/vector_store_handler/test_vectorstores.py CHANGED
@@ -1,14 +1,16 @@
1
  import unittest
2
  from unittest.mock import MagicMock, patch
3
- from langchain.embeddings import OpenAIEmbeddings
4
- from langchain.schema import Document
 
 
5
 
6
  # Update the import to reflect your project structure
7
  from rag_app.vector_store_handler.vectorstores import BaseVectorStore, ChromaVectorStore, FAISSVectorStore
8
 
9
  class TestBaseVectorStore(unittest.TestCase):
10
  def setUp(self):
11
- self.embedding_model = MagicMock(spec=OpenAIEmbeddings)
12
  self.base_store = BaseVectorStore(self.embedding_model, "test_dir")
13
 
14
  def test_init(self):
@@ -34,7 +36,7 @@ class TestBaseVectorStore(unittest.TestCase):
34
 
35
  class TestChromaVectorStore(unittest.TestCase):
36
  def setUp(self):
37
- self.embedding_model = MagicMock(spec=OpenAIEmbeddings)
38
  self.chroma_store = ChromaVectorStore(self.embedding_model, "test_dir")
39
 
40
  @patch('rag_app.vector_store_handler.vectorstores.Chroma')
@@ -62,7 +64,7 @@ class TestChromaVectorStore(unittest.TestCase):
62
 
63
  class TestFAISSVectorStore(unittest.TestCase):
64
  def setUp(self):
65
- self.embedding_model = MagicMock(spec=OpenAIEmbeddings)
66
  self.faiss_store = FAISSVectorStore(self.embedding_model, "test_dir")
67
 
68
  @patch('rag_app.vector_store_handler.vectorstores.FAISS')
 
1
  import unittest
2
  from unittest.mock import MagicMock, patch
3
+ # from langchain.embeddings import OpenAIEmbeddings
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+ # from langchain.schema import Document
6
+ from langchain_core.documents import Document
7
 
8
  # Update the import to reflect your project structure
9
  from rag_app.vector_store_handler.vectorstores import BaseVectorStore, ChromaVectorStore, FAISSVectorStore
10
 
11
  class TestBaseVectorStore(unittest.TestCase):
12
  def setUp(self):
13
+ self.embedding_model = MagicMock(spec=HuggingFaceEmbeddings)
14
  self.base_store = BaseVectorStore(self.embedding_model, "test_dir")
15
 
16
  def test_init(self):
 
36
 
37
  class TestChromaVectorStore(unittest.TestCase):
38
  def setUp(self):
39
+ self.embedding_model = MagicMock(spec=HuggingFaceEmbeddings)
40
  self.chroma_store = ChromaVectorStore(self.embedding_model, "test_dir")
41
 
42
  @patch('rag_app.vector_store_handler.vectorstores.Chroma')
 
64
 
65
  class TestFAISSVectorStore(unittest.TestCase):
66
  def setUp(self):
67
+ self.embedding_model = MagicMock(spec=HuggingFaceEmbeddings)
68
  self.faiss_store = FAISSVectorStore(self.embedding_model, "test_dir")
69
 
70
  @patch('rag_app.vector_store_handler.vectorstores.FAISS')