PeacePal

Runtime error

App Files Files Community

SwatGarg commited on Mar 19

Commit

63b46ac

•

1 Parent(s): eb5e96e

Create retriever.py

Browse files

Files changed (1) hide show

retriever.py +120 -0

retriever.py ADDED Viewed

	@@ -0,0 +1,120 @@

+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain.retrievers import ParentDocumentRetriever
+from langchain.storage import InMemoryStore
+from langchain_community.vectorstores import Chroma
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyMuPDFLoader
+import os
+# Function to create embeddings
+# def create_embeddings(text_chunks):
+#     embeddings = embeddings_model.encode(text_chunks, show_progress_bar=True)
+#     return embeddings
+curr_dir = os.getcwd()
+db_path = os.path.join(os.path.dirname(os.path.dirname(curr_dir)), 'src','vector_db','chroma_db_v2')
+def process_pdf_document(file_path, parent_chunk_size=2000, child_chunk_size=500):
+    '''
+    Process a PDF document and return the documents and text splitters
+    Args:
+        file_path (str): The path to the PDF document
+        parent_chunk_size (int): The size of the parent chunks
+        child_chunk_size (int): The size of the child chunks
+    Returns:
+        documents (list): The list of documents
+        parent_splitter (RecursiveCharacterTextSplitter): The text splitter for the parent documents
+        child_splitter (RecursiveCharacterTextSplitter): The text splitter for the child documents
+    '''
+    # Load the PDF document
+    loader = PyMuPDFLoader(file_path)
+    documents = loader.load()
+    # Initialize text splitters for parent and child documents
+    parent_splitter = RecursiveCharacterTextSplitter(chunk_size=parent_chunk_size)
+    child_splitter = RecursiveCharacterTextSplitter(chunk_size=child_chunk_size)
+    return documents, parent_splitter, child_splitter
+# Function to create the vectorstore
+def create_vectorstore(embeddings_model="all-MiniLM-L6-v2"):
+    '''
+    Create the vectorstore and store for the documents
+    Args:
+        embeddings_model (HuggingFaceEmbeddings): The embeddings model
+        documents (list): The list of documents
+    Returns:
+        vectorstore (Chroma): The vectorstore
+        store (InMemoryStore): The store
+    '''
+    # Initialize the embedding model
+    embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+    # This text splitter is used to create the parent documents
+    parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
+    # This text splitter is used to create the child documents
+    # It should create documents smaller than the parent
+    child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
+    # The vectorstore to use to index the child chunks
+    vectorstore = Chroma(
+        collection_name="split_parents", embedding_function=embeddings_model
+    )
+    vectordb = Chroma(persist_directory=db_path,
+                  embedding_function=embeddings_model)
+    # The storage layer for the parent documents
+    store = InMemoryStore()
+    return vectordb, store
+def rag_retriever(vectorstore):
+    '''
+    Create the retriever for the RAG model
+    Args:
+        vectorstore (Chroma): The vectorstore
+        store (InMemoryStore): The store
+        parent_splitter (RecursiveCharacterTextSplitter): The text splitter for the parent documents
+        child_splitter (RecursiveCharacterTextSplitter): The text splitter for the child documents
+    Returns:
+        retriever (ParentDocumentRetriever): The retriever
+    '''
+    # retriever = ParentDocumentRetriever(
+    #     vectorstore=vectorstore,
+    #     docstore=store,
+    #     child_splitter=None,
+    #     parent_splitter=None,
+    #     docs=documents
+    # )
+    # retriever.add_documents(documents)
+    retriever = vectorstore.as_retriever()
+    return retriever
+# def retrieve_context(query, top_k):
+#     # Retrieve the top k similar documents
+#     sub_docs = vectorstore.similarity_search(query, k=top_k, return_documents=True)
+#     # Get the context of the first document
+#     context = sub_docs[0].page_content
+#     return context