File size: 3,863 Bytes
63b46ac
 
 
 
 
 
 
 
 
 
 
 
 
 
306ad8e
63b46ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
import os

# Function to create embeddings
# def create_embeddings(text_chunks):
#     embeddings = embeddings_model.encode(text_chunks, show_progress_bar=True)
#     return embeddings

curr_dir = os.getcwd()
db_path = 'chroma_db'

def process_pdf_document(file_path, parent_chunk_size=2000, child_chunk_size=500):
    '''
    Process a PDF document and return the documents and text splitters

    Args:
        file_path (str): The path to the PDF document
        parent_chunk_size (int): The size of the parent chunks
        child_chunk_size (int): The size of the child chunks

    Returns:
        documents (list): The list of documents
        parent_splitter (RecursiveCharacterTextSplitter): The text splitter for the parent documents
        child_splitter (RecursiveCharacterTextSplitter): The text splitter for the child documents

    '''
    # Load the PDF document
    loader = PyMuPDFLoader(file_path)
    documents = loader.load()

    # Initialize text splitters for parent and child documents
    parent_splitter = RecursiveCharacterTextSplitter(chunk_size=parent_chunk_size)
    child_splitter = RecursiveCharacterTextSplitter(chunk_size=child_chunk_size)

    return documents, parent_splitter, child_splitter


# Function to create the vectorstore
def create_vectorstore(embeddings_model="all-MiniLM-L6-v2"):
    '''
    Create the vectorstore and store for the documents

    Args:
        embeddings_model (HuggingFaceEmbeddings): The embeddings model
        documents (list): The list of documents

    Returns:
        vectorstore (Chroma): The vectorstore
        store (InMemoryStore): The store

    '''

    # Initialize the embedding model
    embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

    # This text splitter is used to create the parent documents
    parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)

    # This text splitter is used to create the child documents
    # It should create documents smaller than the parent
    child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

    # The vectorstore to use to index the child chunks
    vectorstore = Chroma(
        collection_name="split_parents", embedding_function=embeddings_model
    )
    vectordb = Chroma(persist_directory=db_path,
                  embedding_function=embeddings_model)
    # The storage layer for the parent documents
    store = InMemoryStore()

    return vectordb, store



def rag_retriever(vectorstore):
    '''
    Create the retriever for the RAG model

    Args:
        vectorstore (Chroma): The vectorstore
        store (InMemoryStore): The store
        parent_splitter (RecursiveCharacterTextSplitter): The text splitter for the parent documents
        child_splitter (RecursiveCharacterTextSplitter): The text splitter for the child documents

    Returns:
        retriever (ParentDocumentRetriever): The retriever
        
    '''

    # retriever = ParentDocumentRetriever(
    #     vectorstore=vectorstore,
    #     docstore=store,
    #     child_splitter=None,
    #     parent_splitter=None,
    #     docs=documents
    # )

    # retriever.add_documents(documents)
    retriever = vectorstore.as_retriever()

    return retriever




# def retrieve_context(query, top_k):

#     # Retrieve the top k similar documents
#     sub_docs = vectorstore.similarity_search(query, k=top_k, return_documents=True)

#     # Get the context of the first document
#     context = sub_docs[0].page_content

#     return context