File size: 5,102 Bytes
63b46ac
 
 
 
 
 
 
 
 
 
 
 
 
 
70a42d3
63b46ac
10bbba1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63b46ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
import os

# Function to create embeddings
# def create_embeddings(text_chunks):
#     embeddings = embeddings_model.encode(text_chunks, show_progress_bar=True)
#     return embeddings

curr_dir = os.getcwd()
db_path = 'chroma_db_v2'

class QuestionRetriever:
   
    def load_documents(self,file_name):
      current_directory = os.getcwd()
      data_directory = os.path.join(current_directory, "data")
      file_path = os.path.join(data_directory, file_name)
      loader = TextLoader(file_path)
      documents = loader.load()
      return documents
    
    def store_data_in_vector_db(self,documents):
    #   global db
      text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0,separator="\n")
      docs = text_splitter.split_documents(documents)
      # create the open-source embedding function
      embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
      # print(docs)
      # load it into Chroma
      db = Chroma.from_documents(docs, embedding_function)
      return db

    def get_response(self, user_query):
        db=self.store_data_in_vector_db(documents)

        docs = db.similarity_search(user_query)
        most_similar_question = docs[0].page_content.split("\n")[0]  # Extract the first question
        if user_query==most_similar_question:
          most_similar_question=docs[1].page_content.split("\n")[0]

        print(most_similar_question)
        return most_similar_question
        
def process_pdf_document(file_path, parent_chunk_size=2000, child_chunk_size=500):
    '''
    Process a PDF document and return the documents and text splitters

    Args:
        file_path (str): The path to the PDF document
        parent_chunk_size (int): The size of the parent chunks
        child_chunk_size (int): The size of the child chunks

    Returns:
        documents (list): The list of documents
        parent_splitter (RecursiveCharacterTextSplitter): The text splitter for the parent documents
        child_splitter (RecursiveCharacterTextSplitter): The text splitter for the child documents

    '''
    # Load the PDF document
    loader = PyMuPDFLoader(file_path)
    documents = loader.load()

    # Initialize text splitters for parent and child documents
    parent_splitter = RecursiveCharacterTextSplitter(chunk_size=parent_chunk_size)
    child_splitter = RecursiveCharacterTextSplitter(chunk_size=child_chunk_size)

    return documents, parent_splitter, child_splitter


# Function to create the vectorstore
def create_vectorstore(embeddings_model="all-MiniLM-L6-v2"):
    '''
    Create the vectorstore and store for the documents

    Args:
        embeddings_model (HuggingFaceEmbeddings): The embeddings model
        documents (list): The list of documents

    Returns:
        vectorstore (Chroma): The vectorstore
        store (InMemoryStore): The store

    '''

    # Initialize the embedding model
    embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

    # This text splitter is used to create the parent documents
    parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)

    # This text splitter is used to create the child documents
    # It should create documents smaller than the parent
    child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

    # The vectorstore to use to index the child chunks
    vectorstore = Chroma(
        collection_name="split_parents", embedding_function=embeddings_model
    )
    vectordb = Chroma(persist_directory=db_path,
                  embedding_function=embeddings_model)
    # The storage layer for the parent documents
    store = InMemoryStore()

    return vectordb, store



def rag_retriever(vectorstore):
    '''
    Create the retriever for the RAG model

    Args:
        vectorstore (Chroma): The vectorstore
        store (InMemoryStore): The store
        parent_splitter (RecursiveCharacterTextSplitter): The text splitter for the parent documents
        child_splitter (RecursiveCharacterTextSplitter): The text splitter for the child documents

    Returns:
        retriever (ParentDocumentRetriever): The retriever
        
    '''

    # retriever = ParentDocumentRetriever(
    #     vectorstore=vectorstore,
    #     docstore=store,
    #     child_splitter=None,
    #     parent_splitter=None,
    #     docs=documents
    # )

    # retriever.add_documents(documents)
    retriever = vectorstore.as_retriever()

    return retriever




# def retrieve_context(query, top_k):

#     # Retrieve the top k similar documents
#     sub_docs = vectorstore.similarity_search(query, k=top_k, return_documents=True)

#     # Get the context of the first document
#     context = sub_docs[0].page_content

#     return context