PeacePal

Runtime error

File size: 5,102 Bytes

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
import os

# Function to create embeddings
# def create_embeddings(text_chunks):
#     embeddings = embeddings_model.encode(text_chunks, show_progress_bar=True)
#     return embeddings

curr_dir = os.getcwd()
db_path = 'chroma_db_v2'

class QuestionRetriever:
   
    def load_documents(self,file_name):
      current_directory = os.getcwd()
      data_directory = os.path.join(current_directory, "data")
      file_path = os.path.join(data_directory, file_name)
      loader = TextLoader(file_path)
      documents = loader.load()
      return documents
    
    def store_data_in_vector_db(self,documents):
    #   global db
      text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0,separator="\n")
      docs = text_splitter.split_documents(documents)
      # create the open-source embedding function
      embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
      # print(docs)
      # load it into Chroma
      db = Chroma.from_documents(docs, embedding_function)
      return db

    def get_response(self, user_query):
        db=self.store_data_in_vector_db(documents)

        docs = db.similarity_search(user_query)
        most_similar_question = docs[0].page_content.split("\n")[0]  # Extract the first question
        if user_query==most_similar_question:
          most_similar_question=docs[1].page_content.split("\n")[0]

        print(most_similar_question)
        return most_similar_question
        
def process_pdf_document(file_path, parent_chunk_size=2000, child_chunk_size=500):
    '''
    Process a PDF document and return the documents and text splitters

    Args:
        file_path (str): The path to the PDF document
        parent_chunk_size (int): The size of the parent chunks
        child_chunk_size (int): The size of the child chunks

    Returns:
        documents (list): The list of documents
        parent_splitter (RecursiveCharacterTextSplitter): The text splitter for the parent documents
        child_splitter (RecursiveCharacterTextSplitter): The text splitter for the child documents

    '''
    # Load the PDF document
    loader = PyMuPDFLoader(file_path)
    documents = loader.load()

    # Initialize text splitters for parent and child documents
    parent_splitter = RecursiveCharacterTextSplitter(chunk_size=parent_chunk_size)
    child_splitter = RecursiveCharacterTextSplitter(chunk_size=child_chunk_size)

    return documents, parent_splitter, child_splitter


# Function to create the vectorstore
def create_vectorstore(embeddings_model="all-MiniLM-L6-v2"):
    '''
    Create the vectorstore and store for the documents

    Args:
        embeddings_model (HuggingFaceEmbeddings): The embeddings model
        documents (list): The list of documents

    Returns:
        vectorstore (Chroma): The vectorstore
        store (InMemoryStore): The store

    '''

    # Initialize the embedding model
    embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

    # This text splitter is used to create the parent documents
    parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)

    # This text splitter is used to create the child documents
    # It should create documents smaller than the parent
    child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

    # The vectorstore to use to index the child chunks
    vectorstore = Chroma(
        collection_name="split_parents", embedding_function=embeddings_model
    )
    vectordb = Chroma(persist_directory=db_path,
                  embedding_function=embeddings_model)
    # The storage layer for the parent documents
    store = InMemoryStore()

    return vectordb, store



def rag_retriever(vectorstore):
    '''
    Create the retriever for the RAG model

    Args:
        vectorstore (Chroma): The vectorstore
        store (InMemoryStore): The store
        parent_splitter (RecursiveCharacterTextSplitter): The text splitter for the parent documents
        child_splitter (RecursiveCharacterTextSplitter): The text splitter for the child documents

    Returns:
        retriever (ParentDocumentRetriever): The retriever
        
    '''

    # retriever = ParentDocumentRetriever(
    #     vectorstore=vectorstore,
    #     docstore=store,
    #     child_splitter=None,
    #     parent_splitter=None,
    #     docs=documents
    # )

    # retriever.add_documents(documents)
    retriever = vectorstore.as_retriever()

    return retriever




# def retrieve_context(query, top_k):

#     # Retrieve the top k similar documents
#     sub_docs = vectorstore.similarity_search(query, k=top_k, return_documents=True)

#     # Get the context of the first document
#     context = sub_docs[0].page_content

#     return context