|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
from langchain.retrievers import ParentDocumentRetriever |
|
from langchain.storage import InMemoryStore |
|
from langchain_community.vectorstores import Chroma |
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
from langchain_community.document_loaders import PyMuPDFLoader |
|
import os |
|
|
|
|
|
|
|
|
|
|
|
|
|
curr_dir = os.getcwd() |
|
db_path = 'chroma_db' |
|
|
|
def process_pdf_document(file_path, parent_chunk_size=2000, child_chunk_size=500): |
|
''' |
|
Process a PDF document and return the documents and text splitters |
|
|
|
Args: |
|
file_path (str): The path to the PDF document |
|
parent_chunk_size (int): The size of the parent chunks |
|
child_chunk_size (int): The size of the child chunks |
|
|
|
Returns: |
|
documents (list): The list of documents |
|
parent_splitter (RecursiveCharacterTextSplitter): The text splitter for the parent documents |
|
child_splitter (RecursiveCharacterTextSplitter): The text splitter for the child documents |
|
|
|
''' |
|
|
|
loader = PyMuPDFLoader(file_path) |
|
documents = loader.load() |
|
|
|
|
|
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=parent_chunk_size) |
|
child_splitter = RecursiveCharacterTextSplitter(chunk_size=child_chunk_size) |
|
|
|
return documents, parent_splitter, child_splitter |
|
|
|
|
|
|
|
def create_vectorstore(embeddings_model="all-MiniLM-L6-v2"): |
|
''' |
|
Create the vectorstore and store for the documents |
|
|
|
Args: |
|
embeddings_model (HuggingFaceEmbeddings): The embeddings model |
|
documents (list): The list of documents |
|
|
|
Returns: |
|
vectorstore (Chroma): The vectorstore |
|
store (InMemoryStore): The store |
|
|
|
''' |
|
|
|
|
|
embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") |
|
|
|
|
|
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000) |
|
|
|
|
|
|
|
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400) |
|
|
|
|
|
vectorstore = Chroma( |
|
collection_name="split_parents", embedding_function=embeddings_model |
|
) |
|
vectordb = Chroma(persist_directory=db_path, |
|
embedding_function=embeddings_model) |
|
|
|
store = InMemoryStore() |
|
|
|
return vectordb, store |
|
|
|
|
|
|
|
def rag_retriever(vectorstore): |
|
''' |
|
Create the retriever for the RAG model |
|
|
|
Args: |
|
vectorstore (Chroma): The vectorstore |
|
store (InMemoryStore): The store |
|
parent_splitter (RecursiveCharacterTextSplitter): The text splitter for the parent documents |
|
child_splitter (RecursiveCharacterTextSplitter): The text splitter for the child documents |
|
|
|
Returns: |
|
retriever (ParentDocumentRetriever): The retriever |
|
|
|
''' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
retriever = vectorstore.as_retriever() |
|
|
|
return retriever |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|