|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
from langchain.retrievers import ParentDocumentRetriever |
|
from langchain.storage import InMemoryStore |
|
from langchain_community.vectorstores import Chroma |
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
from langchain_community.document_loaders import PyMuPDFLoader |
|
import os |
|
|
|
|
|
|
|
|
|
|
|
|
|
curr_dir = os.getcwd() |
|
db_path = 'chroma_db' |
|
|
|
def process_pdf_document(file_path_list): |
|
''' |
|
Process a PDF document and return the documents and text splitters |
|
Args: |
|
file_path (str): The path to the PDF document |
|
parent_chunk_size (int): The size of the parent chunks |
|
child_chunk_size (int): The size of the child chunks |
|
Returns: |
|
documents (list): The list of documents |
|
parent_splitter (RecursiveCharacterTextSplitter): The text splitter for the parent documents |
|
child_splitter (RecursiveCharacterTextSplitter): The text splitter for the child documents |
|
''' |
|
|
|
|
|
|
|
|
|
loaders = [PyMuPDFLoader(file_path) for file_path in file_path_list] |
|
|
|
documents = [] |
|
for loader in loaders: |
|
documents.extend(loader.load()) |
|
|
|
return documents |
|
|
|
|
|
|
|
def create_vectorstore(embeddings_model="all-MiniLM-L6-v2"): |
|
''' |
|
Create the vectorstore and store for the documents |
|
Args: |
|
embeddings_model (HuggingFaceEmbeddings): The embeddings model |
|
documents (list): The list of documents |
|
Returns: |
|
vectorstore (Chroma): The vectorstore |
|
store (InMemoryStore): The store |
|
''' |
|
|
|
|
|
embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vectordb = Chroma(persist_directory=db_path, |
|
embedding_function=embeddings_model) |
|
|
|
|
|
store = InMemoryStore() |
|
|
|
return vectordb, store |
|
|
|
|
|
|
|
def rag_retriever(vectorstore, store, documents, parent_splitter, child_splitter): |
|
''' |
|
Create the retriever for the RAG model |
|
Args: |
|
vectorstore (Chroma): The vectorstore |
|
store (InMemoryStore): The store |
|
parent_splitter (RecursiveCharacterTextSplitter): The text splitter for the parent documents |
|
child_splitter (RecursiveCharacterTextSplitter): The text splitter for the child documents |
|
Returns: |
|
retriever (ParentDocumentRetriever): The retriever |
|
|
|
''' |
|
|
|
retriever = ParentDocumentRetriever( |
|
vectorstore=vectorstore, |
|
docstore=store, |
|
child_splitter=child_splitter, |
|
parent_splitter=parent_splitter, |
|
docs=documents |
|
) |
|
|
|
|
|
|
|
|
|
return retriever |