File size: 909 Bytes
5ac9b29
 
 
 
 
 
 
 
 
 
 
e3cf4fe
 
5ac9b29
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Load and split the PDF document and return the documents and text chunks
def load_split_pdf(file_path):
    # Load the PDF document and split it into chunks
    loader = PyPDFLoader(file_path)  # Initialize the PDF loader with the file path
    documents = loader.load()  # Load the PDF document 

    # Initialize the recursive character text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=100,  # Set the maximum chunk size
        chunk_overlap=20,  # Set the number of overlapping characters between chunks
        separators=["\n\n", "\n", " ", ""],  # Define resume-specific separators for splitting
    )   

    # Split the loaded documents into chunks
    chunks = text_splitter.split_documents(documents)
    return documents, chunks