Spaces:
Running
Running
File size: 909 Bytes
5ac9b29 e3cf4fe 5ac9b29 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
# Load and split the PDF document and return the documents and text chunks
def load_split_pdf(file_path):
# Load the PDF document and split it into chunks
loader = PyPDFLoader(file_path) # Initialize the PDF loader with the file path
documents = loader.load() # Load the PDF document
# Initialize the recursive character text splitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=100, # Set the maximum chunk size
chunk_overlap=20, # Set the number of overlapping characters between chunks
separators=["\n\n", "\n", " ", ""], # Define resume-specific separators for splitting
)
# Split the loaded documents into chunks
chunks = text_splitter.split_documents(documents)
return documents, chunks
|