from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import UnstructuredAPIFileLoader def load_documents_OCR(file_path, unstructured_api): """Load documents that require OCR via unstructured.""" loader = UnstructuredAPIFileLoader(file_path=file_path, api_key=unstructured_api, url='https://paf-stkjy1b5.api.unstructuredapp.io/', mode='paged') documents = loader.load() return documents def load_documents(file_path): """Load documents using LangChain.""" loader = PyPDFLoader(file_path) documents = loader.load() return documents def split_documents(documents): """Split documents using LangChain splitter.""" text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=500) split_docs = text_splitter.split_documents(documents) return split_docs def load_and_split_documents(file_path): """Load and split documents from the specified file path.""" loader = PyPDFLoader(file_path) documents = loader.load() if not documents: print("No documents loaded from file:", file_path) return [] split_docs = split_documents(documents) if not split_docs: print("Document splitting resulted in no output for file:", file_path) return split_docs def update_metadata(documents, original_name): """Update metadata for each document.""" updated_documents = [] for doc in documents: doc.metadata['source'] = original_name updated_documents.append(doc) return updated_documents