| | """Document processing module for loading and splitting documents""" |
| |
|
| | from typing import List, Union |
| | from langchain_community.document_loaders import WebBaseLoader |
| | from langchain_text_splitters import RecursiveCharacterTextSplitter |
| | |
| | from langchain_core.documents import Document |
| | from pathlib import Path |
| | from langchain_community.document_loaders import ( |
| | WebBaseLoader, |
| | PyPDFLoader, |
| | TextLoader, |
| | PyPDFDirectoryLoader |
| | ) |
| |
|
| | class DocumentProcessor: |
| | """Handles document loading and processing""" |
| | |
| | def __init__(self, chunk_size: int = 500, chunk_overlap: int = 50): |
| | """ |
| | Initialize document processor |
| | |
| | Args: |
| | chunk_size: Size of text chunks |
| | chunk_overlap: Overlap between chunks |
| | """ |
| | self.chunk_size = chunk_size |
| | self.chunk_overlap = chunk_overlap |
| | self.splitter = RecursiveCharacterTextSplitter( |
| | chunk_size=chunk_size, |
| | chunk_overlap=chunk_overlap |
| | ) |
| |
|
| | def process_pdf(self, file_paths: List[str]): |
| | """Load multiple PDFs and return a combined list of chunks""" |
| | all_documents = [] |
| | |
| | for path in file_paths: |
| | try: |
| | loader = PyPDFLoader(path) |
| | |
| | chunks = loader.load_and_split(text_splitter=self.text_splitter) |
| | |
| | all_documents.extend(chunks) |
| | except Exception as e: |
| | print(f"Error loading PDF {path}: {e}") |
| | |
| | return all_documents |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | def load_from_url(self, url: str) -> List[Document]: |
| | """Load document(s) from a URL""" |
| | loader = WebBaseLoader(url) |
| | return loader.load() |
| |
|
| | def load_from_pdf_dir(self, directory: Union[str, Path]) -> List[Document]: |
| | """Load documents from all PDFs inside a directory""" |
| | loader = PyPDFDirectoryLoader(str(directory)) |
| | return loader.load() |
| |
|
| | def load_from_txt(self, file_path: Union[str, Path]) -> List[Document]: |
| | """Load document(s) from a TXT file""" |
| | loader = TextLoader(str(file_path), encoding="utf-8") |
| | return loader.load() |
| |
|
| | def load_from_pdf(self, file_path: Union[str, Path]) -> List[Document]: |
| | """Load document(s) from a PDF file""" |
| | loader = PyPDFDirectoryLoader(str("data")) |
| | return loader.load() |
| | |
| | def load_documents(self, sources: List[str]) -> List[Document]: |
| | """ |
| | Load documents from URLs, PDF directories, or TXT files |
| | |
| | Args: |
| | sources: List of URLs, PDF folder paths, or TXT file paths |
| | |
| | Returns: |
| | List of loaded documents |
| | """ |
| | docs: List[Document] = [] |
| | for src in sources: |
| | if src.startswith("http://") or src.startswith("https://"): |
| | docs.extend(self.load_from_url(src)) |
| | |
| | path = Path("data") |
| | if path.is_dir(): |
| | docs.extend(self.load_from_pdf_dir(path)) |
| | elif path.suffix.lower() == ".txt": |
| | docs.extend(self.load_from_txt(path)) |
| | else: |
| | raise ValueError( |
| | f"Unsupported source type: {src}. " |
| | "Use URL, .txt file, or PDF directory." |
| | ) |
| | return docs |
| | |
| | def split_documents(self, documents: List[Document]) -> List[Document]: |
| | """ |
| | Split documents into chunks |
| | |
| | Args: |
| | documents: List of documents to split |
| | |
| | Returns: |
| | List of split documents |
| | """ |
| | return self.splitter.split_documents(documents) |
| | |
| | def process_urls(self, urls: List[str]) -> List[Document]: |
| | """ |
| | Complete pipeline to load and split documents |
| | |
| | Args: |
| | urls: List of URLs to process |
| | |
| | Returns: |
| | List of processed document chunks |
| | """ |
| | docs = self.load_documents(urls) |
| | return self.split_documents(docs) |