Spaces:
Runtime error
Runtime error
| """Loader that loads PDF files.""" | |
| import os | |
| import tempfile | |
| from abc import ABC | |
| from typing import Any, List, Optional | |
| from urllib.parse import urlparse | |
| import requests | |
| from langchain.docstore.document import Document | |
| from langchain.document_loaders.base import BaseLoader | |
| from langchain.document_loaders.unstructured import UnstructuredFileLoader | |
| class UnstructuredPDFLoader(UnstructuredFileLoader): | |
| """Loader that uses unstructured to load PDF files.""" | |
| def _get_elements(self) -> List: | |
| from unstructured.partition.pdf import partition_pdf | |
| return partition_pdf(filename=self.file_path, **self.unstructured_kwargs) | |
| class BasePDFLoader(BaseLoader, ABC): | |
| """Base loader class for PDF files. | |
| Defaults to check for local file, but if the file is a web path, it will download it | |
| to a temporary file, and use that, then clean up the temporary file after completion | |
| """ | |
| file_path: str | |
| web_path: Optional[str] = None | |
| def __init__(self, file_path: str): | |
| """Initialize with file path.""" | |
| self.file_path = file_path | |
| if "~" in self.file_path: | |
| self.file_path = os.path.expanduser(self.file_path) | |
| # If the file is a web path, download it to a temporary file, and use that | |
| if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path): | |
| r = requests.get(self.file_path) | |
| if r.status_code != 200: | |
| raise ValueError( | |
| "Check the url of your file; returned status code %s" | |
| % r.status_code | |
| ) | |
| self.web_path = self.file_path | |
| self.temp_file = tempfile.NamedTemporaryFile() | |
| self.temp_file.write(r.content) | |
| self.file_path = self.temp_file.name | |
| elif not os.path.isfile(self.file_path): | |
| raise ValueError("File path %s is not a valid file or url" % self.file_path) | |
| def __del__(self) -> None: | |
| if hasattr(self, "temp_file"): | |
| self.temp_file.close() | |
| def _is_valid_url(url: str) -> bool: | |
| """Check if the url is valid.""" | |
| parsed = urlparse(url) | |
| return bool(parsed.netloc) and bool(parsed.scheme) | |
| class OnlinePDFLoader(BasePDFLoader): | |
| """Loader that loads online PDFs.""" | |
| def load(self) -> List[Document]: | |
| """Load documents.""" | |
| loader = UnstructuredPDFLoader(str(self.file_path)) | |
| return loader.load() | |
| class PyPDFLoader(BasePDFLoader): | |
| """Loads a PDF with pypdf and chunks at character level. | |
| Loader also stores page numbers in metadatas. | |
| """ | |
| def __init__(self, file_path: str): | |
| """Initialize with file path.""" | |
| try: | |
| import pypdf # noqa:F401 | |
| except ImportError: | |
| raise ValueError( | |
| "pypdf package not found, please install it with " "`pip install pypdf`" | |
| ) | |
| super().__init__(file_path) | |
| def load(self) -> List[Document]: | |
| """Load given path as pages.""" | |
| import pypdf | |
| with open(self.file_path, "rb") as pdf_file_obj: | |
| pdf_reader = pypdf.PdfReader(pdf_file_obj) | |
| return [ | |
| Document( | |
| page_content=page.extract_text(), | |
| metadata={"source": self.file_path, "page": i}, | |
| ) | |
| for i, page in enumerate(pdf_reader.pages) | |
| ] | |
| class PDFMinerLoader(BasePDFLoader): | |
| """Loader that uses PDFMiner to load PDF files.""" | |
| def __init__(self, file_path: str): | |
| """Initialize with file path.""" | |
| try: | |
| from pdfminer.high_level import extract_text # noqa:F401 | |
| except ImportError: | |
| raise ValueError( | |
| "pdfminer package not found, please install it with " | |
| "`pip install pdfminer.six`" | |
| ) | |
| super().__init__(file_path) | |
| def load(self) -> List[Document]: | |
| """Load file.""" | |
| from pdfminer.high_level import extract_text | |
| text = extract_text(self.file_path) | |
| metadata = {"source": self.file_path} | |
| return [Document(page_content=text, metadata=metadata)] | |
| class PyMuPDFLoader(BasePDFLoader): | |
| """Loader that uses PyMuPDF to load PDF files.""" | |
| def __init__(self, file_path: str): | |
| """Initialize with file path.""" | |
| try: | |
| import fitz # noqa:F401 | |
| except ImportError: | |
| raise ValueError( | |
| "PyMuPDF package not found, please install it with " | |
| "`pip install pymupdf`" | |
| ) | |
| super().__init__(file_path) | |
| def load(self, **kwargs: Optional[Any]) -> List[Document]: | |
| """Load file.""" | |
| import fitz | |
| doc = fitz.open(self.file_path) # open document | |
| file_path = self.file_path if self.web_path is None else self.web_path | |
| return [ | |
| Document( | |
| page_content=page.get_text(**kwargs).encode("utf-8"), | |
| metadata=dict( | |
| { | |
| "file_path": file_path, | |
| "page_number": page.number + 1, | |
| "total_pages": len(doc), | |
| }, | |
| **{ | |
| k: doc.metadata[k] | |
| for k in doc.metadata | |
| if type(doc.metadata[k]) in [str, int] | |
| } | |
| ), | |
| ) | |
| for page in doc | |
| ] | |