"""Loader that uses unstructured to load files.""" from abc import ABC, abstractmethod from typing import IO, Any, List from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader def satisfies_min_unstructured_version(min_version: str) -> bool: """Checks to see if the installed unstructured version exceeds the minimum version for the feature in question.""" from unstructured.__version__ import __version__ as __unstructured_version__ min_version_tuple = tuple([int(x) for x in min_version.split(".")]) # NOTE(MthwRobinson) - enables the loader to work when you're using pre-release # versions of unstructured like 0.4.17-dev1 _unstructured_version = __unstructured_version__.split("-")[0] unstructured_version_tuple = tuple( [int(x) for x in _unstructured_version.split(".")] ) return unstructured_version_tuple >= min_version_tuple class UnstructuredBaseLoader(BaseLoader, ABC): """Loader that uses unstructured to load files.""" def __init__(self, mode: str = "single", **unstructured_kwargs: Any): """Initialize with file path.""" try: import unstructured # noqa:F401 except ImportError: raise ValueError( "unstructured package not found, please install it with " "`pip install unstructured`" ) _valid_modes = {"single", "elements"} if mode not in _valid_modes: raise ValueError( f"Got {mode} for `mode`, but should be one of `{_valid_modes}`" ) self.mode = mode if not satisfies_min_unstructured_version("0.5.4"): if "strategy" in unstructured_kwargs: unstructured_kwargs.pop("strategy") self.unstructured_kwargs = unstructured_kwargs @abstractmethod def _get_elements(self) -> List: """Get elements.""" @abstractmethod def _get_metadata(self) -> dict: """Get metadata.""" def load(self) -> List[Document]: """Load file.""" elements = self._get_elements() if self.mode == "elements": docs: List[Document] = list() for element in elements: metadata = self._get_metadata() # NOTE(MthwRobinson) - the attribute check is for backward compatibility # with unstructured<0.4.9. The metadata attributed was added in 0.4.9. if hasattr(element, "metadata"): metadata.update(element.metadata.to_dict()) if hasattr(element, "category"): metadata["category"] = element.category docs.append(Document(page_content=str(element), metadata=metadata)) elif self.mode == "single": metadata = self._get_metadata() text = "\n\n".join([str(el) for el in elements]) docs = [Document(page_content=text, metadata=metadata)] else: raise ValueError(f"mode of {self.mode} not supported.") return docs class UnstructuredFileLoader(UnstructuredBaseLoader): """Loader that uses unstructured to load files.""" def __init__( self, file_path: str, mode: str = "single", **unstructured_kwargs: Any ): """Initialize with file path.""" self.file_path = file_path super().__init__(mode=mode, **unstructured_kwargs) def _get_elements(self) -> List: from unstructured.partition.auto import partition return partition(filename=self.file_path, **self.unstructured_kwargs) def _get_metadata(self) -> dict: return {"source": self.file_path} class UnstructuredFileIOLoader(UnstructuredBaseLoader): """Loader that uses unstructured to load file IO objects.""" def __init__(self, file: IO, mode: str = "single", **unstructured_kwargs: Any): """Initialize with file path.""" self.file = file super().__init__(mode=mode, **unstructured_kwargs) def _get_elements(self) -> List: from unstructured.partition.auto import partition return partition(file=self.file, **self.unstructured_kwargs) def _get_metadata(self) -> dict: return {}