Spaces:
Runtime error
Runtime error
| """Loader that uses unstructured to load files.""" | |
| from abc import ABC, abstractmethod | |
| from typing import IO, Any, List | |
| from langchain.docstore.document import Document | |
| from langchain.document_loaders.base import BaseLoader | |
| def satisfies_min_unstructured_version(min_version: str) -> bool: | |
| """Checks to see if the installed unstructured version exceeds the minimum version | |
| for the feature in question.""" | |
| from unstructured.__version__ import __version__ as __unstructured_version__ | |
| min_version_tuple = tuple([int(x) for x in min_version.split(".")]) | |
| # NOTE(MthwRobinson) - enables the loader to work when you're using pre-release | |
| # versions of unstructured like 0.4.17-dev1 | |
| _unstructured_version = __unstructured_version__.split("-")[0] | |
| unstructured_version_tuple = tuple( | |
| [int(x) for x in _unstructured_version.split(".")] | |
| ) | |
| return unstructured_version_tuple >= min_version_tuple | |
| class UnstructuredBaseLoader(BaseLoader, ABC): | |
| """Loader that uses unstructured to load files.""" | |
| def __init__(self, mode: str = "single", **unstructured_kwargs: Any): | |
| """Initialize with file path.""" | |
| try: | |
| import unstructured # noqa:F401 | |
| except ImportError: | |
| raise ValueError( | |
| "unstructured package not found, please install it with " | |
| "`pip install unstructured`" | |
| ) | |
| _valid_modes = {"single", "elements"} | |
| if mode not in _valid_modes: | |
| raise ValueError( | |
| f"Got {mode} for `mode`, but should be one of `{_valid_modes}`" | |
| ) | |
| self.mode = mode | |
| if not satisfies_min_unstructured_version("0.5.4"): | |
| if "strategy" in unstructured_kwargs: | |
| unstructured_kwargs.pop("strategy") | |
| self.unstructured_kwargs = unstructured_kwargs | |
| def _get_elements(self) -> List: | |
| """Get elements.""" | |
| def _get_metadata(self) -> dict: | |
| """Get metadata.""" | |
| def load(self) -> List[Document]: | |
| """Load file.""" | |
| elements = self._get_elements() | |
| if self.mode == "elements": | |
| docs: List[Document] = list() | |
| for element in elements: | |
| metadata = self._get_metadata() | |
| # NOTE(MthwRobinson) - the attribute check is for backward compatibility | |
| # with unstructured<0.4.9. The metadata attributed was added in 0.4.9. | |
| if hasattr(element, "metadata"): | |
| metadata.update(element.metadata.to_dict()) | |
| if hasattr(element, "category"): | |
| metadata["category"] = element.category | |
| docs.append(Document(page_content=str(element), metadata=metadata)) | |
| elif self.mode == "single": | |
| metadata = self._get_metadata() | |
| text = "\n\n".join([str(el) for el in elements]) | |
| docs = [Document(page_content=text, metadata=metadata)] | |
| else: | |
| raise ValueError(f"mode of {self.mode} not supported.") | |
| return docs | |
| class UnstructuredFileLoader(UnstructuredBaseLoader): | |
| """Loader that uses unstructured to load files.""" | |
| def __init__( | |
| self, file_path: str, mode: str = "single", **unstructured_kwargs: Any | |
| ): | |
| """Initialize with file path.""" | |
| self.file_path = file_path | |
| super().__init__(mode=mode, **unstructured_kwargs) | |
| def _get_elements(self) -> List: | |
| from unstructured.partition.auto import partition | |
| return partition(filename=self.file_path, **self.unstructured_kwargs) | |
| def _get_metadata(self) -> dict: | |
| return {"source": self.file_path} | |
| class UnstructuredFileIOLoader(UnstructuredBaseLoader): | |
| """Loader that uses unstructured to load file IO objects.""" | |
| def __init__(self, file: IO, mode: str = "single", **unstructured_kwargs: Any): | |
| """Initialize with file path.""" | |
| self.file = file | |
| super().__init__(mode=mode, **unstructured_kwargs) | |
| def _get_elements(self) -> List: | |
| from unstructured.partition.auto import partition | |
| return partition(file=self.file, **self.unstructured_kwargs) | |
| def _get_metadata(self) -> dict: | |
| return {} | |