Spaces:
Runtime error
Runtime error
| """Base class for all loaders that uses O365 Package""" | |
| from __future__ import annotations | |
| import logging | |
| import os | |
| import tempfile | |
| from abc import abstractmethod | |
| from enum import Enum | |
| from pathlib import Path | |
| from typing import TYPE_CHECKING, Dict, Iterable, List, Sequence, Union | |
| from langchain_core.pydantic_v1 import ( | |
| BaseModel, | |
| BaseSettings, | |
| Field, | |
| FilePath, | |
| SecretStr, | |
| ) | |
| from langchain.document_loaders.base import BaseLoader | |
| from langchain.document_loaders.blob_loaders.file_system import FileSystemBlobLoader | |
| from langchain.document_loaders.blob_loaders.schema import Blob | |
| if TYPE_CHECKING: | |
| from O365 import Account | |
| from O365.drive import Drive, Folder | |
| logger = logging.getLogger(__name__) | |
| CHUNK_SIZE = 1024 * 1024 * 5 | |
| class _O365Settings(BaseSettings): | |
| client_id: str = Field(..., env="O365_CLIENT_ID") | |
| client_secret: SecretStr = Field(..., env="O365_CLIENT_SECRET") | |
| class Config: | |
| env_prefix = "" | |
| case_sentive = False | |
| env_file = ".env" | |
| class _O365TokenStorage(BaseSettings): | |
| token_path: FilePath = Path.home() / ".credentials" / "o365_token.txt" | |
| class _FileType(str, Enum): | |
| DOC = "doc" | |
| DOCX = "docx" | |
| PDF = "pdf" | |
| def fetch_mime_types(file_types: Sequence[_FileType]) -> Dict[str, str]: | |
| """Fetch the mime types for the specified file types.""" | |
| mime_types_mapping = {} | |
| for file_type in file_types: | |
| if file_type.value == "doc": | |
| mime_types_mapping[file_type.value] = "application/msword" | |
| elif file_type.value == "docx": | |
| mime_types_mapping[ | |
| file_type.value | |
| ] = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" # noqa: E501 | |
| elif file_type.value == "pdf": | |
| mime_types_mapping[file_type.value] = "application/pdf" | |
| return mime_types_mapping | |
| class O365BaseLoader(BaseLoader, BaseModel): | |
| """Base class for all loaders that uses O365 Package""" | |
| settings: _O365Settings = Field(default_factory=_O365Settings) | |
| """Settings for the Office365 API client.""" | |
| auth_with_token: bool = False | |
| """Whether to authenticate with a token or not. Defaults to False.""" | |
| chunk_size: Union[int, str] = CHUNK_SIZE | |
| """Number of bytes to retrieve from each api call to the server. int or 'auto'.""" | |
| def _file_types(self) -> Sequence[_FileType]: | |
| """Return supported file types.""" | |
| def _fetch_mime_types(self) -> Dict[str, str]: | |
| """Return a dict of supported file types to corresponding mime types.""" | |
| return fetch_mime_types(self._file_types) | |
| def _scopes(self) -> List[str]: | |
| """Return required scopes.""" | |
| def _load_from_folder(self, folder: Folder) -> Iterable[Blob]: | |
| """Lazily load all files from a specified folder of the configured MIME type. | |
| Args: | |
| folder: The Folder instance from which the files are to be loaded. This | |
| Folder instance should represent a directory in a file system where the | |
| files are stored. | |
| Yields: | |
| An iterator that yields Blob instances, which are binary representations of | |
| the files loaded from the folder. | |
| """ | |
| file_mime_types = self._fetch_mime_types | |
| items = folder.get_items() | |
| with tempfile.TemporaryDirectory() as temp_dir: | |
| os.makedirs(os.path.dirname(temp_dir), exist_ok=True) | |
| for file in items: | |
| if file.is_file: | |
| if file.mime_type in list(file_mime_types.values()): | |
| file.download(to_path=temp_dir, chunk_size=self.chunk_size) | |
| loader = FileSystemBlobLoader(path=temp_dir) | |
| yield from loader.yield_blobs() | |
| def _load_from_object_ids( | |
| self, drive: Drive, object_ids: List[str] | |
| ) -> Iterable[Blob]: | |
| """Lazily load files specified by their object_ids from a drive. | |
| Load files into the system as binary large objects (Blobs) and return Iterable. | |
| Args: | |
| drive: The Drive instance from which the files are to be loaded. This Drive | |
| instance should represent a cloud storage service or similar storage | |
| system where the files are stored. | |
| object_ids: A list of object_id strings. Each object_id represents a unique | |
| identifier for a file in the drive. | |
| Yields: | |
| An iterator that yields Blob instances, which are binary representations of | |
| the files loaded from the drive using the specified object_ids. | |
| """ | |
| file_mime_types = self._fetch_mime_types | |
| with tempfile.TemporaryDirectory() as temp_dir: | |
| for object_id in object_ids: | |
| file = drive.get_item(object_id) | |
| if not file: | |
| logging.warning( | |
| "There isn't a file with" | |
| f"object_id {object_id} in drive {drive}." | |
| ) | |
| continue | |
| if file.is_file: | |
| if file.mime_type in list(file_mime_types.values()): | |
| file.download(to_path=temp_dir, chunk_size=self.chunk_size) | |
| loader = FileSystemBlobLoader(path=temp_dir) | |
| yield from loader.yield_blobs() | |
| def _auth(self) -> Account: | |
| """Authenticates the OneDrive API client | |
| Returns: | |
| The authenticated Account object. | |
| """ | |
| try: | |
| from O365 import Account, FileSystemTokenBackend | |
| except ImportError: | |
| raise ImportError( | |
| "O365 package not found, please install it with `pip install o365`" | |
| ) | |
| if self.auth_with_token: | |
| token_storage = _O365TokenStorage() | |
| token_path = token_storage.token_path | |
| token_backend = FileSystemTokenBackend( | |
| token_path=token_path.parent, token_filename=token_path.name | |
| ) | |
| account = Account( | |
| credentials=( | |
| self.settings.client_id, | |
| self.settings.client_secret.get_secret_value(), | |
| ), | |
| scopes=self._scopes, | |
| token_backend=token_backend, | |
| **{"raise_http_errors": False}, | |
| ) | |
| else: | |
| token_backend = FileSystemTokenBackend( | |
| token_path=Path.home() / ".credentials" | |
| ) | |
| account = Account( | |
| credentials=( | |
| self.settings.client_id, | |
| self.settings.client_secret.get_secret_value(), | |
| ), | |
| scopes=self._scopes, | |
| token_backend=token_backend, | |
| **{"raise_http_errors": False}, | |
| ) | |
| # make the auth | |
| account.authenticate() | |
| return account | |