Spaces:
Runtime error
Runtime error
"""Base class for all loaders that uses O365 Package""" | |
from __future__ import annotations | |
import logging | |
import os | |
import tempfile | |
from abc import abstractmethod | |
from enum import Enum | |
from pathlib import Path | |
from typing import TYPE_CHECKING, Dict, Iterable, List, Sequence, Union | |
from langchain_core.pydantic_v1 import ( | |
BaseModel, | |
BaseSettings, | |
Field, | |
FilePath, | |
SecretStr, | |
) | |
from langchain.document_loaders.base import BaseLoader | |
from langchain.document_loaders.blob_loaders.file_system import FileSystemBlobLoader | |
from langchain.document_loaders.blob_loaders.schema import Blob | |
if TYPE_CHECKING: | |
from O365 import Account | |
from O365.drive import Drive, Folder | |
logger = logging.getLogger(__name__) | |
CHUNK_SIZE = 1024 * 1024 * 5 | |
class _O365Settings(BaseSettings): | |
client_id: str = Field(..., env="O365_CLIENT_ID") | |
client_secret: SecretStr = Field(..., env="O365_CLIENT_SECRET") | |
class Config: | |
env_prefix = "" | |
case_sentive = False | |
env_file = ".env" | |
class _O365TokenStorage(BaseSettings): | |
token_path: FilePath = Path.home() / ".credentials" / "o365_token.txt" | |
class _FileType(str, Enum): | |
DOC = "doc" | |
DOCX = "docx" | |
PDF = "pdf" | |
def fetch_mime_types(file_types: Sequence[_FileType]) -> Dict[str, str]: | |
"""Fetch the mime types for the specified file types.""" | |
mime_types_mapping = {} | |
for file_type in file_types: | |
if file_type.value == "doc": | |
mime_types_mapping[file_type.value] = "application/msword" | |
elif file_type.value == "docx": | |
mime_types_mapping[ | |
file_type.value | |
] = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" # noqa: E501 | |
elif file_type.value == "pdf": | |
mime_types_mapping[file_type.value] = "application/pdf" | |
return mime_types_mapping | |
class O365BaseLoader(BaseLoader, BaseModel): | |
"""Base class for all loaders that uses O365 Package""" | |
settings: _O365Settings = Field(default_factory=_O365Settings) | |
"""Settings for the Office365 API client.""" | |
auth_with_token: bool = False | |
"""Whether to authenticate with a token or not. Defaults to False.""" | |
chunk_size: Union[int, str] = CHUNK_SIZE | |
"""Number of bytes to retrieve from each api call to the server. int or 'auto'.""" | |
def _file_types(self) -> Sequence[_FileType]: | |
"""Return supported file types.""" | |
def _fetch_mime_types(self) -> Dict[str, str]: | |
"""Return a dict of supported file types to corresponding mime types.""" | |
return fetch_mime_types(self._file_types) | |
def _scopes(self) -> List[str]: | |
"""Return required scopes.""" | |
def _load_from_folder(self, folder: Folder) -> Iterable[Blob]: | |
"""Lazily load all files from a specified folder of the configured MIME type. | |
Args: | |
folder: The Folder instance from which the files are to be loaded. This | |
Folder instance should represent a directory in a file system where the | |
files are stored. | |
Yields: | |
An iterator that yields Blob instances, which are binary representations of | |
the files loaded from the folder. | |
""" | |
file_mime_types = self._fetch_mime_types | |
items = folder.get_items() | |
with tempfile.TemporaryDirectory() as temp_dir: | |
os.makedirs(os.path.dirname(temp_dir), exist_ok=True) | |
for file in items: | |
if file.is_file: | |
if file.mime_type in list(file_mime_types.values()): | |
file.download(to_path=temp_dir, chunk_size=self.chunk_size) | |
loader = FileSystemBlobLoader(path=temp_dir) | |
yield from loader.yield_blobs() | |
def _load_from_object_ids( | |
self, drive: Drive, object_ids: List[str] | |
) -> Iterable[Blob]: | |
"""Lazily load files specified by their object_ids from a drive. | |
Load files into the system as binary large objects (Blobs) and return Iterable. | |
Args: | |
drive: The Drive instance from which the files are to be loaded. This Drive | |
instance should represent a cloud storage service or similar storage | |
system where the files are stored. | |
object_ids: A list of object_id strings. Each object_id represents a unique | |
identifier for a file in the drive. | |
Yields: | |
An iterator that yields Blob instances, which are binary representations of | |
the files loaded from the drive using the specified object_ids. | |
""" | |
file_mime_types = self._fetch_mime_types | |
with tempfile.TemporaryDirectory() as temp_dir: | |
for object_id in object_ids: | |
file = drive.get_item(object_id) | |
if not file: | |
logging.warning( | |
"There isn't a file with" | |
f"object_id {object_id} in drive {drive}." | |
) | |
continue | |
if file.is_file: | |
if file.mime_type in list(file_mime_types.values()): | |
file.download(to_path=temp_dir, chunk_size=self.chunk_size) | |
loader = FileSystemBlobLoader(path=temp_dir) | |
yield from loader.yield_blobs() | |
def _auth(self) -> Account: | |
"""Authenticates the OneDrive API client | |
Returns: | |
The authenticated Account object. | |
""" | |
try: | |
from O365 import Account, FileSystemTokenBackend | |
except ImportError: | |
raise ImportError( | |
"O365 package not found, please install it with `pip install o365`" | |
) | |
if self.auth_with_token: | |
token_storage = _O365TokenStorage() | |
token_path = token_storage.token_path | |
token_backend = FileSystemTokenBackend( | |
token_path=token_path.parent, token_filename=token_path.name | |
) | |
account = Account( | |
credentials=( | |
self.settings.client_id, | |
self.settings.client_secret.get_secret_value(), | |
), | |
scopes=self._scopes, | |
token_backend=token_backend, | |
**{"raise_http_errors": False}, | |
) | |
else: | |
token_backend = FileSystemTokenBackend( | |
token_path=Path.home() / ".credentials" | |
) | |
account = Account( | |
credentials=( | |
self.settings.client_id, | |
self.settings.client_secret.get_secret_value(), | |
), | |
scopes=self._scopes, | |
token_backend=token_backend, | |
**{"raise_http_errors": False}, | |
) | |
# make the auth | |
account.authenticate() | |
return account | |