Spaces:
Runtime error
Runtime error
File size: 6,953 Bytes
129cd69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
"""Base class for all loaders that uses O365 Package"""
from __future__ import annotations
import logging
import os
import tempfile
from abc import abstractmethod
from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING, Dict, Iterable, List, Sequence, Union
from langchain_core.pydantic_v1 import (
BaseModel,
BaseSettings,
Field,
FilePath,
SecretStr,
)
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.blob_loaders.file_system import FileSystemBlobLoader
from langchain.document_loaders.blob_loaders.schema import Blob
if TYPE_CHECKING:
from O365 import Account
from O365.drive import Drive, Folder
logger = logging.getLogger(__name__)
CHUNK_SIZE = 1024 * 1024 * 5
class _O365Settings(BaseSettings):
client_id: str = Field(..., env="O365_CLIENT_ID")
client_secret: SecretStr = Field(..., env="O365_CLIENT_SECRET")
class Config:
env_prefix = ""
case_sentive = False
env_file = ".env"
class _O365TokenStorage(BaseSettings):
token_path: FilePath = Path.home() / ".credentials" / "o365_token.txt"
class _FileType(str, Enum):
DOC = "doc"
DOCX = "docx"
PDF = "pdf"
def fetch_mime_types(file_types: Sequence[_FileType]) -> Dict[str, str]:
"""Fetch the mime types for the specified file types."""
mime_types_mapping = {}
for file_type in file_types:
if file_type.value == "doc":
mime_types_mapping[file_type.value] = "application/msword"
elif file_type.value == "docx":
mime_types_mapping[
file_type.value
] = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" # noqa: E501
elif file_type.value == "pdf":
mime_types_mapping[file_type.value] = "application/pdf"
return mime_types_mapping
class O365BaseLoader(BaseLoader, BaseModel):
"""Base class for all loaders that uses O365 Package"""
settings: _O365Settings = Field(default_factory=_O365Settings)
"""Settings for the Office365 API client."""
auth_with_token: bool = False
"""Whether to authenticate with a token or not. Defaults to False."""
chunk_size: Union[int, str] = CHUNK_SIZE
"""Number of bytes to retrieve from each api call to the server. int or 'auto'."""
@property
@abstractmethod
def _file_types(self) -> Sequence[_FileType]:
"""Return supported file types."""
@property
def _fetch_mime_types(self) -> Dict[str, str]:
"""Return a dict of supported file types to corresponding mime types."""
return fetch_mime_types(self._file_types)
@property
@abstractmethod
def _scopes(self) -> List[str]:
"""Return required scopes."""
def _load_from_folder(self, folder: Folder) -> Iterable[Blob]:
"""Lazily load all files from a specified folder of the configured MIME type.
Args:
folder: The Folder instance from which the files are to be loaded. This
Folder instance should represent a directory in a file system where the
files are stored.
Yields:
An iterator that yields Blob instances, which are binary representations of
the files loaded from the folder.
"""
file_mime_types = self._fetch_mime_types
items = folder.get_items()
with tempfile.TemporaryDirectory() as temp_dir:
os.makedirs(os.path.dirname(temp_dir), exist_ok=True)
for file in items:
if file.is_file:
if file.mime_type in list(file_mime_types.values()):
file.download(to_path=temp_dir, chunk_size=self.chunk_size)
loader = FileSystemBlobLoader(path=temp_dir)
yield from loader.yield_blobs()
def _load_from_object_ids(
self, drive: Drive, object_ids: List[str]
) -> Iterable[Blob]:
"""Lazily load files specified by their object_ids from a drive.
Load files into the system as binary large objects (Blobs) and return Iterable.
Args:
drive: The Drive instance from which the files are to be loaded. This Drive
instance should represent a cloud storage service or similar storage
system where the files are stored.
object_ids: A list of object_id strings. Each object_id represents a unique
identifier for a file in the drive.
Yields:
An iterator that yields Blob instances, which are binary representations of
the files loaded from the drive using the specified object_ids.
"""
file_mime_types = self._fetch_mime_types
with tempfile.TemporaryDirectory() as temp_dir:
for object_id in object_ids:
file = drive.get_item(object_id)
if not file:
logging.warning(
"There isn't a file with"
f"object_id {object_id} in drive {drive}."
)
continue
if file.is_file:
if file.mime_type in list(file_mime_types.values()):
file.download(to_path=temp_dir, chunk_size=self.chunk_size)
loader = FileSystemBlobLoader(path=temp_dir)
yield from loader.yield_blobs()
def _auth(self) -> Account:
"""Authenticates the OneDrive API client
Returns:
The authenticated Account object.
"""
try:
from O365 import Account, FileSystemTokenBackend
except ImportError:
raise ImportError(
"O365 package not found, please install it with `pip install o365`"
)
if self.auth_with_token:
token_storage = _O365TokenStorage()
token_path = token_storage.token_path
token_backend = FileSystemTokenBackend(
token_path=token_path.parent, token_filename=token_path.name
)
account = Account(
credentials=(
self.settings.client_id,
self.settings.client_secret.get_secret_value(),
),
scopes=self._scopes,
token_backend=token_backend,
**{"raise_http_errors": False},
)
else:
token_backend = FileSystemTokenBackend(
token_path=Path.home() / ".credentials"
)
account = Account(
credentials=(
self.settings.client_id,
self.settings.client_secret.get_secret_value(),
),
scopes=self._scopes,
token_backend=token_backend,
**{"raise_http_errors": False},
)
# make the auth
account.authenticate()
return account
|