File size: 1,913 Bytes
129cd69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import logging
import os
import tempfile
from typing import Any, Iterator, List

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader

logger = logging.getLogger(__name__)


class BaiduBOSFileLoader(BaseLoader):
    """Load from `Baidu Cloud BOS` file."""

    def __init__(self, conf: Any, bucket: str, key: str):
        """Initialize with BOS config, bucket and key name.
        :param conf(BceClientConfiguration): BOS config.
        :param bucket(str): BOS bucket.
        :param key(str): BOS file key.
        """
        self.conf = conf
        self.bucket = bucket
        self.key = key

    def load(self) -> List[Document]:
        return list(self.lazy_load())

    def lazy_load(self) -> Iterator[Document]:
        """Load documents."""
        try:
            from baidubce.services.bos.bos_client import BosClient
        except ImportError:
            raise ImportError(
                "Please using `pip install bce-python-sdk`"
                + " before import bos related package."
            )

        # Initialize BOS Client
        client = BosClient(self.conf)
        with tempfile.TemporaryDirectory() as temp_dir:
            file_path = f"{temp_dir}/{self.bucket}/{self.key}"
            os.makedirs(os.path.dirname(file_path), exist_ok=True)
            # Download the file to a destination
            logger.debug(f"get object key {self.key} to file {file_path}")
            client.get_object_to_file(self.bucket, self.key, file_path)
            try:
                loader = UnstructuredFileLoader(file_path)
                documents = loader.load()
                return iter(documents)
            except Exception as ex:
                logger.error(f"load document error = {ex}")
                return iter([Document(page_content="")])