Spaces:

X-Pipe
/

flash

Sleeping

File size: 1,911 Bytes

b5d3f34

import os
import pickle
from abc import abstractmethod, ABC
from typing import Optional, Sequence, List

from llama_hub.github_repo import GithubRepositoryReader, GithubClient
from llama_index import download_loader
from llama_index.readers.schema.base import Document

from core.lifecycle import Lifecycle


class WikiLoader(ABC):
    @abstractmethod
    def load(self) -> List[Document]:
        pass


class GithubLoader(WikiLoader, Lifecycle):
    def __init__(
        self,
        github_owner: Optional[str] = None,
        repo: Optional[str] = None,
        dirs: Optional[Sequence[str]] = None,
    ):
        super().__init__()
        self.owner = (
            github_owner if github_owner is not None else os.environ["GITHUB_OWNER"]
        )
        self.repo = repo if repo is not None else os.environ["GITHUB_REPO"]
        self.dirs = dirs if dirs is not None else [".", "doc"]

    def load(self) -> List[Document]:
        download_loader("GithubRepositoryReader")
        docs = None
        if os.path.exists("docs/docs.pkl"):
            with open("docs/docs.pkl", "rb") as f:
                docs = pickle.load(f)

        if docs is not None:
            return docs

        # otherwise, we download from github and save it locally
        github_client = GithubClient(os.getenv("GITHUB_TOKEN"))
        loader = GithubRepositoryReader(
            github_client,
            # owner="ctripcorp",
            owner=self.owner,
            # repo="x-pipe",
            repo=self.repo,
            filter_directories=(self.dirs, GithubRepositoryReader.FilterType.INCLUDE),
            filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE),
            verbose=True,
            concurrent_requests=10,
        )

        docs = loader.load_data(branch="master")

        with open("docs/docs.pkl", "wb") as f:
            pickle.dump(docs, f)

        return docs