flash / llama /data_loader.py
NickNYU's picture
upload github well compiled files (#1)
b5d3f34
raw
history blame
1.91 kB
import os
import pickle
from abc import abstractmethod, ABC
from typing import Optional, Sequence, List
from llama_hub.github_repo import GithubRepositoryReader, GithubClient
from llama_index import download_loader
from llama_index.readers.schema.base import Document
from core.lifecycle import Lifecycle
class WikiLoader(ABC):
@abstractmethod
def load(self) -> List[Document]:
pass
class GithubLoader(WikiLoader, Lifecycle):
def __init__(
self,
github_owner: Optional[str] = None,
repo: Optional[str] = None,
dirs: Optional[Sequence[str]] = None,
):
super().__init__()
self.owner = (
github_owner if github_owner is not None else os.environ["GITHUB_OWNER"]
)
self.repo = repo if repo is not None else os.environ["GITHUB_REPO"]
self.dirs = dirs if dirs is not None else [".", "doc"]
def load(self) -> List[Document]:
download_loader("GithubRepositoryReader")
docs = None
if os.path.exists("docs/docs.pkl"):
with open("docs/docs.pkl", "rb") as f:
docs = pickle.load(f)
if docs is not None:
return docs
# otherwise, we download from github and save it locally
github_client = GithubClient(os.getenv("GITHUB_TOKEN"))
loader = GithubRepositoryReader(
github_client,
# owner="ctripcorp",
owner=self.owner,
# repo="x-pipe",
repo=self.repo,
filter_directories=(self.dirs, GithubRepositoryReader.FilterType.INCLUDE),
filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE),
verbose=True,
concurrent_requests=10,
)
docs = loader.load_data(branch="master")
with open("docs/docs.pkl", "wb") as f:
pickle.dump(docs, f)
return docs