Spaces:

X-Pipe
/

flash

Sleeping

App Files Files Community

Chen commited on Jun 25, 2023

Commit

6c20719

•

1 Parent(s): 175a385

add tools

Browse files

Files changed (14) hide show

.gitignore +149 -0
.pre-commit-config.yaml +5 -0
Makefile +14 -0
app.py +21 -15
core/lifecycle.py +9 -10
core/logger_factory.py +6 -8
core/test_lifecycle.py +0 -1
github_retriever.py +17 -9
langchain/manager.py +0 -1
llama/context.py +7 -5
llama/data_loader.py +9 -5
llama/index.py +0 -1
llama/vector_storage.py +0 -1
pyproject.toml +17 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,149 @@

+.DS_Store
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+bin/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+etc/
+include/
+lib/
+lib64/
+parts/
+sdist/
+share/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+.ruff_cache
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+notebooks/
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+pyvenv.cfg
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# Jetbrains
+.idea
+modules/
+*.swp
+# pipenv
+Pipfile
+Pipfile.lock
+# pyright
+pyrightconfig.json

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+repos:
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    rev: v0.0.243
+    hooks:
+      - id: ruff

Makefile ADDED Viewed

	@@ -0,0 +1,14 @@

+.PHONY: format lint
+GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
+format:
+	black .
+lint:
+	mypy .
+	black . --check
+	ruff check .
+test:
+	pytest tests

app.py CHANGED Viewed

@@ -15,11 +15,14 @@ import sys
 load_dotenv()
-logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) # logging.DEBUG for more verbose output
 logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
 def main():
-    documents = SimpleDirectoryReader('./data').load_data()
     # index = VectorStoreIndex.from_documents(documents)
@@ -28,15 +31,19 @@ def main():
     # index = VectorStoreIndex(nodes)
     # define embedding
-    embedding = LangchainEmbedding(OpenAIEmbeddings(chunk_size = 1))
     # define LLM
-    llm_predictor = LLMPredictor(llm=AzureOpenAI(
-        engine="text-davinci-003",
-        model_name="text-davinci-003",
-    ))
     # configure service context
-    service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, embed_model=embedding)
     # build index
     index = VectorStoreIndex.from_documents(
@@ -46,19 +53,18 @@ def main():
     index.storage_context.persist(persist_dir="./dataset")
     storage_context = StorageContext.from_defaults(persist_dir="./dataset")
-    index = load_index_from_storage(storage_context=storage_context, service_context=service_context)
     # index.vector_store.persist("./dataset")
     # query with embed_model specified
     query_engine = index.as_query_engine(
-        retriever_mode="embedding",
-        verbose=True,
-        service_context=service_context
     )
     response = query_engine.query("请帮忙推荐一杯咖啡给我，我喜欢咖啡因")
     print(response)
-if __name__ == '__main__':
-    main()

 load_dotenv()
+logging.basicConfig(
+    stream=sys.stdout, level=logging.DEBUG
+)  # logging.DEBUG for more verbose output
 logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
 def main():
+    documents = SimpleDirectoryReader("./data").load_data()
     # index = VectorStoreIndex.from_documents(documents)
     # index = VectorStoreIndex(nodes)
     # define embedding
+    embedding = LangchainEmbedding(OpenAIEmbeddings(chunk_size=1))
     # define LLM
+    llm_predictor = LLMPredictor(
+        llm=AzureOpenAI(
+            engine="text-davinci-003",
+            model_name="text-davinci-003",
+        )
+    )
     # configure service context
+    service_context = ServiceContext.from_defaults(
+        llm_predictor=llm_predictor, embed_model=embedding
+    )
     # build index
     index = VectorStoreIndex.from_documents(
     index.storage_context.persist(persist_dir="./dataset")
     storage_context = StorageContext.from_defaults(persist_dir="./dataset")
+    index = load_index_from_storage(
+        storage_context=storage_context, service_context=service_context
+    )
     # index.vector_store.persist("./dataset")
     # query with embed_model specified
     query_engine = index.as_query_engine(
+        retriever_mode="embedding", verbose=True, service_context=service_context
     )
     response = query_engine.query("请帮忙推荐一杯咖啡给我，我喜欢咖啡因")
     print(response)
+if __name__ == "__main__":
+    main()

core/lifecycle.py CHANGED Viewed

@@ -6,7 +6,6 @@ from core import logger_factory
 class Initializable(ABC):
     @abstractmethod
     def initialize(self) -> None:
         pass
@@ -19,21 +18,18 @@ class Startable(ABC):
 class Stoppable(ABC):
     @abstractmethod
     def stop(self) -> None:
         pass
 class Disposable(ABC):
     @abstractmethod
     def dispose(self) -> None:
         pass
 class LifecycleAware(ABC):
     def __init__(self, state):
         self.state = state
@@ -42,7 +38,6 @@ class LifecycleAware(ABC):
 class Lifecycle(Initializable, Startable, Stoppable, Disposable, LifecycleAware, ABC):
     def __init__(self):
         self.logger = logger_factory.get_logger(self.__class__.__name__)
         self.lifecycle_state = LifecycleState(lifecycle=self)
@@ -108,27 +103,27 @@ class LifecyclePhase(enum.Enum):
 class LifecycleController(ABC):
     def can_initialize(self, phase: [LifecyclePhase]) -> bool:
         return phase is None or phase == LifecyclePhase.DISPOSED
     def can_start(self, phase: [LifecyclePhase]) -> bool:
         return phase is not None and (
-                phase == LifecyclePhase.INITIALIZED or phase == LifecyclePhase.STOPPED)
     def can_stop(self, phase: [LifecyclePhase]) -> bool:
         return phase is not None and phase == LifecyclePhase.STARTED
     def can_dispose(self, phase: [LifecyclePhase]) -> bool:
         return phase is not None and (
-                phase == LifecyclePhase.INITIALIZED or phase == LifecyclePhase.STOPPED)
 LS = TypeVar("LS", bound=Lifecycle)
 class LifecycleState(LifecycleController, ABC):
     def __init__(self, lifecycle: [LS]):
         self.phase = None
         self.prev_phase = None
@@ -164,7 +159,11 @@ class LifecycleState(LifecycleController, ABC):
     def set_phase(self, phase: [LifecyclePhase]) -> None:
         prev = "None" if self.phase is None else self.phase.name
-        self.logger.info("[setPhaseName][{}]{} --> {}".format(self.lifecycle.__class__.__name__, prev, phase.name))
         self.phase = phase
     def rollback(self, err: [Exception]) -> None:

 class Initializable(ABC):
     @abstractmethod
     def initialize(self) -> None:
         pass
 class Stoppable(ABC):
     @abstractmethod
     def stop(self) -> None:
         pass
 class Disposable(ABC):
     @abstractmethod
     def dispose(self) -> None:
         pass
 class LifecycleAware(ABC):
     def __init__(self, state):
         self.state = state
 class Lifecycle(Initializable, Startable, Stoppable, Disposable, LifecycleAware, ABC):
     def __init__(self):
         self.logger = logger_factory.get_logger(self.__class__.__name__)
         self.lifecycle_state = LifecycleState(lifecycle=self)
 class LifecycleController(ABC):
     def can_initialize(self, phase: [LifecyclePhase]) -> bool:
         return phase is None or phase == LifecyclePhase.DISPOSED
     def can_start(self, phase: [LifecyclePhase]) -> bool:
         return phase is not None and (
+            phase == LifecyclePhase.INITIALIZED or phase == LifecyclePhase.STOPPED
+        )
     def can_stop(self, phase: [LifecyclePhase]) -> bool:
         return phase is not None and phase == LifecyclePhase.STARTED
     def can_dispose(self, phase: [LifecyclePhase]) -> bool:
         return phase is not None and (
+            phase == LifecyclePhase.INITIALIZED or phase == LifecyclePhase.STOPPED
+        )
 LS = TypeVar("LS", bound=Lifecycle)
 class LifecycleState(LifecycleController, ABC):
     def __init__(self, lifecycle: [LS]):
         self.phase = None
         self.prev_phase = None
     def set_phase(self, phase: [LifecyclePhase]) -> None:
         prev = "None" if self.phase is None else self.phase.name
+        self.logger.info(
+            "[setPhaseName][{}]{} --> {}".format(
+                self.lifecycle.__class__.__name__, prev, phase.name
+            )
+        )
         self.phase = phase
     def rollback(self, err: [Exception]) -> None:

core/logger_factory.py CHANGED Viewed

@@ -3,16 +3,14 @@ from logging import handlers
 from typing import Optional
-def get_logger(name: [str],
-               file_name: Optional[str] = None) -> logging.Logger:
     logger = logging.getLogger(name)
     if file_name is None:
-        file_name = 'app-default.log'
-    handler = handlers.TimedRotatingFileHandler(filename=file_name,
-                                                when='d',
-                                                backupCount=21,
-                                                encoding='UTF-8')
-    formatter = logging.Formatter('[%(asctime)s][%(levelname)s][%(message)s]')
     handler.setFormatter(formatter)
     logger.addHandler(handler)
     logger.setLevel(logging.INFO)

 from typing import Optional
+def get_logger(name: [str], file_name: Optional[str] = None) -> logging.Logger:
     logger = logging.getLogger(name)
     if file_name is None:
+        file_name = "app-default.log"
+    handler = handlers.TimedRotatingFileHandler(
+        filename=file_name, when="d", backupCount=21, encoding="UTF-8"
+    )
+    formatter = logging.Formatter("[%(asctime)s][%(levelname)s][%(message)s]")
     handler.setFormatter(formatter)
     logger.addHandler(handler)
     logger.setLevel(logging.INFO)

core/test_lifecycle.py CHANGED Viewed

@@ -7,7 +7,6 @@ logging.basicConfig()
 class SubLifecycle(Lifecycle):
     def __init__(self):
         super().__init__()
         self.init_counter = 0

 class SubLifecycle(Lifecycle):
     def __init__(self):
         super().__init__()
         self.init_counter = 0

github_retriever.py CHANGED Viewed

@@ -8,17 +8,23 @@ from llama_index import StorageContext, load_index_from_storage
 from dotenv import load_dotenv
 import os
 import pickle
 def main():
     # define embedding
     embedding = LangchainEmbedding(OpenAIEmbeddings(chunk_size=1))
     # define LLM
-    llm_predictor = LLMPredictor(llm=AzureOpenAI(
-        engine="text-davinci-003",
-        model_name="text-davinci-003",
-    ))
     # configure service context
-    service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, embed_model=embedding)
     download_loader("GithubRepositoryReader")
     docs = None
     if os.path.exists("docs/docs.pkl"):
@@ -31,7 +37,10 @@ def main():
             github_client,
             owner="ctripcorp",
             repo="x-pipe",
-            filter_directories=([".", "doc"], GithubRepositoryReader.FilterType.INCLUDE),
             filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE),
             verbose=True,
             concurrent_requests=10,
@@ -49,7 +58,6 @@ def main():
     print(response)
-if __name__ == '__main__':
     load_dotenv()
-    main()

 from dotenv import load_dotenv
 import os
 import pickle
 def main():
     # define embedding
     embedding = LangchainEmbedding(OpenAIEmbeddings(chunk_size=1))
     # define LLM
+    llm_predictor = LLMPredictor(
+        llm=AzureOpenAI(
+            engine="text-davinci-003",
+            model_name="text-davinci-003",
+        )
+    )
     # configure service context
+    service_context = ServiceContext.from_defaults(
+        llm_predictor=llm_predictor, embed_model=embedding
+    )
     download_loader("GithubRepositoryReader")
     docs = None
     if os.path.exists("docs/docs.pkl"):
             github_client,
             owner="ctripcorp",
             repo="x-pipe",
+            filter_directories=(
+                [".", "doc"],
+                GithubRepositoryReader.FilterType.INCLUDE,
+            ),
             filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE),
             verbose=True,
             concurrent_requests=10,
     print(response)
+if __name__ == "__main__":
     load_dotenv()
+    main()

langchain/manager.py CHANGED Viewed

@@ -22,7 +22,6 @@ class LangChainManager(Lifecycle, ABC):
 class LangChainAzureManager(LangChainManager):
     def __init__(self):
         super().__init__()

 class LangChainAzureManager(LangChainManager):
     def __init__(self):
         super().__init__()

llama/context.py CHANGED Viewed

@@ -5,7 +5,6 @@ from langchain.manager import LangChainManager
 class ServiceContextManager(Lifecycle):
     def __init__(self, manager: [LangChainManager]):
         super().__init__()
         self.manager = manager
@@ -13,7 +12,9 @@ class ServiceContextManager(Lifecycle):
     def get_service_context(self) -> ServiceContext:
         if self.lifecycle_state.is_started():
-            raise Exception("incorrect lifecycle state: {}".format(self.get_lifecycle_state()))
         return self.service_context
     def do_init(self):
@@ -22,7 +23,9 @@ class ServiceContextManager(Lifecycle):
         # define LLM
         llm_predictor = LLMPredictor(llm=self.manager.get_llm())
         # configure service context
-        self.service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, embed_model=embedding)
     def do_start(self):
         pass
@@ -35,8 +38,7 @@ class ServiceContextManager(Lifecycle):
 class StorageContextManager(Lifecycle):
-    def __init__(self, dataset_path: [str] = './dataset'):
         super().__init__()
         self.dataset_path = dataset_path

 class ServiceContextManager(Lifecycle):
     def __init__(self, manager: [LangChainManager]):
         super().__init__()
         self.manager = manager
     def get_service_context(self) -> ServiceContext:
         if self.lifecycle_state.is_started():
+            raise Exception(
+                "incorrect lifecycle state: {}".format(self.get_lifecycle_state())
+            )
         return self.service_context
     def do_init(self):
         # define LLM
         llm_predictor = LLMPredictor(llm=self.manager.get_llm())
         # configure service context
+        self.service_context = ServiceContext.from_defaults(
+            llm_predictor=llm_predictor, embed_model=embedding
+        )
     def do_start(self):
         pass
 class StorageContextManager(Lifecycle):
+    def __init__(self, dataset_path: [str] = "./dataset"):
         super().__init__()
         self.dataset_path = dataset_path

llama/data_loader.py CHANGED Viewed

@@ -17,12 +17,16 @@ class WikiLoader(ABC):
 class GithubLoader(WikiLoader, Lifecycle):
-    def __init__(self, github_owner: Optional[str] = None,
-                 repo: Optional[str] = None,
-                 dirs: Optional[Sequence[str]] = None):
         super().__init__()
-        self.owner = github_owner if github_owner is not None else os.environ["GITHUB_OWNER"]
         self.repo = repo if repo is not None else os.environ["GITHUB_REPO"]
         self.dirs = dirs if dirs is not None else [".", "doc"]

 class GithubLoader(WikiLoader, Lifecycle):
+    def __init__(
+        self,
+        github_owner: Optional[str] = None,
+        repo: Optional[str] = None,
+        dirs: Optional[Sequence[str]] = None,
+    ):
         super().__init__()
+        self.owner = (
+            github_owner if github_owner is not None else os.environ["GITHUB_OWNER"]
+        )
         self.repo = repo if repo is not None else os.environ["GITHUB_REPO"]
         self.dirs = dirs if dirs is not None else [".", "doc"]

llama/index.py CHANGED Viewed

@@ -3,7 +3,6 @@ from llama.context import ServiceContextManager
 class IndexManager(Lifecycle):
     def __init__(self, context_manager: [ServiceContextManager]):
         super().__init__()
         self.index = None

 class IndexManager(Lifecycle):
     def __init__(self, context_manager: [ServiceContextManager]):
         super().__init__()
         self.index = None

llama/vector_storage.py CHANGED Viewed

@@ -2,7 +2,6 @@ from core.lifecycle import Lifecycle
 class VectorStorageManager(Lifecycle):
     def __init__(self):
         super().__init__()

 class VectorStorageManager(Lifecycle):
     def __init__(self):
         super().__init__()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,17 @@

+[tool.mypy]
+ignore_missing_imports = "True"
+disallow_untyped_defs = "True"
+exclude = ["notebooks", "build", "examples"]
+[tool.ruff]
+exclude = [
+    ".venv",
+    "__pycache__",
+    ".ipynb_checkpoints",
+    ".mypy_cache",
+    ".ruff_cache",
+    "examples",
+    "notebooks",
+    "docs",
+    "dataset",
+]