Chen commited on
Commit
6c20719
1 Parent(s): 175a385
.gitignore ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .DS_Store
2
+ # Byte-compiled / optimized / DLL files
3
+ __pycache__/
4
+ *.py[cod]
5
+ *$py.class
6
+
7
+ # C extensions
8
+ *.so
9
+
10
+ # Distribution / packaging
11
+ .Python
12
+ bin/
13
+ build/
14
+ develop-eggs/
15
+ dist/
16
+ downloads/
17
+ eggs/
18
+ .eggs/
19
+ etc/
20
+ include/
21
+ lib/
22
+ lib64/
23
+ parts/
24
+ sdist/
25
+ share/
26
+ var/
27
+ wheels/
28
+ pip-wheel-metadata/
29
+ share/python-wheels/
30
+ *.egg-info/
31
+ .installed.cfg
32
+ *.egg
33
+ MANIFEST
34
+
35
+ # PyInstaller
36
+ # Usually these files are written by a python script from a template
37
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
38
+ *.manifest
39
+ *.spec
40
+
41
+ # Installer logs
42
+ pip-log.txt
43
+ pip-delete-this-directory.txt
44
+
45
+ # Unit test / coverage reports
46
+ htmlcov/
47
+ .tox/
48
+ .nox/
49
+ .coverage
50
+ .coverage.*
51
+ .cache
52
+ nosetests.xml
53
+ coverage.xml
54
+ *.cover
55
+ *.py,cover
56
+ .hypothesis/
57
+ .pytest_cache/
58
+ .ruff_cache
59
+
60
+ # Translations
61
+ *.mo
62
+ *.pot
63
+
64
+ # Django stuff:
65
+ *.log
66
+ local_settings.py
67
+ db.sqlite3
68
+ db.sqlite3-journal
69
+
70
+ # Flask stuff:
71
+ instance/
72
+ .webassets-cache
73
+
74
+ # Scrapy stuff:
75
+ .scrapy
76
+
77
+ # Sphinx documentation
78
+ docs/_build/
79
+
80
+ # PyBuilder
81
+ target/
82
+
83
+ # Jupyter Notebook
84
+ .ipynb_checkpoints
85
+ notebooks/
86
+
87
+ # IPython
88
+ profile_default/
89
+ ipython_config.py
90
+
91
+ # pyenv
92
+ .python-version
93
+
94
+ # pipenv
95
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
97
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
98
+ # install all needed dependencies.
99
+ #Pipfile.lock
100
+
101
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
102
+ __pypackages__/
103
+
104
+ # Celery stuff
105
+ celerybeat-schedule
106
+ celerybeat.pid
107
+
108
+ # SageMath parsed files
109
+ *.sage.py
110
+
111
+ # Environments
112
+ .env
113
+ .venv
114
+ env/
115
+ venv/
116
+ ENV/
117
+ env.bak/
118
+ venv.bak/
119
+ pyvenv.cfg
120
+
121
+ # Spyder project settings
122
+ .spyderproject
123
+ .spyproject
124
+
125
+ # Rope project settings
126
+ .ropeproject
127
+
128
+ # mkdocs documentation
129
+ /site
130
+
131
+ # mypy
132
+ .mypy_cache/
133
+ .dmypy.json
134
+ dmypy.json
135
+
136
+ # Pyre type checker
137
+ .pyre/
138
+
139
+ # Jetbrains
140
+ .idea
141
+ modules/
142
+ *.swp
143
+
144
+ # pipenv
145
+ Pipfile
146
+ Pipfile.lock
147
+
148
+ # pyright
149
+ pyrightconfig.json
.pre-commit-config.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/charliermarsh/ruff-pre-commit
3
+ rev: v0.0.243
4
+ hooks:
5
+ - id: ruff
Makefile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: format lint
2
+
3
+ GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
4
+
5
+ format:
6
+ black .
7
+
8
+ lint:
9
+ mypy .
10
+ black . --check
11
+ ruff check .
12
+
13
+ test:
14
+ pytest tests
app.py CHANGED
@@ -15,11 +15,14 @@ import sys
15
 
16
 
17
  load_dotenv()
18
- logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) # logging.DEBUG for more verbose output
 
 
19
  logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
20
 
 
21
  def main():
22
- documents = SimpleDirectoryReader('./data').load_data()
23
 
24
  # index = VectorStoreIndex.from_documents(documents)
25
 
@@ -28,15 +31,19 @@ def main():
28
  # index = VectorStoreIndex(nodes)
29
 
30
  # define embedding
31
- embedding = LangchainEmbedding(OpenAIEmbeddings(chunk_size = 1))
32
  # define LLM
33
- llm_predictor = LLMPredictor(llm=AzureOpenAI(
34
- engine="text-davinci-003",
35
- model_name="text-davinci-003",
36
- ))
 
 
37
 
38
  # configure service context
39
- service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, embed_model=embedding)
 
 
40
 
41
  # build index
42
  index = VectorStoreIndex.from_documents(
@@ -46,19 +53,18 @@ def main():
46
 
47
  index.storage_context.persist(persist_dir="./dataset")
48
  storage_context = StorageContext.from_defaults(persist_dir="./dataset")
49
- index = load_index_from_storage(storage_context=storage_context, service_context=service_context)
 
 
50
 
51
  # index.vector_store.persist("./dataset")
52
  # query with embed_model specified
53
  query_engine = index.as_query_engine(
54
- retriever_mode="embedding",
55
- verbose=True,
56
- service_context=service_context
57
  )
58
  response = query_engine.query("请帮忙推荐一杯咖啡给我,我喜欢咖啡因")
59
  print(response)
60
 
61
 
62
-
63
- if __name__ == '__main__':
64
- main()
 
15
 
16
 
17
  load_dotenv()
18
+ logging.basicConfig(
19
+ stream=sys.stdout, level=logging.DEBUG
20
+ ) # logging.DEBUG for more verbose output
21
  logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
22
 
23
+
24
  def main():
25
+ documents = SimpleDirectoryReader("./data").load_data()
26
 
27
  # index = VectorStoreIndex.from_documents(documents)
28
 
 
31
  # index = VectorStoreIndex(nodes)
32
 
33
  # define embedding
34
+ embedding = LangchainEmbedding(OpenAIEmbeddings(chunk_size=1))
35
  # define LLM
36
+ llm_predictor = LLMPredictor(
37
+ llm=AzureOpenAI(
38
+ engine="text-davinci-003",
39
+ model_name="text-davinci-003",
40
+ )
41
+ )
42
 
43
  # configure service context
44
+ service_context = ServiceContext.from_defaults(
45
+ llm_predictor=llm_predictor, embed_model=embedding
46
+ )
47
 
48
  # build index
49
  index = VectorStoreIndex.from_documents(
 
53
 
54
  index.storage_context.persist(persist_dir="./dataset")
55
  storage_context = StorageContext.from_defaults(persist_dir="./dataset")
56
+ index = load_index_from_storage(
57
+ storage_context=storage_context, service_context=service_context
58
+ )
59
 
60
  # index.vector_store.persist("./dataset")
61
  # query with embed_model specified
62
  query_engine = index.as_query_engine(
63
+ retriever_mode="embedding", verbose=True, service_context=service_context
 
 
64
  )
65
  response = query_engine.query("请帮忙推荐一杯咖啡给我,我喜欢咖啡因")
66
  print(response)
67
 
68
 
69
+ if __name__ == "__main__":
70
+ main()
 
core/lifecycle.py CHANGED
@@ -6,7 +6,6 @@ from core import logger_factory
6
 
7
 
8
  class Initializable(ABC):
9
-
10
  @abstractmethod
11
  def initialize(self) -> None:
12
  pass
@@ -19,21 +18,18 @@ class Startable(ABC):
19
 
20
 
21
  class Stoppable(ABC):
22
-
23
  @abstractmethod
24
  def stop(self) -> None:
25
  pass
26
 
27
 
28
  class Disposable(ABC):
29
-
30
  @abstractmethod
31
  def dispose(self) -> None:
32
  pass
33
 
34
 
35
  class LifecycleAware(ABC):
36
-
37
  def __init__(self, state):
38
  self.state = state
39
 
@@ -42,7 +38,6 @@ class LifecycleAware(ABC):
42
 
43
 
44
  class Lifecycle(Initializable, Startable, Stoppable, Disposable, LifecycleAware, ABC):
45
-
46
  def __init__(self):
47
  self.logger = logger_factory.get_logger(self.__class__.__name__)
48
  self.lifecycle_state = LifecycleState(lifecycle=self)
@@ -108,27 +103,27 @@ class LifecyclePhase(enum.Enum):
108
 
109
 
110
  class LifecycleController(ABC):
111
-
112
  def can_initialize(self, phase: [LifecyclePhase]) -> bool:
113
  return phase is None or phase == LifecyclePhase.DISPOSED
114
 
115
  def can_start(self, phase: [LifecyclePhase]) -> bool:
116
  return phase is not None and (
117
- phase == LifecyclePhase.INITIALIZED or phase == LifecyclePhase.STOPPED)
 
118
 
119
  def can_stop(self, phase: [LifecyclePhase]) -> bool:
120
  return phase is not None and phase == LifecyclePhase.STARTED
121
 
122
  def can_dispose(self, phase: [LifecyclePhase]) -> bool:
123
  return phase is not None and (
124
- phase == LifecyclePhase.INITIALIZED or phase == LifecyclePhase.STOPPED)
 
125
 
126
 
127
  LS = TypeVar("LS", bound=Lifecycle)
128
 
129
 
130
  class LifecycleState(LifecycleController, ABC):
131
-
132
  def __init__(self, lifecycle: [LS]):
133
  self.phase = None
134
  self.prev_phase = None
@@ -164,7 +159,11 @@ class LifecycleState(LifecycleController, ABC):
164
 
165
  def set_phase(self, phase: [LifecyclePhase]) -> None:
166
  prev = "None" if self.phase is None else self.phase.name
167
- self.logger.info("[setPhaseName][{}]{} --> {}".format(self.lifecycle.__class__.__name__, prev, phase.name))
 
 
 
 
168
  self.phase = phase
169
 
170
  def rollback(self, err: [Exception]) -> None:
 
6
 
7
 
8
  class Initializable(ABC):
 
9
  @abstractmethod
10
  def initialize(self) -> None:
11
  pass
 
18
 
19
 
20
  class Stoppable(ABC):
 
21
  @abstractmethod
22
  def stop(self) -> None:
23
  pass
24
 
25
 
26
  class Disposable(ABC):
 
27
  @abstractmethod
28
  def dispose(self) -> None:
29
  pass
30
 
31
 
32
  class LifecycleAware(ABC):
 
33
  def __init__(self, state):
34
  self.state = state
35
 
 
38
 
39
 
40
  class Lifecycle(Initializable, Startable, Stoppable, Disposable, LifecycleAware, ABC):
 
41
  def __init__(self):
42
  self.logger = logger_factory.get_logger(self.__class__.__name__)
43
  self.lifecycle_state = LifecycleState(lifecycle=self)
 
103
 
104
 
105
  class LifecycleController(ABC):
 
106
  def can_initialize(self, phase: [LifecyclePhase]) -> bool:
107
  return phase is None or phase == LifecyclePhase.DISPOSED
108
 
109
  def can_start(self, phase: [LifecyclePhase]) -> bool:
110
  return phase is not None and (
111
+ phase == LifecyclePhase.INITIALIZED or phase == LifecyclePhase.STOPPED
112
+ )
113
 
114
  def can_stop(self, phase: [LifecyclePhase]) -> bool:
115
  return phase is not None and phase == LifecyclePhase.STARTED
116
 
117
  def can_dispose(self, phase: [LifecyclePhase]) -> bool:
118
  return phase is not None and (
119
+ phase == LifecyclePhase.INITIALIZED or phase == LifecyclePhase.STOPPED
120
+ )
121
 
122
 
123
  LS = TypeVar("LS", bound=Lifecycle)
124
 
125
 
126
  class LifecycleState(LifecycleController, ABC):
 
127
  def __init__(self, lifecycle: [LS]):
128
  self.phase = None
129
  self.prev_phase = None
 
159
 
160
  def set_phase(self, phase: [LifecyclePhase]) -> None:
161
  prev = "None" if self.phase is None else self.phase.name
162
+ self.logger.info(
163
+ "[setPhaseName][{}]{} --> {}".format(
164
+ self.lifecycle.__class__.__name__, prev, phase.name
165
+ )
166
+ )
167
  self.phase = phase
168
 
169
  def rollback(self, err: [Exception]) -> None:
core/logger_factory.py CHANGED
@@ -3,16 +3,14 @@ from logging import handlers
3
  from typing import Optional
4
 
5
 
6
- def get_logger(name: [str],
7
- file_name: Optional[str] = None) -> logging.Logger:
8
  logger = logging.getLogger(name)
9
  if file_name is None:
10
- file_name = 'app-default.log'
11
- handler = handlers.TimedRotatingFileHandler(filename=file_name,
12
- when='d',
13
- backupCount=21,
14
- encoding='UTF-8')
15
- formatter = logging.Formatter('[%(asctime)s][%(levelname)s][%(message)s]')
16
  handler.setFormatter(formatter)
17
  logger.addHandler(handler)
18
  logger.setLevel(logging.INFO)
 
3
  from typing import Optional
4
 
5
 
6
+ def get_logger(name: [str], file_name: Optional[str] = None) -> logging.Logger:
 
7
  logger = logging.getLogger(name)
8
  if file_name is None:
9
+ file_name = "app-default.log"
10
+ handler = handlers.TimedRotatingFileHandler(
11
+ filename=file_name, when="d", backupCount=21, encoding="UTF-8"
12
+ )
13
+ formatter = logging.Formatter("[%(asctime)s][%(levelname)s][%(message)s]")
 
14
  handler.setFormatter(formatter)
15
  logger.addHandler(handler)
16
  logger.setLevel(logging.INFO)
core/test_lifecycle.py CHANGED
@@ -7,7 +7,6 @@ logging.basicConfig()
7
 
8
 
9
  class SubLifecycle(Lifecycle):
10
-
11
  def __init__(self):
12
  super().__init__()
13
  self.init_counter = 0
 
7
 
8
 
9
  class SubLifecycle(Lifecycle):
 
10
  def __init__(self):
11
  super().__init__()
12
  self.init_counter = 0
github_retriever.py CHANGED
@@ -8,17 +8,23 @@ from llama_index import StorageContext, load_index_from_storage
8
  from dotenv import load_dotenv
9
  import os
10
  import pickle
 
 
11
  def main():
12
  # define embedding
13
  embedding = LangchainEmbedding(OpenAIEmbeddings(chunk_size=1))
14
  # define LLM
15
- llm_predictor = LLMPredictor(llm=AzureOpenAI(
16
- engine="text-davinci-003",
17
- model_name="text-davinci-003",
18
- ))
 
 
19
 
20
  # configure service context
21
- service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, embed_model=embedding)
 
 
22
  download_loader("GithubRepositoryReader")
23
  docs = None
24
  if os.path.exists("docs/docs.pkl"):
@@ -31,7 +37,10 @@ def main():
31
  github_client,
32
  owner="ctripcorp",
33
  repo="x-pipe",
34
- filter_directories=([".", "doc"], GithubRepositoryReader.FilterType.INCLUDE),
 
 
 
35
  filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE),
36
  verbose=True,
37
  concurrent_requests=10,
@@ -49,7 +58,6 @@ def main():
49
  print(response)
50
 
51
 
52
-
53
- if __name__ == '__main__':
54
  load_dotenv()
55
- main()
 
8
  from dotenv import load_dotenv
9
  import os
10
  import pickle
11
+
12
+
13
  def main():
14
  # define embedding
15
  embedding = LangchainEmbedding(OpenAIEmbeddings(chunk_size=1))
16
  # define LLM
17
+ llm_predictor = LLMPredictor(
18
+ llm=AzureOpenAI(
19
+ engine="text-davinci-003",
20
+ model_name="text-davinci-003",
21
+ )
22
+ )
23
 
24
  # configure service context
25
+ service_context = ServiceContext.from_defaults(
26
+ llm_predictor=llm_predictor, embed_model=embedding
27
+ )
28
  download_loader("GithubRepositoryReader")
29
  docs = None
30
  if os.path.exists("docs/docs.pkl"):
 
37
  github_client,
38
  owner="ctripcorp",
39
  repo="x-pipe",
40
+ filter_directories=(
41
+ [".", "doc"],
42
+ GithubRepositoryReader.FilterType.INCLUDE,
43
+ ),
44
  filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE),
45
  verbose=True,
46
  concurrent_requests=10,
 
58
  print(response)
59
 
60
 
61
+ if __name__ == "__main__":
 
62
  load_dotenv()
63
+ main()
langchain/manager.py CHANGED
@@ -22,7 +22,6 @@ class LangChainManager(Lifecycle, ABC):
22
 
23
 
24
  class LangChainAzureManager(LangChainManager):
25
-
26
  def __init__(self):
27
  super().__init__()
28
 
 
22
 
23
 
24
  class LangChainAzureManager(LangChainManager):
 
25
  def __init__(self):
26
  super().__init__()
27
 
llama/context.py CHANGED
@@ -5,7 +5,6 @@ from langchain.manager import LangChainManager
5
 
6
 
7
  class ServiceContextManager(Lifecycle):
8
-
9
  def __init__(self, manager: [LangChainManager]):
10
  super().__init__()
11
  self.manager = manager
@@ -13,7 +12,9 @@ class ServiceContextManager(Lifecycle):
13
 
14
  def get_service_context(self) -> ServiceContext:
15
  if self.lifecycle_state.is_started():
16
- raise Exception("incorrect lifecycle state: {}".format(self.get_lifecycle_state()))
 
 
17
  return self.service_context
18
 
19
  def do_init(self):
@@ -22,7 +23,9 @@ class ServiceContextManager(Lifecycle):
22
  # define LLM
23
  llm_predictor = LLMPredictor(llm=self.manager.get_llm())
24
  # configure service context
25
- self.service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, embed_model=embedding)
 
 
26
 
27
  def do_start(self):
28
  pass
@@ -35,8 +38,7 @@ class ServiceContextManager(Lifecycle):
35
 
36
 
37
  class StorageContextManager(Lifecycle):
38
-
39
- def __init__(self, dataset_path: [str] = './dataset'):
40
  super().__init__()
41
  self.dataset_path = dataset_path
42
 
 
5
 
6
 
7
  class ServiceContextManager(Lifecycle):
 
8
  def __init__(self, manager: [LangChainManager]):
9
  super().__init__()
10
  self.manager = manager
 
12
 
13
  def get_service_context(self) -> ServiceContext:
14
  if self.lifecycle_state.is_started():
15
+ raise Exception(
16
+ "incorrect lifecycle state: {}".format(self.get_lifecycle_state())
17
+ )
18
  return self.service_context
19
 
20
  def do_init(self):
 
23
  # define LLM
24
  llm_predictor = LLMPredictor(llm=self.manager.get_llm())
25
  # configure service context
26
+ self.service_context = ServiceContext.from_defaults(
27
+ llm_predictor=llm_predictor, embed_model=embedding
28
+ )
29
 
30
  def do_start(self):
31
  pass
 
38
 
39
 
40
  class StorageContextManager(Lifecycle):
41
+ def __init__(self, dataset_path: [str] = "./dataset"):
 
42
  super().__init__()
43
  self.dataset_path = dataset_path
44
 
llama/data_loader.py CHANGED
@@ -17,12 +17,16 @@ class WikiLoader(ABC):
17
 
18
 
19
  class GithubLoader(WikiLoader, Lifecycle):
20
-
21
- def __init__(self, github_owner: Optional[str] = None,
22
- repo: Optional[str] = None,
23
- dirs: Optional[Sequence[str]] = None):
 
 
24
  super().__init__()
25
- self.owner = github_owner if github_owner is not None else os.environ["GITHUB_OWNER"]
 
 
26
  self.repo = repo if repo is not None else os.environ["GITHUB_REPO"]
27
  self.dirs = dirs if dirs is not None else [".", "doc"]
28
 
 
17
 
18
 
19
  class GithubLoader(WikiLoader, Lifecycle):
20
+ def __init__(
21
+ self,
22
+ github_owner: Optional[str] = None,
23
+ repo: Optional[str] = None,
24
+ dirs: Optional[Sequence[str]] = None,
25
+ ):
26
  super().__init__()
27
+ self.owner = (
28
+ github_owner if github_owner is not None else os.environ["GITHUB_OWNER"]
29
+ )
30
  self.repo = repo if repo is not None else os.environ["GITHUB_REPO"]
31
  self.dirs = dirs if dirs is not None else [".", "doc"]
32
 
llama/index.py CHANGED
@@ -3,7 +3,6 @@ from llama.context import ServiceContextManager
3
 
4
 
5
  class IndexManager(Lifecycle):
6
-
7
  def __init__(self, context_manager: [ServiceContextManager]):
8
  super().__init__()
9
  self.index = None
 
3
 
4
 
5
  class IndexManager(Lifecycle):
 
6
  def __init__(self, context_manager: [ServiceContextManager]):
7
  super().__init__()
8
  self.index = None
llama/vector_storage.py CHANGED
@@ -2,7 +2,6 @@ from core.lifecycle import Lifecycle
2
 
3
 
4
  class VectorStorageManager(Lifecycle):
5
-
6
  def __init__(self):
7
  super().__init__()
8
 
 
2
 
3
 
4
  class VectorStorageManager(Lifecycle):
 
5
  def __init__(self):
6
  super().__init__()
7
 
pyproject.toml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.mypy]
2
+ ignore_missing_imports = "True"
3
+ disallow_untyped_defs = "True"
4
+ exclude = ["notebooks", "build", "examples"]
5
+
6
+ [tool.ruff]
7
+ exclude = [
8
+ ".venv",
9
+ "__pycache__",
10
+ ".ipynb_checkpoints",
11
+ ".mypy_cache",
12
+ ".ruff_cache",
13
+ "examples",
14
+ "notebooks",
15
+ "docs",
16
+ "dataset",
17
+ ]