andreped commited on
Commit
f80c824
·
unverified ·
2 Parent(s): 1980b84 3dc7ebb

Merge pull request #1 from andreped/linting

Browse files

Added listing; refactored code; change environ-import order

.github/workflows/linting.yml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Linting
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - '*'
7
+ pull_request:
8
+ branches:
9
+ - '*'
10
+ workflow_dispatch:
11
+
12
+ jobs:
13
+ build:
14
+ runs-on: ubuntu-20.04
15
+ steps:
16
+ - uses: actions/checkout@v1
17
+ - name: Set up Python 3.7
18
+ uses: actions/setup-python@v2
19
+ with:
20
+ python-version: 3.7
21
+
22
+ - name: Install lint dependencies
23
+ run: pip install wheel setuptools black==22.3.0 isort==5.10.1 flake8==4.0.1
24
+
25
+ - name: Lint the code
26
+ run: sh shell/lint.sh
README.md CHANGED
@@ -23,6 +23,7 @@ app_file: knowledge_gpt/main.py
23
  | - | - |
24
  | **HF Deploy** | [![Deploy](https://github.com/andreped/referencebot/workflows/Deploy/badge.svg)](https://github.com/andreped/referencebot/actions) |
25
  | **File size check** | [![Filesize](https://github.com/andreped/referencebot/workflows/Check%20file%20size/badge.svg)](https://github.com/andreped/referencebot/actions) |
 
26
 
27
  ## [Installation](https://github.com/andreped/referencebot#installation)
28
 
 
23
  | - | - |
24
  | **HF Deploy** | [![Deploy](https://github.com/andreped/referencebot/workflows/Deploy/badge.svg)](https://github.com/andreped/referencebot/actions) |
25
  | **File size check** | [![Filesize](https://github.com/andreped/referencebot/workflows/Check%20file%20size/badge.svg)](https://github.com/andreped/referencebot/actions) |
26
+ | **Formatting check** | [![Filesize](https://github.com/andreped/ReferenceBot/workflows/Linting/badge.svg)](https://github.com/andreped/ReferenceBot/actions) |
27
 
28
  ## [Installation](https://github.com/andreped/referencebot#installation)
29
 
knowledge_gpt/components/sidebar.py CHANGED
@@ -1,5 +1,4 @@
1
  import streamlit as st
2
-
3
  from dotenv import load_dotenv
4
 
5
  load_dotenv()
 
1
  import streamlit as st
 
2
  from dotenv import load_dotenv
3
 
4
  load_dotenv()
knowledge_gpt/core/caching.py CHANGED
@@ -1,9 +1,9 @@
1
  import streamlit as st
2
  from streamlit.runtime.caching.hashing import HashFuncsDict
3
 
4
- import knowledge_gpt.core.parsing as parsing
5
  import knowledge_gpt.core.chunking as chunking
6
  import knowledge_gpt.core.embedding as embedding
 
7
  from knowledge_gpt.core.parsing import File
8
 
9
 
@@ -18,16 +18,10 @@ def bootstrap_caching():
18
 
19
  # Get all substypes of File from module
20
  file_subtypes = [
21
- cls
22
- for cls in vars(parsing).values()
23
- if isinstance(cls, type) and issubclass(cls, File) and cls != File
24
  ]
25
  file_hash_funcs: HashFuncsDict = {cls: file_hash_func for cls in file_subtypes}
26
 
27
  parsing.read_file = st.cache_data(show_spinner=False)(parsing.read_file)
28
- chunking.chunk_file = st.cache_data(show_spinner=False, hash_funcs=file_hash_funcs)(
29
- chunking.chunk_file
30
- )
31
- embedding.embed_files = st.cache_data(
32
- show_spinner=False, hash_funcs=file_hash_funcs
33
- )(embedding.embed_files)
 
1
  import streamlit as st
2
  from streamlit.runtime.caching.hashing import HashFuncsDict
3
 
 
4
  import knowledge_gpt.core.chunking as chunking
5
  import knowledge_gpt.core.embedding as embedding
6
+ import knowledge_gpt.core.parsing as parsing
7
  from knowledge_gpt.core.parsing import File
8
 
9
 
 
18
 
19
  # Get all substypes of File from module
20
  file_subtypes = [
21
+ cls for cls in vars(parsing).values() if isinstance(cls, type) and issubclass(cls, File) and cls != File
 
 
22
  ]
23
  file_hash_funcs: HashFuncsDict = {cls: file_hash_func for cls in file_subtypes}
24
 
25
  parsing.read_file = st.cache_data(show_spinner=False)(parsing.read_file)
26
+ chunking.chunk_file = st.cache_data(show_spinner=False, hash_funcs=file_hash_funcs)(chunking.chunk_file)
27
+ embedding.embed_files = st.cache_data(show_spinner=False, hash_funcs=file_hash_funcs)(embedding.embed_files)
 
 
 
 
knowledge_gpt/core/chunking.py CHANGED
@@ -1,11 +1,10 @@
1
  from langchain.docstore.document import Document
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
3
  from knowledge_gpt.core.parsing import File
4
 
5
 
6
- def chunk_file(
7
- file: File, chunk_size: int, chunk_overlap: int = 0, model_name="gpt-3.5-turbo"
8
- ) -> File:
9
  """Chunks each document in a file into smaller documents
10
  according to the specified chunk size and overlap
11
  where the size is determined by the number of tokens for the specified model.
 
1
  from langchain.docstore.document import Document
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+
4
  from knowledge_gpt.core.parsing import File
5
 
6
 
7
+ def chunk_file(file: File, chunk_size: int, chunk_overlap: int = 0, model_name="gpt-3.5-turbo") -> File:
 
 
8
  """Chunks each document in a file into smaller documents
9
  according to the specified chunk size and overlap
10
  where the size is determined by the number of tokens for the specified model.
knowledge_gpt/core/debug.py CHANGED
@@ -1,10 +1,13 @@
1
- from langchain.vectorstores import VectorStore
2
- from typing import Iterable, List, Any
 
 
 
 
3
  from langchain.docstore.document import Document
4
  from langchain.embeddings.base import Embeddings
5
  from langchain.embeddings.fake import FakeEmbeddings as FakeEmbeddingsBase
6
- from langchain.chat_models.fake import FakeListChatModel
7
- from typing import Optional
8
 
9
 
10
  class FakeChatModel(FakeListChatModel):
@@ -24,9 +27,7 @@ class FakeVectorStore(VectorStore):
24
  def __init__(self, texts: List[str]):
25
  self.texts: List[str] = texts
26
 
27
- def add_texts(
28
- self, texts: Iterable[str], metadatas: List[dict] | None = None, **kwargs: Any
29
- ) -> List[str]:
30
  self.texts.extend(texts)
31
  return self.texts
32
 
@@ -40,10 +41,5 @@ class FakeVectorStore(VectorStore):
40
  ) -> "FakeVectorStore":
41
  return cls(texts=list(texts))
42
 
43
- def similarity_search(
44
- self, query: str, k: int = 4, **kwargs: Any
45
- ) -> List[Document]:
46
- return [
47
- Document(page_content=text, metadata={"source": f"{i+1}-{1}"})
48
- for i, text in enumerate(self.texts)
49
- ]
 
1
+ from typing import Any
2
+ from typing import Iterable
3
+ from typing import List
4
+ from typing import Optional
5
+
6
+ from langchain.chat_models.fake import FakeListChatModel
7
  from langchain.docstore.document import Document
8
  from langchain.embeddings.base import Embeddings
9
  from langchain.embeddings.fake import FakeEmbeddings as FakeEmbeddingsBase
10
+ from langchain.vectorstores import VectorStore
 
11
 
12
 
13
  class FakeChatModel(FakeListChatModel):
 
27
  def __init__(self, texts: List[str]):
28
  self.texts: List[str] = texts
29
 
30
+ def add_texts(self, texts: Iterable[str], metadatas: List[dict] | None = None, **kwargs: Any) -> List[str]:
 
 
31
  self.texts.extend(texts)
32
  return self.texts
33
 
 
41
  ) -> "FakeVectorStore":
42
  return cls(texts=list(texts))
43
 
44
+ def similarity_search(self, query: str, k: int = 4, **kwargs: Any) -> List[Document]:
45
+ return [Document(page_content=text, metadata={"source": f"{i+1}-{1}"}) for i, text in enumerate(self.texts)]
 
 
 
 
 
knowledge_gpt/core/embedding.py CHANGED
@@ -1,11 +1,15 @@
1
- from langchain.vectorstores import VectorStore
2
- from knowledge_gpt.core.parsing import File
3
- from langchain.vectorstores.faiss import FAISS
 
4
  from langchain.embeddings import OpenAIEmbeddings
5
  from langchain.embeddings.base import Embeddings
6
- from typing import List, Type
7
- from langchain.docstore.document import Document
8
- from knowledge_gpt.core.debug import FakeVectorStore, FakeEmbeddings
 
 
 
9
 
10
 
11
  class FolderIndex:
@@ -30,9 +34,7 @@ class FolderIndex:
30
  return all_texts
31
 
32
  @classmethod
33
- def from_files(
34
- cls, files: List[File], embeddings: Embeddings, vector_store: Type[VectorStore]
35
- ) -> "FolderIndex":
36
  """Creates an index from files."""
37
 
38
  all_docs = cls._combine_files(files)
@@ -45,9 +47,7 @@ class FolderIndex:
45
  return cls(files=files, index=index)
46
 
47
 
48
- def embed_files(
49
- files: List[File], embedding: str, vector_store: str, **kwargs
50
- ) -> FolderIndex:
51
  """Embeds a collection of files and stores them in a FolderIndex."""
52
 
53
  supported_embeddings: dict[str, Type[Embeddings]] = {
@@ -69,6 +69,4 @@ def embed_files(
69
  else:
70
  raise NotImplementedError(f"Vector store {vector_store} not supported.")
71
 
72
- return FolderIndex.from_files(
73
- files=files, embeddings=_embeddings, vector_store=_vector_store
74
- )
 
1
+ from typing import List
2
+ from typing import Type
3
+
4
+ from langchain.docstore.document import Document
5
  from langchain.embeddings import OpenAIEmbeddings
6
  from langchain.embeddings.base import Embeddings
7
+ from langchain.vectorstores import VectorStore
8
+ from langchain.vectorstores.faiss import FAISS
9
+
10
+ from knowledge_gpt.core.debug import FakeEmbeddings
11
+ from knowledge_gpt.core.debug import FakeVectorStore
12
+ from knowledge_gpt.core.parsing import File
13
 
14
 
15
  class FolderIndex:
 
34
  return all_texts
35
 
36
  @classmethod
37
+ def from_files(cls, files: List[File], embeddings: Embeddings, vector_store: Type[VectorStore]) -> "FolderIndex":
 
 
38
  """Creates an index from files."""
39
 
40
  all_docs = cls._combine_files(files)
 
47
  return cls(files=files, index=index)
48
 
49
 
50
+ def embed_files(files: List[File], embedding: str, vector_store: str, **kwargs) -> FolderIndex:
 
 
51
  """Embeds a collection of files and stores them in a FolderIndex."""
52
 
53
  supported_embeddings: dict[str, Type[Embeddings]] = {
 
69
  else:
70
  raise NotImplementedError(f"Vector store {vector_store} not supported.")
71
 
72
+ return FolderIndex.from_files(files=files, embeddings=_embeddings, vector_store=_vector_store)
 
 
knowledge_gpt/core/parsing.py CHANGED
@@ -1,14 +1,16 @@
1
- from io import BytesIO
2
- from typing import List, Any, Optional
3
  import re
 
 
 
 
 
 
 
 
4
 
5
  import docx2txt
6
- from langchain.docstore.document import Document
7
  import fitz
8
- from hashlib import md5
9
-
10
- from abc import abstractmethod, ABC
11
- from copy import deepcopy
12
 
13
 
14
  class File(ABC):
@@ -32,10 +34,7 @@ class File(ABC):
32
  """Creates a File from a BytesIO object"""
33
 
34
  def __repr__(self) -> str:
35
- return (
36
- f"File(name={self.name}, id={self.id},"
37
- " metadata={self.metadata}, docs={self.docs})"
38
- )
39
 
40
  def __str__(self) -> str:
41
  return f"File(name={self.name}, id={self.id}, metadata={self.metadata})"
 
 
 
1
  import re
2
+ from abc import ABC
3
+ from abc import abstractmethod
4
+ from copy import deepcopy
5
+ from hashlib import md5
6
+ from io import BytesIO
7
+ from typing import Any
8
+ from typing import List
9
+ from typing import Optional
10
 
11
  import docx2txt
 
12
  import fitz
13
+ from langchain.docstore.document import Document
 
 
 
14
 
15
 
16
  class File(ABC):
 
34
  """Creates a File from a BytesIO object"""
35
 
36
  def __repr__(self) -> str:
37
+ return f"File(name={self.name}, id={self.id}," " metadata={self.metadata}, docs={self.docs})"
 
 
 
38
 
39
  def __str__(self) -> str:
40
  return f"File(name={self.name}, id={self.id}, metadata={self.metadata})"
knowledge_gpt/core/prompts.py CHANGED
@@ -26,6 +26,4 @@ QUESTION: {question}
26
  =========
27
  FINAL ANSWER:"""
28
 
29
- STUFF_PROMPT = PromptTemplate(
30
- template=template, input_variables=["summaries", "question"]
31
- )
 
26
  =========
27
  FINAL ANSWER:"""
28
 
29
+ STUFF_PROMPT = PromptTemplate(template=template, input_variables=["summaries", "question"])
 
 
knowledge_gpt/core/qa.py CHANGED
@@ -1,10 +1,12 @@
1
  from typing import List
 
2
  from langchain.chains.qa_with_sources import load_qa_with_sources_chain
3
- from knowledge_gpt.core.prompts import STUFF_PROMPT
4
  from langchain.docstore.document import Document
5
- from knowledge_gpt.core.embedding import FolderIndex
6
  from pydantic import BaseModel
7
- from langchain.chat_models.base import BaseChatModel
 
 
8
 
9
 
10
  class AnswerWithSources(BaseModel):
@@ -39,9 +41,7 @@ def query_folder(
39
  )
40
 
41
  relevant_docs = folder_index.index.similarity_search(query, k=5)
42
- result = chain(
43
- {"input_documents": relevant_docs, "question": query}, return_only_outputs=True
44
- )
45
  sources = relevant_docs
46
 
47
  if not return_all:
 
1
  from typing import List
2
+
3
  from langchain.chains.qa_with_sources import load_qa_with_sources_chain
4
+ from langchain.chat_models.base import BaseChatModel
5
  from langchain.docstore.document import Document
 
6
  from pydantic import BaseModel
7
+
8
+ from knowledge_gpt.core.embedding import FolderIndex
9
+ from knowledge_gpt.core.prompts import STUFF_PROMPT
10
 
11
 
12
  class AnswerWithSources(BaseModel):
 
41
  )
42
 
43
  relevant_docs = folder_index.index.similarity_search(query, k=5)
44
+ result = chain({"input_documents": relevant_docs, "question": query}, return_only_outputs=True)
 
 
45
  sources = relevant_docs
46
 
47
  if not return_all:
knowledge_gpt/core/utils.py CHANGED
@@ -1,15 +1,14 @@
1
  from typing import List
 
2
  from langchain.chains.combine_documents.stuff import StuffDocumentsChain
 
 
3
  from langchain.docstore.document import Document
4
 
5
- from langchain.chat_models import ChatOpenAI
6
  from knowledge_gpt.core.debug import FakeChatModel
7
- from langchain.chat_models.base import BaseChatModel
8
 
9
 
10
- def pop_docs_upto_limit(
11
- query: str, chain: StuffDocumentsChain, docs: List[Document], max_len: int
12
- ) -> List[Document]:
13
  """Pops documents from a list until the final prompt length is less
14
  than the max length."""
15
 
 
1
  from typing import List
2
+
3
  from langchain.chains.combine_documents.stuff import StuffDocumentsChain
4
+ from langchain.chat_models import ChatOpenAI
5
+ from langchain.chat_models.base import BaseChatModel
6
  from langchain.docstore.document import Document
7
 
 
8
  from knowledge_gpt.core.debug import FakeChatModel
 
9
 
10
 
11
+ def pop_docs_upto_limit(query: str, chain: StuffDocumentsChain, docs: List[Document], max_len: int) -> List[Document]:
 
 
12
  """Pops documents from a list until the final prompt length is less
13
  than the max length."""
14
 
knowledge_gpt/main.py CHANGED
@@ -1,31 +1,27 @@
1
  import os
2
- os.environ["OPENAI_API_TYPE"] = "azure" # configure API to Azure OpenAI
3
 
4
  import streamlit as st
5
- st.set_page_config(page_title="ReferenceBot", page_icon="📖", layout="wide")
6
-
7
- # add all secrets into environmental variables
8
- if os.path.exists(os.path.dirname(os.path.abspath(__file__)) + "/../.streamlit/secrets.toml"): # to avoid redundant print by calling st.secrets
9
- for key, value in st.secrets.items():
10
- os.environ[key] = value
11
 
12
  from knowledge_gpt.components.sidebar import sidebar
13
-
14
- from knowledge_gpt.ui import (
15
- wrap_doc_in_html,
16
- is_query_valid,
17
- is_file_valid,
18
- display_file_read_error,
19
- )
20
-
21
  from knowledge_gpt.core.caching import bootstrap_caching
22
-
23
- from knowledge_gpt.core.parsing import read_file
24
  from knowledge_gpt.core.chunking import chunk_file
25
  from knowledge_gpt.core.embedding import embed_files
 
26
  from knowledge_gpt.core.qa import query_folder
 
 
 
 
27
 
28
- from langchain.chat_models import AzureChatOpenAI
 
 
 
 
 
 
 
29
 
30
 
31
  def main():
@@ -78,7 +74,7 @@ def main():
78
  openai_api_key=os.environ["OPENAI_API_KEY"],
79
  openai_api_base=os.environ["OPENAI_API_BASE"],
80
  openai_api_type="azure",
81
- chunk_size = 1,
82
  )
83
 
84
  with st.form(key="qa_form"):
@@ -106,7 +102,7 @@ def main():
106
  openai_api_type="azure",
107
  temperature=0,
108
  )
109
-
110
  with st.spinner("Querying folder to get result..."):
111
  result = query_folder(
112
  folder_index=folder_index,
 
1
  import os
 
2
 
3
  import streamlit as st
4
+ from langchain.chat_models import AzureChatOpenAI
 
 
 
 
 
5
 
6
  from knowledge_gpt.components.sidebar import sidebar
 
 
 
 
 
 
 
 
7
  from knowledge_gpt.core.caching import bootstrap_caching
 
 
8
  from knowledge_gpt.core.chunking import chunk_file
9
  from knowledge_gpt.core.embedding import embed_files
10
+ from knowledge_gpt.core.parsing import read_file
11
  from knowledge_gpt.core.qa import query_folder
12
+ from knowledge_gpt.ui import display_file_read_error
13
+ from knowledge_gpt.ui import is_file_valid
14
+ from knowledge_gpt.ui import is_query_valid
15
+ from knowledge_gpt.ui import wrap_doc_in_html
16
 
17
+ st.set_page_config(page_title="ReferenceBot", page_icon="📖", layout="wide")
18
+
19
+ # add all secrets into environmental variables
20
+ if os.path.exists(
21
+ os.path.dirname(os.path.abspath(__file__)) + "/../.streamlit/secrets.toml"
22
+ ): # to avoid redundant print by calling st.secrets
23
+ for key, value in st.secrets.items():
24
+ os.environ[key] = value
25
 
26
 
27
  def main():
 
74
  openai_api_key=os.environ["OPENAI_API_KEY"],
75
  openai_api_base=os.environ["OPENAI_API_BASE"],
76
  openai_api_type="azure",
77
+ chunk_size=1,
78
  )
79
 
80
  with st.form(key="qa_form"):
 
102
  openai_api_type="azure",
103
  temperature=0,
104
  )
105
+
106
  with st.spinner("Querying folder to get result..."):
107
  result = query_folder(
108
  folder_index=folder_index,
knowledge_gpt/ui.py CHANGED
@@ -1,9 +1,11 @@
1
  from typing import List
 
 
2
  import streamlit as st
3
  from langchain.docstore.document import Document
4
- from knowledge_gpt.core.parsing import File
5
  from streamlit.logger import get_logger
6
- from typing import NoReturn
 
7
 
8
  logger = get_logger(__name__)
9
 
@@ -25,10 +27,7 @@ def is_query_valid(query: str) -> bool:
25
 
26
 
27
  def is_file_valid(file: File) -> bool:
28
- if (
29
- len(file.docs) == 0
30
- or "".join([doc.page_content for doc in file.docs]).strip() == ""
31
- ):
32
  st.error("Cannot read document! Make sure the document has selectable text")
33
  logger.error("Cannot read document")
34
  return False
 
1
  from typing import List
2
+ from typing import NoReturn
3
+
4
  import streamlit as st
5
  from langchain.docstore.document import Document
 
6
  from streamlit.logger import get_logger
7
+
8
+ from knowledge_gpt.core.parsing import File
9
 
10
  logger = get_logger(__name__)
11
 
 
27
 
28
 
29
  def is_file_valid(file: File) -> bool:
30
+ if len(file.docs) == 0 or "".join([doc.page_content for doc in file.docs]).strip() == "":
 
 
 
31
  st.error("Cannot read document! Make sure the document has selectable text")
32
  logger.error("Cannot read document")
33
  return False
setup.cfg ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [metadata]
2
+ description-file = README.md
3
+
4
+ [isort]
5
+ force_single_line=True
6
+ known_first_party=aeropath
7
+ line_length=160
8
+ profile=black
9
+
10
+ [flake8]
11
+ # imported but unused in __init__.py, that's ok.
12
+ per-file-ignores=*__init__.py:F401
13
+ ignore=E203,W503,W605,F632,E266,E731,E712,E741
14
+ max-line-length=120
shell/format.sh ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ #!/bin/bash
2
+ isort --sl knowledge_gpt/
3
+ black --line-length 120 knowledge_gpt/
4
+ flake8 knowledge_gpt/
shell/lint.sh ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ isort --check --sl -c knowledge_gpt/
3
+ if ! [ $? -eq 0 ]
4
+ then
5
+ echo "Please run \"sh shell/format.sh\" to format the code."
6
+ exit 1
7
+ fi
8
+ echo "no issues with isort"
9
+ flake8 knowledge_gpt/
10
+ if ! [ $? -eq 0 ]
11
+ then
12
+ echo "Please fix the code style issue."
13
+ exit 1
14
+ fi
15
+ echo "no issues with flake8"
16
+ black --check --line-length 120 knowledge_gpt/
17
+ if ! [ $? -eq 0 ]
18
+ then
19
+ echo "Please run \"sh shell/format.sh\" to format the code."
20
+ exit 1
21
+ fi
22
+ echo "no issues with black"
23
+ echo "linting success!"