jerpint commited on
Commit
b4b5bdf
1 Parent(s): c525408

Deeplake support (#1)

Browse files

* add deeplake support

* add helper functions to ingest the data via deeplake

* add script to embed documents to deeplake format

Files changed (5) hide show
  1. cfg.py +9 -7
  2. embed_documents.py +61 -0
  3. gradio_app.py +4 -5
  4. requirements.txt +1 -0
  5. utils.py +53 -0
cfg.py CHANGED
@@ -1,16 +1,16 @@
1
- import os
2
  import logging
3
-
4
- from huggingface_hub import hf_hub_download
5
 
6
  from buster.busterbot import Buster, BusterConfig
7
  from buster.completers import ChatGPTCompleter, Completer, DocumentAnswerer
8
  from buster.formatters.documents import DocumentsFormatter
9
  from buster.formatters.prompts import PromptFormatter
10
- from buster.retriever import Retriever, SQLiteRetriever
11
  from buster.tokenizers import GPTTokenizer
12
  from buster.validators import QuestionAnswerValidator, Validator
 
13
 
 
14
 
15
  logger = logging.getLogger(__name__)
16
  logging.basicConfig(level=logging.INFO)
@@ -18,7 +18,7 @@ logging.basicConfig(level=logging.INFO)
18
 
19
  HUB_TOKEN = os.getenv("HUB_TOKEN")
20
  REPO_ID = "jerpint/towardsai-buster-data"
21
- HUB_DB_FILE = "documents.db"
22
  logger.info(f"Downloading {HUB_DB_FILE} from hub...")
23
  hf_hub_download(
24
  repo_id=REPO_ID,
@@ -28,6 +28,8 @@ hf_hub_download(
28
  local_dir=".",
29
  )
30
 
 
 
31
 
32
  buster_cfg = BusterConfig(
33
  validator_cfg={
@@ -61,7 +63,7 @@ A user will submit a question. Respond 'true' if it is valid, respond 'false' if
61
  },
62
  },
63
  retriever_cfg={
64
- "db_path": "./documents.db",
65
  "top_k": 3,
66
  "thresh": 0.7,
67
  "max_tokens": 2000,
@@ -115,7 +117,7 @@ A user will submit a question. Respond 'true' if it is valid, respond 'false' if
115
 
116
  # initialize buster with the config in cfg.py (adapt to your needs) ...
117
  # buster_cfg = cfg.buster_cfg
118
- retriever: Retriever = SQLiteRetriever(**buster_cfg.retriever_cfg)
119
  tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
120
  document_answerer: DocumentAnswerer = DocumentAnswerer(
121
  completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
 
 
1
  import logging
2
+ import os
 
3
 
4
  from buster.busterbot import Buster, BusterConfig
5
  from buster.completers import ChatGPTCompleter, Completer, DocumentAnswerer
6
  from buster.formatters.documents import DocumentsFormatter
7
  from buster.formatters.prompts import PromptFormatter
8
+ from buster.retriever import DeepLakeRetriever, Retriever
9
  from buster.tokenizers import GPTTokenizer
10
  from buster.validators import QuestionAnswerValidator, Validator
11
+ from huggingface_hub import hf_hub_download
12
 
13
+ from utils import extract_zip
14
 
15
  logger = logging.getLogger(__name__)
16
  logging.basicConfig(level=logging.INFO)
 
18
 
19
  HUB_TOKEN = os.getenv("HUB_TOKEN")
20
  REPO_ID = "jerpint/towardsai-buster-data"
21
+ HUB_DB_FILE = "deeplake_store.zip"
22
  logger.info(f"Downloading {HUB_DB_FILE} from hub...")
23
  hf_hub_download(
24
  repo_id=REPO_ID,
 
28
  local_dir=".",
29
  )
30
 
31
+ extract_zip(zip_file_path="deeplake_store.zip", output_path="deeplake_store")
32
+
33
 
34
  buster_cfg = BusterConfig(
35
  validator_cfg={
 
63
  },
64
  },
65
  retriever_cfg={
66
+ "path": "./deeplake_store",
67
  "top_k": 3,
68
  "thresh": 0.7,
69
  "max_tokens": 2000,
 
117
 
118
  # initialize buster with the config in cfg.py (adapt to your needs) ...
119
  # buster_cfg = cfg.buster_cfg
120
+ retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
121
  tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
122
  document_answerer: DocumentAnswerer = DocumentAnswerer(
123
  completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
embed_documents.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import pandas as pd
3
+ from deeplake.core.vectorstore import VectorStore
4
+
5
+ from utils import zip_contents
6
+
7
+
8
+ def embedding_function(texts, model="text-embedding-ada-002"):
9
+ if isinstance(texts, str):
10
+ texts = [texts]
11
+
12
+ texts = [t.replace("\n", " ") for t in texts]
13
+ return [
14
+ data["embedding"]
15
+ for data in openai.Embedding.create(input=texts, model=model)["data"]
16
+ ]
17
+
18
+
19
+ def extract_metadata(df: pd.DataFrame) -> dict:
20
+ """extract the metadata from the dataframe in deeplake dict format"""
21
+ metadata = df.apply(
22
+ lambda x: {
23
+ "url": x.url,
24
+ "source": x.source,
25
+ "title": x.title,
26
+ },
27
+ axis=1,
28
+ ).to_list()
29
+ return metadata
30
+
31
+
32
+ if __name__ == "__main__":
33
+ vector_store_path = "deeplake_store"
34
+ chunk_file = "data/chunks_preprocessed.csv"
35
+ overwrite = True
36
+ df = pd.read_csv(chunk_file)
37
+
38
+ for col in ["url", "source", "title", "content"]:
39
+ assert col in df.columns
40
+
41
+ # extract the text + metadata
42
+ metadata = extract_metadata(df)
43
+ chunked_text = df.content.to_list()
44
+
45
+ # init the vector store
46
+ vector_store = VectorStore(
47
+ path=vector_store_path,
48
+ overwrite=True,
49
+ )
50
+
51
+ # add the embeddings
52
+ vector_store.add(
53
+ text=chunked_text,
54
+ embedding_function=embedding_function,
55
+ embedding_data=chunked_text,
56
+ metadata=metadata,
57
+ )
58
+
59
+ # save the deeplake folder to a zip file
60
+ zipped_file_path = zip_contents(input_path=vector_store_path, output_path=".")
61
+ print(f"Contents zipped to: {zipped_file_path}")
gradio_app.py CHANGED
@@ -1,12 +1,11 @@
 
1
  import os
2
 
3
- import cfg
4
  import gradio as gr
5
  import pandas as pd
6
- from cfg import buster
7
-
8
 
9
- import logging
 
10
 
11
  logger = logging.getLogger(__name__)
12
  logging.basicConfig(level=logging.INFO)
@@ -86,7 +85,7 @@ with block:
86
  placeholder="Ask a question to AI stackoverflow here...",
87
  lines=1,
88
  )
89
- submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
90
 
91
  examples = gr.Examples(
92
  examples=[
 
1
+ import logging
2
  import os
3
 
 
4
  import gradio as gr
5
  import pandas as pd
 
 
6
 
7
+ import cfg
8
+ from cfg import buster
9
 
10
  logger = logging.getLogger(__name__)
11
  logging.basicConfig(level=logging.INFO)
 
85
  placeholder="Ask a question to AI stackoverflow here...",
86
  lines=1,
87
  )
88
+ submit = gr.Button(value="Send", variant="secondary")
89
 
90
  examples = gr.Examples(
91
  examples=[
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  git+https://github.com/jerpint/buster@v1.0.14
2
  gradio
 
 
1
  git+https://github.com/jerpint/buster@v1.0.14
2
  gradio
3
+ deeplake
utils.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import zipfile
3
+
4
+
5
+ def zip_contents(input_path, output_path):
6
+ """
7
+ Zips the entire contents of a given path to a custom output path.
8
+
9
+ Authored by ChatGPT
10
+
11
+ Args:
12
+ input_path (str): The path of the directory to be zipped.
13
+ output_path (str): The path where the zip file will be created.
14
+
15
+ Returns:
16
+ str: The path of the created zip file.
17
+ """
18
+ if not os.path.exists(input_path):
19
+ raise ValueError("The specified input path does not exist.")
20
+
21
+ zip_file_name = f"{os.path.basename(input_path)}.zip"
22
+ zip_file_path = os.path.join(output_path, zip_file_name)
23
+
24
+ with zipfile.ZipFile(zip_file_path, "w", zipfile.ZIP_DEFLATED) as zipf:
25
+ for root, _, files in os.walk(input_path):
26
+ for file in files:
27
+ file_path = os.path.join(root, file)
28
+ arcname = os.path.relpath(file_path, input_path)
29
+ zipf.write(file_path, arcname=arcname)
30
+
31
+ return zip_file_path
32
+
33
+
34
+ def extract_zip(zip_file_path, output_path):
35
+ """
36
+ Extracts the contents of a zip file to a custom output path.
37
+
38
+ Authored by ChatGPT
39
+
40
+ Args:
41
+ zip_file_path (str): The path of the zip file to be extracted.
42
+ output_path (str): The path where the zip contents will be extracted.
43
+
44
+ Returns:
45
+ str: The path of the directory where the zip contents are extracted.
46
+ """
47
+ if not os.path.exists(zip_file_path):
48
+ raise ValueError("The specified zip file does not exist.")
49
+
50
+ with zipfile.ZipFile(zip_file_path, "r") as zipf:
51
+ zipf.extractall(output_path)
52
+
53
+ return output_path