Spaces:
Running
Running
Deeplake support (#1)
Browse files* add deeplake support
* add helper functions to ingest the data via deeplake
* add script to embed documents to deeplake format
- cfg.py +9 -7
- embed_documents.py +61 -0
- gradio_app.py +4 -5
- requirements.txt +1 -0
- utils.py +53 -0
cfg.py
CHANGED
@@ -1,16 +1,16 @@
|
|
1 |
-
import os
|
2 |
import logging
|
3 |
-
|
4 |
-
from huggingface_hub import hf_hub_download
|
5 |
|
6 |
from buster.busterbot import Buster, BusterConfig
|
7 |
from buster.completers import ChatGPTCompleter, Completer, DocumentAnswerer
|
8 |
from buster.formatters.documents import DocumentsFormatter
|
9 |
from buster.formatters.prompts import PromptFormatter
|
10 |
-
from buster.retriever import
|
11 |
from buster.tokenizers import GPTTokenizer
|
12 |
from buster.validators import QuestionAnswerValidator, Validator
|
|
|
13 |
|
|
|
14 |
|
15 |
logger = logging.getLogger(__name__)
|
16 |
logging.basicConfig(level=logging.INFO)
|
@@ -18,7 +18,7 @@ logging.basicConfig(level=logging.INFO)
|
|
18 |
|
19 |
HUB_TOKEN = os.getenv("HUB_TOKEN")
|
20 |
REPO_ID = "jerpint/towardsai-buster-data"
|
21 |
-
HUB_DB_FILE = "
|
22 |
logger.info(f"Downloading {HUB_DB_FILE} from hub...")
|
23 |
hf_hub_download(
|
24 |
repo_id=REPO_ID,
|
@@ -28,6 +28,8 @@ hf_hub_download(
|
|
28 |
local_dir=".",
|
29 |
)
|
30 |
|
|
|
|
|
31 |
|
32 |
buster_cfg = BusterConfig(
|
33 |
validator_cfg={
|
@@ -61,7 +63,7 @@ A user will submit a question. Respond 'true' if it is valid, respond 'false' if
|
|
61 |
},
|
62 |
},
|
63 |
retriever_cfg={
|
64 |
-
"
|
65 |
"top_k": 3,
|
66 |
"thresh": 0.7,
|
67 |
"max_tokens": 2000,
|
@@ -115,7 +117,7 @@ A user will submit a question. Respond 'true' if it is valid, respond 'false' if
|
|
115 |
|
116 |
# initialize buster with the config in cfg.py (adapt to your needs) ...
|
117 |
# buster_cfg = cfg.buster_cfg
|
118 |
-
retriever: Retriever =
|
119 |
tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
|
120 |
document_answerer: DocumentAnswerer = DocumentAnswerer(
|
121 |
completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
|
|
|
|
|
1 |
import logging
|
2 |
+
import os
|
|
|
3 |
|
4 |
from buster.busterbot import Buster, BusterConfig
|
5 |
from buster.completers import ChatGPTCompleter, Completer, DocumentAnswerer
|
6 |
from buster.formatters.documents import DocumentsFormatter
|
7 |
from buster.formatters.prompts import PromptFormatter
|
8 |
+
from buster.retriever import DeepLakeRetriever, Retriever
|
9 |
from buster.tokenizers import GPTTokenizer
|
10 |
from buster.validators import QuestionAnswerValidator, Validator
|
11 |
+
from huggingface_hub import hf_hub_download
|
12 |
|
13 |
+
from utils import extract_zip
|
14 |
|
15 |
logger = logging.getLogger(__name__)
|
16 |
logging.basicConfig(level=logging.INFO)
|
|
|
18 |
|
19 |
HUB_TOKEN = os.getenv("HUB_TOKEN")
|
20 |
REPO_ID = "jerpint/towardsai-buster-data"
|
21 |
+
HUB_DB_FILE = "deeplake_store.zip"
|
22 |
logger.info(f"Downloading {HUB_DB_FILE} from hub...")
|
23 |
hf_hub_download(
|
24 |
repo_id=REPO_ID,
|
|
|
28 |
local_dir=".",
|
29 |
)
|
30 |
|
31 |
+
extract_zip(zip_file_path="deeplake_store.zip", output_path="deeplake_store")
|
32 |
+
|
33 |
|
34 |
buster_cfg = BusterConfig(
|
35 |
validator_cfg={
|
|
|
63 |
},
|
64 |
},
|
65 |
retriever_cfg={
|
66 |
+
"path": "./deeplake_store",
|
67 |
"top_k": 3,
|
68 |
"thresh": 0.7,
|
69 |
"max_tokens": 2000,
|
|
|
117 |
|
118 |
# initialize buster with the config in cfg.py (adapt to your needs) ...
|
119 |
# buster_cfg = cfg.buster_cfg
|
120 |
+
retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
|
121 |
tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
|
122 |
document_answerer: DocumentAnswerer = DocumentAnswerer(
|
123 |
completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
|
embed_documents.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import pandas as pd
|
3 |
+
from deeplake.core.vectorstore import VectorStore
|
4 |
+
|
5 |
+
from utils import zip_contents
|
6 |
+
|
7 |
+
|
8 |
+
def embedding_function(texts, model="text-embedding-ada-002"):
|
9 |
+
if isinstance(texts, str):
|
10 |
+
texts = [texts]
|
11 |
+
|
12 |
+
texts = [t.replace("\n", " ") for t in texts]
|
13 |
+
return [
|
14 |
+
data["embedding"]
|
15 |
+
for data in openai.Embedding.create(input=texts, model=model)["data"]
|
16 |
+
]
|
17 |
+
|
18 |
+
|
19 |
+
def extract_metadata(df: pd.DataFrame) -> dict:
|
20 |
+
"""extract the metadata from the dataframe in deeplake dict format"""
|
21 |
+
metadata = df.apply(
|
22 |
+
lambda x: {
|
23 |
+
"url": x.url,
|
24 |
+
"source": x.source,
|
25 |
+
"title": x.title,
|
26 |
+
},
|
27 |
+
axis=1,
|
28 |
+
).to_list()
|
29 |
+
return metadata
|
30 |
+
|
31 |
+
|
32 |
+
if __name__ == "__main__":
|
33 |
+
vector_store_path = "deeplake_store"
|
34 |
+
chunk_file = "data/chunks_preprocessed.csv"
|
35 |
+
overwrite = True
|
36 |
+
df = pd.read_csv(chunk_file)
|
37 |
+
|
38 |
+
for col in ["url", "source", "title", "content"]:
|
39 |
+
assert col in df.columns
|
40 |
+
|
41 |
+
# extract the text + metadata
|
42 |
+
metadata = extract_metadata(df)
|
43 |
+
chunked_text = df.content.to_list()
|
44 |
+
|
45 |
+
# init the vector store
|
46 |
+
vector_store = VectorStore(
|
47 |
+
path=vector_store_path,
|
48 |
+
overwrite=True,
|
49 |
+
)
|
50 |
+
|
51 |
+
# add the embeddings
|
52 |
+
vector_store.add(
|
53 |
+
text=chunked_text,
|
54 |
+
embedding_function=embedding_function,
|
55 |
+
embedding_data=chunked_text,
|
56 |
+
metadata=metadata,
|
57 |
+
)
|
58 |
+
|
59 |
+
# save the deeplake folder to a zip file
|
60 |
+
zipped_file_path = zip_contents(input_path=vector_store_path, output_path=".")
|
61 |
+
print(f"Contents zipped to: {zipped_file_path}")
|
gradio_app.py
CHANGED
@@ -1,12 +1,11 @@
|
|
|
|
1 |
import os
|
2 |
|
3 |
-
import cfg
|
4 |
import gradio as gr
|
5 |
import pandas as pd
|
6 |
-
from cfg import buster
|
7 |
-
|
8 |
|
9 |
-
import
|
|
|
10 |
|
11 |
logger = logging.getLogger(__name__)
|
12 |
logging.basicConfig(level=logging.INFO)
|
@@ -86,7 +85,7 @@ with block:
|
|
86 |
placeholder="Ask a question to AI stackoverflow here...",
|
87 |
lines=1,
|
88 |
)
|
89 |
-
submit = gr.Button(value="Send", variant="secondary")
|
90 |
|
91 |
examples = gr.Examples(
|
92 |
examples=[
|
|
|
1 |
+
import logging
|
2 |
import os
|
3 |
|
|
|
4 |
import gradio as gr
|
5 |
import pandas as pd
|
|
|
|
|
6 |
|
7 |
+
import cfg
|
8 |
+
from cfg import buster
|
9 |
|
10 |
logger = logging.getLogger(__name__)
|
11 |
logging.basicConfig(level=logging.INFO)
|
|
|
85 |
placeholder="Ask a question to AI stackoverflow here...",
|
86 |
lines=1,
|
87 |
)
|
88 |
+
submit = gr.Button(value="Send", variant="secondary")
|
89 |
|
90 |
examples = gr.Examples(
|
91 |
examples=[
|
requirements.txt
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
git+https://github.com/jerpint/buster@v1.0.14
|
2 |
gradio
|
|
|
|
1 |
git+https://github.com/jerpint/buster@v1.0.14
|
2 |
gradio
|
3 |
+
deeplake
|
utils.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import zipfile
|
3 |
+
|
4 |
+
|
5 |
+
def zip_contents(input_path, output_path):
|
6 |
+
"""
|
7 |
+
Zips the entire contents of a given path to a custom output path.
|
8 |
+
|
9 |
+
Authored by ChatGPT
|
10 |
+
|
11 |
+
Args:
|
12 |
+
input_path (str): The path of the directory to be zipped.
|
13 |
+
output_path (str): The path where the zip file will be created.
|
14 |
+
|
15 |
+
Returns:
|
16 |
+
str: The path of the created zip file.
|
17 |
+
"""
|
18 |
+
if not os.path.exists(input_path):
|
19 |
+
raise ValueError("The specified input path does not exist.")
|
20 |
+
|
21 |
+
zip_file_name = f"{os.path.basename(input_path)}.zip"
|
22 |
+
zip_file_path = os.path.join(output_path, zip_file_name)
|
23 |
+
|
24 |
+
with zipfile.ZipFile(zip_file_path, "w", zipfile.ZIP_DEFLATED) as zipf:
|
25 |
+
for root, _, files in os.walk(input_path):
|
26 |
+
for file in files:
|
27 |
+
file_path = os.path.join(root, file)
|
28 |
+
arcname = os.path.relpath(file_path, input_path)
|
29 |
+
zipf.write(file_path, arcname=arcname)
|
30 |
+
|
31 |
+
return zip_file_path
|
32 |
+
|
33 |
+
|
34 |
+
def extract_zip(zip_file_path, output_path):
|
35 |
+
"""
|
36 |
+
Extracts the contents of a zip file to a custom output path.
|
37 |
+
|
38 |
+
Authored by ChatGPT
|
39 |
+
|
40 |
+
Args:
|
41 |
+
zip_file_path (str): The path of the zip file to be extracted.
|
42 |
+
output_path (str): The path where the zip contents will be extracted.
|
43 |
+
|
44 |
+
Returns:
|
45 |
+
str: The path of the directory where the zip contents are extracted.
|
46 |
+
"""
|
47 |
+
if not os.path.exists(zip_file_path):
|
48 |
+
raise ValueError("The specified zip file does not exist.")
|
49 |
+
|
50 |
+
with zipfile.ZipFile(zip_file_path, "r") as zipf:
|
51 |
+
zipf.extractall(output_path)
|
52 |
+
|
53 |
+
return output_path
|