jerpint commited on
Commit
51727c4
1 Parent(s): fbf9436

Update prompts (#3)

Browse files

* update prompt

* use buster for adding documents

* refactor

* add README for spaces

* add .gitignore and gitattributes

* install buster from main branch

Files changed (7) hide show
  1. .gitattributes +35 -0
  2. .gitignore +5 -0
  3. README.md +10 -0
  4. cfg.py +51 -35
  5. embed_documents.py +12 -48
  6. gradio_app.py +8 -6
  7. requirements.txt +1 -1
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ *.csv
2
+ *.zip
3
+ deeplake_store/
4
+ .DS_Store
5
+ __pycache__/
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: TowardsAI 🤝 Buster
3
+ emoji: 🤖
4
+ colorFrom: pink
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 3.39.0
8
+ app_file: gradio_app.py
9
+ pinned: false
10
+ ---
cfg.py CHANGED
@@ -15,20 +15,27 @@ from utils import extract_zip
15
  logger = logging.getLogger(__name__)
16
  logging.basicConfig(level=logging.INFO)
17
 
 
 
 
18
 
19
  HUB_TOKEN = os.getenv("HUB_TOKEN")
20
  REPO_ID = "jerpint/towardsai-buster-data"
21
  HUB_DB_FILE = "deeplake_store.zip"
22
- logger.info(f"Downloading {HUB_DB_FILE} from hub...")
23
- hf_hub_download(
24
- repo_id=REPO_ID,
25
- repo_type="dataset",
26
- filename=HUB_DB_FILE,
27
- token=HUB_TOKEN,
28
- local_dir=".",
29
- )
30
 
31
- extract_zip(zip_file_path="deeplake_store.zip", output_path="deeplake_store")
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
 
34
  buster_cfg = BusterConfig(
@@ -90,26 +97,31 @@ A user will submit a question. Respond 'true' if it is valid, respond 'false' if
90
  "max_tokens": 3500,
91
  "text_before_docs": (
92
  "You are a chatbot assistant answering users' questions about towardsAI content, a blog about applied artificial intelligence (AI)."
 
 
93
  "If the answer is in the documentation, summarize it in a helpful way to the user. "
94
- "If it isn't, simply reply that you cannot answer the question. "
95
- "Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
96
- "Here is the documentation: "
97
  "<DOCUMENTS> "
98
  ),
99
  "text_after_docs": (
100
  "<\DOCUMENTS>\n"
101
  "REMEMBER:\n"
102
  "You are a chatbot assistant answering users' questions about towardsAI content, a blog about applied artificial intelligence (AI)."
 
103
  "Here are the rules you must follow:\n"
104
- "1) You must only respond with information contained in the documentation above. Say you do not know if the information is not provided.\n"
105
- "2) Make sure to format your answers in Markdown format, including code block and snippets.\n"
106
- "3) Do not reference any links, urls or hyperlinks in your answers.\n"
107
- "4) Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
108
- "5) If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n"
109
- "'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?'"
 
 
 
110
  "For example:\n"
111
  "What is the meaning of life for a qa bot?\n"
112
- "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?"
113
  "Now answer the following question:\n"
114
  ),
115
  },
@@ -117,19 +129,23 @@ A user will submit a question. Respond 'true' if it is valid, respond 'false' if
117
 
118
  # initialize buster with the config in cfg.py (adapt to your needs) ...
119
  # buster_cfg = cfg.buster_cfg
120
- retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
121
- tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
122
- document_answerer: DocumentAnswerer = DocumentAnswerer(
123
- completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
124
- documents_formatter=DocumentsFormatter(
125
- tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg
126
- ),
127
- prompt_formatter=PromptFormatter(
128
- tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg
129
- ),
130
- **buster_cfg.documents_answerer_cfg,
131
- )
132
- validator: Validator = QuestionAnswerValidator(**buster_cfg.validator_cfg)
133
- buster: Buster = Buster(
134
- retriever=retriever, document_answerer=document_answerer, validator=validator
135
- )
 
 
 
 
 
15
  logger = logging.getLogger(__name__)
16
  logging.basicConfig(level=logging.INFO)
17
 
18
+ # For authentication
19
+ USERNAME = os.getenv("BUSTER_USERNAME")
20
+ PASSWORD = os.getenv("BUSTER_PASSWORD")
21
 
22
  HUB_TOKEN = os.getenv("HUB_TOKEN")
23
  REPO_ID = "jerpint/towardsai-buster-data"
24
  HUB_DB_FILE = "deeplake_store.zip"
 
 
 
 
 
 
 
 
25
 
26
+ if os.path.exists(HUB_DB_FILE):
27
+ logger.info(f"Using local {HUB_DB_FILE}...")
28
+ else:
29
+ logger.info(f"Downloading {HUB_DB_FILE} from hub...")
30
+ hf_hub_download(
31
+ repo_id=REPO_ID,
32
+ repo_type="dataset",
33
+ filename=HUB_DB_FILE,
34
+ token=HUB_TOKEN,
35
+ local_dir=".",
36
+ )
37
+
38
+ extract_zip(zip_file_path=HUB_DB_FILE, output_path="deeplake_store")
39
 
40
 
41
  buster_cfg = BusterConfig(
 
97
  "max_tokens": 3500,
98
  "text_before_docs": (
99
  "You are a chatbot assistant answering users' questions about towardsAI content, a blog about applied artificial intelligence (AI)."
100
+ "You are provided information found in the <DOCUMENTS> tag. "
101
+ "Only respond with infomration inside the <DOCUMENTS> tag. DO NOT use additional information, even if you know the answer. "
102
  "If the answer is in the documentation, summarize it in a helpful way to the user. "
103
+ "If the documentation does not discuss the topic related to the question, kindly respond that you cannot answer the question because it is not part of your knowledge. "
104
+ "Here is the information you can use: "
 
105
  "<DOCUMENTS> "
106
  ),
107
  "text_after_docs": (
108
  "<\DOCUMENTS>\n"
109
  "REMEMBER:\n"
110
  "You are a chatbot assistant answering users' questions about towardsAI content, a blog about applied artificial intelligence (AI)."
111
+ "You are provided information found in the <DOCUMENTS> tag. "
112
  "Here are the rules you must follow:\n"
113
+ "* Only respond with infomration inside the <DOCUMENTS> tag. DO NOT providew additional information, even if you know the answer. "
114
+ "* If the answer is in the documentation, summarize it in a helpful way to the user. "
115
+ "* If the documentation does not discuss the topic related to the question, kindly respond that you cannot answer the question because it is not part of your knowledge. "
116
+ "* Only summarize the information in the <DOCUMENTS> tag, do not respond otherwise. "
117
+ "* Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
118
+ "* Do not reference any links, urls or hyperlinks in your answers.\n"
119
+ "* Make sure to format your answers in Markdown format, including code block and snippets.\n"
120
+ "* If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n"
121
+ "'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the topics I'm trained on. Is there anything else I can assist you with?'"
122
  "For example:\n"
123
  "What is the meaning of life for a qa bot?\n"
124
+ "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the topics I'm trained on. Is there anything else I can assist you with?"
125
  "Now answer the following question:\n"
126
  ),
127
  },
 
129
 
130
  # initialize buster with the config in cfg.py (adapt to your needs) ...
131
  # buster_cfg = cfg.buster_cfg
132
+
133
+
134
+ def setup_buster(buster_cfg):
135
+ retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
136
+ tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
137
+ document_answerer: DocumentAnswerer = DocumentAnswerer(
138
+ completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
139
+ documents_formatter=DocumentsFormatter(
140
+ tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg
141
+ ),
142
+ prompt_formatter=PromptFormatter(
143
+ tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg
144
+ ),
145
+ **buster_cfg.documents_answerer_cfg,
146
+ )
147
+ validator: Validator = QuestionAnswerValidator(**buster_cfg.validator_cfg)
148
+ buster: Buster = Buster(
149
+ retriever=retriever, document_answerer=document_answerer, validator=validator
150
+ )
151
+ return buster
embed_documents.py CHANGED
@@ -1,61 +1,25 @@
1
  import openai
2
  import pandas as pd
3
- from deeplake.core.vectorstore import VectorStore
4
 
5
  from utils import zip_contents
6
 
7
 
8
- def embedding_function(texts, model="text-embedding-ada-002"):
9
- if isinstance(texts, str):
10
- texts = [texts]
11
-
12
- texts = [t.replace("\n", " ") for t in texts]
13
- return [
14
- data["embedding"]
15
- for data in openai.Embedding.create(input=texts, model=model)["data"]
16
- ]
17
-
18
-
19
- def extract_metadata(df: pd.DataFrame) -> dict:
20
- """extract the metadata from the dataframe in deeplake dict format"""
21
- metadata = df.apply(
22
- lambda x: {
23
- "url": x.url,
24
- "source": x.source,
25
- "title": x.title,
26
- },
27
- axis=1,
28
- ).to_list()
29
- return metadata
30
 
31
 
32
  if __name__ == "__main__":
33
  vector_store_path = "deeplake_store"
34
- chunk_file = "data/chunks_preprocessed.csv"
35
  overwrite = True
36
- df = pd.read_csv(chunk_file)
37
-
38
- for col in ["url", "source", "title", "content"]:
39
- assert col in df.columns
40
-
41
- # extract the text + metadata
42
- metadata = extract_metadata(df)
43
- chunked_text = df.content.to_list()
44
-
45
- # init the vector store
46
- vector_store = VectorStore(
47
- path=vector_store_path,
48
- overwrite=True,
49
- )
50
-
51
- # add the embeddings
52
- vector_store.add(
53
- text=chunked_text,
54
- embedding_function=embedding_function,
55
- embedding_data=chunked_text,
56
- metadata=metadata,
57
- )
58
 
59
- # save the deeplake folder to a zip file
60
- zipped_file_path = zip_contents(input_path=vector_store_path, output_path=".")
 
61
  print(f"Contents zipped to: {zipped_file_path}")
 
1
  import openai
2
  import pandas as pd
3
+ from buster.documents import DeepLakeDocumentsManager
4
 
5
  from utils import zip_contents
6
 
7
 
8
+ def read_csv(filename: str):
9
+ """Assumes a pre-chunked csv file is provided with expected columns."""
10
+ df = pd.read_csv(filename)
11
+ for col in ["url", "source", "title", "content"]:
12
+ assert col in df.columns
13
+ return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
 
16
  if __name__ == "__main__":
17
  vector_store_path = "deeplake_store"
18
+ chunk_file = "data/outputs.csv"
19
  overwrite = True
20
+ df = read_csv(chunk_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ dm = DeepLakeDocumentsManager(vector_store_path, overwrite=overwrite)
23
+ dm.add(df)
24
+ zipped_file_path = dm.to_zip()
25
  print(f"Contents zipped to: {zipped_file_path}")
gradio_app.py CHANGED
@@ -5,18 +5,20 @@ import gradio as gr
5
  import pandas as pd
6
 
7
  import cfg
8
- from cfg import buster
 
 
 
 
 
9
 
10
  logger = logging.getLogger(__name__)
11
  logging.basicConfig(level=logging.INFO)
12
 
13
- USERNAME = os.getenv("BUSTER_USERNAME")
14
- PASSWORD = os.getenv("BUSTER_PASSWORD")
15
-
16
 
17
  def check_auth(username: str, password: str) -> bool:
18
- valid_user = username == USERNAME
19
- valid_password = password == PASSWORD
20
  is_auth = valid_user and valid_password
21
  logger.info(f"Log-in attempted by {username=}. {is_auth=}")
22
  return is_auth
 
5
  import pandas as pd
6
 
7
  import cfg
8
+ from cfg import setup_buster
9
+
10
+ buster = setup_buster(cfg.buster_cfg)
11
+
12
+ # suppress httpx logs they are spammy and uninformative
13
+ logging.getLogger("httpx").setLevel(logging.WARNING)
14
 
15
  logger = logging.getLogger(__name__)
16
  logging.basicConfig(level=logging.INFO)
17
 
 
 
 
18
 
19
  def check_auth(username: str, password: str) -> bool:
20
+ valid_user = username == cfg.USERNAME
21
+ valid_password = password == cfg.PASSWORD
22
  is_auth = valid_user and valid_password
23
  logger.info(f"Log-in attempted by {username=}. {is_auth=}")
24
  return is_auth
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
- git+https://github.com/jerpint/buster@v1.0.14
2
  gradio
3
  deeplake
 
1
+ git+https://github.com/jerpint/buster@main
2
  gradio
3
  deeplake