jerpint commited on
Commit
44ee439
1 Parent(s): c6dd20e

Add quickstart (#73)

Browse files

* rename buster.py to busterbot.py; was causing conflicts

* add easy script for generating embeddings

* Add README.md with instructions

* Add entrypoint

* refactor

* Add quickstart example

* ignore .db files

.gitignore CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  buster/apps/data/
2
  # Byte-compiled / optimized / DLL files
3
  __pycache__/
1
+ # database files
2
+ *.db
3
+
4
  buster/apps/data/
5
  # Byte-compiled / optimized / DLL files
6
  __pycache__/
README.md CHANGED
@@ -11,12 +11,38 @@ pinned: false
11
 
12
  # Buster, the QA documentation chatbot!
13
 
14
- Buster is a question-answering chatbot that can be tuned to specific documentations. You can try it [here](https://huggingface.co/spaces/jerpint/buster), where it will answer questions about [🤗 Transformers](https://huggingface.co/docs/transformers/index).
15
 
 
16
 
17
- ![Question: How do I load a Huggingface model?](buster/imgs/qa_web_load.png)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- ![Question: My code is crashing with "CUDA out of memory". What can I do to solve this?](buster/imgs/qa_web_oom.png)
20
 
21
  ## How does Buster work?
22
 
@@ -31,7 +57,7 @@ Finally, we craft the prompt:
31
 
32
  We send the prompt to the [OpenAI API](https://beta.openai.com/docs/api-reference/completions), and display the answer to the user!
33
 
34
- ### Currently used models
35
 
36
  - For embeddings: "text-embedding-ada-002"
37
  - For completion: We support both "text-davinci-003" and "gpt-3.5-turbo"
11
 
12
  # Buster, the QA documentation chatbot!
13
 
14
+ Buster is a question-answering chatbot that can be tuned to any source of documentations.
15
 
16
+ # Demo
17
 
18
+ You can try out our [live demo here](https://huggingface.co/spaces/jerpint/buster), where it will answer questions about a bunch of libraries we've already scraped, including [🤗 Transformers](https://huggingface.co/docs/transformers/index).
19
+
20
+
21
+ # Quickstart
22
+
23
+ Here is a quick guide to help you deploy buster on your own dataset!
24
+
25
+ First step, install buster locally. Note that buster requires python>=3.10.
26
+
27
+ ```
28
+ git clone https://github.com/jerpint/buster.git
29
+ pip install -e .
30
+ ```
31
+
32
+ Then, go to the examples folder. We've attached a sample `stackoverflow.csv` file to help you get started. You will convert the .csv to a `documents.db` file.
33
+
34
+ ```
35
+ buster_csv_parser stackoverflow.csv --output-filepath documents.db
36
+ ```
37
+
38
+ This will generate the embeddings and save them locally. Finally, run
39
+
40
+ ```
41
+ gradio gradio_app.py
42
+ ```
43
+
44
+ This will launch the gradio app locally, which you should be able to view on [localhost]( http://127.0.0.1:7860)
45
 
 
46
 
47
  ## How does Buster work?
48
 
57
 
58
  We send the prompt to the [OpenAI API](https://beta.openai.com/docs/api-reference/completions), and display the answer to the user!
59
 
60
+ ### Currently available models
61
 
62
  - For embeddings: "text-embedding-ada-002"
63
  - For completion: We support both "text-davinci-003" and "gpt-3.5-turbo"
buster/apps/bot_configs.py CHANGED
@@ -1,4 +1,4 @@
1
- from buster.buster import BusterConfig
2
 
3
  huggingface_cfg = BusterConfig(
4
  unknown_prompt="I'm sorry, but I am an AI language model trained to assist with questions related to the huggingface transformers library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
1
+ from buster.busterbot import BusterConfig
2
 
3
  huggingface_cfg = BusterConfig(
4
  unknown_prompt="I'm sorry, but I am an AI language model trained to assist with questions related to the huggingface transformers library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
buster/apps/gradio_app.py CHANGED
@@ -4,7 +4,7 @@ import pathlib
4
  import gradio as gr
5
 
6
  from buster.apps.bot_configs import available_configs
7
- from buster.buster import Buster, BusterConfig
8
  from buster.documents.base import DocumentsManager
9
  from buster.documents.utils import download_db, get_documents_manager_from_extension
10
 
4
  import gradio as gr
5
 
6
  from buster.apps.bot_configs import available_configs
7
+ from buster.busterbot import Buster, BusterConfig
8
  from buster.documents.base import DocumentsManager
9
  from buster.documents.utils import download_db, get_documents_manager_from_extension
10
 
buster/apps/slackbot.py CHANGED
@@ -3,7 +3,7 @@ import os
3
 
4
  from slack_bolt import App
5
 
6
- from buster.buster import Buster, BusterConfig
7
 
8
  logger = logging.getLogger(__name__)
9
  logging.basicConfig(level=logging.INFO)
3
 
4
  from slack_bolt import App
5
 
6
+ from buster.busterbot import Buster, BusterConfig
7
 
8
  logger = logging.getLogger(__name__)
9
  logging.basicConfig(level=logging.INFO)
buster/{buster.py → busterbot.py} RENAMED
File without changes
buster/docparser.py CHANGED
@@ -1,7 +1,9 @@
1
  import glob
 
2
  import os
3
  from typing import Type
4
 
 
5
  import numpy as np
6
  import pandas as pd
7
  import tiktoken
@@ -11,6 +13,9 @@ from openai.embeddings_utils import get_embedding
11
  from buster.documents import get_documents_manager_from_extension
12
  from buster.parser import HuggingfaceParser, Parser, SphinxParser
13
 
 
 
 
14
  EMBEDDING_MODEL = "text-embedding-ada-002"
15
  EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002
16
 
@@ -84,25 +89,89 @@ def get_all_documents(
84
  return documents_df
85
 
86
 
87
- def compute_n_tokens(df: pd.DataFrame) -> pd.DataFrame:
88
- encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
 
 
 
 
89
  # TODO are there unexpected consequences of allowing endoftext?
90
- df["n_tokens"] = df.content.apply(lambda x: len(encoding.encode(x, allowed_special={"<|endoftext|>"})))
91
  return df
92
 
93
 
94
- def precompute_embeddings(df: pd.DataFrame) -> pd.DataFrame:
95
- df["embedding"] = df.content.apply(lambda x: np.asarray(get_embedding(x, engine=EMBEDDING_MODEL), dtype=np.float32))
 
 
 
 
 
 
 
 
96
  return df
97
 
98
 
99
- def generate_embeddings(root_dir: str, output_filepath: str, source: str) -> pd.DataFrame:
100
- # Get all documents and precompute their embeddings
 
 
 
 
 
 
101
  documents = get_all_documents(root_dir, supported_docs[source]["base_url"], supported_docs[source]["parser"])
102
- documents = compute_n_tokens(documents)
103
- documents = precompute_embeddings(documents)
104
 
 
 
105
  documents_manager = get_documents_manager_from_extension(output_filepath)(output_filepath)
106
- documents_manager.add(source, documents)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  return documents
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import glob
2
+ import logging
3
  import os
4
  from typing import Type
5
 
6
+ import click
7
  import numpy as np
8
  import pandas as pd
9
  import tiktoken
13
  from buster.documents import get_documents_manager_from_extension
14
  from buster.parser import HuggingfaceParser, Parser, SphinxParser
15
 
16
+ logger = logging.getLogger(__name__)
17
+ logging.basicConfig(level=logging.INFO)
18
+
19
  EMBEDDING_MODEL = "text-embedding-ada-002"
20
  EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002
21
 
89
  return documents_df
90
 
91
 
92
+ def compute_n_tokens(
93
+ df: pd.DataFrame, embedding_encoding: str = EMBEDDING_ENCODING, col: str = "content"
94
+ ) -> pd.DataFrame:
95
+ """Counts the tokens in the content column and adds the count to a n_tokens column."""
96
+ logger.info("Computing tokens counts...")
97
+ encoding = tiktoken.get_encoding(encoding_name=embedding_encoding)
98
  # TODO are there unexpected consequences of allowing endoftext?
99
+ df["n_tokens"] = df[col].apply(lambda x: len(encoding.encode(x, allowed_special={"<|endoftext|>"})))
100
  return df
101
 
102
 
103
+ def max_word_count(df: pd.DataFrame, max_words: int, col: str = "content") -> pd.DataFrame:
104
+ """Trim the word count of an entry to max_words"""
105
+ assert df[col].apply(lambda s: isinstance(s, str)).all(), f"Column {col} must contain only strings"
106
+ word_counts_before = df[col].apply(lambda x: len(x.split()))
107
+ df[col] = df[col].apply(lambda x: " ".join(x.split()[:max_words]))
108
+ word_counts_after = df[col].apply(lambda x: len(x.split()))
109
+
110
+ trimmed = df[word_counts_before == word_counts_after]
111
+ logger.info(f"trimmed {len(trimmed)} documents to {max_words} words.")
112
+
113
  return df
114
 
115
 
116
+ def compute_embeddings(df: pd.DataFrame, engine: str = EMBEDDING_MODEL, col="embedding") -> pd.DataFrame:
117
+ logger.info(f"Computing embeddings for {len(df)} documents...")
118
+ df[col] = df.content.apply(lambda x: np.asarray(get_embedding(x, engine=engine), dtype=np.float32))
119
+ logger.info(f"Done computing embeddings for {len(df)} documents.")
120
+ return df
121
+
122
+
123
+ def generate_embeddings_parser(root_dir: str, output_filepath: str, source: str) -> pd.DataFrame:
124
  documents = get_all_documents(root_dir, supported_docs[source]["base_url"], supported_docs[source]["parser"])
125
+ return generate_embeddings(documents, output_filepath)
126
+
127
 
128
+ def documents_to_db(documents: pd.DataFrame, output_filepath: str):
129
+ logger.info("Preparing database...")
130
  documents_manager = get_documents_manager_from_extension(output_filepath)(output_filepath)
131
+ sources = documents["source"].unique()
132
+ for source in sources:
133
+ documents_manager.add(source, documents)
134
+ logger.info(f"Documents saved to: {output_filepath}")
135
+
136
+
137
+ def generate_embeddings(
138
+ documents: pd.DataFrame,
139
+ output_filepath: str = "documents.db",
140
+ max_words=500,
141
+ embedding_engine: str = EMBEDDING_MODEL,
142
+ ) -> pd.DataFrame:
143
+ # check that we have the appropriate columns in our dataframe
144
+
145
+ assert set(required_cols := ["content", "title", "url"]).issubset(
146
+ set(documents.columns)
147
+ ), f"Your dataframe must contain {required_cols}."
148
+
149
+ # Get all documents and precompute their embeddings
150
+ documents = max_word_count(documents, max_words=max_words)
151
+ documents = compute_n_tokens(documents)
152
+ documents = compute_embeddings(documents, engine=embedding_engine)
153
+
154
+ # save the documents to a db for later use
155
+ documents_to_db(documents, output_filepath)
156
 
157
  return documents
158
+
159
+
160
+ @click.command()
161
+ @click.argument("documents-csv")
162
+ @click.option(
163
+ "--output-filepath", default="documents.db", help='Where your database will be saved. Default is "documents.db"'
164
+ )
165
+ @click.option(
166
+ "--max-words", default=500, help="Number of maximum allowed words per document, excess is trimmed. Default is 500"
167
+ )
168
+ @click.option(
169
+ "--embeddings-engine", default=EMBEDDING_MODEL, help=f"Embedding model to use. Default is {EMBEDDING_MODEL}"
170
+ )
171
+ def main(documents_csv: str, output_filepath: str, max_words: int, embeddings_engine: str):
172
+ documents = pd.read_csv(documents_csv)
173
+ documents = generate_embeddings(documents, output_filepath, max_words, embeddings_engine)
174
+
175
+
176
+ if __name__ == "__main__":
177
+ main()
buster/examples/cfg.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from buster.busterbot import BusterConfig
2
+
3
+ documents_filepath = "./documents.db"
4
+ buster_cfg = BusterConfig(
5
+ unknown_prompt="I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
6
+ embedding_model="text-embedding-ada-002",
7
+ top_k=3,
8
+ thresh=0.7,
9
+ max_words=3000,
10
+ completer_cfg={
11
+ "name": "ChatGPT",
12
+ "text_before_documents": (
13
+ "You are a chatbot assistant answering technical questions about artificial intelligence (AI)."
14
+ "You can only respond to a question if the content necessary to answer the question is contained in the following provided documentation. "
15
+ "If the answer is in the documentation, summarize it in a helpful way to the user. "
16
+ "If it isn't, simply reply that you cannot answer the question. "
17
+ "Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
18
+ "Here is the documentation: "
19
+ "<DOCUMENTS> "
20
+ ),
21
+ "text_before_prompt": (
22
+ "<\DOCUMENTS>\n"
23
+ "REMEMBER:\n"
24
+ "You are a chatbot assistant answering technical questions about artificial intelligence (AI)."
25
+ "Here are the rules you must follow:\n"
26
+ "1) You must only respond with information contained in the documentation above. Say you do not know if the information is not provided.\n"
27
+ "2) Make sure to format your answers in Markdown format, including code block and snippets.\n"
28
+ "3) Do not reference any links, urls or hyperlinks in your answers.\n"
29
+ "4) If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n"
30
+ "5) Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
31
+ "'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?'"
32
+ "For example:\n"
33
+ "What is the meaning of life for an qa bot?\n"
34
+ "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?"
35
+ "Now answer the following question:\n"
36
+ ),
37
+ "completion_kwargs": {
38
+ "model": "gpt-3.5-turbo",
39
+ },
40
+ },
41
+ response_format="gradio",
42
+ source="stackoverflow",
43
+ )
buster/examples/gradio_app.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cfg
2
+ import gradio as gr
3
+
4
+ from buster.busterbot import Buster
5
+ from buster.documents.base import DocumentsManager
6
+ from buster.documents.utils import get_documents_manager_from_extension
7
+
8
+ # initialize buster with the config in config.py (adapt to your needs) ...
9
+ documents: DocumentsManager = get_documents_manager_from_extension(cfg.documents_filepath)(cfg.documents_filepath)
10
+ buster: Buster = Buster(cfg=cfg.buster_cfg, documents=documents)
11
+
12
+
13
+ def chat(question, history):
14
+ history = history or []
15
+ answer = buster.process_input(question)
16
+
17
+ # formatting hack for code blocks to render properly every time
18
+ answer = answer.replace("```", "\n```\n")
19
+
20
+ history.append((question, answer))
21
+ return history, history
22
+
23
+
24
+ block = gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}")
25
+
26
+ with block:
27
+ with gr.Row():
28
+ gr.Markdown("<h3><center>Buster 🤖: A Question-Answering Bot for your documentation</center></h3>")
29
+
30
+ chatbot = gr.Chatbot()
31
+
32
+ with gr.Row():
33
+ message = gr.Textbox(
34
+ label="What's your question?",
35
+ placeholder="Ask a question to AI stackoverflow here...",
36
+ lines=1,
37
+ )
38
+ submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
39
+
40
+ examples = gr.Examples(
41
+ examples=[
42
+ "How can I perform backpropagation?",
43
+ "How do I deal with noisy data?",
44
+ ],
45
+ inputs=message,
46
+ )
47
+
48
+ gr.Markdown("This application uses GPT to search the docs for relevant info and answer questions.")
49
+
50
+ gr.HTML("️<center> Created with ❤️ by @jerpint and @hadrienbertrand")
51
+
52
+ state = gr.State()
53
+ agent_state = gr.State()
54
+
55
+ submit.click(chat, inputs=[message, state], outputs=[chatbot, state])
56
+ message.submit(chat, inputs=[message, state], outputs=[chatbot, state])
57
+
58
+
59
+ block.launch(debug=True, share=False)
buster/examples/stackoverflow.csv ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,source,title,content,url
2
+ 0,stackoverflow,stackoverflow question #1,"""Backprop"" is the same as ""backpropagation"": it's just a shorter way to say it. It is sometimes abbreviated as ""BP"".
3
+ ",https://ai.stackexchange.com/questions/1
4
+ 1,stackoverflow,stackoverflow question #2,"Noise in the data, to a reasonable amount, may help the network to generalize better. Sometimes, it has the opposite effect. It partly depends on the kind of noise (""true"" vs. artificial).
5
+ The AI FAQ on ANN gives a good overview. Excerpt:
6
+
7
+ Noise in the actual data is never a good thing, since it limits the accuracy of generalization that can be achieved no matter how extensive the training set is. On the other hand, injecting artificial noise (jitter) into the inputs during training is one of several ways to improve generalization for smooth functions when you have a small training set.
8
+
9
+ In some field, such as computer vision, it's common to increase the size of the training set by copying some samples and adding some noises or other transformation.
10
+ ",https://ai.stackexchange.com/questions/2
11
+ 2,stackoverflow,stackoverflow question #4,"There is no direct way to find the optimal number of them: people empirically try and see (e.g., using cross-validation). The most common search techniques are random, manual, and grid searches.
12
+ There exist more advanced techniques such as Gaussian processes, e.g. Optimizing Neural Network Hyperparameters with Gaussian Processes for Dialog Act Classification, IEEE SLT 2016.
13
+ ",https://ai.stackexchange.com/questions/4
14
+ 3,stackoverflow,stackoverflow question #6,"It rather depends on how one defines several of the terms used. For example:
15
+
16
+ Whether the term ""expected"" is interpreted in a formal (i.e.
17
+ statistical) sense.
18
+ Whether it's assumed that humans have any kind of utilitarian
19
+ ""performance measure"".
20
+
21
+ The motivation for this description of ""agent"" arose from a desire to have a quantitative model - it's not clear that such a model is a good fit for human cognition.
22
+ However, there are alternative definitions of agents, for example the BDI model, which are rather more open-ended and hence more obviously applicable to humans.
23
+ ",https://ai.stackexchange.com/questions/6
24
+ 4,stackoverflow,stackoverflow question #7,"
25
+ To put it simply in layman terms, what are the possible threats from AI?
26
+
27
+ Currently, there are no threat.
28
+ The threat comes if humans create a so-called ultraintelligent machine, a machine that can surpass all intellectual activities by any human. This would be the last invention man would need to do, since this machine is better in inventing machines than humans are (since that is an intellectual activity). However, this could cause the machine to invent machines that can destruct humans, and we can't stop them because they are so much smarter than we are.
29
+ This is all hypothetical, no one has even a clue of what an ultraintelligent machine looks like.
30
+
31
+ If we know that AI is so dangerous why are we still promoting it? Why is it not banned?
32
+
33
+ As I said before, the existence of a ultraintelligent machine is hypothetical. Artificial Intelligence has lots of useful applications (more than this answer can contain), and if we develop it, we get even more useful applications. We just have to be careful that the machines won't overtake us.
34
+ ",https://ai.stackexchange.com/questions/7
35
+ 5,stackoverflow,stackoverflow question #10,"It's analogous to analogue versus digital, or the many shades of gray in between black and white: when evaluating the truthiness of a result, in binary boolean it's either true or false (0 or 1), but when utilizing fuzzy logic, it's an estimated probability between 0 and 1 (such as 0.75 being mostly probably true). It's useful for making calculated decisions when all information needed isn't necessarily available.
36
+ Wikipedia has a fantastic page for this.
37
+ ",https://ai.stackexchange.com/questions/10
38
+ 6,stackoverflow,stackoverflow question #15,"The problem of the Turing Test is that it tests the machines ability to resemble humans. Not necessarily every form of AI has to resemble humans. This makes the Turing Test less reliable. However, it is still useful since it is an actual test. It is also noteworthy that there is a prize for passing or coming closest to passing the Turing Test, the Loebner Prize.
39
+ The intelligent agent definition of intelligence states that an agent is intelligent if it acts so to maximize the expected value of a performance measure based on past experience and knowledge. (paraphrased from Wikipedia). This definition is used more often and does not depend on the ability to resemble humans. However, it is harder to test this.
40
+ ",https://ai.stackexchange.com/questions/15
41
+ 7,stackoverflow,stackoverflow question #17,"The concept of ""the singularity"" is when machines outsmart the humans. Although Stephen Hawking opinion is that this situation is inevitable, but I think it'll be very difficult to reach that point, because every A.I. algorithm needs to be programmed by humans, therefore it would be always more limited than its creator.
42
+ We would probably know when that point when humanity will lose control over Artificial Intelligence where super-smart AI would be in competition with humans and maybe creating more sophisticated intelligent beings occurred, but currently, it's more like science fiction (aka Terminator's Skynet).
43
+ The risk could involve killing people (like self-flying war drones making their own decision), destroying countries or even the whole planet (like A.I. connected to the nuclear weapons (aka WarGames movie), but it doesn't prove the point that the machines would be smarter than humans.
44
+ ",https://ai.stackexchange.com/questions/17
45
+ 8,stackoverflow,stackoverflow question #26,"I think your question fits nowadays more in the field of Human-Robot Interaction, which relies largely on vision for recognition of gestures and follow movements, as well as soft, natural movements as a response. Note that the movements of the face and hands belong to the most complex tasks, involving many muscles at a time.
46
+ I strongly recommend the film Plug & Pray to have an idea of what people are researching in this area.
47
+ You may also find Eliza (which you can try here) interesting. It is classical in the history of AI and pretends to mimic an analyst (psychology). (I am thinking of Eliza not because of its emotional intelligence, but because it was apparently taken seriously by a couple of humans. Could this be taken as a sort of (approved) Turing test? What does it say about the humans it met?)
48
+ On the purely human end of the scale, I sometimes wonder about our (my) emotional intelligence myself. Would I want to implement such an intelligence in an artificial agent at all?
49
+ ",https://ai.stackexchange.com/questions/26
50
+ 9,stackoverflow,stackoverflow question #28,"This is probably more a question of philosophy than anything. In terms of how things are commonly defined, I'll say ""yes, genetic algorithms are part of AI"". If you pick up a comprehensive book on artificial intelligence, there will probably be a chapter on genetic algorithms (or more broadly, evolutionary algorithms).
51
+ One area that has been extensively studied in the past is the idea of using genetic algorithms to train neural networks. I don't know if people are still actively researching this topic or not, but it at least illustrates that GA's are part of the overall rubric of AI in one regard.
52
+ ",https://ai.stackexchange.com/questions/28
pyproject.toml CHANGED
@@ -10,6 +10,9 @@ readme = "README.md"
10
  requires-python = ">=3.10"
11
  dynamic = ["dependencies"]
12
 
 
 
 
13
  [tool.setuptools.dynamic]
14
  dependencies = {file = ["requirements.txt"]}
15
 
@@ -22,3 +25,4 @@ line-length = 120
22
  [tool.pytest.ini_options]
23
  log_cli = true
24
  log_cli_level = "INFO"
 
10
  requires-python = ">=3.10"
11
  dynamic = ["dependencies"]
12
 
13
+ [project.scripts]
14
+ buster_csv_parser = "buster.docparser:main"
15
+
16
  [tool.setuptools.dynamic]
17
  dependencies = {file = ["requirements.txt"]}
18
 
25
  [tool.pytest.ini_options]
26
  log_cli = true
27
  log_cli_level = "INFO"
28
+
requirements.txt CHANGED
@@ -8,6 +8,7 @@ tiktoken
8
  promptlayer
9
  pytest
10
  openai
 
11
 
12
  # all openai[embeddings] deps, their list breaks our CI, see: https://github.com/openai/openai-python/issues/210
13
 
8
  promptlayer
9
  pytest
10
  openai
11
+ click
12
 
13
  # all openai[embeddings] deps, their list breaks our CI, see: https://github.com/openai/openai-python/issues/210
14
 
tests/test_chatbot.py CHANGED
@@ -4,7 +4,7 @@ from pathlib import Path
4
  import numpy as np
5
  import pandas as pd
6
 
7
- from buster.buster import Buster, BusterConfig
8
  from buster.completers.base import Completer
9
  from buster.documents import DocumentsManager, get_documents_manager_from_extension
10
  from buster.formatter.base import Response
@@ -60,7 +60,7 @@ logging.basicConfig(level=logging.INFO)
60
  def test_chatbot_mock_data(tmp_path, monkeypatch):
61
  gpt_expected_answer = "this is GPT answer"
62
  monkeypatch.setattr(Buster, "get_embedding", lambda self, prompt, engine: get_fake_embedding())
63
- monkeypatch.setattr("buster.buster.get_completer", lambda x: MockCompleter(expected_answer=gpt_expected_answer))
64
 
65
  hf_transformers_cfg = BusterConfig(
66
  unknown_prompt="This doesn't seem to be related to the huggingface library. I am not sure how to answer.",
4
  import numpy as np
5
  import pandas as pd
6
 
7
+ from buster.busterbot import Buster, BusterConfig
8
  from buster.completers.base import Completer
9
  from buster.documents import DocumentsManager, get_documents_manager_from_extension
10
  from buster.formatter.base import Response
60
  def test_chatbot_mock_data(tmp_path, monkeypatch):
61
  gpt_expected_answer = "this is GPT answer"
62
  monkeypatch.setattr(Buster, "get_embedding", lambda self, prompt, engine: get_fake_embedding())
63
+ monkeypatch.setattr("buster.busterbot.get_completer", lambda x: MockCompleter(expected_answer=gpt_expected_answer))
64
 
65
  hf_transformers_cfg = BusterConfig(
66
  unknown_prompt="This doesn't seem to be related to the huggingface library. I am not sure how to answer.",
tests/test_docparser.py CHANGED
@@ -7,7 +7,9 @@ from buster.documents import get_documents_manager_from_extension
7
 
8
  def test_generate_embeddings(tmp_path, monkeypatch):
9
  # Create fake data
10
- data = pd.DataFrame.from_dict({"title": ["test"], "url": ["http://url.com"], "content": ["cool text"]})
 
 
11
 
12
  # Patch the get_embedding function to return a fixed embedding
13
  monkeypatch.setattr("buster.docparser.get_embedding", lambda x, engine: [-0.005, 0.0018])
@@ -15,10 +17,10 @@ def test_generate_embeddings(tmp_path, monkeypatch):
15
 
16
  # Generate embeddings, store in a file
17
  output_file = tmp_path / "test_document_embeddings.tar.gz"
18
- df = generate_embeddings(tmp_path, output_file, source="mila")
19
 
20
  # Read the embeddings from the file
21
- read_df = get_documents_manager_from_extension(output_file)(output_file).get_documents("mila")
22
 
23
  # Check all the values are correct across the files
24
  assert df["title"].iloc[0] == data["title"].iloc[0] == read_df["title"].iloc[0]
7
 
8
  def test_generate_embeddings(tmp_path, monkeypatch):
9
  # Create fake data
10
+ data = pd.DataFrame.from_dict(
11
+ {"title": ["test"], "url": ["http://url.com"], "content": ["cool text"], "source": ["my_source"]}
12
+ )
13
 
14
  # Patch the get_embedding function to return a fixed embedding
15
  monkeypatch.setattr("buster.docparser.get_embedding", lambda x, engine: [-0.005, 0.0018])
17
 
18
  # Generate embeddings, store in a file
19
  output_file = tmp_path / "test_document_embeddings.tar.gz"
20
+ df = generate_embeddings(data, output_file)
21
 
22
  # Read the embeddings from the file
23
+ read_df = get_documents_manager_from_extension(output_file)(output_file).get_documents("my_source")
24
 
25
  # Check all the values are correct across the files
26
  assert df["title"].iloc[0] == data["title"].iloc[0] == read_df["title"].iloc[0]