Spaces:

jerpint
/

buster-dev

Runtime error

App Files Files Community

jerpint commited on Mar 14, 2023

Commit

c6dd20e

•

1 Parent(s): 2642581

Use dropdown to select source (#71)

Browse files

* add dropdown menu for switching data sources

* Add ability to update Buster's config on the fly

* Add lightning, godot documentation sources

* add download script for the weights (from huggingface dataset)

* update tests

* Add logging to pytest

* Fix source titles when returning results

* return percentages instead of cosine score

* change source directly when you call chat

Files changed (14) hide show

.gitattributes +0 -1
.gitignore +1 -0
buster/apps/bot_configs.py +175 -0
buster/apps/gradio_app.py +33 -50
buster/buster.py +47 -29
buster/data/documents.db +0 -3
buster/documents/utils.py +13 -0
buster/formatter/base.py +3 -2
buster/formatter/gradio.py +1 -1
buster/formatter/html.py +1 -1
buster/formatter/markdown.py +1 -1
buster/formatter/slack.py +1 -1
pyproject.toml +4 -0
tests/test_chatbot.py +33 -14

.gitattributes DELETED Viewed

	@@ -1 +0,0 @@
1	- *.db filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -1,3 +1,4 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

+buster/apps/data/
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

buster/apps/bot_configs.py ADDED Viewed

	@@ -0,0 +1,175 @@

+from buster.buster import BusterConfig
+huggingface_cfg = BusterConfig(
+    unknown_prompt="I'm sorry, but I am an AI language model trained to assist with questions related to the huggingface transformers library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
+    embedding_model="text-embedding-ada-002",
+    top_k=3,
+    thresh=0.7,
+    max_words=3000,
+    completer_cfg={
+        "name": "ChatGPT",
+        "text_before_documents": (
+            "You are a chatbot assistant answering technical questions about huggingface transformers, a library to train transformers in python. "
+            "You can only respond to a question if the content necessary to answer the question is contained in the following provided documentation. "
+            "If the answer is in the documentation, summarize it in a helpful way to the user. "
+            "If it isn't, simply reply that you cannot answer the question. "
+            "Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
+            "Here is the documentation: "
+            "<DOCUMENTS> "
+        ),
+        "text_before_prompt": (
+            "<\DOCUMENTS>\n"
+            "REMEMBER:\n"
+            "You are a chatbot assistant answering technical questions about huggingface transformers, a library to train transformers in python. "
+            "Here are the rules you must follow:\n"
+            "1) You must only respond with information contained in the documentation above. Say you do not know if the information is not provided.\n"
+            "2) Make sure to format your answers in Markdown format, including code block and snippets.\n"
+            "3) Do not reference any links, urls or hyperlinks in your answers.\n"
+            "4) If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n"
+            "5) Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
+            "'I'm sorry, but I am an AI language model trained to assist with questions related to the huggingface transformers library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?'"
+            "For example:\n"
+            "What is the meaning of life for huggingface?\n"
+            "I'm sorry, but I am an AI language model trained to assist with questions related to the huggingface transformers library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?"
+            "Now answer the following question:\n"
+        ),
+        "completion_kwargs": {
+            "model": "gpt-3.5-turbo",
+        },
+    },
+    response_format="gradio",
+    source="huggingface",
+)
+pytorch_cfg = BusterConfig(
+    unknown_prompt="I'm sorry, but I am an AI language model trained to assist with questions related to the pytorch library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
+    embedding_model="text-embedding-ada-002",
+    top_k=3,
+    thresh=0.7,
+    max_words=3000,
+    completer_cfg={
+        "name": "ChatGPT",
+        "text_before_documents": (
+            "You are a chatbot assistant answering technical questions about pytorch, a library to train neural networks in python. "
+            "You can only respond to a question if the content necessary to answer the question is contained in the following provided documentation. "
+            "If the answer is in the documentation, summarize it in a helpful way to the user. "
+            "If it isn't, simply reply that you cannot answer the question. "
+            "Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
+            "Here is the documentation: "
+            "<DOCUMENTS> "
+        ),
+        "text_before_prompt": (
+            "<\DOCUMENTS>\n"
+            "REMEMBER:\n"
+            "You are a chatbot assistant answering technical questions about pytorch transformers, a library to train neural networks in python. "
+            "Here are the rules you must follow:\n"
+            "1) You must only respond with information contained in the documentation above. Say you do not know if the information is not provided.\n"
+            "2) Make sure to format your answers in Markdown format, including code block and snippets.\n"
+            "3) Do not include any links, urls or hyperlinks in your answers.\n"
+            "4) If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n"
+            "5) Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
+            "'I'm sorry, but I am an AI language model trained to assist with questions related to the pytorch transformers library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?'"
+            "For example:\n"
+            "What is the meaning of life for pytorch?\n"
+            "I'm sorry, but I am an AI language model trained to assist with questions related to the pytorch library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?"
+            "Now answer the following question:\n"
+        ),
+        "completion_kwargs": {
+            "model": "gpt-3.5-turbo",
+        },
+    },
+    response_format="gradio",
+    source="pytorch",
+)
+lightning_cfg = BusterConfig(
+    unknown_prompt="I'm sorry, but I am an AI language model trained to assist with questions related to the pytorch lightning library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
+    embedding_model="text-embedding-ada-002",
+    top_k=3,
+    thresh=0.7,
+    max_words=3000,
+    completer_cfg={
+        "name": "ChatGPT",
+        "text_before_documents": (
+            "You are a chatbot assistant answering technical questions about pytorch lightning, a library to train neural networks in python. "
+            "You can only respond to a question if the content necessary to answer the question is contained in the following provided documentation. "
+            "If the answer is in the documentation, summarize it in a helpful way to the user. "
+            "If it isn't, simply reply that you cannot answer the question. "
+            "Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
+            "Here is the documentation: "
+            "<DOCUMENTS> "
+        ),
+        "text_before_prompt": (
+            "<\DOCUMENTS>\n"
+            "REMEMBER:\n"
+            "You are a chatbot assistant answering technical questions about pytorch lightning transformers, a library to train neural networks in python. "
+            "Here are the rules you must follow:\n"
+            "1) You must only respond with information contained in the documentation above. Say you do not know if the information is not provided.\n"
+            "2) Make sure to format your answers in Markdown format, including code block and snippets.\n"
+            "3) Do not include any links, urls or hyperlinks in your answers.\n"
+            "4) If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n"
+            "5) Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
+            "'I'm sorry, but I am an AI language model trained to assist with questions related to the pytorch lightning library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?'"
+            "For example:\n"
+            "What is the meaning of life for pytorch lightning?\n"
+            "I'm sorry, but I am an AI language model trained to assist with questions related to the pytorch lightning library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?"
+            "Now answer the following question:\n"
+        ),
+        "completion_kwargs": {
+            "model": "gpt-3.5-turbo",
+        },
+    },
+    response_format="gradio",
+    source="lightning",
+)
+godot_cfg = BusterConfig(
+    unknown_prompt="I'm sorry, but I am an AI language model trained to assist with questions related to the godot library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
+    embedding_model="text-embedding-ada-002",
+    top_k=3,
+    thresh=0.7,
+    max_words=3000,
+    completer_cfg={
+        "name": "ChatGPT",
+        "text_before_documents": (
+            "You are a chatbot assistant answering technical questions about godot, a game-engine library. "
+            "You can only respond to a question if the content necessary to answer the question is contained in the following provided documentation. "
+            "If the answer is in the documentation, summarize it in a helpful way to the user. "
+            "If it isn't, simply reply that you cannot answer the question. "
+            "Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
+            "Here is the documentation: "
+            "<DOCUMENTS> "
+        ),
+        "text_before_prompt": (
+            "<\DOCUMENTS>\n"
+            "REMEMBER:\n"
+            "You are a chatbot assistant answering technical questions about godot, a game-engine library."
+            "Here are the rules you must follow:\n"
+            "1) You must only respond with information contained in the documentation above. Say you do not know if the information is not provided.\n"
+            "2) Make sure to format your answers in Markdown format, including code block and snippets.\n"
+            "3) Do not include any links, urls or hyperlinks in your answers.\n"
+            "4) If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n"
+            "5) Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
+            "'I'm sorry, but I am an AI language model trained to assist with questions related to the godot library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?'"
+            "For example:\n"
+            "What is the meaning of life for godot?\n"
+            "I'm sorry, but I am an AI language model trained to assist with questions related to the pytorch lightning library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?"
+            "Now answer the following question:\n"
+        ),
+        "completion_kwargs": {
+            "model": "gpt-3.5-turbo",
+        },
+    },
+    response_format="gradio",
+    source="godot",
+)
+available_configs = {
+    "huggingface": huggingface_cfg,
+    "pytorch": pytorch_cfg,
+    "pytorch-lightning": lightning_cfg,
+    "godot": godot_cfg,
+}

buster/apps/gradio_app.py CHANGED Viewed

@@ -3,53 +3,27 @@ import pathlib
 import gradio as gr
 from buster.buster import Buster, BusterConfig
-DATA_DIR = pathlib.Path(__file__).parent.parent.resolve() / "data"  # points to ../data/
-buster_cfg = BusterConfig(
-    documents_file=os.path.join(DATA_DIR, "document_embeddings_huggingface.tar.gz"),
-    unknown_prompt="I'm sorry, but I am an AI language model trained to assist with questions related to the huggingface transformers library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
-    embedding_model="text-embedding-ada-002",
-    top_k=3,
-    thresh=0.7,
-    max_words=3000,
-    completer_cfg={
-        "name": "ChatGPT",
-        "text_before_documents": (
-            "You are a chatbot assistant answering technical questions about huggingface transformers, a library to train transformers in python. "
-            "You can only respond to a question if the content necessary to answer the question is contained in the following provided documentation. "
-            "If it isn't, simply reply that you cannot answer the question. "
-            "Here is the documentation: "
-            "<BEGIN_DOCUMENTATION> "
-        ),
-        "text_before_prompt": (
-            "<\END_DOCUMENTATION>\n"
-            "REMINDER:\n"
-            "You are a chatbot assistant answering technical questions about huggingface transformers, a library to train transformers in python. "
-            "Here are the rules you must follow:\n"
-            "1) You must only respond with information contained in the documentation above. Say you do not know if the information is not provided.\n"
-            "2) Make sure to format your answers in Markdown format, including code block and snippets.\n"
-            "3) Do not include any links to urls or hyperlinks in your answers.\n"
-            "4) If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n"
-            "'I'm sorry, but I am an AI language model trained to assist with questions related to the huggingface transformers library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?'"
-            "For example:\n"
-            "What is the meaning of life for huggingface?\n"
-            "I'm sorry, but I am an AI language model trained to assist with questions related to the huggingface transformers library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?"
-            "Now answer the following question:\n"
-        ),
-        "completion_kwargs": {
-            "model": "gpt-3.5-turbo",
-        },
-    },
-    response_format="gradio",
-)
-buster = Buster(buster_cfg)
-def chat(question, history):
-    history = history or []
     answer = buster.process_input(question)
     # formatting hack for code blocks to render properly every time
@@ -59,11 +33,20 @@ def chat(question, history):
     return history, history
-block = gr.Blocks(css=".gradio-container {background-color: lightgray}")
 with block:
     with gr.Row():
-        gr.Markdown("<h3><center>Buster 🤖: A Question-Answering Bot for Huggingface 🤗 Transformers </center></h3>")
     chatbot = gr.Chatbot()
@@ -75,11 +58,11 @@ with block:
         )
         submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
-    gr.Examples(
         examples=[
             "What kind of models should I use for images and text?",
             "When should I finetune a model vs. training it form scratch?",
-            "How can I deploy my trained huggingface model?",
             "Can you give me some python code to quickly finetune a model on my sentiment analysis dataset?",
         ],
         inputs=message,
@@ -95,8 +78,8 @@ with block:
     state = gr.State()
     agent_state = gr.State()
-    submit.click(chat, inputs=[message, state], outputs=[chatbot, state])
-    message.submit(chat, inputs=[message, state], outputs=[chatbot, state])
 block.launch(debug=True)

 import gradio as gr
+from buster.apps.bot_configs import available_configs
 from buster.buster import Buster, BusterConfig
+from buster.documents.base import DocumentsManager
+from buster.documents.utils import download_db, get_documents_manager_from_extension
+DEFAULT_CONFIG = "huggingface"
+DB_URL = "https://huggingface.co/datasets/jerpint/buster-data/resolve/main/documents.db"
+# Download the db...
+documents_filepath = download_db(db_url=DB_URL, output_dir="./data")
+documents: DocumentsManager = get_documents_manager_from_extension(documents_filepath)(documents_filepath)
+# initialize buster with the default config...
+default_cfg: BusterConfig = available_configs.get(DEFAULT_CONFIG)
+buster = Buster(cfg=default_cfg, documents=documents)
+def chat(question, history, bot_source):
+    history = history or []
+    cfg = available_configs.get(bot_source)
+    buster.update_cfg(cfg)
     answer = buster.process_input(question)
     # formatting hack for code blocks to render properly every time
     return history, history
+block = gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}")
 with block:
     with gr.Row():
+        gr.Markdown("<h3><center>Buster 🤖: A Question-Answering Bot for open-source libraries </center></h3>")
+    doc_source = gr.Dropdown(
+        choices=sorted(list(available_configs.keys())),
+        value=DEFAULT_CONFIG,
+        interactive=True,
+        multiselect=False,
+        label="Source of Documentation",
+        info="The source of documentation to select from",
+    )
     chatbot = gr.Chatbot()
         )
         submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
+    examples = gr.Examples(
+        # TODO: seems not possible (for now) to update examples on change...
         examples=[
             "What kind of models should I use for images and text?",
             "When should I finetune a model vs. training it form scratch?",
             "Can you give me some python code to quickly finetune a model on my sentiment analysis dataset?",
         ],
         inputs=message,
     state = gr.State()
     agent_state = gr.State()
+    submit.click(chat, inputs=[message, state, doc_source], outputs=[chatbot, state])
+    message.submit(chat, inputs=[message, state, doc_source], outputs=[chatbot, state])
 block.launch(debug=True)

buster/buster.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import logging
 from dataclasses import dataclass, field
 import numpy as np
 import pandas as pd
 from openai.embeddings_utils import cosine_similarity, get_embedding
 from buster.completers import get_completer
-from buster.documents import get_documents_manager_from_extension
 from buster.formatter import (
     Response,
     ResponseFormatter,
@@ -33,6 +33,7 @@ class BusterConfig:
     unknown_prompt: Prompt to use to generate the "I don't know" embedding to compare to.
     text_before_prompt: Text to prompt GPT with before the user prompt, but after the documentation.
     reponse_footnote: Generic response to add the the chatbot's reply.
     """
     documents_file: str = "buster/data/document_embeddings.tar.gz"
@@ -60,34 +61,45 @@ class BusterConfig:
     response_format: str = "slack"
     unknown_prompt: str = "I Don't know how to answer your question."
     response_footnote: str = "I'm a bot 🤖 and not always perfect."
 class Buster:
-    def __init__(self, cfg: BusterConfig):
-        # TODO: right now, the cfg is being passed as an omegaconf, is this what we want?
         self.cfg = cfg
-        self.completer = get_completer(cfg.completer_cfg)
-        self._init_documents()
-        self._init_unk_embedding()
-        self._init_response_formatter()
-    def _init_response_formatter(self):
         self.response_formatter = response_formatter_factory(
             format=self.cfg.response_format, response_footnote=self.cfg.response_footnote
         )
-    def _init_documents(self):
-        filepath = self.cfg.documents_file
-        logger.info(f"loading embeddings from {filepath}...")
-        self.documents = get_documents_manager_from_extension(filepath)(filepath)
-        logger.info(f"embeddings loaded.")
-    def _init_unk_embedding(self):
-        logger.info("Generating UNK embedding...")
-        self.unk_embedding = get_embedding(
-            self.cfg.unknown_prompt,
-            engine=self.cfg.embedding_model,
-        )
     def rank_documents(
         self,
@@ -95,16 +107,17 @@ class Buster:
         top_k: float,
         thresh: float,
         engine: str,
     ) -> pd.DataFrame:
         """
         Compare the question to the series of documents and return the best matching documents.
         """
-        query_embedding = get_embedding(
             query,
             engine=engine,
         )
-        matched_documents = self.documents.retrieve(query_embedding, top_k)
         # log matched_documents to the console
         logger.info(f"matched documents before thresh: {matched_documents}")
@@ -119,7 +132,9 @@ class Buster:
     def prepare_documents(self, matched_documents: pd.DataFrame, max_words: int) -> str:
         # gather the documents in one large plaintext variable
         documents_list = matched_documents.content.to_list()
-        documents_str = " ".join(documents_list)
         # truncate the documents to fit
         # TODO: increase to actual token count
@@ -135,11 +150,13 @@ class Buster:
         self,
         response,
         matched_documents: pd.DataFrame,
-        unknown_prompt: str,
     ):
         logger.info(f"GPT Response:\n{response.text}")
         sources = (
-            Source(dct["source"], dct["url"], dct["similarity"]) for dct in matched_documents.to_dict(orient="records")
         )
         return sources
@@ -154,7 +171,7 @@ class Buster:
         set the unk_threshold to 0 to essentially turn off this feature.
         """
-        response_embedding = get_embedding(
             completion,
             engine=engine,
         )
@@ -180,17 +197,18 @@ class Buster:
             top_k=self.cfg.top_k,
             thresh=self.cfg.thresh,
             engine=self.cfg.embedding_model,
         )
         if len(matched_documents) == 0:
-            response = Response("I did not find any sources to answer your question.")
             sources = tuple()
             return self.response_formatter(response, sources)
         # generate a completion
         documents: str = self.prepare_documents(matched_documents, max_words=self.cfg.max_words)
-        response = self.completer.generate_response(user_input, documents)
-        sources = self.add_sources(response, matched_documents, self.cfg.unknown_prompt)
         # check for relevance
         relevant = self.check_response_relevance(

 import logging
 from dataclasses import dataclass, field
+from functools import lru_cache
 import numpy as np
 import pandas as pd
 from openai.embeddings_utils import cosine_similarity, get_embedding
 from buster.completers import get_completer
 from buster.formatter import (
     Response,
     ResponseFormatter,
     unknown_prompt: Prompt to use to generate the "I don't know" embedding to compare to.
     text_before_prompt: Text to prompt GPT with before the user prompt, but after the documentation.
     reponse_footnote: Generic response to add the the chatbot's reply.
+    source: the source of the document to consider
     """
     documents_file: str = "buster/data/document_embeddings.tar.gz"
     response_format: str = "slack"
     unknown_prompt: str = "I Don't know how to answer your question."
     response_footnote: str = "I'm a bot 🤖 and not always perfect."
+    source: str = ""
+from buster.documents.base import DocumentsManager
 class Buster:
+    def __init__(self, cfg: BusterConfig, documents: DocumentsManager):
+        self._unk_embedding = None
         self.cfg = cfg
+        self.update_cfg(cfg)
+        self.documents = documents
+    @property
+    def unk_embedding(self):
+        return self._unk_embedding
+    @unk_embedding.setter
+    def unk_embedding(self, embedding):
+        logger.info("Setting new UNK embedding...")
+        self._unk_embedding = embedding
+        return self._unk_embedding
+    def update_cfg(self, cfg: BusterConfig):
+        """Every time we set a new config, we update the things that need to be updated."""
+        logger.info(f"Updating config to {cfg.source}:\n{cfg}")
+        self.cfg = cfg
+        self.completer = get_completer(cfg.completer_cfg)
+        self.unk_embedding = self.get_embedding(self.cfg.unknown_prompt, engine=self.cfg.embedding_model)
         self.response_formatter = response_formatter_factory(
             format=self.cfg.response_format, response_footnote=self.cfg.response_footnote
         )
+        logger.info(f"Config Updated.")
+    @lru_cache
+    def get_embedding(self, query: str, engine: str):
+        logger.info("generating embedding")
+        return get_embedding(query, engine=engine)
     def rank_documents(
         self,
         top_k: float,
         thresh: float,
         engine: str,
+        source: str,
     ) -> pd.DataFrame:
         """
         Compare the question to the series of documents and return the best matching documents.
         """
+        query_embedding = self.get_embedding(
             query,
             engine=engine,
         )
+        matched_documents = self.documents.retrieve(query_embedding, top_k=top_k, source=source)
         # log matched_documents to the console
         logger.info(f"matched documents before thresh: {matched_documents}")
     def prepare_documents(self, matched_documents: pd.DataFrame, max_words: int) -> str:
         # gather the documents in one large plaintext variable
         documents_list = matched_documents.content.to_list()
+        documents_str = ""
+        for idx, doc in enumerate(documents_list):
+            documents_str += f"<DOCUMENT> {doc} <\DOCUMENT>"
         # truncate the documents to fit
         # TODO: increase to actual token count
         self,
         response,
         matched_documents: pd.DataFrame,
     ):
         logger.info(f"GPT Response:\n{response.text}")
         sources = (
+            Source(
+                source=dct["source"], title=dct["title"], url=dct["url"], question_similarity=dct["similarity"] * 100
+            )
+            for dct in matched_documents.to_dict(orient="records")
         )
         return sources
         set the unk_threshold to 0 to essentially turn off this feature.
         """
+        response_embedding = self.get_embedding(
             completion,
             engine=engine,
         )
             top_k=self.cfg.top_k,
             thresh=self.cfg.thresh,
             engine=self.cfg.embedding_model,
+            source=self.cfg.source,
         )
         if len(matched_documents) == 0:
+            response = Response(self.cfg.unknown_prompt)
             sources = tuple()
             return self.response_formatter(response, sources)
         # generate a completion
         documents: str = self.prepare_documents(matched_documents, max_words=self.cfg.max_words)
+        response: Response = self.completer.generate_response(user_input, documents)
+        sources = self.add_sources(response, matched_documents)
         # check for relevance
         relevant = self.check_response_relevance(

buster/data/documents.db DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b86c2b4f5a2ec410c2b9132ed62213528ba10c0dc260162f689e30ba677815f1
-size 244338688

buster/documents/utils.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 from typing import Type
 from buster.documents.base import DocumentsManager
@@ -12,6 +13,18 @@ def get_file_extension(filepath: str) -> str:
     return os.path.splitext(filepath)[1]
 def get_documents_manager_from_extension(filepath: str) -> Type[DocumentsManager]:
     ext = get_file_extension(filepath)

 import os
+import urllib.request
 from typing import Type
 from buster.documents.base import DocumentsManager
     return os.path.splitext(filepath)[1]
+def download_db(db_url: str, output_dir: str):
+    os.makedirs(output_dir, exist_ok=True)
+    fname = os.path.join(output_dir, "documents.db")
+    if not os.path.exists(fname):
+        print(f"Downloading db file from {db_url} to {fname}...")
+        urllib.request.urlretrieve(db_url, fname)
+        print("Downloaded.")
+    else:
+        print("File already exists. Skipping.")
+    return fname
 def get_documents_manager_from_extension(filepath: str) -> Type[DocumentsManager]:
     ext = get_file_extension(filepath)

buster/formatter/base.py CHANGED Viewed

@@ -4,9 +4,10 @@ from typing import Iterable, NamedTuple
 # Should be from the `documents` module.
 class Source(NamedTuple):
-    source: str
     url: str
     question_similarity: float
     # TODO Add answer similarity.
     # answer_similarity: float
@@ -22,7 +23,7 @@ class Response:
 @dataclass
 class ResponseFormatter:
     response_footnote: str
-    source_template: str = "{source.name} (relevance: {source.question_similarity:2.3f})"
     error_msg_template: str = """Something went wrong:\n{response.error_msg}"""
     error_fallback_template: str = "Something went very wrong."
     sourced_answer_template: str = (

 # Should be from the `documents` module.
 class Source(NamedTuple):
+    title: str
     url: str
     question_similarity: float
+    source: str = ""
     # TODO Add answer similarity.
     # answer_similarity: float
 @dataclass
 class ResponseFormatter:
     response_footnote: str
+    source_template: str = "{source.name} (relevance: {source.question_similarity:2.1f})"
     error_msg_template: str = """Something went wrong:\n{response.error_msg}"""
     error_fallback_template: str = "Something went very wrong."
     sourced_answer_template: str = (

buster/formatter/gradio.py CHANGED Viewed

@@ -17,7 +17,7 @@ class GradioResponseFormatter(ResponseFormatter):
         """{footnote}"""
     )
     unsourced_answer_template: str = "{response.text}<br><br>{footnote}"
-    source_template: str = """[🔗 {source.source}]({source.url}), relevance: {source.question_similarity:2.3f}"""
     def sources_list(self, sources: Iterable[Source]) -> str | None:
         """Format sources into a list."""

         """{footnote}"""
     )
     unsourced_answer_template: str = "{response.text}<br><br>{footnote}"
+    source_template: str = """[🔗 {source.title}]({source.url}), relevance: {source.question_similarity:2.1f} %"""
     def sources_list(self, sources: Iterable[Source]) -> str | None:
         """Format sources into a list."""

buster/formatter/html.py CHANGED Viewed

@@ -37,5 +37,5 @@ class HTMLResponseFormatter(ResponseFormatter):
             response.error,
             html.escape(response.error_msg) if response.error_msg else response.error_msg,
         )
-        sources = (Source(html.escape(source.source), source.url, source.question_similarity) for source in sources)
         return super().__call__(response, sources)

             response.error,
             html.escape(response.error_msg) if response.error_msg else response.error_msg,
         )
+        sources = (Source(html.escape(source.title), source.url, source.question_similarity) for source in sources)
         return super().__call__(response, sources)

buster/formatter/markdown.py CHANGED Viewed

@@ -8,7 +8,7 @@ from buster.formatter.base import ResponseFormatter, Source
 class MarkdownResponseFormatter(ResponseFormatter):
     """Format the answer in markdown."""
-    source_template: str = """[🔗 {source.source}]({source.url}), relevance: {source.question_similarity:2.3f}"""
     def sources_list(self, sources: Iterable[Source]) -> str | None:
         """Format sources into a list."""

 class MarkdownResponseFormatter(ResponseFormatter):
     """Format the answer in markdown."""
+    source_template: str = """[🔗 {source.title}]({source.url}), relevance: {source.question_similarity:2.3f}"""
     def sources_list(self, sources: Iterable[Source]) -> str | None:
         """Format sources into a list."""

buster/formatter/slack.py CHANGED Viewed

@@ -8,7 +8,7 @@ from buster.formatter import ResponseFormatter, Source
 class SlackResponseFormatter(ResponseFormatter):
     """Format the answer for Slack."""
-    source_template: str = """<{source.url}|🔗 {source.source}>, relevance: {source.question_similarity:2.3f}"""
     def sources_list(self, sources: Iterable[Source]) -> str | None:
         """Format sources into a list."""

 class SlackResponseFormatter(ResponseFormatter):
     """Format the answer for Slack."""
+    source_template: str = """<{source.url}|🔗 {source.title}>, relevance: {source.question_similarity:2.3f}"""
     def sources_list(self, sources: Iterable[Source]) -> str | None:
         """Format sources into a list."""

pyproject.toml CHANGED Viewed

@@ -18,3 +18,7 @@ profile = "black"
 [tool.black]
 line-length = 120

 [tool.black]
 line-length = 120
+[tool.pytest.ini_options]
+log_cli = true
+log_cli_level = "INFO"

tests/test_chatbot.py CHANGED Viewed

@@ -5,7 +5,9 @@ import numpy as np
 import pandas as pd
 from buster.buster import Buster, BusterConfig
-from buster.documents import DocumentsManager
 TEST_DATA_DIR = Path(__file__).resolve().parent / "data"
 DOCUMENTS_FILE = os.path.join(str(TEST_DATA_DIR), "document_embeddings_huggingface_subset.tar.gz")
@@ -16,6 +18,17 @@ def get_fake_embedding(length=1536):
     return list(rng.random(length, dtype=np.float32))
 class DocumentsMock(DocumentsManager):
     def __init__(self, filepath):
         self.filepath = filepath
@@ -39,20 +52,24 @@ class DocumentsMock(DocumentsManager):
         return self.documents
 def test_chatbot_mock_data(tmp_path, monkeypatch):
     gpt_expected_answer = "this is GPT answer"
-    monkeypatch.setattr("buster.buster.get_documents_manager_from_extension", lambda filepath: DocumentsMock)
-    monkeypatch.setattr("buster.buster.get_embedding", lambda x, engine: get_fake_embedding())
-    monkeypatch.setattr("openai.Completion.create", lambda **kwargs: {"choices": [{"text": gpt_expected_answer}]})
     hf_transformers_cfg = BusterConfig(
-        documents_file=tmp_path / "not_a_real_file.tar.gz",
         unknown_prompt="This doesn't seem to be related to the huggingface library. I am not sure how to answer.",
         embedding_model="text-embedding-ada-002",
         top_k=3,
-        thresh=0.7,
         max_words=3000,
         response_format="slack",
         completer_cfg={
             "name": "GPT3",
             "text_before_prompt": (
@@ -72,7 +89,9 @@ def test_chatbot_mock_data(tmp_path, monkeypatch):
             },
         },
     )
-    buster = Buster(hf_transformers_cfg)
     answer = buster.process_input("What is a transformer?")
     assert isinstance(answer, str)
     assert answer.startswith(gpt_expected_answer)
@@ -80,7 +99,6 @@ def test_chatbot_mock_data(tmp_path, monkeypatch):
 def test_chatbot_real_data__chatGPT():
     hf_transformers_cfg = BusterConfig(
-        documents_file=DOCUMENTS_FILE,
         unknown_prompt="I'm sorry, but I am an AI language model trained to assist with questions related to the huggingface transformers library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
         embedding_model="text-embedding-ada-002",
         top_k=3,
@@ -101,14 +119,14 @@ def test_chatbot_real_data__chatGPT():
             },
         },
     )
-    buster = Buster(hf_transformers_cfg)
     answer = buster.process_input("What is a transformer?")
     assert isinstance(answer, str)
 def test_chatbot_real_data__chatGPT_OOD():
     buster_cfg = BusterConfig(
-        documents_file=DOCUMENTS_FILE,
         unknown_prompt="I'm sorry, but I am an AI language model trained to assist with questions related to the huggingface transformers library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
         embedding_model="text-embedding-ada-002",
         top_k=3,
@@ -122,7 +140,7 @@ def test_chatbot_real_data__chatGPT_OOD():
                 """Do not include any links to urls or hyperlinks in your answers. """
                 """If you do not know the answer to a question, or if it is completely irrelevant to the library usage, let the user know you cannot answer. """
                 """Use this response: """
-                """I'm sorry, but I am an AI language model trained to assist with questions related to the huggingface transformers library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?"""
                 """For example:\n"""
                 """What is the meaning of life for huggingface?\n"""
                 """I'm sorry, but I am an AI language model trained to assist with questions related to the huggingface transformers library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?"""
@@ -135,7 +153,8 @@ def test_chatbot_real_data__chatGPT_OOD():
         },
         response_format="gradio",
     )
-    buster = Buster(buster_cfg)
     answer = buster.process_input("What is a good recipe for brocolli soup?")
     assert isinstance(answer, str)
     assert buster_cfg.unknown_prompt in answer
@@ -143,7 +162,6 @@ def test_chatbot_real_data__chatGPT_OOD():
 def test_chatbot_real_data__GPT():
     hf_transformers_cfg = BusterConfig(
-        documents_file=DOCUMENTS_FILE,
         unknown_prompt="This doesn't seem to be related to the huggingface library. I am not sure how to answer.",
         embedding_model="text-embedding-ada-002",
         top_k=3,
@@ -169,6 +187,7 @@ def test_chatbot_real_data__GPT():
             },
         },
     )
-    buster = Buster(hf_transformers_cfg)
     answer = buster.process_input("What is a transformer?")
     assert isinstance(answer, str)

 import pandas as pd
 from buster.buster import Buster, BusterConfig
+from buster.completers.base import Completer
+from buster.documents import DocumentsManager, get_documents_manager_from_extension
+from buster.formatter.base import Response
 TEST_DATA_DIR = Path(__file__).resolve().parent / "data"
 DOCUMENTS_FILE = os.path.join(str(TEST_DATA_DIR), "document_embeddings_huggingface_subset.tar.gz")
     return list(rng.random(length, dtype=np.float32))
+class MockCompleter(Completer):
+    def __init__(self, expected_answer):
+        self.expected_answer = expected_answer
+    def complete(self):
+        return
+    def generate_response(self, user_input, documents) -> Response:
+        return Response(self.expected_answer)
 class DocumentsMock(DocumentsManager):
     def __init__(self, filepath):
         self.filepath = filepath
         return self.documents
+import logging
+logging.basicConfig(level=logging.INFO)
 def test_chatbot_mock_data(tmp_path, monkeypatch):
     gpt_expected_answer = "this is GPT answer"
+    monkeypatch.setattr(Buster, "get_embedding", lambda self, prompt, engine: get_fake_embedding())
+    monkeypatch.setattr("buster.buster.get_completer", lambda x: MockCompleter(expected_answer=gpt_expected_answer))
     hf_transformers_cfg = BusterConfig(
         unknown_prompt="This doesn't seem to be related to the huggingface library. I am not sure how to answer.",
         embedding_model="text-embedding-ada-002",
         top_k=3,
+        thresh=0,
         max_words=3000,
         response_format="slack",
+        source="fake source",
         completer_cfg={
             "name": "GPT3",
             "text_before_prompt": (
             },
         },
     )
+    filepath = tmp_path / "not_a_real_file.tar.gz"
+    documents = DocumentsMock(filepath)
+    buster = Buster(cfg=hf_transformers_cfg, documents=documents)
     answer = buster.process_input("What is a transformer?")
     assert isinstance(answer, str)
     assert answer.startswith(gpt_expected_answer)
 def test_chatbot_real_data__chatGPT():
     hf_transformers_cfg = BusterConfig(
         unknown_prompt="I'm sorry, but I am an AI language model trained to assist with questions related to the huggingface transformers library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
         embedding_model="text-embedding-ada-002",
         top_k=3,
             },
         },
     )
+    documents = get_documents_manager_from_extension(DOCUMENTS_FILE)(DOCUMENTS_FILE)
+    buster = Buster(cfg=hf_transformers_cfg, documents=documents)
     answer = buster.process_input("What is a transformer?")
     assert isinstance(answer, str)
 def test_chatbot_real_data__chatGPT_OOD():
     buster_cfg = BusterConfig(
         unknown_prompt="I'm sorry, but I am an AI language model trained to assist with questions related to the huggingface transformers library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
         embedding_model="text-embedding-ada-002",
         top_k=3,
                 """Do not include any links to urls or hyperlinks in your answers. """
                 """If you do not know the answer to a question, or if it is completely irrelevant to the library usage, let the user know you cannot answer. """
                 """Use this response: """
+                """'I'm sorry, but I am an AI language model trained to assist with questions related to the huggingface transformers library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?'\n"""
                 """For example:\n"""
                 """What is the meaning of life for huggingface?\n"""
                 """I'm sorry, but I am an AI language model trained to assist with questions related to the huggingface transformers library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?"""
         },
         response_format="gradio",
     )
+    documents = get_documents_manager_from_extension(DOCUMENTS_FILE)(DOCUMENTS_FILE)
+    buster = Buster(cfg=buster_cfg, documents=documents)
     answer = buster.process_input("What is a good recipe for brocolli soup?")
     assert isinstance(answer, str)
     assert buster_cfg.unknown_prompt in answer
 def test_chatbot_real_data__GPT():
     hf_transformers_cfg = BusterConfig(
         unknown_prompt="This doesn't seem to be related to the huggingface library. I am not sure how to answer.",
         embedding_model="text-embedding-ada-002",
         top_k=3,
             },
         },
     )
+    documents = get_documents_manager_from_extension(DOCUMENTS_FILE)(DOCUMENTS_FILE)
+    buster = Buster(cfg=hf_transformers_cfg, documents=documents)
     answer = buster.process_input("What is a transformer?")
     assert isinstance(answer, str)