Spaces:

jerpint
/

buster-dev

Runtime error

App Files Files Community

jerpint commited on Feb 2, 2023

Commit

c5f5dc3

unverified ·

1 Parent(s): 7d4662a

fix bug when reading csv (#19)

Browse files

* fix bug when reading csv

* add pytorch bot

* Log the actual prompt

* update pytorch prompt

Files changed (3) hide show

app.py +40 -3
buster/chatbot.py +7 -11
buster/docparser.py +4 -1

app.py CHANGED Viewed

@@ -6,10 +6,11 @@ from buster.chatbot import Chatbot, ChatbotConfig
 MILA_CLUSTER_CHANNEL = "C04LR4H9KQA"
 ORION_CHANNEL = "C04LYHGUYB0"
 buster_cfg = ChatbotConfig(
-    documents_csv="buster/data/document_embeddings.csv",
-    unknown_prompt="This doesn't seem to be related to cluster usage. I am not sure how to answer.",
     embedding_model="text-embedding-ada-002",
     top_k=3,
     thresh=0.7,
@@ -44,7 +45,7 @@ buster_cfg = ChatbotConfig(
 buster_chatbot = Chatbot(buster_cfg)
 orion_cfg = ChatbotConfig(
-    documents_csv="buster/data/document_embeddings_orion.csv",
     unknown_prompt="This doesn't seem to be related to the orion library. I am not sure how to answer.",
     embedding_model="text-embedding-ada-002",
     top_k=3,
@@ -76,6 +77,39 @@ orion_cfg = ChatbotConfig(
 )
 orion_chatbot = Chatbot(orion_cfg)
 app = App(token=os.environ.get("SLACK_BOT_TOKEN"), signing_secret=os.environ.get("SLACK_SIGNING_SECRET"))
@@ -93,6 +127,9 @@ def respond_to_question(event, say):
     elif channel == ORION_CHANNEL:
         print("*******using ORION********")
         answer = orion_chatbot.process_input(text)
     # responds to the message in the thread
     thread_ts = event["event_ts"]

 MILA_CLUSTER_CHANNEL = "C04LR4H9KQA"
 ORION_CHANNEL = "C04LYHGUYB0"
+PYTORCH_CHANNEL = "C04MEK6N882"
 buster_cfg = ChatbotConfig(
+    documents_file="buster/data/document_embeddings.csv",
+    unknown_prompt="This doesn't seem to be related to cluster usage.",
     embedding_model="text-embedding-ada-002",
     top_k=3,
     thresh=0.7,
 buster_chatbot = Chatbot(buster_cfg)
 orion_cfg = ChatbotConfig(
+    documents_file="buster/data/document_embeddings_orion.csv",
     unknown_prompt="This doesn't seem to be related to the orion library. I am not sure how to answer.",
     embedding_model="text-embedding-ada-002",
     top_k=3,
 )
 orion_chatbot = Chatbot(orion_cfg)
+pytorch_cfg = ChatbotConfig(
+    documents_file="buster/data/document_embeddings_pytorch.tar.gz",
+    unknown_prompt="This doesn't seem to be related to the pytorch library. I am not sure how to answer.",
+    embedding_model="text-embedding-ada-002",
+    top_k=3,
+    thresh=0.7,
+    max_chars=3000,
+    completion_kwargs={
+        "engine": "text-davinci-003",
+        "max_tokens": 500,
+    },
+    separator="\n",
+    link_format="slack",
+    text_after_response="I'm a bot 🤖 and not always perfect.",
+    text_before_prompt="""You are a slack chatbot assistant answering technical questions about pytorch, a library to train neural networks written in python.
+    Make sure to format your answers in Markdown format, including code block and snippets.
+    Do not include any links to urls or hyperlinks in your answers.
+    If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:
+    'This doesn't seem to be related to the pytorch library.'
+    For example:
+    What is the meaning of life for pytorch?
+    This doesn't seem to be related to cluster usage.
+    Now answer the following question:
+    """,
+)
+pytorch_chatbot = Chatbot(pytorch_cfg)
 app = App(token=os.environ.get("SLACK_BOT_TOKEN"), signing_secret=os.environ.get("SLACK_SIGNING_SECRET"))
     elif channel == ORION_CHANNEL:
         print("*******using ORION********")
         answer = orion_chatbot.process_input(text)
+    elif channel == PYTORCH_CHANNEL:
+        print("*******using PYTORCH********")
+        answer = pytorch_chatbot.process_input(text)
     # responds to the message in the thread
     thread_ts = event["event_ts"]

buster/chatbot.py CHANGED Viewed

@@ -7,20 +7,12 @@ import pandas as pd
 from omegaconf import OmegaConf
 from openai.embeddings_utils import cosine_similarity, get_embedding
-from buster.docparser import EMBEDDING_MODEL
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
-def load_documents(path: str) -> pd.DataFrame:
-    logger.info(f"loading embeddings from {path}...")
-    df = pd.read_csv(path)
-    df["embedding"] = df.embedding.apply(eval).apply(np.array)
-    logger.info(f"embeddings loaded.")
-    return df
 class Chatbot:
     def __init__(self, cfg: OmegaConf):
         # TODO: right now, the cfg is being passed as an omegaconf, is this what we want?
@@ -29,7 +21,10 @@ class Chatbot:
         self._init_unk_embedding()
     def _init_documents(self):
-        self.documents = load_documents(self.cfg.documents_csv)
     def _init_unk_embedding(self):
         logger.info("Generating UNK token...")
@@ -101,6 +96,7 @@ class Chatbot:
             return response_text
         logger.info(f"querying GPT...")
         # Call the API to generate a response
         try:
             completion_kwargs = self.cfg.completion_kwargs
@@ -198,7 +194,7 @@ class ChatbotConfig:
     text_after_response: Generic response to add the the chatbot's reply.
     """
-    documents_csv: str = "buster/data/document_embeddings.csv"
     embedding_model: str = "text-embedding-ada-002"
     top_k: int = 3
     thresh: float = 0.7

 from omegaconf import OmegaConf
 from openai.embeddings_utils import cosine_similarity, get_embedding
+from buster.docparser import EMBEDDING_MODEL, read_documents
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
 class Chatbot:
     def __init__(self, cfg: OmegaConf):
         # TODO: right now, the cfg is being passed as an omegaconf, is this what we want?
         self._init_unk_embedding()
     def _init_documents(self):
+        filepath = self.cfg.documents_file
+        logger.info(f"loading embeddings from {filepath}...")
+        self.documents = read_documents(filepath)
+        logger.info(f"embeddings loaded.")
     def _init_unk_embedding(self):
         logger.info("Generating UNK token...")
             return response_text
         logger.info(f"querying GPT...")
+        logger.info(f"Prompt:  {prompt}")
         # Call the API to generate a response
         try:
             completion_kwargs = self.cfg.completion_kwargs
     text_after_response: Generic response to add the the chatbot's reply.
     """
+    documents_file: str = "buster/data/document_embeddings.csv"
     embedding_model: str = "text-embedding-ada-002"
     top_k: int = 3
     thresh: float = 0.7

buster/docparser.py CHANGED Viewed

@@ -3,6 +3,7 @@ import math
 import os
 import bs4
 import pandas as pd
 import tiktoken
 from bs4 import BeautifulSoup
@@ -126,7 +127,9 @@ def read_documents(filepath: str) -> pd.DataFrame:
     ext = get_file_extension(filepath)
     if ext == ".csv":
-        return pd.read_csv(filepath)
     elif ext in PICKLE_EXTENSIONS:
         return pd.read_pickle(filepath)
     else:

 import os
 import bs4
+import numpy as np
 import pandas as pd
 import tiktoken
 from bs4 import BeautifulSoup
     ext = get_file_extension(filepath)
     if ext == ".csv":
+        df = pd.read_csv(filepath)
+        df["embedding"] = df.embedding.apply(eval).apply(np.array)
+        return df
     elif ext in PICKLE_EXTENSIONS:
         return pd.read_pickle(filepath)
     else: