Spaces:

jerpint
/

buster

Running

App Files Files Community

jerpint commited on Jan 26, 2023

Commit

f97aa81

•

1 Parent(s): 2c4fa53

development branch (#7)

Browse files

* fix relative import

* add embeddings requirement

* update openai embeddings requirements...

* format responses appropriately

* add markdown response

* Fix newline formatting

* add threshold and top_k

* update response

* fix merge conflict

Files changed (1) hide show

buster/chatbot.py +41 -6

buster/chatbot.py CHANGED Viewed

@@ -12,13 +12,16 @@ logging.basicConfig(level=logging.INFO)
 # search through the reviews for a specific product
-def rank_documents(df: pd.DataFrame, query: str, top_k: int = 3) -> pd.DataFrame:
     product_embedding = get_embedding(
         query,
         engine=EMBEDDING_MODEL,
     )
     df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))
     if top_k == -1:
         # return all results
         n = len(df)
@@ -28,13 +31,43 @@ def rank_documents(df: pd.DataFrame, query: str, top_k: int = 3) -> pd.DataFrame
 def engineer_prompt(question: str, documents: list[str]) -> str:
-    return " ".join(documents) + "\nNow answer the following question:\n" + question
-def answer_question(question: str, df) -> str:
     # rank the documents, get the highest scoring doc and generate the prompt
-    candidates = rank_documents(df, query=question, top_k=1)
     documents = candidates.text.to_list()
     prompt = engineer_prompt(question, documents)
     logger.info(f"querying GPT...")
@@ -58,12 +91,14 @@ def answer_question(question: str, df) -> str:
         GPT Response:\n{response_text}
         """
         )
-        return response_text
     except Exception as e:
         import traceback
         logging.error(traceback.format_exc())
-        return "Oops, something went wrong. Try again later!"
 def load_embeddings(path: str) -> pd.DataFrame:

 # search through the reviews for a specific product
+def rank_documents(df: pd.DataFrame, query: str, top_k: int = 1, thresh: float = None) -> pd.DataFrame:
     product_embedding = get_embedding(
         query,
         engine=EMBEDDING_MODEL,
     )
     df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))
+    if thresh:
+        df = df[df.similarity > thresh]
     if top_k == -1:
         # return all results
         n = len(df)
 def engineer_prompt(question: str, documents: list[str]) -> str:
+    documents_str = " ".join(documents)
+    if len(documents_str) > 3000:
+        logger.info("truncating documents to fit...")
+        documents_str = documents_str[0:3000]
+    return documents_str + "\nNow answer the following question:\n" + question
+def format_response(response_text, sources_url=None):
+    response = f"{response_text}\n"
+    if sources_url:
+        response += f"<br><br>Here are the sources I used to answer your question:\n"
+        for url in sources_url:
+            response += f"<br>[{url}]({url})\n"
+    response += "<br><br>"
+    response += """
+    ```
+    I'm a bot 🤖 and not always perfect.
+    For more info, view the full documentation here (https://docs.mila.quebec/) or contact support@mila.quebec
+    ```
+    """
+    return response
+def answer_question(question: str, df, top_k: int = 1, thresh: float = None) -> str:
     # rank the documents, get the highest scoring doc and generate the prompt
+    candidates = rank_documents(df, query=question, top_k=top_k, thresh=thresh)
+    logger.info(f"candidate responses: {candidates}")
+    if len(candidates) == 0:
+        return format_response("I did not find any relevant documentation related to your question.")
     documents = candidates.text.to_list()
+    sources_url = candidates.url.to_list()
     prompt = engineer_prompt(question, documents)
     logger.info(f"querying GPT...")
         GPT Response:\n{response_text}
         """
         )
+        return format_response(response_text, sources_url)
     except Exception as e:
         import traceback
         logging.error(traceback.format_exc())
+        response = "Oops, something went wrong. Try again later!"
+        return format_response(response)
 def load_embeddings(path: str) -> pd.DataFrame: