Spaces:

jerpint
/

buster

Running

App Files Files Community

jerpint commited on Mar 21, 2023

Commit

44ee439

•

1 Parent(s): c6dd20e

Add quickstart (#73)

Browse files

* rename buster.py to busterbot.py; was causing conflicts

* add easy script for generating embeddings

* Add README.md with instructions

* Add entrypoint

* refactor

* Add quickstart example

* ignore .db files

Files changed (14) hide show

.gitignore +3 -0
README.md +30 -4
buster/apps/bot_configs.py +1 -1
buster/apps/gradio_app.py +1 -1
buster/apps/slackbot.py +1 -1
buster/{buster.py → busterbot.py} +0 -0
buster/docparser.py +79 -10
buster/examples/cfg.py +43 -0
buster/examples/gradio_app.py +59 -0
buster/examples/stackoverflow.csv +52 -0
pyproject.toml +4 -0
requirements.txt +1 -0
tests/test_chatbot.py +2 -2
tests/test_docparser.py +5 -3

.gitignore CHANGED Viewed

@@ -1,3 +1,6 @@
 buster/apps/data/
 # Byte-compiled / optimized / DLL files
 __pycache__/

+# database files
+*.db
 buster/apps/data/
 # Byte-compiled / optimized / DLL files
 __pycache__/

README.md CHANGED Viewed

@@ -11,12 +11,38 @@ pinned: false
 # Buster, the QA documentation chatbot!
-Buster is a question-answering chatbot that can be tuned to specific documentations. You can try it [here](https://huggingface.co/spaces/jerpint/buster), where it will answer questions about [🤗 Transformers](https://huggingface.co/docs/transformers/index).
-![Question: How do I load a Huggingface model?](buster/imgs/qa_web_load.png)
-![Question: My code is crashing with "CUDA out of memory". What can I do to solve this?](buster/imgs/qa_web_oom.png)
 ## How does Buster work?
@@ -31,7 +57,7 @@ Finally, we craft the prompt:
 We send the prompt to the [OpenAI API](https://beta.openai.com/docs/api-reference/completions), and display the answer to the user!
-### Currently used models
 - For embeddings: "text-embedding-ada-002"
 - For completion: We support both "text-davinci-003" and "gpt-3.5-turbo"

 # Buster, the QA documentation chatbot!
+Buster is a question-answering chatbot that can be tuned to any source of documentations.
+# Demo
+You can try out our [live demo here](https://huggingface.co/spaces/jerpint/buster), where it will answer questions about a bunch of libraries we've already scraped, including [🤗 Transformers](https://huggingface.co/docs/transformers/index).
+# Quickstart
+Here is a quick guide to help you deploy buster on your own dataset!
+First step, install buster locally. Note that buster requires python>=3.10.
+```
+git clone https://github.com/jerpint/buster.git
+pip install -e .
+```
+Then, go to the examples folder. We've attached a sample `stackoverflow.csv` file to help you get started. You will convert the .csv to a `documents.db` file.
+```
+buster_csv_parser stackoverflow.csv --output-filepath documents.db
+```
+This will generate the embeddings and save them locally. Finally, run
+```
+gradio gradio_app.py
+```
+This will launch the gradio app locally, which you should be able to view on [localhost]( http://127.0.0.1:7860)
 ## How does Buster work?
 We send the prompt to the [OpenAI API](https://beta.openai.com/docs/api-reference/completions), and display the answer to the user!
+### Currently available models
 - For embeddings: "text-embedding-ada-002"
 - For completion: We support both "text-davinci-003" and "gpt-3.5-turbo"

buster/apps/bot_configs.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from buster.buster import BusterConfig
 huggingface_cfg = BusterConfig(
     unknown_prompt="I'm sorry, but I am an AI language model trained to assist with questions related to the huggingface transformers library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",

+from buster.busterbot import BusterConfig
 huggingface_cfg = BusterConfig(
     unknown_prompt="I'm sorry, but I am an AI language model trained to assist with questions related to the huggingface transformers library. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",

buster/apps/gradio_app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import pathlib
 import gradio as gr
 from buster.apps.bot_configs import available_configs
-from buster.buster import Buster, BusterConfig
 from buster.documents.base import DocumentsManager
 from buster.documents.utils import download_db, get_documents_manager_from_extension

 import gradio as gr
 from buster.apps.bot_configs import available_configs
+from buster.busterbot import Buster, BusterConfig
 from buster.documents.base import DocumentsManager
 from buster.documents.utils import download_db, get_documents_manager_from_extension

buster/apps/slackbot.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 from slack_bolt import App
-from buster.buster import Buster, BusterConfig
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)

 from slack_bolt import App
+from buster.busterbot import Buster, BusterConfig
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)

buster/{buster.py → busterbot.py} RENAMED Viewed

File without changes

buster/docparser.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import glob
 import os
 from typing import Type
 import numpy as np
 import pandas as pd
 import tiktoken
@@ -11,6 +13,9 @@ from openai.embeddings_utils import get_embedding
 from buster.documents import get_documents_manager_from_extension
 from buster.parser import HuggingfaceParser, Parser, SphinxParser
 EMBEDDING_MODEL = "text-embedding-ada-002"
 EMBEDDING_ENCODING = "cl100k_base"  # this the encoding for text-embedding-ada-002
@@ -84,25 +89,89 @@ def get_all_documents(
     return documents_df
-def compute_n_tokens(df: pd.DataFrame) -> pd.DataFrame:
-    encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
     # TODO are there unexpected consequences of allowing endoftext?
-    df["n_tokens"] = df.content.apply(lambda x: len(encoding.encode(x, allowed_special={"<|endoftext|>"})))
     return df
-def precompute_embeddings(df: pd.DataFrame) -> pd.DataFrame:
-    df["embedding"] = df.content.apply(lambda x: np.asarray(get_embedding(x, engine=EMBEDDING_MODEL), dtype=np.float32))
     return df
-def generate_embeddings(root_dir: str, output_filepath: str, source: str) -> pd.DataFrame:
-    # Get all documents and precompute their embeddings
     documents = get_all_documents(root_dir, supported_docs[source]["base_url"], supported_docs[source]["parser"])
-    documents = compute_n_tokens(documents)
-    documents = precompute_embeddings(documents)
     documents_manager = get_documents_manager_from_extension(output_filepath)(output_filepath)
-    documents_manager.add(source, documents)
     return documents

 import glob
+import logging
 import os
 from typing import Type
+import click
 import numpy as np
 import pandas as pd
 import tiktoken
 from buster.documents import get_documents_manager_from_extension
 from buster.parser import HuggingfaceParser, Parser, SphinxParser
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
 EMBEDDING_MODEL = "text-embedding-ada-002"
 EMBEDDING_ENCODING = "cl100k_base"  # this the encoding for text-embedding-ada-002
     return documents_df
+def compute_n_tokens(
+    df: pd.DataFrame, embedding_encoding: str = EMBEDDING_ENCODING, col: str = "content"
+) -> pd.DataFrame:
+    """Counts the tokens in the content column and adds the count to a n_tokens column."""
+    logger.info("Computing tokens counts...")
+    encoding = tiktoken.get_encoding(encoding_name=embedding_encoding)
     # TODO are there unexpected consequences of allowing endoftext?
+    df["n_tokens"] = df[col].apply(lambda x: len(encoding.encode(x, allowed_special={"<|endoftext|>"})))
     return df
+def max_word_count(df: pd.DataFrame, max_words: int, col: str = "content") -> pd.DataFrame:
+    """Trim the word count of an entry to max_words"""
+    assert df[col].apply(lambda s: isinstance(s, str)).all(), f"Column {col} must contain only strings"
+    word_counts_before = df[col].apply(lambda x: len(x.split()))
+    df[col] = df[col].apply(lambda x: " ".join(x.split()[:max_words]))
+    word_counts_after = df[col].apply(lambda x: len(x.split()))
+    trimmed = df[word_counts_before == word_counts_after]
+    logger.info(f"trimmed {len(trimmed)} documents to {max_words} words.")
     return df
+def compute_embeddings(df: pd.DataFrame, engine: str = EMBEDDING_MODEL, col="embedding") -> pd.DataFrame:
+    logger.info(f"Computing embeddings for {len(df)} documents...")
+    df[col] = df.content.apply(lambda x: np.asarray(get_embedding(x, engine=engine), dtype=np.float32))
+    logger.info(f"Done computing embeddings for {len(df)} documents.")
+    return df
+def generate_embeddings_parser(root_dir: str, output_filepath: str, source: str) -> pd.DataFrame:
     documents = get_all_documents(root_dir, supported_docs[source]["base_url"], supported_docs[source]["parser"])
+    return generate_embeddings(documents, output_filepath)
+def documents_to_db(documents: pd.DataFrame, output_filepath: str):
+    logger.info("Preparing database...")
     documents_manager = get_documents_manager_from_extension(output_filepath)(output_filepath)
+    sources = documents["source"].unique()
+    for source in sources:
+        documents_manager.add(source, documents)
+    logger.info(f"Documents saved to: {output_filepath}")
+def generate_embeddings(
+    documents: pd.DataFrame,
+    output_filepath: str = "documents.db",
+    max_words=500,
+    embedding_engine: str = EMBEDDING_MODEL,
+) -> pd.DataFrame:
+    # check that we have the appropriate columns in our dataframe
+    assert set(required_cols := ["content", "title", "url"]).issubset(
+        set(documents.columns)
+    ), f"Your dataframe must contain {required_cols}."
+    # Get all documents and precompute their embeddings
+    documents = max_word_count(documents, max_words=max_words)
+    documents = compute_n_tokens(documents)
+    documents = compute_embeddings(documents, engine=embedding_engine)
+    # save the documents to a db for later use
+    documents_to_db(documents, output_filepath)
     return documents
+@click.command()
+@click.argument("documents-csv")
+@click.option(
+    "--output-filepath", default="documents.db", help='Where your database will be saved. Default is "documents.db"'
+)
+@click.option(
+    "--max-words", default=500, help="Number of maximum allowed words per document, excess is trimmed. Default is 500"
+)
+@click.option(
+    "--embeddings-engine", default=EMBEDDING_MODEL, help=f"Embedding model to use. Default is {EMBEDDING_MODEL}"
+)
+def main(documents_csv: str, output_filepath: str, max_words: int, embeddings_engine: str):
+    documents = pd.read_csv(documents_csv)
+    documents = generate_embeddings(documents, output_filepath, max_words, embeddings_engine)
+if __name__ == "__main__":
+    main()

buster/examples/cfg.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from buster.busterbot import BusterConfig
+documents_filepath = "./documents.db"
+buster_cfg = BusterConfig(
+    unknown_prompt="I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
+    embedding_model="text-embedding-ada-002",
+    top_k=3,
+    thresh=0.7,
+    max_words=3000,
+    completer_cfg={
+        "name": "ChatGPT",
+        "text_before_documents": (
+            "You are a chatbot assistant answering technical questions about artificial intelligence (AI)."
+            "You can only respond to a question if the content necessary to answer the question is contained in the following provided documentation. "
+            "If the answer is in the documentation, summarize it in a helpful way to the user. "
+            "If it isn't, simply reply that you cannot answer the question. "
+            "Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
+            "Here is the documentation: "
+            "<DOCUMENTS> "
+        ),
+        "text_before_prompt": (
+            "<\DOCUMENTS>\n"
+            "REMEMBER:\n"
+            "You are a chatbot assistant answering technical questions about artificial intelligence (AI)."
+            "Here are the rules you must follow:\n"
+            "1) You must only respond with information contained in the documentation above. Say you do not know if the information is not provided.\n"
+            "2) Make sure to format your answers in Markdown format, including code block and snippets.\n"
+            "3) Do not reference any links, urls or hyperlinks in your answers.\n"
+            "4) If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n"
+            "5) Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
+            "'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?'"
+            "For example:\n"
+            "What is the meaning of life for an qa bot?\n"
+            "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?"
+            "Now answer the following question:\n"
+        ),
+        "completion_kwargs": {
+            "model": "gpt-3.5-turbo",
+        },
+    },
+    response_format="gradio",
+    source="stackoverflow",
+)

buster/examples/gradio_app.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import cfg
+import gradio as gr
+from buster.busterbot import Buster
+from buster.documents.base import DocumentsManager
+from buster.documents.utils import get_documents_manager_from_extension
+# initialize buster with the config in config.py (adapt to your needs) ...
+documents: DocumentsManager = get_documents_manager_from_extension(cfg.documents_filepath)(cfg.documents_filepath)
+buster: Buster = Buster(cfg=cfg.buster_cfg, documents=documents)
+def chat(question, history):
+    history = history or []
+    answer = buster.process_input(question)
+    # formatting hack for code blocks to render properly every time
+    answer = answer.replace("```", "\n```\n")
+    history.append((question, answer))
+    return history, history
+block = gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}")
+with block:
+    with gr.Row():
+        gr.Markdown("<h3><center>Buster 🤖: A Question-Answering Bot for your documentation</center></h3>")
+    chatbot = gr.Chatbot()
+    with gr.Row():
+        message = gr.Textbox(
+            label="What's your question?",
+            placeholder="Ask a question to AI stackoverflow here...",
+            lines=1,
+        )
+        submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
+    examples = gr.Examples(
+        examples=[
+            "How can I perform backpropagation?",
+            "How do I deal with noisy data?",
+        ],
+        inputs=message,
+    )
+    gr.Markdown("This application uses GPT to search the docs for relevant info and answer questions.")
+    gr.HTML("️<center> Created with ❤️ by @jerpint and @hadrienbertrand")
+    state = gr.State()
+    agent_state = gr.State()
+    submit.click(chat, inputs=[message, state], outputs=[chatbot, state])
+    message.submit(chat, inputs=[message, state], outputs=[chatbot, state])
+block.launch(debug=True, share=False)

buster/examples/stackoverflow.csv ADDED Viewed

	@@ -0,0 +1,52 @@

+,source,title,content,url
+0,stackoverflow,stackoverflow question #1,"""Backprop"" is the same as ""backpropagation"": it's just a shorter way to say it. It is sometimes abbreviated as ""BP"".
+",https://ai.stackexchange.com/questions/1
+1,stackoverflow,stackoverflow question #2,"Noise in the data, to a reasonable amount, may help the network to generalize better. Sometimes, it has the opposite effect. It partly depends on the kind of noise (""true"" vs. artificial).
+The AI FAQ on ANN gives a good overview. Excerpt:
+Noise in the actual data is never a good thing, since it limits the accuracy of generalization that can be achieved no matter how extensive the training set is. On the other hand, injecting artificial noise (jitter) into the inputs during training is one of several ways to improve generalization for smooth functions when you have a small training set.
+In some field, such as computer vision, it's common to increase the size of the training set by copying some samples and adding some noises or other transformation.
+",https://ai.stackexchange.com/questions/2
+2,stackoverflow,stackoverflow question #4,"There is no direct way to find the optimal number of them: people empirically try and see (e.g., using cross-validation). The most common search techniques are random, manual, and grid searches.
+There exist more advanced techniques such as Gaussian processes, e.g. Optimizing Neural Network Hyperparameters with Gaussian Processes for Dialog Act Classification, IEEE SLT 2016.
+",https://ai.stackexchange.com/questions/4
+3,stackoverflow,stackoverflow question #6,"It rather depends on how one defines several of the terms used. For example:
+Whether the term ""expected"" is interpreted in a formal (i.e.
+statistical) sense.
+Whether it's assumed that humans have any kind of utilitarian
+""performance measure"".
+The motivation for this description of ""agent"" arose from a desire to have a quantitative model - it's not clear that such a model is a good fit for human cognition.
+However, there are alternative definitions of agents, for example the BDI model, which are rather more open-ended and hence more obviously applicable to humans.
+",https://ai.stackexchange.com/questions/6
+4,stackoverflow,stackoverflow question #7,"
+To put it simply in layman terms, what are the possible threats from AI?
+Currently, there are no threat.
+The threat comes if humans create a so-called ultraintelligent machine, a machine that can surpass all intellectual activities by any human. This would be the last invention man would need to do, since this machine is better in inventing machines than humans are (since that is an intellectual activity).  However, this could cause the machine to invent machines that can destruct humans, and we can't stop them because they are so much smarter than we are.
+This is all hypothetical, no one has even a clue of what an ultraintelligent machine looks like.
+If we know that AI is so dangerous why are we still promoting it? Why is it not banned?
+As I said before, the existence of a ultraintelligent machine is hypothetical. Artificial Intelligence has lots of useful applications (more than this answer can contain), and if we develop it, we get even more useful applications. We just have to be careful that the machines won't overtake us.
+",https://ai.stackexchange.com/questions/7
+5,stackoverflow,stackoverflow question #10,"It's analogous to analogue versus digital, or the many shades of gray in between black and white: when evaluating the truthiness of a result, in binary boolean it's either true or false (0 or 1), but when utilizing fuzzy logic, it's an estimated probability between 0 and 1 (such as 0.75 being mostly probably true). It's useful for making calculated decisions when all information needed isn't necessarily available.
+Wikipedia has a fantastic page for this.
+",https://ai.stackexchange.com/questions/10
+6,stackoverflow,stackoverflow question #15,"The problem of the Turing Test is that it tests the machines ability to resemble humans. Not necessarily every form of AI has to resemble humans. This makes the Turing Test less reliable. However, it is still useful since it is an actual test. It is also noteworthy that there is a prize for passing or coming closest to passing the Turing Test, the Loebner Prize.
+The intelligent agent definition of intelligence states that an agent is intelligent if it acts so to maximize the expected value of a performance measure based on past experience and knowledge. (paraphrased from Wikipedia). This definition is used more often and does not depend on the ability to resemble humans. However, it is harder to test this.
+",https://ai.stackexchange.com/questions/15
+7,stackoverflow,stackoverflow question #17,"The concept of ""the singularity"" is when machines outsmart the humans. Although Stephen Hawking opinion is that this situation is inevitable, but I think it'll be very difficult to reach that point, because every A.I. algorithm needs to be programmed by humans, therefore it would be always more limited than its creator.
+We would probably know when that point when humanity will lose control over Artificial Intelligence where super-smart AI would be in competition with humans and maybe creating more sophisticated intelligent beings occurred, but currently, it's more like science fiction (aka Terminator's Skynet).
+The risk could involve killing people (like self-flying war drones making their own decision), destroying countries or even the whole planet (like A.I. connected to the nuclear weapons (aka WarGames movie), but it doesn't prove the point that the machines would be smarter than humans.
+",https://ai.stackexchange.com/questions/17
+8,stackoverflow,stackoverflow question #26,"I think your question fits nowadays more in the field of Human-Robot Interaction, which relies largely on vision for recognition of gestures and follow movements, as well as soft, natural movements as a response. Note that the movements of the face and hands belong to the most complex tasks, involving many muscles at a time.
+I strongly recommend the film Plug & Pray to have an idea of what people are researching in this area.
+You may also find Eliza (which you can try here) interesting. It is classical in the history of AI and pretends to mimic an analyst (psychology). (I am thinking of Eliza not because of its emotional intelligence, but because it was apparently taken seriously by a couple of humans. Could this be taken as a sort of (approved) Turing test? What does it say about the humans it met?)
+On the purely human end of the scale, I sometimes wonder about our (my) emotional intelligence myself. Would I want to implement such an intelligence in an artificial agent at all?
+",https://ai.stackexchange.com/questions/26
+9,stackoverflow,stackoverflow question #28,"This is probably more a question of philosophy than anything. In terms of how things are commonly defined, I'll say ""yes, genetic algorithms are part of AI"".  If you pick up a comprehensive book on artificial intelligence, there will probably be a chapter on genetic algorithms (or more broadly, evolutionary algorithms).
+One area that has been extensively studied in the past is the idea of using genetic algorithms to train neural networks.  I don't know if people are still actively researching this topic or not, but it at least illustrates that GA's are part of the overall rubric of AI in one regard.
+",https://ai.stackexchange.com/questions/28

pyproject.toml CHANGED Viewed

@@ -10,6 +10,9 @@ readme = "README.md"
 requires-python = ">=3.10"
 dynamic = ["dependencies"]
 [tool.setuptools.dynamic]
 dependencies = {file = ["requirements.txt"]}
@@ -22,3 +25,4 @@ line-length = 120
 [tool.pytest.ini_options]
 log_cli = true
 log_cli_level = "INFO"

 requires-python = ">=3.10"
 dynamic = ["dependencies"]
+[project.scripts]
+buster_csv_parser = "buster.docparser:main"
 [tool.setuptools.dynamic]
 dependencies = {file = ["requirements.txt"]}
 [tool.pytest.ini_options]
 log_cli = true
 log_cli_level = "INFO"

requirements.txt CHANGED Viewed

@@ -8,6 +8,7 @@ tiktoken
 promptlayer
 pytest
 openai
 # all openai[embeddings] deps, their list breaks our CI, see: https://github.com/openai/openai-python/issues/210

 promptlayer
 pytest
 openai
+click
 # all openai[embeddings] deps, their list breaks our CI, see: https://github.com/openai/openai-python/issues/210

tests/test_chatbot.py CHANGED Viewed

@@ -4,7 +4,7 @@ from pathlib import Path
 import numpy as np
 import pandas as pd
-from buster.buster import Buster, BusterConfig
 from buster.completers.base import Completer
 from buster.documents import DocumentsManager, get_documents_manager_from_extension
 from buster.formatter.base import Response
@@ -60,7 +60,7 @@ logging.basicConfig(level=logging.INFO)
 def test_chatbot_mock_data(tmp_path, monkeypatch):
     gpt_expected_answer = "this is GPT answer"
     monkeypatch.setattr(Buster, "get_embedding", lambda self, prompt, engine: get_fake_embedding())
-    monkeypatch.setattr("buster.buster.get_completer", lambda x: MockCompleter(expected_answer=gpt_expected_answer))
     hf_transformers_cfg = BusterConfig(
         unknown_prompt="This doesn't seem to be related to the huggingface library. I am not sure how to answer.",

 import numpy as np
 import pandas as pd
+from buster.busterbot import Buster, BusterConfig
 from buster.completers.base import Completer
 from buster.documents import DocumentsManager, get_documents_manager_from_extension
 from buster.formatter.base import Response
 def test_chatbot_mock_data(tmp_path, monkeypatch):
     gpt_expected_answer = "this is GPT answer"
     monkeypatch.setattr(Buster, "get_embedding", lambda self, prompt, engine: get_fake_embedding())
+    monkeypatch.setattr("buster.busterbot.get_completer", lambda x: MockCompleter(expected_answer=gpt_expected_answer))
     hf_transformers_cfg = BusterConfig(
         unknown_prompt="This doesn't seem to be related to the huggingface library. I am not sure how to answer.",

tests/test_docparser.py CHANGED Viewed

@@ -7,7 +7,9 @@ from buster.documents import get_documents_manager_from_extension
 def test_generate_embeddings(tmp_path, monkeypatch):
     # Create fake data
-    data = pd.DataFrame.from_dict({"title": ["test"], "url": ["http://url.com"], "content": ["cool text"]})
     # Patch the get_embedding function to return a fixed embedding
     monkeypatch.setattr("buster.docparser.get_embedding", lambda x, engine: [-0.005, 0.0018])
@@ -15,10 +17,10 @@ def test_generate_embeddings(tmp_path, monkeypatch):
     # Generate embeddings, store in a file
     output_file = tmp_path / "test_document_embeddings.tar.gz"
-    df = generate_embeddings(tmp_path, output_file, source="mila")
     # Read the embeddings from the file
-    read_df = get_documents_manager_from_extension(output_file)(output_file).get_documents("mila")
     # Check all the values are correct across the files
     assert df["title"].iloc[0] == data["title"].iloc[0] == read_df["title"].iloc[0]

 def test_generate_embeddings(tmp_path, monkeypatch):
     # Create fake data
+    data = pd.DataFrame.from_dict(
+        {"title": ["test"], "url": ["http://url.com"], "content": ["cool text"], "source": ["my_source"]}
+    )
     # Patch the get_embedding function to return a fixed embedding
     monkeypatch.setattr("buster.docparser.get_embedding", lambda x, engine: [-0.005, 0.0018])
     # Generate embeddings, store in a file
     output_file = tmp_path / "test_document_embeddings.tar.gz"
+    df = generate_embeddings(data, output_file)
     # Read the embeddings from the file
+    read_df = get_documents_manager_from_extension(output_file)(output_file).get_documents("my_source")
     # Check all the values are correct across the files
     assert df["title"].iloc[0] == data["title"].iloc[0] == read_df["title"].iloc[0]