Spaces:

jerpint
/

buster-dev

Runtime error

App Files Files Community

hbertrand commited on Mar 29, 2023

Commit

06bca0c

•

1 Parent(s): 44ee439

PR: retriever interface (#77)

Browse files

* retriever interface

* black + isort

* documents -> retriever

* PR

* black

Files changed (17) hide show

buster/apps/gradio_app.py +4 -4
buster/busterbot.py +4 -4
buster/docparser.py +1 -1
buster/documents/__init__.py +1 -2
buster/documents/base.py +0 -19
buster/documents/pickle.py +0 -16
buster/documents/sqlite/documents.py +0 -11
buster/examples/gradio_app.py +4 -4
buster/parser.py +0 -1
buster/retriever/__init__.py +5 -0
buster/retriever/base.py +26 -0
buster/retriever/pickle.py +26 -0
buster/retriever/sqlite.py +46 -0
buster/{documents/utils.py → utils.py} +13 -3
tests/test_chatbot.py +11 -13
tests/test_docparser.py +6 -4
tests/test_documents.py +13 -6

buster/apps/gradio_app.py CHANGED Viewed

@@ -5,19 +5,19 @@ import gradio as gr
 from buster.apps.bot_configs import available_configs
 from buster.busterbot import Buster, BusterConfig
-from buster.documents.base import DocumentsManager
-from buster.documents.utils import download_db, get_documents_manager_from_extension
 DEFAULT_CONFIG = "huggingface"
 DB_URL = "https://huggingface.co/datasets/jerpint/buster-data/resolve/main/documents.db"
 # Download the db...
 documents_filepath = download_db(db_url=DB_URL, output_dir="./data")
-documents: DocumentsManager = get_documents_manager_from_extension(documents_filepath)(documents_filepath)
 # initialize buster with the default config...
 default_cfg: BusterConfig = available_configs.get(DEFAULT_CONFIG)
-buster = Buster(cfg=default_cfg, documents=documents)
 def chat(question, history, bot_source):

 from buster.apps.bot_configs import available_configs
 from buster.busterbot import Buster, BusterConfig
+from buster.retriever import Retriever
+from buster.utils import download_db, get_retriever_from_extension
 DEFAULT_CONFIG = "huggingface"
 DB_URL = "https://huggingface.co/datasets/jerpint/buster-data/resolve/main/documents.db"
 # Download the db...
 documents_filepath = download_db(db_url=DB_URL, output_dir="./data")
+retriever: Retriever = get_retriever_from_extension(documents_filepath)(documents_filepath)
 # initialize buster with the default config...
 default_cfg: BusterConfig = available_configs.get(DEFAULT_CONFIG)
+buster = Buster(cfg=default_cfg, retriever=retriever)
 def chat(question, history, bot_source):

buster/busterbot.py CHANGED Viewed

@@ -64,16 +64,16 @@ class BusterConfig:
     source: str = ""
-from buster.documents.base import DocumentsManager
 class Buster:
-    def __init__(self, cfg: BusterConfig, documents: DocumentsManager):
         self._unk_embedding = None
         self.cfg = cfg
         self.update_cfg(cfg)
-        self.documents = documents
     @property
     def unk_embedding(self):
@@ -117,7 +117,7 @@ class Buster:
             query,
             engine=engine,
         )
-        matched_documents = self.documents.retrieve(query_embedding, top_k=top_k, source=source)
         # log matched_documents to the console
         logger.info(f"matched documents before thresh: {matched_documents}")

     source: str = ""
+from buster.retriever import Retriever
 class Buster:
+    def __init__(self, cfg: BusterConfig, retriever: Retriever):
         self._unk_embedding = None
         self.cfg = cfg
         self.update_cfg(cfg)
+        self.retriever = retriever
     @property
     def unk_embedding(self):
             query,
             engine=engine,
         )
+        matched_documents = self.retriever.retrieve(query_embedding, top_k=top_k, source=source)
         # log matched_documents to the console
         logger.info(f"matched documents before thresh: {matched_documents}")

buster/docparser.py CHANGED Viewed

@@ -10,8 +10,8 @@ import tiktoken
 from bs4 import BeautifulSoup
 from openai.embeddings_utils import get_embedding
-from buster.documents import get_documents_manager_from_extension
 from buster.parser import HuggingfaceParser, Parser, SphinxParser
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)

 from bs4 import BeautifulSoup
 from openai.embeddings_utils import get_embedding
 from buster.parser import HuggingfaceParser, Parser, SphinxParser
+from buster.utils import get_documents_manager_from_extension
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)

buster/documents/__init__.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from .base import DocumentsManager
 from .pickle import DocumentsPickle
 from .sqlite import DocumentsDB
-from .utils import get_documents_manager_from_extension
-__all__ = [DocumentsManager, DocumentsPickle, DocumentsDB, get_documents_manager_from_extension]

 from .base import DocumentsManager
 from .pickle import DocumentsPickle
 from .sqlite import DocumentsDB
+__all__ = [DocumentsManager, DocumentsPickle, DocumentsDB]

buster/documents/base.py CHANGED Viewed

@@ -2,7 +2,6 @@ from abc import ABC, abstractmethod
 from dataclasses import dataclass
 import pandas as pd
-from openai.embeddings_utils import cosine_similarity
 @dataclass
@@ -10,21 +9,3 @@ class DocumentsManager(ABC):
     @abstractmethod
     def add(self, source: str, df: pd.DataFrame):
         ...
-    @abstractmethod
-    def get_documents(self, source: str) -> pd.DataFrame:
-        ...
-    def retrieve(self, query_embedding: list[float], top_k: int, source: str = None) -> pd.DataFrame:
-        documents = self.get_documents(source)
-        documents["similarity"] = documents.embedding.apply(lambda x: cosine_similarity(x, query_embedding))
-        # sort the matched_documents by score
-        matched_documents = documents.sort_values("similarity", ascending=False)
-        # limit search to top_k matched_documents.
-        top_k = len(matched_documents) if top_k == -1 else top_k
-        matched_documents = matched_documents.head(top_k)
-        return matched_documents

 from dataclasses import dataclass
 import pandas as pd
 @dataclass
     @abstractmethod
     def add(self, source: str, df: pd.DataFrame):
         ...

buster/documents/pickle.py CHANGED Viewed

@@ -27,19 +27,3 @@ class DocumentsPickle(DocumentsManager):
             self.documents = df
         self.documents.to_pickle(self.filepath)
-    def get_documents(self, source: str) -> pd.DataFrame:
-        if self.documents is None:
-            raise FileNotFoundError(f"No documents found at {self.filepath}. Are you sure this is the correct path?")
-        documents = self.documents.copy()
-        if "current" in documents.columns:
-            documents = documents[documents.current == 1]
-            # Drop the `current` column
-            documents.drop(columns=["current"], inplace=True)
-        if source is not None and "source" in documents.columns:
-            documents = documents[documents.source == source]
-        return documents


27	self.documents = df
28
29	self.documents.to_pickle(self.filepath)

buster/documents/sqlite/documents.py CHANGED Viewed

@@ -33,7 +33,6 @@ class DocumentsDB(DocumentsManager):
     Example:
         >>> db = DocumentsDB("/path/to/the/db.db")
         >>> db.add("source", df)  # df is a DataFrame containing the documents from a given source, obtained e.g. by using buster.docparser.generate_embeddings
-        >>> df = db.get_documents("source")
     """
     def __init__(self, db_path: sqlite3.Connection | str):
@@ -142,13 +141,3 @@ class DocumentsDB(DocumentsManager):
         sid, vid = self.add_parse(source, (section for section, _ in sections))
         self.add_chunking(sid, vid, size, (chunks for _, chunks in sections))
         self.conn.commit()
-    def get_documents(self, source: str) -> pd.DataFrame:
-        """Get all current documents from a given source."""
-        # Execute the SQL statement and fetch the results
-        results = self.conn.execute("SELECT * FROM documents WHERE source = ?", (source,))
-        rows = results.fetchall()
-        # Convert the results to a pandas DataFrame
-        df = pd.DataFrame(rows, columns=[description[0] for description in results.description])
-        return df

     Example:
         >>> db = DocumentsDB("/path/to/the/db.db")
         >>> db.add("source", df)  # df is a DataFrame containing the documents from a given source, obtained e.g. by using buster.docparser.generate_embeddings
     """
     def __init__(self, db_path: sqlite3.Connection | str):
         sid, vid = self.add_parse(source, (section for section, _ in sections))
         self.add_chunking(sid, vid, size, (chunks for _, chunks in sections))
         self.conn.commit()

buster/examples/gradio_app.py CHANGED Viewed

@@ -2,12 +2,12 @@ import cfg
 import gradio as gr
 from buster.busterbot import Buster
-from buster.documents.base import DocumentsManager
-from buster.documents.utils import get_documents_manager_from_extension
 # initialize buster with the config in config.py (adapt to your needs) ...
-documents: DocumentsManager = get_documents_manager_from_extension(cfg.documents_filepath)(cfg.documents_filepath)
-buster: Buster = Buster(cfg=cfg.buster_cfg, documents=documents)
 def chat(question, history):

 import gradio as gr
 from buster.busterbot import Buster
+from buster.retriever import Retriever
+from buster.utils import get_retriever_from_extension
 # initialize buster with the config in config.py (adapt to your needs) ...
+retriever: Retriever = get_retriever_from_extension(cfg.documents_filepath)(cfg.documents_filepath)
+buster: Buster = Buster(cfg=cfg.buster_cfg, retriever=retriever)
 def chat(question, history):

buster/parser.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import math
 import os
 from abc import ABC, abstractmethod
 from dataclasses import InitVar, dataclass, field

 import os
 from abc import ABC, abstractmethod
 from dataclasses import InitVar, dataclass, field

buster/retriever/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .base import Retriever
+from .pickle import PickleRetriever
+from .sqlite import SQLiteRetriever
+__all__ = [Retriever, PickleRetriever, SQLiteRetriever]

buster/retriever/base.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+import pandas as pd
+from openai.embeddings_utils import cosine_similarity
+@dataclass
+class Retriever(ABC):
+    @abstractmethod
+    def get_documents(self, source: str) -> pd.DataFrame:
+        ...
+    def retrieve(self, query_embedding: list[float], top_k: int, source: str = None) -> pd.DataFrame:
+        documents = self.get_documents(source)
+        documents["similarity"] = documents.embedding.apply(lambda x: cosine_similarity(x, query_embedding))
+        # sort the matched_documents by score
+        matched_documents = documents.sort_values("similarity", ascending=False)
+        # limit search to top_k matched_documents.
+        top_k = len(matched_documents) if top_k == -1 else top_k
+        matched_documents = matched_documents.head(top_k)
+        return matched_documents

buster/retriever/pickle.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import pandas as pd
+from buster.retriever.base import Retriever
+class PickleRetriever(Retriever):
+    def __init__(self, filepath: str):
+        self.filepath = filepath
+        self.documents = pd.read_pickle(filepath)
+    def get_documents(self, source: str) -> pd.DataFrame:
+        if self.documents is None:
+            raise FileNotFoundError(f"No documents found at {self.filepath}. Are you sure this is the correct path?")
+        documents = self.documents.copy()
+        # The `current` column exists when multiple versions of a document exist
+        if "current" in documents.columns:
+            documents = documents[documents.current == 1]
+            # Drop the `current` column
+            documents.drop(columns=["current"], inplace=True)
+        if source is not None and "source" in documents.columns:
+            documents = documents[documents.source == source]
+        return documents

buster/retriever/sqlite.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import sqlite3
+from pathlib import Path
+import pandas as pd
+import buster.documents.sqlite.schema as schema
+from buster.retriever.base import Retriever
+class SQLiteRetriever(Retriever):
+    """Simple SQLite database for retrieval of documents.
+    The database is just a file on disk. It can store documents from different sources, and it
+    can store multiple versions of the same document (e.g. if the document is updated).
+    Example:
+        >>> db = DocumentsDB("/path/to/the/db.db")
+        >>> df = db.get_documents("source")
+    """
+    def __init__(self, db_path: sqlite3.Connection | str):
+        if isinstance(db_path, (str, Path)):
+            self.db_path = db_path
+            self.conn = sqlite3.connect(db_path, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
+        else:
+            self.db_path = None
+            self.conn = db_path
+        schema.initialize_db(self.conn)
+        schema.setup_db(self.conn)
+    def __del__(self):
+        if self.db_path is not None:
+            self.conn.close()
+    def get_documents(self, source: str) -> pd.DataFrame:
+        """Get all current documents from a given source."""
+        # Execute the SQL statement and fetch the results.
+        if source is "":
+            results = self.conn.execute("SELECT * FROM documents")
+        else:
+            results = self.conn.execute("SELECT * FROM documents WHERE source = ?", (source,))
+        rows = results.fetchall()
+        # Convert the results to a pandas DataFrame
+        df = pd.DataFrame(rows, columns=[description[0] for description in results.description])
+        return df

buster/{documents/utils.py → utils.py} RENAMED Viewed

@@ -2,9 +2,8 @@ import os
 import urllib.request
 from typing import Type
-from buster.documents.base import DocumentsManager
-from buster.documents.pickle import DocumentsPickle
-from buster.documents.sqlite import DocumentsDB
 PICKLE_EXTENSIONS = [".gz", ".bz2", ".zip", ".xz", ".zst", ".tar", ".tar.gz", ".tar.xz", ".tar.bz2"]
@@ -34,3 +33,14 @@ def get_documents_manager_from_extension(filepath: str) -> Type[DocumentsManager
         return DocumentsDB
     else:
         raise ValueError(f"Unsupported format: {ext}.")

 import urllib.request
 from typing import Type
+from buster.documents import DocumentsDB, DocumentsManager, DocumentsPickle
+from buster.retriever import PickleRetriever, Retriever, SQLiteRetriever
 PICKLE_EXTENSIONS = [".gz", ".bz2", ".zip", ".xz", ".zst", ".tar", ".tar.gz", ".tar.xz", ".tar.bz2"]
         return DocumentsDB
     else:
         raise ValueError(f"Unsupported format: {ext}.")
+def get_retriever_from_extension(filepath: str) -> Type[Retriever]:
+    ext = get_file_extension(filepath)
+    if ext in PICKLE_EXTENSIONS:
+        return PickleRetriever
+    elif ext == ".db":
+        return SQLiteRetriever
+    else:
+        raise ValueError(f"Unsupported format: {ext}.")

tests/test_chatbot.py CHANGED Viewed

@@ -6,8 +6,9 @@ import pandas as pd
 from buster.busterbot import Buster, BusterConfig
 from buster.completers.base import Completer
-from buster.documents import DocumentsManager, get_documents_manager_from_extension
 from buster.formatter.base import Response
 TEST_DATA_DIR = Path(__file__).resolve().parent / "data"
 DOCUMENTS_FILE = os.path.join(str(TEST_DATA_DIR), "document_embeddings_huggingface_subset.tar.gz")
@@ -29,7 +30,7 @@ class MockCompleter(Completer):
         return Response(self.expected_answer)
-class DocumentsMock(DocumentsManager):
     def __init__(self, filepath):
         self.filepath = filepath
@@ -45,9 +46,6 @@ class DocumentsMock(DocumentsManager):
             }
         )
-    def add(self, documents):
-        pass
     def get_documents(self, source):
         return self.documents
@@ -90,8 +88,8 @@ def test_chatbot_mock_data(tmp_path, monkeypatch):
         },
     )
     filepath = tmp_path / "not_a_real_file.tar.gz"
-    documents = DocumentsMock(filepath)
-    buster = Buster(cfg=hf_transformers_cfg, documents=documents)
     answer = buster.process_input("What is a transformer?")
     assert isinstance(answer, str)
     assert answer.startswith(gpt_expected_answer)
@@ -119,8 +117,8 @@ def test_chatbot_real_data__chatGPT():
             },
         },
     )
-    documents = get_documents_manager_from_extension(DOCUMENTS_FILE)(DOCUMENTS_FILE)
-    buster = Buster(cfg=hf_transformers_cfg, documents=documents)
     answer = buster.process_input("What is a transformer?")
     assert isinstance(answer, str)
@@ -153,8 +151,8 @@ def test_chatbot_real_data__chatGPT_OOD():
         },
         response_format="gradio",
     )
-    documents = get_documents_manager_from_extension(DOCUMENTS_FILE)(DOCUMENTS_FILE)
-    buster = Buster(cfg=buster_cfg, documents=documents)
     answer = buster.process_input("What is a good recipe for brocolli soup?")
     assert isinstance(answer, str)
     assert buster_cfg.unknown_prompt in answer
@@ -187,7 +185,7 @@ def test_chatbot_real_data__GPT():
             },
         },
     )
-    documents = get_documents_manager_from_extension(DOCUMENTS_FILE)(DOCUMENTS_FILE)
-    buster = Buster(cfg=hf_transformers_cfg, documents=documents)
     answer = buster.process_input("What is a transformer?")
     assert isinstance(answer, str)

 from buster.busterbot import Buster, BusterConfig
 from buster.completers.base import Completer
 from buster.formatter.base import Response
+from buster.retriever import Retriever
+from buster.utils import get_retriever_from_extension
 TEST_DATA_DIR = Path(__file__).resolve().parent / "data"
 DOCUMENTS_FILE = os.path.join(str(TEST_DATA_DIR), "document_embeddings_huggingface_subset.tar.gz")
         return Response(self.expected_answer)
+class MockRetriever(Retriever):
     def __init__(self, filepath):
         self.filepath = filepath
             }
         )
     def get_documents(self, source):
         return self.documents
         },
     )
     filepath = tmp_path / "not_a_real_file.tar.gz"
+    retriever = MockRetriever(filepath)
+    buster = Buster(cfg=hf_transformers_cfg, retriever=retriever)
     answer = buster.process_input("What is a transformer?")
     assert isinstance(answer, str)
     assert answer.startswith(gpt_expected_answer)
             },
         },
     )
+    retriever = get_retriever_from_extension(DOCUMENTS_FILE)(DOCUMENTS_FILE)
+    buster = Buster(cfg=hf_transformers_cfg, retriever=retriever)
     answer = buster.process_input("What is a transformer?")
     assert isinstance(answer, str)
         },
         response_format="gradio",
     )
+    retriever = get_retriever_from_extension(DOCUMENTS_FILE)(DOCUMENTS_FILE)
+    buster = Buster(cfg=buster_cfg, retriever=retriever)
     answer = buster.process_input("What is a good recipe for brocolli soup?")
     assert isinstance(answer, str)
     assert buster_cfg.unknown_prompt in answer
             },
         },
     )
+    retriever = get_retriever_from_extension(DOCUMENTS_FILE)(DOCUMENTS_FILE)
+    buster = Buster(cfg=hf_transformers_cfg, retriever=retriever)
     answer = buster.process_input("What is a transformer?")
     assert isinstance(answer, str)

tests/test_docparser.py CHANGED Viewed

@@ -1,11 +1,13 @@
 import numpy as np
 import pandas as pd
 from buster.docparser import generate_embeddings
-from buster.documents import get_documents_manager_from_extension
-def test_generate_embeddings(tmp_path, monkeypatch):
     # Create fake data
     data = pd.DataFrame.from_dict(
         {"title": ["test"], "url": ["http://url.com"], "content": ["cool text"], "source": ["my_source"]}
@@ -16,11 +18,11 @@ def test_generate_embeddings(tmp_path, monkeypatch):
     monkeypatch.setattr("buster.docparser.get_all_documents", lambda a, b, c: data)
     # Generate embeddings, store in a file
-    output_file = tmp_path / "test_document_embeddings.tar.gz"
     df = generate_embeddings(data, output_file)
     # Read the embeddings from the file
-    read_df = get_documents_manager_from_extension(output_file)(output_file).get_documents("my_source")
     # Check all the values are correct across the files
     assert df["title"].iloc[0] == data["title"].iloc[0] == read_df["title"].iloc[0]

 import numpy as np
 import pandas as pd
+import pytest
 from buster.docparser import generate_embeddings
+from buster.utils import get_retriever_from_extension
+@pytest.mark.parametrize("extension", ["db", "tar.gz"])
+def test_generate_embeddings(tmp_path, monkeypatch, extension):
     # Create fake data
     data = pd.DataFrame.from_dict(
         {"title": ["test"], "url": ["http://url.com"], "content": ["cool text"], "source": ["my_source"]}
     monkeypatch.setattr("buster.docparser.get_all_documents", lambda a, b, c: data)
     # Generate embeddings, store in a file
+    output_file = tmp_path / f"test_document_embeddings.{extension}"
     df = generate_embeddings(data, output_file)
     # Read the embeddings from the file
+    read_df = get_retriever_from_extension(output_file)(output_file).get_documents("my_source")
     # Check all the values are correct across the files
     assert df["title"].iloc[0] == data["title"].iloc[0] == read_df["title"].iloc[0]

tests/test_documents.py CHANGED Viewed

@@ -3,10 +3,14 @@ import pandas as pd
 import pytest
 from buster.documents import DocumentsDB, DocumentsPickle
-@pytest.mark.parametrize("documents_manager, extension", [(DocumentsDB, "db"), (DocumentsPickle, "tar.gz")])
-def test_write_read(tmp_path, documents_manager, extension):
     db = documents_manager(tmp_path / f"test.{extension}")
     data = pd.DataFrame.from_dict(
@@ -20,7 +24,7 @@ def test_write_read(tmp_path, documents_manager, extension):
     )
     db.add(source="test", df=data)
-    db_data = db.get_documents("test")
     assert db_data["title"].iloc[0] == data["title"].iloc[0]
     assert db_data["url"].iloc[0] == data["url"].iloc[0]
@@ -29,8 +33,11 @@ def test_write_read(tmp_path, documents_manager, extension):
     assert db_data["n_tokens"].iloc[0] == data["n_tokens"].iloc[0]
-@pytest.mark.parametrize("documents_manager, extension", [(DocumentsDB, "db"), (DocumentsPickle, "tar.gz")])
-def test_write_write_read(tmp_path, documents_manager, extension):
     db = documents_manager(tmp_path / f"test.{extension}")
     data_1 = pd.DataFrame.from_dict(
@@ -55,7 +62,7 @@ def test_write_write_read(tmp_path, documents_manager, extension):
     )
     db.add(source="test", df=data_2)
-    db_data = db.get_documents("test")
     assert len(db_data) == len(data_2)
     assert db_data["title"].iloc[0] == data_2["title"].iloc[0]

 import pytest
 from buster.documents import DocumentsDB, DocumentsPickle
+from buster.retriever import PickleRetriever, SQLiteRetriever
+@pytest.mark.parametrize(
+    "documents_manager, retriever, extension",
+    [(DocumentsDB, SQLiteRetriever, "db"), (DocumentsPickle, PickleRetriever, "tar.gz")],
+)
+def test_write_read(tmp_path, documents_manager, retriever, extension):
     db = documents_manager(tmp_path / f"test.{extension}")
     data = pd.DataFrame.from_dict(
     )
     db.add(source="test", df=data)
+    db_data = retriever(tmp_path / f"test.{extension}").get_documents("test")
     assert db_data["title"].iloc[0] == data["title"].iloc[0]
     assert db_data["url"].iloc[0] == data["url"].iloc[0]
     assert db_data["n_tokens"].iloc[0] == data["n_tokens"].iloc[0]
+@pytest.mark.parametrize(
+    "documents_manager, retriever, extension",
+    [(DocumentsDB, SQLiteRetriever, "db"), (DocumentsPickle, PickleRetriever, "tar.gz")],
+)
+def test_write_write_read(tmp_path, documents_manager, retriever, extension):
     db = documents_manager(tmp_path / f"test.{extension}")
     data_1 = pd.DataFrame.from_dict(
     )
     db.add(source="test", df=data_2)
+    db_data = retriever(tmp_path / f"test.{extension}").get_documents("test")
     assert len(db_data) == len(data_2)
     assert db_data["title"].iloc[0] == data_2["title"].iloc[0]