Spaces:

jerpint
/

buster-dev

Runtime error

App Files Files Community

hbertrand commited on Apr 5, 2023

Commit

6aad21a

•

1 Parent(s): ebace01

PR: source display name (#80)

Browse files

* source display name

* tests

* black

* CR

* isort

Files changed (10) hide show

buster/docparser.py +5 -0
buster/documents/base.py +6 -0
buster/documents/pickle.py +5 -0
buster/documents/sqlite/documents.py +10 -0
buster/documents/sqlite/schema.py +1 -0
buster/retriever/base.py +8 -0
buster/retriever/pickle.py +9 -1
buster/retriever/sqlite.py +13 -1
tests/test_chatbot.py +3 -0
tests/test_documents.py +11 -0

buster/docparser.py CHANGED Viewed

@@ -157,6 +157,11 @@ def documents_to_db(documents: pd.DataFrame, output_filepath: str):
     logger.info(f"Documents saved to: {output_filepath}")
 def generate_embeddings(
     documents: pd.DataFrame,
     output_filepath: str = "documents.db",

     logger.info(f"Documents saved to: {output_filepath}")
+def update_source(source: str, output_filepath: str, display_name: str = None, note: str = None):
+    documents_manager = get_documents_manager_from_extension(output_filepath)(output_filepath)
+    documents_manager.update_source(source, display_name, note)
 def generate_embeddings(
     documents: pd.DataFrame,
     output_filepath: str = "documents.db",

buster/documents/base.py CHANGED Viewed

@@ -8,4 +8,10 @@ import pandas as pd
 class DocumentsManager(ABC):
     @abstractmethod
     def add(self, source: str, df: pd.DataFrame):
         ...

 class DocumentsManager(ABC):
     @abstractmethod
     def add(self, source: str, df: pd.DataFrame):
+        """Write all documents from the dataframe into the db as a new version."""
+        ...
+    @abstractmethod
+    def update_source(self, source: str, display_name: str = None, note: str = None):
+        """Update the display name and/or note of a source. Also create the source if it does not exist."""
         ...

buster/documents/pickle.py CHANGED Viewed

@@ -15,6 +15,7 @@ class DocumentsPickle(DocumentsManager):
             self.documents = None
     def add(self, source: str, df: pd.DataFrame):
         if source is not None:
             df["source"] = source
@@ -27,3 +28,7 @@ class DocumentsPickle(DocumentsManager):
             self.documents = df
         self.documents.to_pickle(self.filepath)

             self.documents = None
     def add(self, source: str, df: pd.DataFrame):
+        """Write all documents from the dataframe into the db as a new version."""
         if source is not None:
             df["source"] = source
             self.documents = df
         self.documents.to_pickle(self.filepath)
+    def update_source(self, source: str, display_name: str = None, note: str = None):
+        """Update the display name and/or note of a source. Also create the source if it does not exist."""
+        print("If you need this function, please switch your backend to DocumentsDB.")

buster/documents/sqlite/documents.py CHANGED Viewed

@@ -141,3 +141,13 @@ class DocumentsDB(DocumentsManager):
         sid, vid = self.add_parse(source, (section for section, _ in sections))
         self.add_chunking(sid, vid, size, (chunks for _, chunks in sections))
         self.conn.commit()

         sid, vid = self.add_parse(source, (section for section, _ in sections))
         self.add_chunking(sid, vid, size, (chunks for _, chunks in sections))
         self.conn.commit()
+    def update_source(self, source: str, display_name: str = None, note: str = None):
+        """Update the display name and/or note of a source. Also create the source if it does not exist."""
+        sid = self.get_source(source)
+        if display_name is not None:
+            self.conn.execute("UPDATE sources SET display_name = ? WHERE id = ?", (display_name, sid))
+        if note is not None:
+            self.conn.execute("UPDATE sources SET note = ? WHERE id = ?", (note, sid))
+        self.conn.commit()

buster/documents/sqlite/schema.py CHANGED Viewed

@@ -6,6 +6,7 @@ import numpy as np
 SOURCE_TABLE = r"""CREATE TABLE IF NOT EXISTS sources (
     id INTEGER PRIMARY KEY AUTOINCREMENT,
     name TEXT NOT NULL,
     note TEXT,
     UNIQUE(name)
 )"""

 SOURCE_TABLE = r"""CREATE TABLE IF NOT EXISTS sources (
     id INTEGER PRIMARY KEY AUTOINCREMENT,
     name TEXT NOT NULL,
+    display_name TEXT,
     note TEXT,
     UNIQUE(name)
 )"""

buster/retriever/base.py CHANGED Viewed

@@ -4,11 +4,19 @@ from dataclasses import dataclass
 import pandas as pd
 from openai.embeddings_utils import cosine_similarity
 @dataclass
 class Retriever(ABC):
     @abstractmethod
     def get_documents(self, source: str) -> pd.DataFrame:
         ...
     def retrieve(self, query_embedding: list[float], top_k: int, source: str = None) -> pd.DataFrame:

 import pandas as pd
 from openai.embeddings_utils import cosine_similarity
+ALL_SOURCES = "All"
 @dataclass
 class Retriever(ABC):
     @abstractmethod
     def get_documents(self, source: str) -> pd.DataFrame:
+        """Get all current documents from a given source."""
+        ...
+    @abstractmethod
+    def get_source_display_name(self, source: str) -> str:
+        """Get the display name of a source."""
         ...
     def retrieve(self, query_embedding: list[float], top_k: int, source: str = None) -> pd.DataFrame:

buster/retriever/pickle.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import pandas as pd
-from buster.retriever.base import Retriever
 class PickleRetriever(Retriever):
@@ -9,6 +9,7 @@ class PickleRetriever(Retriever):
         self.documents = pd.read_pickle(filepath)
     def get_documents(self, source: str) -> pd.DataFrame:
         if self.documents is None:
             raise FileNotFoundError(f"No documents found at {self.filepath}. Are you sure this is the correct path?")
@@ -24,3 +25,10 @@ class PickleRetriever(Retriever):
             documents = documents[documents.source == source]
         return documents

 import pandas as pd
+from buster.retriever.base import ALL_SOURCES, Retriever
 class PickleRetriever(Retriever):
         self.documents = pd.read_pickle(filepath)
     def get_documents(self, source: str) -> pd.DataFrame:
+        """Get all current documents from a given source."""
         if self.documents is None:
             raise FileNotFoundError(f"No documents found at {self.filepath}. Are you sure this is the correct path?")
             documents = documents[documents.source == source]
         return documents
+    def get_source_display_name(self, source: str) -> str:
+        """Get the display name of a source."""
+        if source is None:
+            return ALL_SOURCES
+        else:
+            return source

buster/retriever/sqlite.py CHANGED Viewed

@@ -4,7 +4,7 @@ from pathlib import Path
 import pandas as pd
 import buster.documents.sqlite.schema as schema
-from buster.retriever.base import Retriever
 class SQLiteRetriever(Retriever):
@@ -44,3 +44,15 @@ class SQLiteRetriever(Retriever):
         # Convert the results to a pandas DataFrame
         df = pd.DataFrame(rows, columns=[description[0] for description in results.description])
         return df

 import pandas as pd
 import buster.documents.sqlite.schema as schema
+from buster.retriever.base import ALL_SOURCES, Retriever
 class SQLiteRetriever(Retriever):
         # Convert the results to a pandas DataFrame
         df = pd.DataFrame(rows, columns=[description[0] for description in results.description])
         return df
+    def get_source_display_name(self, source: str) -> str:
+        """Get the display name of a source."""
+        if source is "":
+            return ALL_SOURCES
+        else:
+            cur = self.conn.execute("SELECT display_name FROM sources WHERE name = ?", (source,))
+            row = cur.fetchone()
+            if row is None:
+                raise KeyError(f'"{source}" is not a known source')
+            (display_name,) = row
+            return display_name

tests/test_chatbot.py CHANGED Viewed

@@ -49,6 +49,9 @@ class MockRetriever(Retriever):
     def get_documents(self, source):
         return self.documents
 import logging

     def get_documents(self, source):
         return self.documents
+    def get_source_display_name(self, source):
+        return source
 import logging

tests/test_documents.py CHANGED Viewed

@@ -70,3 +70,14 @@ def test_write_write_read(tmp_path, documents_manager, retriever, extension):
     assert db_data["content"].iloc[0] == data_2["content"].iloc[0]
     assert np.allclose(db_data["embedding"].iloc[0], data_2["embedding"].iloc[0])
     assert db_data["n_tokens"].iloc[0] == data_2["n_tokens"].iloc[0]

     assert db_data["content"].iloc[0] == data_2["content"].iloc[0]
     assert np.allclose(db_data["embedding"].iloc[0], data_2["embedding"].iloc[0])
     assert db_data["n_tokens"].iloc[0] == data_2["n_tokens"].iloc[0]
+def test_update_source(tmp_path):
+    display_name = "Super Test"
+    db = DocumentsDB(tmp_path / "test.db")
+    db.update_source(source="test", display_name=display_name)
+    returned_display_name = SQLiteRetriever(tmp_path / "test.db").get_source_display_name("test")
+    assert display_name == returned_display_name