Spaces:

edugp
/

perplexity-lenses

Runtime error

App Files Files Community

edugp commited on Nov 14, 2021

Commit

7b62017

1 Parent(s): 0def03f

Run tokenizer before computing perplexity and format

Browse files

Files changed (9) hide show

README.md +3 -3
app.py +39 -9
cli.py +64 -21
perplexity_lenses/data.py +30 -5
perplexity_lenses/engine.py +10 -2
perplexity_lenses/perplexity.py +29 -4
perplexity_lenses/visualization.py +27 -5
requirements.txt +7 -7
tests/test_data.py +3 -1

README.md CHANGED Viewed

@@ -15,13 +15,13 @@ pip install -r requirements.txt
 ```
 # Web App:
-The app is hosted [here](https://huggingface.co/spaces/edugp/perplexity-lenses). To run it locally:
 ```
 python -m streamlit run app.py
 ```
 # CLI:
-The CLI with no arguments defaults to running mc4 in Spanish.
 For full usage:
 ```
 python cli.py --help
@@ -43,4 +43,4 @@ python cli.py \
 # Tests:
 ```
 python -m unittest discover -s ./tests/ -p "test_*.py"
-```

 ```
 # Web App:
+The app is hosted [here](https://huggingface.co/spaces/edugp/perplexity-lenses). To run it locally:
 ```
 python -m streamlit run app.py
 ```
 # CLI:
+The CLI with no arguments defaults to running mc4 in Spanish.
 For full usage:
 ```
 python cli.py --help
 # Tests:
 ```
 python -m unittest discover -s ./tests/ -p "test_*.py"
+```

app.py CHANGED Viewed

@@ -3,11 +3,15 @@ from functools import partial
 import streamlit as st
 from embedding_lenses.data import uploaded_file_to_dataframe
-from embedding_lenses.dimensionality_reduction import get_tsne_embeddings, get_umap_embeddings
 from embedding_lenses.embedding import load_model
-from perplexity_lenses.data import documents_df_to_sentences_df, hub_dataset_to_dataframe
-from perplexity_lenses.engine import DIMENSIONALITY_REDUCTION_ALGORITHMS, DOCUMENT_TYPES, EMBEDDING_MODELS, LANGUAGES, SEED, generate_plot
 from perplexity_lenses.perplexity import KenlmModel
 logging.basicConfig(level=logging.INFO)
@@ -17,7 +21,9 @@ logger = logging.getLogger(__name__)
 st.title("Perplexity Lenses")
 st.write("Visualize text embeddings in 2D using colors to represent perplexity values.")
 uploaded_file = st.file_uploader("Choose an csv/tsv file...", type=["csv", "tsv"])
-st.write("Alternatively, select a dataset from the [hub](https://huggingface.co/datasets)")
 col1, col2, col3 = st.columns(3)
 with col1:
     hub_dataset = st.text_input("Dataset name", "mc4")
@@ -38,13 +44,17 @@ with col6:
 with col7:
     sample = st.number_input("Maximum number of documents to use", 1, 100000, 1000)
-dimensionality_reduction = st.selectbox("Dimensionality Reduction algorithm", DIMENSIONALITY_REDUCTION_ALGORITHMS, 0)
 model_name = st.selectbox("Sentence embedding model", EMBEDDING_MODELS, 0)
 with st.spinner(text="Loading embedding model..."):
     model = load_model(model_name)
 dimensionality_reduction_function = (
-    partial(get_umap_embeddings, random_state=SEED) if dimensionality_reduction == "UMAP" else partial(get_tsne_embeddings, random_state=SEED)
 )
 with st.spinner(text="Loading KenLM model..."):
@@ -58,12 +68,32 @@ if uploaded_file or hub_dataset:
                 df = documents_df_to_sentences_df(df, text_column, sample, seed=SEED)
             df["perplexity"] = df[text_column].map(kenlm_model.get_perplexity)
         else:
-            df = hub_dataset_to_dataframe(hub_dataset, hub_dataset_config, hub_dataset_split, sample, text_column, kenlm_model, seed=SEED, doc_type=doc_type)
     # Round perplexity
     df["perplexity"] = df["perplexity"].round().astype(int)
-    logger.info(f"Perplexity range: {df['perplexity'].min()} - {df['perplexity'].max()}")
-    plot = generate_plot(df, text_column, "perplexity", None, dimensionality_reduction_function, model, seed=SEED, context_logger=st.spinner)
     logger.info("Displaying plot")
     st.bokeh_chart(plot)
     logger.info("Done")

 import streamlit as st
 from embedding_lenses.data import uploaded_file_to_dataframe
+from embedding_lenses.dimensionality_reduction import (get_tsne_embeddings,
+                                                       get_umap_embeddings)
 from embedding_lenses.embedding import load_model
+from perplexity_lenses.data import (documents_df_to_sentences_df,
+                                    hub_dataset_to_dataframe)
+from perplexity_lenses.engine import (DIMENSIONALITY_REDUCTION_ALGORITHMS,
+                                      DOCUMENT_TYPES, EMBEDDING_MODELS,
+                                      LANGUAGES, SEED, generate_plot)
 from perplexity_lenses.perplexity import KenlmModel
 logging.basicConfig(level=logging.INFO)
 st.title("Perplexity Lenses")
 st.write("Visualize text embeddings in 2D using colors to represent perplexity values.")
 uploaded_file = st.file_uploader("Choose an csv/tsv file...", type=["csv", "tsv"])
+st.write(
+    "Alternatively, select a dataset from the [hub](https://huggingface.co/datasets)"
+)
 col1, col2, col3 = st.columns(3)
 with col1:
     hub_dataset = st.text_input("Dataset name", "mc4")
 with col7:
     sample = st.number_input("Maximum number of documents to use", 1, 100000, 1000)
+dimensionality_reduction = st.selectbox(
+    "Dimensionality Reduction algorithm", DIMENSIONALITY_REDUCTION_ALGORITHMS, 0
+)
 model_name = st.selectbox("Sentence embedding model", EMBEDDING_MODELS, 0)
 with st.spinner(text="Loading embedding model..."):
     model = load_model(model_name)
 dimensionality_reduction_function = (
+    partial(get_umap_embeddings, random_state=SEED)
+    if dimensionality_reduction == "UMAP"
+    else partial(get_tsne_embeddings, random_state=SEED)
 )
 with st.spinner(text="Loading KenLM model..."):
                 df = documents_df_to_sentences_df(df, text_column, sample, seed=SEED)
             df["perplexity"] = df[text_column].map(kenlm_model.get_perplexity)
         else:
+            df = hub_dataset_to_dataframe(
+                hub_dataset,
+                hub_dataset_config,
+                hub_dataset_split,
+                sample,
+                text_column,
+                kenlm_model,
+                seed=SEED,
+                doc_type=doc_type,
+            )
     # Round perplexity
     df["perplexity"] = df["perplexity"].round().astype(int)
+    logger.info(
+        f"Perplexity range: {df['perplexity'].min()} - {df['perplexity'].max()}"
+    )
+    plot = generate_plot(
+        df,
+        text_column,
+        "perplexity",
+        None,
+        dimensionality_reduction_function,
+        model,
+        seed=SEED,
+        context_logger=st.spinner,
+    )
     logger.info("Displaying plot")
     st.bokeh_chart(plot)
     logger.info("Done")

cli.py CHANGED Viewed

@@ -2,15 +2,20 @@ import logging
 from functools import partial
 from typing import Optional
 import typer
 from bokeh.plotting import output_file as bokeh_output_file
 from bokeh.plotting import save
 from embedding_lenses.data import uploaded_file_to_dataframe
-from embedding_lenses.dimensionality_reduction import get_tsne_embeddings, get_umap_embeddings
 from embedding_lenses.embedding import load_model
-from perplexity_lenses.data import documents_df_to_sentences_df, hub_dataset_to_dataframe
-from perplexity_lenses.engine import DIMENSIONALITY_REDUCTION_ALGORITHMS, DOCUMENT_TYPES, EMBEDDING_MODELS, LANGUAGES, SEED, generate_plot
 from perplexity_lenses.perplexity import KenlmModel
 logging.basicConfig(level=logging.INFO)
@@ -22,19 +27,36 @@ app = typer.Typer()
 @app.command()
 def main(
-    dataset: str = typer.Option("mc4", help="The name of the hub dataset or local csv/tsv file."),
-    dataset_config: Optional[str] = typer.Option("es", help="The configuration of the hub dataset, if any. Does not apply to local csv/tsv files."),
-    dataset_split: Optional[str] = typer.Option("train", help="The dataset split. Does not apply to local csv/tsv files."),
     text_column: str = typer.Option("text", help="The text field name."),
-    language: str = typer.Option("es", help=f"The language of the text. Options: {LANGUAGES}"),
-    doc_type: str = typer.Option("sentence", help=f"Whether to embed at the sentence or document level. Options: {DOCUMENT_TYPES}."),
     sample: int = typer.Option(1000, help="Maximum number of examples to use."),
     dimensionality_reduction: str = typer.Option(
         DIMENSIONALITY_REDUCTION_ALGORITHMS[0],
         help=f"Whether to use UMAP or t-SNE for dimensionality reduction. Options: {DIMENSIONALITY_REDUCTION_ALGORITHMS}.",
     ),
-    model_name: str = typer.Option(EMBEDDING_MODELS[0], help=f"The sentence embedding model to use. Options: {EMBEDDING_MODELS}"),
-    output_file: str = typer.Option("perplexity.html", help="The name of the output visualization HTML file."),
 ):
     """
     Perplexity Lenses: Visualize text embeddings in 2D using colors to represent perplexity values.
@@ -42,26 +64,47 @@ def main(
     logger.info("Loading embedding model...")
     model = load_model(model_name)
     dimensionality_reduction_function = (
-        partial(get_umap_embeddings, random_state=SEED) if dimensionality_reduction.lower() == "umap" else partial(get_tsne_embeddings, random_state=SEED)
     )
     logger.info("Loading KenLM model...")
     kenlm_model = KenlmModel.from_pretrained(language)
     logger.info("Loading dataset...")
     if dataset.endswith(".csv") or dataset.endswith(".tsv"):
-        df = uploaded_file_to_dataframe(dataset)
         if doc_type.lower() == "sentence":
             df = documents_df_to_sentences_df(df, text_column, sample, seed=SEED)
         df["perplexity"] = df[text_column].map(kenlm_model.get_perplexity)
     else:
-        df = hub_dataset_to_dataframe(dataset, dataset_config, dataset_split, sample, text_column, kenlm_model, seed=SEED, doc_type=doc_type)
-        # Round perplexity
-        df["perplexity"] = df["perplexity"].round().astype(int)
-        logger.info(f"Perplexity range: {df['perplexity'].min()} - {df['perplexity'].max()}")
-        plot = generate_plot(df, text_column, "perplexity", None, dimensionality_reduction_function, model, seed=SEED)
-        logger.info("Saving plot")
-        bokeh_output_file(output_file)
-        save(plot)
-        logger.info("Done")
 if __name__ == "__main__":

 from functools import partial
 from typing import Optional
+import pandas as pd
 import typer
 from bokeh.plotting import output_file as bokeh_output_file
 from bokeh.plotting import save
 from embedding_lenses.data import uploaded_file_to_dataframe
+from embedding_lenses.dimensionality_reduction import (get_tsne_embeddings,
+                                                       get_umap_embeddings)
 from embedding_lenses.embedding import load_model
+from perplexity_lenses.data import (documents_df_to_sentences_df,
+                                    hub_dataset_to_dataframe)
+from perplexity_lenses.engine import (DIMENSIONALITY_REDUCTION_ALGORITHMS,
+                                      DOCUMENT_TYPES, EMBEDDING_MODELS,
+                                      LANGUAGES, SEED, generate_plot)
 from perplexity_lenses.perplexity import KenlmModel
 logging.basicConfig(level=logging.INFO)
 @app.command()
 def main(
+    dataset: str = typer.Option(
+        "mc4", help="The name of the hub dataset or local csv/tsv file."
+    ),
+    dataset_config: Optional[str] = typer.Option(
+        "es",
+        help="The configuration of the hub dataset, if any. Does not apply to local csv/tsv files.",
+    ),
+    dataset_split: Optional[str] = typer.Option(
+        "train", help="The dataset split. Does not apply to local csv/tsv files."
+    ),
     text_column: str = typer.Option("text", help="The text field name."),
+    language: str = typer.Option(
+        "es", help=f"The language of the text. Options: {LANGUAGES}"
+    ),
+    doc_type: str = typer.Option(
+        "sentence",
+        help=f"Whether to embed at the sentence or document level. Options: {DOCUMENT_TYPES}.",
+    ),
     sample: int = typer.Option(1000, help="Maximum number of examples to use."),
     dimensionality_reduction: str = typer.Option(
         DIMENSIONALITY_REDUCTION_ALGORITHMS[0],
         help=f"Whether to use UMAP or t-SNE for dimensionality reduction. Options: {DIMENSIONALITY_REDUCTION_ALGORITHMS}.",
     ),
+    model_name: str = typer.Option(
+        EMBEDDING_MODELS[0],
+        help=f"The sentence embedding model to use. Options: {EMBEDDING_MODELS}",
+    ),
+    output_file: str = typer.Option(
+        "perplexity.html", help="The name of the output visualization HTML file."
+    ),
 ):
     """
     Perplexity Lenses: Visualize text embeddings in 2D using colors to represent perplexity values.
     logger.info("Loading embedding model...")
     model = load_model(model_name)
     dimensionality_reduction_function = (
+        partial(get_umap_embeddings, random_state=SEED)
+        if dimensionality_reduction.lower() == "umap"
+        else partial(get_tsne_embeddings, random_state=SEED)
     )
     logger.info("Loading KenLM model...")
     kenlm_model = KenlmModel.from_pretrained(language)
     logger.info("Loading dataset...")
     if dataset.endswith(".csv") or dataset.endswith(".tsv"):
+        df = pd.read_csv(dataset, sep="\t" if dataset.endswith(".tsv") else ",")
         if doc_type.lower() == "sentence":
             df = documents_df_to_sentences_df(df, text_column, sample, seed=SEED)
         df["perplexity"] = df[text_column].map(kenlm_model.get_perplexity)
     else:
+        df = hub_dataset_to_dataframe(
+            dataset,
+            dataset_config,
+            dataset_split,
+            sample,
+            text_column,
+            kenlm_model,
+            seed=SEED,
+            doc_type=doc_type,
+        )
+    # Round perplexity
+    df["perplexity"] = df["perplexity"].round().astype(int)
+    logger.info(
+        f"Perplexity range: {df['perplexity'].min()} - {df['perplexity'].max()}"
+    )
+    plot = generate_plot(
+        df,
+        text_column,
+        "perplexity",
+        None,
+        dimensionality_reduction_function,
+        model,
+        seed=SEED,
+    )
+    logger.info("Saving plot")
+    bokeh_output_file(output_file)
+    save(plot)
+    logger.info("Done")
 if __name__ == "__main__":

perplexity_lenses/data.py CHANGED Viewed

@@ -9,7 +9,14 @@ from perplexity_lenses.perplexity import KenlmModel
 def hub_dataset_to_dataframe(
-    path: str, name: str, split: str, sample: int, text_column: str, model: KenlmModel, seed: int = 0, doc_type: str = "Whole document"
 ) -> pd.DataFrame:
     load_dataset_fn = partial(load_dataset, path=path)
     if name:
@@ -18,9 +25,19 @@ def hub_dataset_to_dataframe(
         load_dataset_fn = partial(load_dataset_fn, split=split)
     dataset = load_dataset_fn(streaming=True).shuffle(buffer_size=10000, seed=seed)
     if doc_type.lower() == "sentence":
-        dataset = dataset.map(lambda x: [{text_column: sentence, "perplexity": model.get_perplexity(sentence)} for sentence in x[text_column].split("\n")])
     else:
-        dataset = dataset.map(lambda x: {text_column: x[text_column], "perplexity": model.get_perplexity(x[text_column])})
     instances = []
     count = 0
     for instance in tqdm(dataset, total=sample):
@@ -38,6 +55,14 @@ def hub_dataset_to_dataframe(
     return pd.DataFrame(instances)
-def documents_df_to_sentences_df(df: pd.DataFrame, text_column: str, sample: int, seed: int = 0):
-    df_sentences = pd.DataFrame({text_column: np.array(df[text_column].map(lambda x: x.split("\n")).values.tolist()).flatten()})
     return df_sentences.sample(min(sample, df_sentences.shape[0]), random_state=seed)

 def hub_dataset_to_dataframe(
+    path: str,
+    name: str,
+    split: str,
+    sample: int,
+    text_column: str,
+    model: KenlmModel,
+    seed: int = 0,
+    doc_type: str = "Whole document",
 ) -> pd.DataFrame:
     load_dataset_fn = partial(load_dataset, path=path)
     if name:
         load_dataset_fn = partial(load_dataset_fn, split=split)
     dataset = load_dataset_fn(streaming=True).shuffle(buffer_size=10000, seed=seed)
     if doc_type.lower() == "sentence":
+        dataset = dataset.map(
+            lambda x: [
+                {text_column: sentence, "perplexity": model.get_perplexity(sentence)}
+                for sentence in x[text_column].split("\n")
+            ]
+        )
     else:
+        dataset = dataset.map(
+            lambda x: {
+                text_column: x[text_column],
+                "perplexity": model.get_perplexity(x[text_column]),
+            }
+        )
     instances = []
     count = 0
     for instance in tqdm(dataset, total=sample):
     return pd.DataFrame(instances)
+def documents_df_to_sentences_df(
+    df: pd.DataFrame, text_column: str, sample: int, seed: int = 0
+):
+    df_sentences = pd.DataFrame(
+        {
+            text_column: np.array(
+                df[text_column].map(lambda x: x.split("\n")).values.tolist()
+            ).flatten()
+        }
+    )
     return df_sentences.sample(min(sample, df_sentences.shape[0]), random_state=seed)

perplexity_lenses/engine.py CHANGED Viewed

@@ -96,7 +96,9 @@ def generate_plot(
     context_logger: Union[st.spinner, ContextLogger] = ContextLogger,
 ) -> Figure:
     if text_column not in df.columns:
-        raise ValueError(f"The specified column name doesn't exist. Columns available: {df.columns.values}")
     if label_column not in df.columns:
         df[label_column] = 0
     df = df.dropna(subset=[text_column, label_column])
@@ -110,6 +112,12 @@ def generate_plot(
         embeddings_2d = dimensionality_reduction_function(embeddings)
     logger.info("Generating figure")
     plot = draw_interactive_scatter_plot(
-        df[text_column].values, embeddings_2d[:, 0], embeddings_2d[:, 1], encoded_labels.values, df[label_column].values, text_column, label_column
     )
     return plot

     context_logger: Union[st.spinner, ContextLogger] = ContextLogger,
 ) -> Figure:
     if text_column not in df.columns:
+        raise ValueError(
+            f"The specified column name doesn't exist. Columns available: {df.columns.values}"
+        )
     if label_column not in df.columns:
         df[label_column] = 0
     df = df.dropna(subset=[text_column, label_column])
         embeddings_2d = dimensionality_reduction_function(embeddings)
     logger.info("Generating figure")
     plot = draw_interactive_scatter_plot(
+        df[text_column].values,
+        embeddings_2d[:, 0],
+        embeddings_2d[:, 1],
+        encoded_labels.values,
+        df[label_column].values,
+        text_column,
+        label_column,
     )
     return plot

perplexity_lenses/perplexity.py CHANGED Viewed

@@ -5,6 +5,21 @@ import urllib.request
 from typing import Dict
 import kenlm
 class KenlmModel:
@@ -46,32 +61,42 @@ class KenlmModel:
         "►": "-",
     }
     unicode_punct_re = re.compile(f"[{''.join(unicode_punct.keys())}]")
-    non_printing_chars_re = re.compile(f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]")
     def __init__(self, language):
         download_kenlm_model(language)
         try:
             self.model = kenlm.Model(f"{language}.arpa.bin")
         except OSError:
             os.remove(f"{language}.arpa.bin")
             if os.path.exists(f"{language}.sp.model"):
                 os.remove(f"{language}.sp.model")
-            raise OSError("File was corrupt and should have been removed. Please, retry.")
     @classmethod
     def from_pretrained(cls, language: str):
         return cls(language)
     def get_perplexity(self, doc: str, normalize_cc_net: bool = True):
         if normalize_cc_net:
             doc = self.normalize(doc)
         doc_log_score, doc_length = 0, 0
         for line in doc.split("\n"):
             log_score = self.model.score(line)
             length = len(line.split()) + 1
             doc_log_score += log_score
             doc_length += length
-        return 10.0 ** (-doc_log_score / doc_length)
     def normalize(
         self,
@@ -106,7 +131,7 @@ class KenlmModel:
         return "".join(output)
     def replace_unicode_punct(self, text: str) -> str:
-        return "".join((self.unicode_punct.get(c, c) for c in text))
     def remove_unicode_punct(self, text: str) -> str:
         """More aggressive version of replace_unicode_punct but also faster."""

 from typing import Dict
 import kenlm
+import sentencepiece
+class SentencePiece:
+    def __init__(
+        self,
+        model: str,
+    ):
+        super().__init__()
+        self.sp = sentencepiece.SentencePieceProcessor()
+        self.sp.load(str(model))
+    def do(self, text: dict) -> dict:
+        tokenized = self.sp.encode_as_pieces(text)
+        return " ".join(tokenized)
 class KenlmModel:
         "►": "-",
     }
     unicode_punct_re = re.compile(f"[{''.join(unicode_punct.keys())}]")
+    non_printing_chars_re = re.compile(
+        f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]"
+    )
     def __init__(self, language):
         download_kenlm_model(language)
         try:
             self.model = kenlm.Model(f"{language}.arpa.bin")
+            self.tokenizer = SentencePiece(f"{language}.sp.model")
         except OSError:
             os.remove(f"{language}.arpa.bin")
             if os.path.exists(f"{language}.sp.model"):
                 os.remove(f"{language}.sp.model")
+            raise OSError(
+                "File was corrupt and should have been removed. Please, retry."
+            )
     @classmethod
     def from_pretrained(cls, language: str):
         return cls(language)
+    def pp(self, log_score, length):
+        return 10.0 ** (-log_score / length)
     def get_perplexity(self, doc: str, normalize_cc_net: bool = True):
         if normalize_cc_net:
             doc = self.normalize(doc)
+        # Tokenize (after normalizing): See https://github.com/facebookresearch/cc_net/blob/bda555bd1cf1ee2e0b925363e62a61cd46c8b60d/cc_net/mine.py#L352 for full pipeline
+        doc = self.tokenizer.do(doc)
         doc_log_score, doc_length = 0, 0
         for line in doc.split("\n"):
             log_score = self.model.score(line)
             length = len(line.split()) + 1
             doc_log_score += log_score
             doc_length += length
+        return round(self.pp(doc_log_score, doc_length), 1)
     def normalize(
         self,
         return "".join(output)
     def replace_unicode_punct(self, text: str) -> str:
+        return "".join(self.unicode_punct.get(c, c) for c in text)
     def remove_unicode_punct(self, text: str) -> str:
         """More aggressive version of replace_unicode_punct but also faster."""

perplexity_lenses/visualization.py CHANGED Viewed

@@ -6,7 +6,13 @@ from bokeh.transform import factor_cmap
 def draw_interactive_scatter_plot(
-    texts: np.ndarray, xs: np.ndarray, ys: np.ndarray, values: np.ndarray, labels: np.ndarray, text_column: str, label_column: str
 ) -> Figure:
     # Smooth down values for coloring, by taking the entropy = log10(perplexity) and multiply it by 10000
     values = ((np.log10(values)) * 10000).round().astype(int)
@@ -16,17 +22,33 @@ def draw_interactive_scatter_plot(
     if max_value - min_value == 0:
         values_color = np.ones(len(values))
     else:
-        values_color = ((values - min_value) / (max_value - min_value) * 255).round().astype(int)
     values_color_sorted = sorted(values_color)
     values_list = values.astype(str).tolist()
     values_sorted = sorted(values_list)
     labels_list = labels.astype(str).tolist()
-    source = ColumnDataSource(data=dict(x=xs, y=ys, text=texts, label=values_list, original_label=labels_list))
-    hover = HoverTool(tooltips=[(text_column, "@text{safe}"), (label_column, "@original_label")])
     p = figure(plot_width=800, plot_height=800, tools=[hover])
-    p.circle("x", "y", size=10, source=source, fill_color=factor_cmap("label", palette=[Pallete[id_] for id_ in values_color_sorted], factors=values_sorted))
     p.axis.visible = False
     p.xgrid.grid_line_color = None

 def draw_interactive_scatter_plot(
+    texts: np.ndarray,
+    xs: np.ndarray,
+    ys: np.ndarray,
+    values: np.ndarray,
+    labels: np.ndarray,
+    text_column: str,
+    label_column: str,
 ) -> Figure:
     # Smooth down values for coloring, by taking the entropy = log10(perplexity) and multiply it by 10000
     values = ((np.log10(values)) * 10000).round().astype(int)
     if max_value - min_value == 0:
         values_color = np.ones(len(values))
     else:
+        values_color = (
+            ((values - min_value) / (max_value - min_value) * 255).round().astype(int)
+        )
     values_color_sorted = sorted(values_color)
     values_list = values.astype(str).tolist()
     values_sorted = sorted(values_list)
     labels_list = labels.astype(str).tolist()
+    source = ColumnDataSource(
+        data=dict(x=xs, y=ys, text=texts, label=values_list, original_label=labels_list)
+    )
+    hover = HoverTool(
+        tooltips=[(text_column, "@text{safe}"), (label_column, "@original_label")]
+    )
     p = figure(plot_width=800, plot_height=800, tools=[hover])
+    p.circle(
+        "x",
+        "y",
+        size=10,
+        source=source,
+        fill_color=factor_cmap(
+            "label",
+            palette=[Pallete[id_] for id_ in values_color_sorted],
+            factors=values_sorted,
+        ),
+    )
     p.axis.visible = False
     p.xgrid.grid_line_color = None

requirements.txt CHANGED Viewed

@@ -1,11 +1,11 @@
 huggingface-hub==0.0.19
 streamlit==1.1.0
 transformers==4.11.3
-watchdog==2.1.3
-sentence-transformers==2.0.0
-bokeh==2.2.2
 umap-learn==0.5.2
-numpy==1.20.0
-https://files.pythonhosted.org/packages/2f/58/e00d2495b54f4ba97ca31a11aa7e636f80183ccf9b616f7eaa5518d050bb/embedding_lenses-0.5.0-py3-none-any.whl
-https://github.com/kpu/kenlm/archive/master.zip
-typer==0.4.0

+bokeh==2.2.2
+https://files.pythonhosted.org/packages/2f/58/e00d2495b54f4ba97ca31a11aa7e636f80183ccf9b616f7eaa5518d050bb/embedding_lenses-0.5.0-py3-none-any.whl
+https://github.com/kpu/kenlm/archive/master.zip
 huggingface-hub==0.0.19
+numpy==1.20.0
+sentence-transformers==2.0.0
 streamlit==1.1.0
 transformers==4.11.3
+typer==0.4.0
 umap-learn==0.5.2
+watchdog==2.1.3

tests/test_data.py CHANGED Viewed

@@ -10,4 +10,6 @@ class TestData(unittest.TestCase):
         input_df = pd.DataFrame({"text": ["foo\nbar"]})
         expected_output_df = pd.DataFrame({"text": ["foo", "bar"]})
         output_df = documents_df_to_sentences_df(input_df, "text", 100)
-        pd.testing.assert_frame_equal(output_df, expected_output_df, check_like=True, check_exact=True)

         input_df = pd.DataFrame({"text": ["foo\nbar"]})
         expected_output_df = pd.DataFrame({"text": ["foo", "bar"]})
         output_df = documents_df_to_sentences_df(input_df, "text", 100)
+        pd.testing.assert_frame_equal(
+            output_df, expected_output_df, check_like=True, check_exact=True
+        )