Spaces:

edugp
/

perplexity-lenses

Runtime error

App Files Files Community

edugp commited on Nov 3, 2021

Commit

1f30dbc

1 Parent(s): 77d22a6

Inital commit for perplexity lenses

Browse files

Files changed (4) hide show

app.py +141 -0
data.py +28 -0
perplexity.py +37 -0
requirements.txt +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import logging
+from functools import partial
+from typing import Callable, Optional
+import pandas as pd
+import streamlit as st
+from bokeh.plotting import Figure
+from embedding_lenses.data import uploaded_file_to_dataframe
+from embedding_lenses.dimensionality_reduction import (get_tsne_embeddings,
+                                                       get_umap_embeddings)
+from embedding_lenses.embedding import embed_text, load_model
+from embedding_lenses.utils import encode_labels
+from embedding_lenses.visualization import draw_interactive_scatter_plot
+from sentence_transformers import SentenceTransformer
+from data import hub_dataset_to_dataframe
+from perplexity import KenlmModel
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+EMBEDDING_MODELS = ["distiluse-base-multilingual-cased-v1", "all-mpnet-base-v2", "flax-sentence-embeddings/all_datasets_v3_mpnet-base"]
+DIMENSIONALITY_REDUCTION_ALGORITHMS = ["UMAP", "t-SNE"]
+LANGUAGES = [
+    "af",
+    "ar",
+    "az",
+    "be",
+    "bg",
+    "bn",
+    "ca",
+    "cs",
+    "da",
+    "de",
+    "el",
+    "en",
+    "es",
+    "et",
+    "fa",
+    "fi",
+    "fr",
+    "gu",
+    "he",
+    "hi",
+    "hr",
+    "hu",
+    "hy",
+    "id",
+    "is",
+    "it",
+    "ja",
+    "ka",
+    "kk",
+    "km",
+    "kn",
+    "ko",
+    "lt",
+    "lv",
+    "mk",
+    "ml",
+    "mn",
+    "mr",
+    "my",
+    "ne",
+    "nl",
+    "no",
+    "pl",
+    "pt",
+    "ro",
+    "ru",
+    "uk",
+    "zh",
+]
+SEED = 0
+def generate_plot(
+    df: pd.DataFrame,
+    text_column: str,
+    label_column: str,
+    sample: Optional[int],
+    dimensionality_reduction_function: Callable,
+    model: SentenceTransformer,
+) -> Figure:
+    if text_column not in df.columns:
+        raise ValueError(f"The specified column name doesn't exist. Columns available: {df.columns.values}")
+    if label_column not in df.columns:
+        df[label_column] = 0
+    df = df.dropna(subset=[text_column, label_column])
+    if sample:
+        df = df.sample(min(sample, df.shape[0]), random_state=SEED)
+    with st.spinner(text="Embedding text..."):
+        embeddings = embed_text(df[text_column].values.tolist(), model)
+    logger.info("Encoding labels")
+    encoded_labels = encode_labels(df[label_column])
+    with st.spinner("Reducing dimensionality..."):
+        embeddings_2d = dimensionality_reduction_function(embeddings)
+    logger.info("Generating figure")
+    plot = draw_interactive_scatter_plot(
+        df[text_column].values, embeddings_2d[:, 0], embeddings_2d[:, 1], encoded_labels.values, df[label_column].values, text_column, label_column
+    )
+    return plot
+st.title("Perplexity Lenses")
+st.write("Visualize text embeddings in 2D using colors to represent perplexity values.")
+uploaded_file = st.file_uploader("Choose an csv/tsv file...", type=["csv", "tsv"])
+st.write("Alternatively, select a dataset from the [hub](https://huggingface.co/datasets)")
+col1, col2, col3 = st.columns(3)
+with col1:
+    hub_dataset = st.text_input("Dataset name", "mc4")
+with col2:
+    hub_dataset_config = st.text_input("Dataset configuration", "es")
+with col3:
+    hub_dataset_split = st.text_input("Dataset split", "train")
+text_column = st.text_input("Text column name", "text")
+language = st.selectbox("Language", LANGUAGES, 12)
+sample = st.number_input("Maximum number of documents to use", 1, 100000, 1000)
+dimensionality_reduction = st.selectbox("Dimensionality Reduction algorithm", DIMENSIONALITY_REDUCTION_ALGORITHMS, 0)
+model_name = st.selectbox("Sentence embedding model", EMBEDDING_MODELS, 0)
+with st.spinner(text="Loading embedding model..."):
+    model = load_model(model_name)
+dimensionality_reduction_function = (
+    partial(get_umap_embeddings, random_state=SEED) if dimensionality_reduction == "UMAP" else partial(get_tsne_embeddings, random_state=SEED)
+)
+with st.spinner(text="Loading KenLM model..."):
+    kenlm_model = KenlmModel.from_pretrained(language)
+if uploaded_file or hub_dataset:
+    with st.spinner("Loading dataset..."):
+        if uploaded_file:
+            df = uploaded_file_to_dataframe(uploaded_file)
+            df["perplexity"] = df[text_column].map(lambda x: model.get_perplexity(x[text_column]))
+        else:
+            df = hub_dataset_to_dataframe(hub_dataset, hub_dataset_config, hub_dataset_split, sample, text_column, kenlm_model, seed=SEED)
+    plot = generate_plot(df, text_column, "perplexity", sample, dimensionality_reduction_function, model)
+    logger.info("Displaying plot")
+    st.bokeh_chart(plot)
+    logger.info("Done")

data.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from functools import partial
+import pandas as pd
+from datasets import load_dataset
+from tqdm import tqdm
+from perplexity import KenlmModel
+def hub_dataset_to_dataframe(path: str, name: str, split: str, sample: int, text_column: str, model: KenlmModel, seed: int = 0) -> pd.DataFrame:
+    load_dataset_fn = partial(load_dataset, path=path)
+    if name:
+        load_dataset_fn = partial(load_dataset_fn, name=name)
+    if split:
+        load_dataset_fn = partial(load_dataset_fn, split=split)
+    dataset = (
+        load_dataset_fn(streaming=True)
+        .shuffle(buffer_size=10000, seed=seed)
+        .map(lambda x: {text_column: x[text_column], "perplexity": model.get_perplexity(x[text_column])})
+    )
+    instances = []
+    count = 0
+    for instance in tqdm(dataset, total=sample):
+        instances.append(instance)
+        count += 1
+        if count == sample:
+            break
+    return pd.DataFrame(instances)

perplexity.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import os
+import urllib.request
+import kenlm
+class KenlmModel:
+    def __init__(self, language):
+        download_kenlm_model(language)
+        self.model = kenlm.Model(f"{language}.arpa.bin")
+    @classmethod
+    def from_pretrained(cls, language: str):
+        return cls(language)
+    def get_perplexity(self, doc: str):
+        doc_log_score, doc_length = 0, 0
+        for line in doc.split("\n"):
+            log_score = self.model.score(line)
+            length = len(line.split()) + 1
+            doc_log_score += log_score
+            doc_length += length
+        return 10.0 ** (-doc_log_score / doc_length)
+def download_kenlm_model(language: str):
+    root_url = "http://dl.fbaipublicfiles.com/cc_net/lm"
+    bin_name = f"{language}.arpa.bin"
+    model_name = f"{language}.sp.model"
+    bin_url = f"{root_url}/{bin_name}"
+    model_url = f"{root_url}/{model_name}"
+    if not os.path.isfile(bin_name):
+        urllib.request.urlretrieve(bin_url, bin_name)
+    if not os.path.isfile(model_name):
+        urllib.request.urlretrieve(model_url, model_name)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+huggingface-hub==0.0.17
+streamlit==0.84.1
+transformers==4.11.3
+watchdog==2.1.3
+sentence-transformers==2.0.0
+bokeh==2.2.2
+umap-learn==0.5.1
+numpy==1.20.0
+embedding-lenses==0.2.0
+git+git://github.com/kpu/kenlm/archive/master.zip