edugp commited on
Commit
1f30dbc
1 Parent(s): 77d22a6

Inital commit for perplexity lenses

Browse files
Files changed (4) hide show
  1. app.py +141 -0
  2. data.py +28 -0
  3. perplexity.py +37 -0
  4. requirements.txt +10 -0
app.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from functools import partial
3
+ from typing import Callable, Optional
4
+
5
+ import pandas as pd
6
+ import streamlit as st
7
+ from bokeh.plotting import Figure
8
+ from embedding_lenses.data import uploaded_file_to_dataframe
9
+ from embedding_lenses.dimensionality_reduction import (get_tsne_embeddings,
10
+ get_umap_embeddings)
11
+ from embedding_lenses.embedding import embed_text, load_model
12
+ from embedding_lenses.utils import encode_labels
13
+ from embedding_lenses.visualization import draw_interactive_scatter_plot
14
+ from sentence_transformers import SentenceTransformer
15
+
16
+ from data import hub_dataset_to_dataframe
17
+ from perplexity import KenlmModel
18
+
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
21
+ EMBEDDING_MODELS = ["distiluse-base-multilingual-cased-v1", "all-mpnet-base-v2", "flax-sentence-embeddings/all_datasets_v3_mpnet-base"]
22
+ DIMENSIONALITY_REDUCTION_ALGORITHMS = ["UMAP", "t-SNE"]
23
+ LANGUAGES = [
24
+ "af",
25
+ "ar",
26
+ "az",
27
+ "be",
28
+ "bg",
29
+ "bn",
30
+ "ca",
31
+ "cs",
32
+ "da",
33
+ "de",
34
+ "el",
35
+ "en",
36
+ "es",
37
+ "et",
38
+ "fa",
39
+ "fi",
40
+ "fr",
41
+ "gu",
42
+ "he",
43
+ "hi",
44
+ "hr",
45
+ "hu",
46
+ "hy",
47
+ "id",
48
+ "is",
49
+ "it",
50
+ "ja",
51
+ "ka",
52
+ "kk",
53
+ "km",
54
+ "kn",
55
+ "ko",
56
+ "lt",
57
+ "lv",
58
+ "mk",
59
+ "ml",
60
+ "mn",
61
+ "mr",
62
+ "my",
63
+ "ne",
64
+ "nl",
65
+ "no",
66
+ "pl",
67
+ "pt",
68
+ "ro",
69
+ "ru",
70
+ "uk",
71
+ "zh",
72
+ ]
73
+ SEED = 0
74
+
75
+
76
+ def generate_plot(
77
+ df: pd.DataFrame,
78
+ text_column: str,
79
+ label_column: str,
80
+ sample: Optional[int],
81
+ dimensionality_reduction_function: Callable,
82
+ model: SentenceTransformer,
83
+ ) -> Figure:
84
+ if text_column not in df.columns:
85
+ raise ValueError(f"The specified column name doesn't exist. Columns available: {df.columns.values}")
86
+ if label_column not in df.columns:
87
+ df[label_column] = 0
88
+ df = df.dropna(subset=[text_column, label_column])
89
+ if sample:
90
+ df = df.sample(min(sample, df.shape[0]), random_state=SEED)
91
+ with st.spinner(text="Embedding text..."):
92
+ embeddings = embed_text(df[text_column].values.tolist(), model)
93
+ logger.info("Encoding labels")
94
+ encoded_labels = encode_labels(df[label_column])
95
+ with st.spinner("Reducing dimensionality..."):
96
+ embeddings_2d = dimensionality_reduction_function(embeddings)
97
+ logger.info("Generating figure")
98
+ plot = draw_interactive_scatter_plot(
99
+ df[text_column].values, embeddings_2d[:, 0], embeddings_2d[:, 1], encoded_labels.values, df[label_column].values, text_column, label_column
100
+ )
101
+ return plot
102
+
103
+
104
+ st.title("Perplexity Lenses")
105
+ st.write("Visualize text embeddings in 2D using colors to represent perplexity values.")
106
+ uploaded_file = st.file_uploader("Choose an csv/tsv file...", type=["csv", "tsv"])
107
+ st.write("Alternatively, select a dataset from the [hub](https://huggingface.co/datasets)")
108
+ col1, col2, col3 = st.columns(3)
109
+ with col1:
110
+ hub_dataset = st.text_input("Dataset name", "mc4")
111
+ with col2:
112
+ hub_dataset_config = st.text_input("Dataset configuration", "es")
113
+ with col3:
114
+ hub_dataset_split = st.text_input("Dataset split", "train")
115
+
116
+ text_column = st.text_input("Text column name", "text")
117
+ language = st.selectbox("Language", LANGUAGES, 12)
118
+ sample = st.number_input("Maximum number of documents to use", 1, 100000, 1000)
119
+ dimensionality_reduction = st.selectbox("Dimensionality Reduction algorithm", DIMENSIONALITY_REDUCTION_ALGORITHMS, 0)
120
+ model_name = st.selectbox("Sentence embedding model", EMBEDDING_MODELS, 0)
121
+
122
+ with st.spinner(text="Loading embedding model..."):
123
+ model = load_model(model_name)
124
+ dimensionality_reduction_function = (
125
+ partial(get_umap_embeddings, random_state=SEED) if dimensionality_reduction == "UMAP" else partial(get_tsne_embeddings, random_state=SEED)
126
+ )
127
+
128
+ with st.spinner(text="Loading KenLM model..."):
129
+ kenlm_model = KenlmModel.from_pretrained(language)
130
+
131
+ if uploaded_file or hub_dataset:
132
+ with st.spinner("Loading dataset..."):
133
+ if uploaded_file:
134
+ df = uploaded_file_to_dataframe(uploaded_file)
135
+ df["perplexity"] = df[text_column].map(lambda x: model.get_perplexity(x[text_column]))
136
+ else:
137
+ df = hub_dataset_to_dataframe(hub_dataset, hub_dataset_config, hub_dataset_split, sample, text_column, kenlm_model, seed=SEED)
138
+ plot = generate_plot(df, text_column, "perplexity", sample, dimensionality_reduction_function, model)
139
+ logger.info("Displaying plot")
140
+ st.bokeh_chart(plot)
141
+ logger.info("Done")
data.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import partial
2
+
3
+ import pandas as pd
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ from perplexity import KenlmModel
8
+
9
+
10
+ def hub_dataset_to_dataframe(path: str, name: str, split: str, sample: int, text_column: str, model: KenlmModel, seed: int = 0) -> pd.DataFrame:
11
+ load_dataset_fn = partial(load_dataset, path=path)
12
+ if name:
13
+ load_dataset_fn = partial(load_dataset_fn, name=name)
14
+ if split:
15
+ load_dataset_fn = partial(load_dataset_fn, split=split)
16
+ dataset = (
17
+ load_dataset_fn(streaming=True)
18
+ .shuffle(buffer_size=10000, seed=seed)
19
+ .map(lambda x: {text_column: x[text_column], "perplexity": model.get_perplexity(x[text_column])})
20
+ )
21
+ instances = []
22
+ count = 0
23
+ for instance in tqdm(dataset, total=sample):
24
+ instances.append(instance)
25
+ count += 1
26
+ if count == sample:
27
+ break
28
+ return pd.DataFrame(instances)
perplexity.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import urllib.request
3
+
4
+ import kenlm
5
+
6
+
7
+ class KenlmModel:
8
+ def __init__(self, language):
9
+ download_kenlm_model(language)
10
+ self.model = kenlm.Model(f"{language}.arpa.bin")
11
+
12
+ @classmethod
13
+ def from_pretrained(cls, language: str):
14
+ return cls(language)
15
+
16
+ def get_perplexity(self, doc: str):
17
+ doc_log_score, doc_length = 0, 0
18
+ for line in doc.split("\n"):
19
+ log_score = self.model.score(line)
20
+ length = len(line.split()) + 1
21
+ doc_log_score += log_score
22
+ doc_length += length
23
+ return 10.0 ** (-doc_log_score / doc_length)
24
+
25
+
26
+ def download_kenlm_model(language: str):
27
+ root_url = "http://dl.fbaipublicfiles.com/cc_net/lm"
28
+ bin_name = f"{language}.arpa.bin"
29
+ model_name = f"{language}.sp.model"
30
+ bin_url = f"{root_url}/{bin_name}"
31
+ model_url = f"{root_url}/{model_name}"
32
+
33
+ if not os.path.isfile(bin_name):
34
+ urllib.request.urlretrieve(bin_url, bin_name)
35
+
36
+ if not os.path.isfile(model_name):
37
+ urllib.request.urlretrieve(model_url, model_name)
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ huggingface-hub==0.0.17
2
+ streamlit==0.84.1
3
+ transformers==4.11.3
4
+ watchdog==2.1.3
5
+ sentence-transformers==2.0.0
6
+ bokeh==2.2.2
7
+ umap-learn==0.5.1
8
+ numpy==1.20.0
9
+ embedding-lenses==0.2.0
10
+ git+git://github.com/kpu/kenlm/archive/master.zip