edugp commited on
Commit
e2feb58
1 Parent(s): 960abcf

Use default app in embedding-lenses 0.11.0

Browse files
Files changed (2) hide show
  1. app.py +2 -151
  2. requirements.txt +1 -9
app.py CHANGED
@@ -1,153 +1,4 @@
1
- import logging
2
- from functools import partial
3
- from typing import Callable, List, Optional
4
 
5
- import numpy as np
6
- import pandas as pd
7
- import streamlit as st
8
- import umap
9
- from bokeh.models import ColumnDataSource, HoverTool
10
- from bokeh.palettes import Cividis256 as Pallete
11
- from bokeh.plotting import Figure, figure
12
- from bokeh.transform import factor_cmap
13
- from datasets import load_dataset
14
- from sentence_transformers import SentenceTransformer
15
- from sklearn.manifold import TSNE
16
 
17
- logging.basicConfig(level=logging.INFO)
18
- logger = logging.getLogger(__name__)
19
- EMBEDDING_MODELS = ["distiluse-base-multilingual-cased-v1", "all-mpnet-base-v2", "flax-sentence-embeddings/all_datasets_v3_mpnet-base"]
20
- DIMENSIONALITY_REDUCTION_ALGORITHMS = ["UMAP", "t-SNE"]
21
- SEED = 0
22
-
23
-
24
- @st.cache(show_spinner=False, allow_output_mutation=True)
25
- def load_model(model_name: str) -> SentenceTransformer:
26
- embedder = model_name
27
- return SentenceTransformer(embedder)
28
-
29
-
30
- def embed_text(text: List[str], model: SentenceTransformer) -> np.ndarray:
31
- return model.encode(text)
32
-
33
-
34
- def encode_labels(labels: pd.Series) -> pd.Series:
35
- if pd.api.types.is_numeric_dtype(labels):
36
- return labels
37
- return labels.astype("category").cat.codes
38
-
39
-
40
- def get_tsne_embeddings(
41
- embeddings: np.ndarray, perplexity: int = 30, n_components: int = 2, init: str = "pca", n_iter: int = 5000, random_state: int = SEED
42
- ) -> np.ndarray:
43
- tsne = TSNE(perplexity=perplexity, n_components=n_components, init=init, n_iter=n_iter, random_state=random_state)
44
- return tsne.fit_transform(embeddings)
45
-
46
-
47
- def get_umap_embeddings(embeddings: np.ndarray) -> np.ndarray:
48
- umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=SEED)
49
- return umap_model.fit_transform(embeddings)
50
-
51
-
52
- def draw_interactive_scatter_plot(
53
- texts: np.ndarray, xs: np.ndarray, ys: np.ndarray, values: np.ndarray, labels: np.ndarray, text_column: str, label_column: str
54
- ) -> Figure:
55
- # Normalize values to range between 0-255, to assign a color for each value
56
- max_value = values.max()
57
- min_value = values.min()
58
- if max_value - min_value == 0:
59
- values_color = np.ones(len(values))
60
- else:
61
- values_color = ((values - min_value) / (max_value - min_value) * 255).round().astype(int).astype(str)
62
- values_color_set = sorted(values_color)
63
-
64
- values_list = values.astype(str).tolist()
65
- values_set = sorted(values_list)
66
- labels_list = labels.astype(str).tolist()
67
-
68
- source = ColumnDataSource(data=dict(x=xs, y=ys, text=texts, label=values_list, original_label=labels_list))
69
- hover = HoverTool(tooltips=[(text_column, "@text{safe}"), (label_column, "@original_label")])
70
- p = figure(plot_width=800, plot_height=800, tools=[hover])
71
- p.circle("x", "y", size=10, source=source, fill_color=factor_cmap("label", palette=[Pallete[int(id_)] for id_ in values_color_set], factors=values_set))
72
-
73
- p.axis.visible = False
74
- p.xgrid.grid_line_color = None
75
- p.ygrid.grid_line_color = None
76
- p.toolbar.logo = None
77
- return p
78
-
79
-
80
- def uploaded_file_to_dataframe(uploaded_file: st.uploaded_file_manager.UploadedFile) -> pd.DataFrame:
81
- extension = uploaded_file.name.split(".")[-1]
82
- return pd.read_csv(uploaded_file, sep="\t" if extension == "tsv" else ",")
83
-
84
-
85
- def hub_dataset_to_dataframe(path: str, name: str, split: str, sample: int) -> pd.DataFrame:
86
- load_dataset_fn = partial(load_dataset, path=path)
87
- if name:
88
- load_dataset_fn = partial(load_dataset_fn, name=name)
89
- if split:
90
- load_dataset_fn = partial(load_dataset_fn, split=split)
91
- dataset = load_dataset_fn().shuffle(seed=SEED)[:sample]
92
- return pd.DataFrame(dataset)
93
-
94
-
95
- def generate_plot(
96
- df: pd.DataFrame,
97
- text_column: str,
98
- label_column: str,
99
- sample: Optional[int],
100
- dimensionality_reduction_function: Callable,
101
- model: SentenceTransformer,
102
- ) -> Figure:
103
- if text_column not in df.columns:
104
- raise ValueError(f"The specified column name doesn't exist. Columns available: {df.columns.values}")
105
- if label_column not in df.columns:
106
- df[label_column] = 0
107
- df = df.dropna(subset=[text_column, label_column])
108
- if sample:
109
- df = df.sample(min(sample, df.shape[0]), random_state=SEED)
110
- with st.spinner(text="Embedding text..."):
111
- embeddings = embed_text(df[text_column].values.tolist(), model)
112
- logger.info("Encoding labels")
113
- encoded_labels = encode_labels(df[label_column])
114
- with st.spinner("Reducing dimensionality..."):
115
- embeddings_2d = dimensionality_reduction_function(embeddings)
116
- logger.info("Generating figure")
117
- plot = draw_interactive_scatter_plot(
118
- df[text_column].values, embeddings_2d[:, 0], embeddings_2d[:, 1], encoded_labels.values, df[label_column].values, text_column, label_column
119
- )
120
- return plot
121
-
122
-
123
- st.title("Embedding Lenses")
124
- st.write("Visualize text embeddings in 2D using colors for continuous or categorical labels.")
125
- uploaded_file = st.file_uploader("Choose an csv/tsv file...", type=["csv", "tsv"])
126
- st.write("Alternatively, select a dataset from the [hub](https://huggingface.co/datasets)")
127
- col1, col2, col3 = st.columns(3)
128
- with col1:
129
- hub_dataset = st.text_input("Dataset name", "ag_news")
130
- with col2:
131
- hub_dataset_config = st.text_input("Dataset configuration", "")
132
- with col3:
133
- hub_dataset_split = st.text_input("Dataset split", "train")
134
-
135
- text_column = st.text_input("Text column name", "text")
136
- label_column = st.text_input("Numerical/categorical column name (ignore if not applicable)", "label")
137
- sample = st.number_input("Maximum number of documents to use", 1, 100000, 1000)
138
- dimensionality_reduction = st.selectbox("Dimensionality Reduction algorithm", DIMENSIONALITY_REDUCTION_ALGORITHMS, 0)
139
- model_name = st.selectbox("Sentence embedding model", EMBEDDING_MODELS, 0)
140
- with st.spinner(text="Loading model..."):
141
- model = load_model(model_name)
142
- dimensionality_reduction_function = get_umap_embeddings if dimensionality_reduction == "UMAP" else get_tsne_embeddings
143
-
144
- if uploaded_file or hub_dataset:
145
- with st.spinner("Loading dataset..."):
146
- if uploaded_file:
147
- df = uploaded_file_to_dataframe(uploaded_file)
148
- else:
149
- df = hub_dataset_to_dataframe(hub_dataset, hub_dataset_config, hub_dataset_split, sample)
150
- plot = generate_plot(df, text_column, label_column, sample, dimensionality_reduction_function, model)
151
- logger.info("Displaying plot")
152
- st.bokeh_chart(plot)
153
- logger.info("Done")
1
+ from embedding_lenses.app import app
 
 
2
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
+ app()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,9 +1 @@
1
- huggingface-hub==0.0.17
2
- streamlit==1.8.1
3
- transformers==4.11.3
4
- watchdog==2.1.3
5
- sentence-transformers==2.0.0
6
- bokeh==2.4.1
7
- umap-learn==0.5.1
8
- numpy==1.20.0
9
- Jinja2==3.0.3
1
+ embedding-lenses==0.11.0