Spaces:
Runtime error
Runtime error
Add application
Browse files- .gitignore +2 -0
- app.py +177 -0
- requirements.txt +5 -0
- utils.py +7 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
.vscode
|
app.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from os import WEXITED
|
2 |
+
import streamlit as st
|
3 |
+
from datasets import load_dataset
|
4 |
+
from sentence_transformers import SentenceTransformer
|
5 |
+
import torch
|
6 |
+
from spectral_metric.estimator import CumulativeGradientEstimator
|
7 |
+
import numpy as np
|
8 |
+
import seaborn as sns
|
9 |
+
import matplotlib.pyplot as plt
|
10 |
+
from spectral_metric.visualize import make_graph
|
11 |
+
from scipy.stats import entropy
|
12 |
+
import pandas as pd
|
13 |
+
|
14 |
+
from utils import show_most_confused
|
15 |
+
|
16 |
+
|
17 |
+
AVAILABLE_DATASETS = [
|
18 |
+
("clinc_oos", "small"),
|
19 |
+
("clinc_oos", "imbalanced"),
|
20 |
+
("banking77",),
|
21 |
+
("tweet_eval", "emoji"),
|
22 |
+
("tweet_eval", "stance_climate")
|
23 |
+
]
|
24 |
+
|
25 |
+
label_column_mapping = {
|
26 |
+
"clinc_oos": "intent",
|
27 |
+
"banking77": "label",
|
28 |
+
"tweet_eval": "label",
|
29 |
+
}
|
30 |
+
|
31 |
+
st.title("Perform a data-driven analysis using `spectral-metric`")
|
32 |
+
st.markdown(
|
33 |
+
"""Today, I would like to analyze this dataset and perform a
|
34 |
+
data-driven analysis by `sentence-transformers` to extract features
|
35 |
+
and `spectral_metric` to perform a spectral analysis of the dataset.
|
36 |
+
|
37 |
+
For support, please submit an issue on [our repo](https://github.com/Dref360/spectral-metric) or [contact me directly](https://github.com/Dref360)
|
38 |
+
"""
|
39 |
+
)
|
40 |
+
|
41 |
+
st.markdown(
|
42 |
+
"""
|
43 |
+
Let's load your dataset, we will run our analysis on the train set.
|
44 |
+
"""
|
45 |
+
)
|
46 |
+
|
47 |
+
dataset_name = st.selectbox("Select your dataset", AVAILABLE_DATASETS)
|
48 |
+
if st.button("Start the analysis"):
|
49 |
+
|
50 |
+
label_column = label_column_mapping[dataset_name[0]]
|
51 |
+
|
52 |
+
# We perform the analysis on the train set.
|
53 |
+
ds = load_dataset(*dataset_name)["train"]
|
54 |
+
class_names = ds.features[label_column].names
|
55 |
+
ds
|
56 |
+
|
57 |
+
# I use all-MiniLM-L12-v2 as it is a good compromise between speed and performance.
|
58 |
+
embedder = SentenceTransformer("all-MiniLM-L12-v2")
|
59 |
+
# We will get **normalized** features for the dataset using our embedder.
|
60 |
+
with st.spinner(text="Computing embeddings..."):
|
61 |
+
features = embedder.encode(
|
62 |
+
ds["text"],
|
63 |
+
device=0 if torch.cuda.is_available() else "cpu",
|
64 |
+
normalize_embeddings=True,
|
65 |
+
)
|
66 |
+
|
67 |
+
st.markdown(
|
68 |
+
"""
|
69 |
+
### Running the spectral analysis
|
70 |
+
|
71 |
+
Now that we have our embeddings extracted by our sentence embedder, we can make an in-depth analysis of these features.
|
72 |
+
|
73 |
+
To do so, we will use CSG (Branchaud-Charron et al, 2019), a technique that combines Probability Product Kernels (Jebara et al, 2004) and spectral clustering to analyze a dataset without training a model.
|
74 |
+
|
75 |
+
In this notebook, we won't use the actual CSG metrics, but we will use the $W$ matrix.
|
76 |
+
This matrix is computed as:
|
77 |
+
* Run a Probabilistic K-NN on the dataset (optionally done via Monte-Carlo)
|
78 |
+
* Compute the average prediction per class (results in the $S$ matrix)
|
79 |
+
* Symetrize this matrix using Bray-Curtis distance metric, a metric that was made to compare samplings from a distribution.
|
80 |
+
|
81 |
+
These steps are all done by `spectral_metric.estimator.CumulativeGradientEstimator`.
|
82 |
+
"""
|
83 |
+
)
|
84 |
+
X, y = features, np.array(ds[label_column]) # Your dataset with shape [N, ?], [N]
|
85 |
+
estimator = CumulativeGradientEstimator(M_sample=250, k_nearest=9, distance="cosine")
|
86 |
+
estimator.fit(data=X, target=y)
|
87 |
+
|
88 |
+
fig, ax = plt.subplots(figsize=(10, 5))
|
89 |
+
sns.heatmap(estimator.W, ax=ax, cmap="rocket_r")
|
90 |
+
ax.set_title(f"Similarity between classes in {dataset_name[0]}")
|
91 |
+
st.pyplot(fig)
|
92 |
+
|
93 |
+
st.markdown(
|
94 |
+
"""
|
95 |
+
This figure will be hard to read on most datasets, so we need to go deeper.
|
96 |
+
Let's do the following analysis:
|
97 |
+
1. Find the class with the highest entropy ie. the class that is the most confused with others.
|
98 |
+
2. Find the 5 pairs of classes that are the most confused.
|
99 |
+
3. Find the items in these pairs that contribute to the confusion.
|
100 |
+
"""
|
101 |
+
)
|
102 |
+
|
103 |
+
|
104 |
+
entropy_per_class = entropy(estimator.W / estimator.W.sum(-1)[:, None], axis=-1)
|
105 |
+
st.markdown(
|
106 |
+
f"Most confused class (highest entropy): {class_names[np.argmax(entropy_per_class)]}",
|
107 |
+
)
|
108 |
+
st.markdown(
|
109 |
+
f"Least confused class (lowest entropy): {class_names[np.argmin(entropy_per_class)]}",
|
110 |
+
)
|
111 |
+
|
112 |
+
pairs = list(zip(*np.unravel_index(np.argsort(estimator.W, axis=None), estimator.W.shape)))[::-1]
|
113 |
+
pairs = [(i,j) for i,j in pairs if i != j]
|
114 |
+
|
115 |
+
lst = []
|
116 |
+
for idx, (i,j) in enumerate(pairs[::2][:10]):
|
117 |
+
lst.append({"Intent A" : class_names[i], "Intent B": class_names[j], "Similarity": estimator.W[i,j]})
|
118 |
+
|
119 |
+
st.title("Most similar pairs")
|
120 |
+
st.dataframe(pd.DataFrame(lst).sort_values("Similarity", ascending=False))
|
121 |
+
|
122 |
+
|
123 |
+
st.markdown("""
|
124 |
+
## Analysis
|
125 |
+
By looking at the top-10 most similar pairs, we get some good insights on the dataset.
|
126 |
+
While this does not 100% indicates that the classifier trained downstream will have issues with these pairs,
|
127 |
+
we know that these intents are similar.
|
128 |
+
In consequence, the classifier might not be able to separate them easily.
|
129 |
+
|
130 |
+
|
131 |
+
Let's now look at which utterance is contributing the most to the confusion.
|
132 |
+
""")
|
133 |
+
|
134 |
+
first_pair = pairs[0]
|
135 |
+
second_pair = pairs[2]
|
136 |
+
st.dataframe(pd.DataFrame({**show_most_confused(ds,first_pair[0], first_pair[1], estimator, class_names),
|
137 |
+
**show_most_confused(ds, first_pair[1], first_pair[0], estimator, class_names)}),
|
138 |
+
width=1000)
|
139 |
+
|
140 |
+
st.markdown("### We can do the same for the second pair")
|
141 |
+
|
142 |
+
st.dataframe(pd.DataFrame({**show_most_confused(ds, second_pair[0], second_pair[1], estimator, class_names),
|
143 |
+
**show_most_confused(ds, second_pair[1], second_pair[0], estimator, class_names)}),
|
144 |
+
width=1000)
|
145 |
+
|
146 |
+
st.markdown(f"""
|
147 |
+
From the top-5 most confused examples per pair, we can see that the sentences are quite similar.
|
148 |
+
While a human could easily separate the two intents, we see that the sentences are made of the same words which might confuse the classifier.
|
149 |
+
|
150 |
+
Some sentences could be seen as mislabelled.
|
151 |
+
Of course, these features come from a model that was not trained to separate these classes,
|
152 |
+
they come from a general-purpose language model.
|
153 |
+
The goal of this analysis is to give insights to the data scientist before they train an expensive model.
|
154 |
+
If we were to train a model on this dataset, the model could probably handle the confusion between `{class_names[first_pair[0]]}`
|
155 |
+
and `{class_names[first_pair[1]]}`,
|
156 |
+
but maybe not easily.
|
157 |
+
|
158 |
+
|
159 |
+
## Conclusion
|
160 |
+
|
161 |
+
In this tutorial, we covered how to conduct a data-driven analysis for on a text classification dataset.
|
162 |
+
By using sentence embedding and the `spectral_metric` library, we found the intents that would be the most likely to be confused and which utterances caused this confusion.
|
163 |
+
|
164 |
+
Following our analysis, we could take the following actions:
|
165 |
+
1. Upweight the classes that are confused during training for the model to better learn to separate them.
|
166 |
+
2. Merge similar classes together.
|
167 |
+
3. Analyse sentences that are confusing to find mislabelled sentences.
|
168 |
+
|
169 |
+
If you have any questions, suggestions or ideas for this library please reach out:
|
170 |
+
|
171 |
+
1. frederic.branchaud.charron@gmail.com
|
172 |
+
2. [@Dref360 on Github](https://github.com/Dref360)
|
173 |
+
|
174 |
+
|
175 |
+
If you have a dataset that you think would be a good fit for this analysis let me know too!
|
176 |
+
""")
|
177 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit==0.84.0
|
2 |
+
spectral-metric==0.5.0
|
3 |
+
datasets==1.18.2
|
4 |
+
sentence-transformers==2.1.0
|
5 |
+
pandas
|
utils.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
def show_most_confused(ds, source_intent, target_intent, estimator, class_names):
|
4 |
+
pair_name = f"{class_names[source_intent]} <> {class_names[target_intent]}"
|
5 |
+
closest_to_second = np.argsort([sample.sample_probability_norm[target_intent] for sample in estimator.similarity_arrays[source_intent].values()])[::-1][:10]
|
6 |
+
dataset_indices = estimator.class_indices[source_intent][closest_to_second]
|
7 |
+
return {pair_name : [ds[int(di)]["text"] for di in dataset_indices]}
|