Spaces:
Runtime error
Runtime error
Mariusz Kossakowski
commited on
Commit
•
77405f7
1
Parent(s):
d1e0fdf
Add tSNE projection
Browse files- clarin_datasets/abusive_clauses_dataset.py +35 -0
- clarin_datasets/aspectemo_dataset.py +42 -0
- clarin_datasets/cst_wikinews_dataset.py +30 -1
- clarin_datasets/kpwr_ner_datasets.py +46 -0
- clarin_datasets/nkjp_pos_dataset.py +47 -0
- clarin_datasets/polemo_dataset.py +31 -0
- clarin_datasets/punctuation_restoration_dataset.py +41 -0
- clarin_datasets/utils.py +54 -0
- requirements.txt +8 -2
clarin_datasets/abusive_clauses_dataset.py
CHANGED
@@ -1,13 +1,18 @@
|
|
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
import plotly.figure_factory as ff
|
3 |
import plotly.graph_objects as go
|
4 |
import streamlit as st
|
|
|
5 |
|
6 |
from clarin_datasets.dataset_to_show import DatasetToShow
|
7 |
from clarin_datasets.utils import (
|
8 |
count_num_of_characters,
|
9 |
count_num_of_words,
|
10 |
)
|
|
|
11 |
|
12 |
|
13 |
class AbusiveClausesDataset(DatasetToShow):
|
@@ -53,6 +58,7 @@ class AbusiveClausesDataset(DatasetToShow):
|
|
53 |
dataframe_head = st.container()
|
54 |
word_searching = st.container()
|
55 |
dataset_statistics = st.container()
|
|
|
56 |
|
57 |
with header:
|
58 |
st.title(self.dataset_name)
|
@@ -188,3 +194,32 @@ class AbusiveClausesDataset(DatasetToShow):
|
|
188 |
xaxis_title="Number of characters",
|
189 |
)
|
190 |
st.plotly_chart(fig_num_chars, use_container_width=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
import seaborn as sns
|
4 |
import pandas as pd
|
5 |
import plotly.figure_factory as ff
|
6 |
import plotly.graph_objects as go
|
7 |
import streamlit as st
|
8 |
+
from sklearn.manifold import TSNE
|
9 |
|
10 |
from clarin_datasets.dataset_to_show import DatasetToShow
|
11 |
from clarin_datasets.utils import (
|
12 |
count_num_of_characters,
|
13 |
count_num_of_words,
|
14 |
)
|
15 |
+
from clarin_datasets.utils import embed_sentence
|
16 |
|
17 |
|
18 |
class AbusiveClausesDataset(DatasetToShow):
|
|
|
58 |
dataframe_head = st.container()
|
59 |
word_searching = st.container()
|
60 |
dataset_statistics = st.container()
|
61 |
+
tsne_projection = st.container()
|
62 |
|
63 |
with header:
|
64 |
st.title(self.dataset_name)
|
|
|
194 |
xaxis_title="Number of characters",
|
195 |
)
|
196 |
st.plotly_chart(fig_num_chars, use_container_width=True)
|
197 |
+
|
198 |
+
with tsne_projection:
|
199 |
+
st.header("t-SNE projection of the dataset")
|
200 |
+
subset_to_project = st.selectbox(
|
201 |
+
label="Select subset to project", options=self.subsets
|
202 |
+
)
|
203 |
+
sentences = self.data_dict[subset_to_project]["text"].values
|
204 |
+
reducer = TSNE(
|
205 |
+
n_components=2
|
206 |
+
)
|
207 |
+
embedded_sentences = np.array(
|
208 |
+
[embed_sentence(text) for text in sentences]
|
209 |
+
)
|
210 |
+
transformed_embeddings = reducer.fit_transform(embedded_sentences)
|
211 |
+
fig, ax = plt.subplots()
|
212 |
+
ax.scatter(
|
213 |
+
x=transformed_embeddings[:, 0],
|
214 |
+
y=transformed_embeddings[:, 1],
|
215 |
+
c=[
|
216 |
+
sns.color_palette()[x]
|
217 |
+
for x in self.data_dict[subset_to_project]["target"].map(
|
218 |
+
{
|
219 |
+
"BEZPIECZNE_POSTANOWIENIE_UMOWNE": 0,
|
220 |
+
"KLAUZULA_ABUZYWNA": 1
|
221 |
+
}
|
222 |
+
).values
|
223 |
+
],
|
224 |
+
)
|
225 |
+
st.pyplot(fig)
|
clarin_datasets/aspectemo_dataset.py
CHANGED
@@ -1,8 +1,13 @@
|
|
|
|
|
|
1 |
import pandas as pd
|
|
|
2 |
from datasets import load_dataset
|
|
|
3 |
import streamlit as st
|
4 |
|
5 |
from clarin_datasets.dataset_to_show import DatasetToShow
|
|
|
6 |
|
7 |
|
8 |
class AspectEmoDataset(DatasetToShow):
|
@@ -54,6 +59,7 @@ class AspectEmoDataset(DatasetToShow):
|
|
54 |
dataframe_head = st.container()
|
55 |
class_distribution = st.container()
|
56 |
most_common_tokens = st.container()
|
|
|
57 |
|
58 |
with header:
|
59 |
st.title(self.dataset_name)
|
@@ -131,3 +137,39 @@ class AspectEmoDataset(DatasetToShow):
|
|
131 |
)
|
132 |
st.dataframe(df_to_show)
|
133 |
st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
import pandas as pd
|
4 |
+
import seaborn as sns
|
5 |
from datasets import load_dataset
|
6 |
+
from sklearn.manifold import TSNE
|
7 |
import streamlit as st
|
8 |
|
9 |
from clarin_datasets.dataset_to_show import DatasetToShow
|
10 |
+
from clarin_datasets.utils import embed_sentence, PLOT_COLOR_PALETTE
|
11 |
|
12 |
|
13 |
class AspectEmoDataset(DatasetToShow):
|
|
|
59 |
dataframe_head = st.container()
|
60 |
class_distribution = st.container()
|
61 |
most_common_tokens = st.container()
|
62 |
+
tsne_projection = st.container()
|
63 |
|
64 |
with header:
|
65 |
st.title(self.dataset_name)
|
|
|
137 |
)
|
138 |
st.dataframe(df_to_show)
|
139 |
st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
|
140 |
+
|
141 |
+
with tsne_projection:
|
142 |
+
st.header("t-SNE projection of the dataset")
|
143 |
+
subset_to_project = st.selectbox(
|
144 |
+
label="Select subset to project", options=self.subsets
|
145 |
+
)
|
146 |
+
tokens_unzipped = self.data_dict[subset_to_project]["tokens"].tolist()
|
147 |
+
tokens_unzipped = np.array([x for subarray in tokens_unzipped for x in subarray])
|
148 |
+
labels_unzipped = self.data_dict[subset_to_project]["labels"].tolist()
|
149 |
+
labels_unzipped = np.array([x for subarray in labels_unzipped for x in subarray])
|
150 |
+
df_unzipped = pd.DataFrame(
|
151 |
+
{
|
152 |
+
"tokens": tokens_unzipped,
|
153 |
+
"labels": labels_unzipped,
|
154 |
+
}
|
155 |
+
)
|
156 |
+
df_unzipped = df_unzipped.loc[df_unzipped["labels"] != 0]
|
157 |
+
tokens_unzipped = df_unzipped["tokens"].values
|
158 |
+
labels_unzipped = df_unzipped["labels"].values
|
159 |
+
embedded_tokens = np.array(
|
160 |
+
[embed_sentence(x) for x in tokens_unzipped]
|
161 |
+
)
|
162 |
+
reducer = TSNE(
|
163 |
+
n_components=2
|
164 |
+
)
|
165 |
+
transformed_embeddings = reducer.fit_transform(embedded_tokens)
|
166 |
+
fig, ax = plt.subplots()
|
167 |
+
ax.scatter(
|
168 |
+
x=transformed_embeddings[:, 0],
|
169 |
+
y=transformed_embeddings[:, 1],
|
170 |
+
c=[
|
171 |
+
PLOT_COLOR_PALETTE[x]
|
172 |
+
for x in labels_unzipped
|
173 |
+
],
|
174 |
+
)
|
175 |
+
st.pyplot(fig)
|
clarin_datasets/cst_wikinews_dataset.py
CHANGED
@@ -1,8 +1,12 @@
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
from datasets import load_dataset
|
|
|
3 |
import streamlit as st
|
4 |
|
5 |
from clarin_datasets.dataset_to_show import DatasetToShow
|
|
|
6 |
|
7 |
|
8 |
class CSTWikinewsDataset(DatasetToShow):
|
@@ -23,7 +27,7 @@ class CSTWikinewsDataset(DatasetToShow):
|
|
23 |
header = st.container()
|
24 |
dataframe_head = st.container()
|
25 |
class_distribution = st.container()
|
26 |
-
|
27 |
with header:
|
28 |
st.title(self.dataset_name)
|
29 |
|
@@ -54,3 +58,28 @@ class CSTWikinewsDataset(DatasetToShow):
|
|
54 |
|
55 |
with class_distribution:
|
56 |
st.dataframe(class_distribution_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
import pandas as pd
|
4 |
from datasets import load_dataset
|
5 |
+
from sklearn.manifold import TSNE
|
6 |
import streamlit as st
|
7 |
|
8 |
from clarin_datasets.dataset_to_show import DatasetToShow
|
9 |
+
from clarin_datasets.utils import embed_sentence, PLOT_COLOR_PALETTE
|
10 |
|
11 |
|
12 |
class CSTWikinewsDataset(DatasetToShow):
|
|
|
27 |
header = st.container()
|
28 |
dataframe_head = st.container()
|
29 |
class_distribution = st.container()
|
30 |
+
tsne_projection = st.container()
|
31 |
with header:
|
32 |
st.title(self.dataset_name)
|
33 |
|
|
|
58 |
|
59 |
with class_distribution:
|
60 |
st.dataframe(class_distribution_df)
|
61 |
+
|
62 |
+
with tsne_projection:
|
63 |
+
st.header("t-SNE projection of the dataset")
|
64 |
+
subset_to_project = st.selectbox(
|
65 |
+
label="Select subset to project", options=self.subsets
|
66 |
+
)
|
67 |
+
first_sentences = self.data_dict[subset_to_project]["sentence_1"].values
|
68 |
+
second_sentences = self.data_dict[subset_to_project]["sentence_2"].values
|
69 |
+
labels = self.data_dict[subset_to_project]["label"].values
|
70 |
+
first_sentences_embedded = np.array([embed_sentence(x) for x in first_sentences])
|
71 |
+
second_sentences_embedded = np.array([embed_sentence(x) for x in second_sentences])
|
72 |
+
mean_embeddings = (first_sentences_embedded + second_sentences_embedded) / 2
|
73 |
+
reducer = TSNE(
|
74 |
+
n_components=2
|
75 |
+
)
|
76 |
+
transformed_embeddings = reducer.fit_transform(mean_embeddings)
|
77 |
+
fig, ax = plt.subplots()
|
78 |
+
ax.scatter(
|
79 |
+
x=transformed_embeddings[:, 0],
|
80 |
+
y=transformed_embeddings[:, 1],
|
81 |
+
c=[
|
82 |
+
PLOT_COLOR_PALETTE[i] for i in labels
|
83 |
+
]
|
84 |
+
)
|
85 |
+
st.pyplot(fig)
|
clarin_datasets/kpwr_ner_datasets.py
CHANGED
@@ -1,8 +1,12 @@
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
from datasets import load_dataset
|
|
|
3 |
import streamlit as st
|
4 |
|
5 |
from clarin_datasets.dataset_to_show import DatasetToShow
|
|
|
6 |
|
7 |
|
8 |
class KpwrNerDataset(DatasetToShow):
|
@@ -66,6 +70,7 @@ class KpwrNerDataset(DatasetToShow):
|
|
66 |
dataframe_head = st.container()
|
67 |
class_distribution = st.container()
|
68 |
most_common_tokens = st.container()
|
|
|
69 |
|
70 |
with header:
|
71 |
st.title(self.dataset_name)
|
@@ -153,3 +158,44 @@ class KpwrNerDataset(DatasetToShow):
|
|
153 |
)
|
154 |
st.dataframe(df_to_show)
|
155 |
st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
import pandas as pd
|
4 |
from datasets import load_dataset
|
5 |
+
from sklearn.manifold import TSNE
|
6 |
import streamlit as st
|
7 |
|
8 |
from clarin_datasets.dataset_to_show import DatasetToShow
|
9 |
+
from clarin_datasets.utils import embed_sentence, PLOT_COLOR_PALETTE
|
10 |
|
11 |
|
12 |
class KpwrNerDataset(DatasetToShow):
|
|
|
70 |
dataframe_head = st.container()
|
71 |
class_distribution = st.container()
|
72 |
most_common_tokens = st.container()
|
73 |
+
tsne_projection = st.container()
|
74 |
|
75 |
with header:
|
76 |
st.title(self.dataset_name)
|
|
|
158 |
)
|
159 |
st.dataframe(df_to_show)
|
160 |
st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
|
161 |
+
SHOW_TSNE_PROJECTION = False
|
162 |
+
if SHOW_TSNE_PROJECTION:
|
163 |
+
with tsne_projection:
|
164 |
+
st.header("t-SNE projection of the dataset")
|
165 |
+
subset_to_project = st.selectbox(
|
166 |
+
label="Select subset to project", options=self.subsets
|
167 |
+
)
|
168 |
+
tokens_unzipped = self.data_dict_named[subset_to_project]["tokens"].tolist()
|
169 |
+
tokens_unzipped = np.array([x for subarray in tokens_unzipped for x in subarray])
|
170 |
+
labels_unzipped = self.data_dict_named[subset_to_project]["ner"].tolist()
|
171 |
+
labels_unzipped = np.array([x for subarray in labels_unzipped for x in subarray])
|
172 |
+
df_unzipped = pd.DataFrame(
|
173 |
+
{
|
174 |
+
"tokens": tokens_unzipped,
|
175 |
+
"ner": labels_unzipped,
|
176 |
+
}
|
177 |
+
)
|
178 |
+
df_unzipped = df_unzipped.loc[
|
179 |
+
(df_unzipped["ner"] != "O")
|
180 |
+
& ~(df_unzipped["ner"].str.startswith("I-"))
|
181 |
+
]
|
182 |
+
tokens_unzipped = df_unzipped["tokens"].values
|
183 |
+
labels_unzipped = df_unzipped["ner"].values
|
184 |
+
mapping_dict = {name: number for number, name in enumerate(set(labels_unzipped))}
|
185 |
+
labels_as_ints = [mapping_dict[label] for label in labels_unzipped]
|
186 |
+
embedded_tokens = np.array(
|
187 |
+
[embed_sentence(x) for x in tokens_unzipped]
|
188 |
+
)
|
189 |
+
reducer = TSNE(
|
190 |
+
n_components=2
|
191 |
+
)
|
192 |
+
transformed_embeddings = reducer.fit_transform(embedded_tokens)
|
193 |
+
fig, ax = plt.subplots()
|
194 |
+
ax.scatter(
|
195 |
+
x=transformed_embeddings[:, 0],
|
196 |
+
y=transformed_embeddings[:, 1],
|
197 |
+
c=[
|
198 |
+
PLOT_COLOR_PALETTE[i] for i in labels_as_ints
|
199 |
+
]
|
200 |
+
)
|
201 |
+
st.pyplot(fig)
|
clarin_datasets/nkjp_pos_dataset.py
CHANGED
@@ -1,8 +1,16 @@
|
|
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
from datasets import load_dataset
|
|
|
3 |
import streamlit as st
|
4 |
|
5 |
from clarin_datasets.dataset_to_show import DatasetToShow
|
|
|
|
|
|
|
|
|
6 |
|
7 |
|
8 |
class NkjpPosDataset(DatasetToShow):
|
@@ -65,6 +73,7 @@ class NkjpPosDataset(DatasetToShow):
|
|
65 |
description = st.container()
|
66 |
dataframe_head = st.container()
|
67 |
class_distribution = st.container()
|
|
|
68 |
|
69 |
with header:
|
70 |
st.title(self.dataset_name)
|
@@ -112,3 +121,41 @@ class NkjpPosDataset(DatasetToShow):
|
|
112 |
st.text_area(
|
113 |
label="LaTeX code", value=class_distribution_df.style.to_latex()
|
114 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
import seaborn as sns
|
4 |
import pandas as pd
|
5 |
from datasets import load_dataset
|
6 |
+
from sklearn.manifold import TSNE
|
7 |
import streamlit as st
|
8 |
|
9 |
from clarin_datasets.dataset_to_show import DatasetToShow
|
10 |
+
from clarin_datasets.utils import (
|
11 |
+
PLOT_COLOR_PALETTE,
|
12 |
+
embed_sentence
|
13 |
+
)
|
14 |
|
15 |
|
16 |
class NkjpPosDataset(DatasetToShow):
|
|
|
73 |
description = st.container()
|
74 |
dataframe_head = st.container()
|
75 |
class_distribution = st.container()
|
76 |
+
tsne_projection = st.container()
|
77 |
|
78 |
with header:
|
79 |
st.title(self.dataset_name)
|
|
|
121 |
st.text_area(
|
122 |
label="LaTeX code", value=class_distribution_df.style.to_latex()
|
123 |
)
|
124 |
+
SHOW_TSNE_PROJECTION = False
|
125 |
+
if SHOW_TSNE_PROJECTION:
|
126 |
+
with tsne_projection:
|
127 |
+
st.header("t-SNE projection of the dataset")
|
128 |
+
subset_to_project = st.selectbox(
|
129 |
+
label="Select subset to project", options=self.subsets
|
130 |
+
)
|
131 |
+
tokens_unzipped = self.data_dict_named[subset_to_project]["tokens"].tolist()
|
132 |
+
tokens_unzipped = np.array([x for subarray in tokens_unzipped for x in subarray])
|
133 |
+
labels_unzipped = self.data_dict_named[subset_to_project]["tags"].tolist()
|
134 |
+
labels_unzipped = np.array([x for subarray in labels_unzipped for x in subarray])
|
135 |
+
df_unzipped = pd.DataFrame(
|
136 |
+
{
|
137 |
+
"tokens": tokens_unzipped,
|
138 |
+
"tags": labels_unzipped,
|
139 |
+
}
|
140 |
+
)
|
141 |
+
tokens_unzipped = df_unzipped["tokens"].values
|
142 |
+
labels_unzipped = df_unzipped["tags"].values
|
143 |
+
mapping_dict = {name: number for number, name in enumerate(set(labels_unzipped))}
|
144 |
+
labels_as_ints = [mapping_dict[label] for label in labels_unzipped]
|
145 |
+
embedded_tokens = np.array(
|
146 |
+
[embed_sentence(x) for x in tokens_unzipped]
|
147 |
+
)
|
148 |
+
reducer = TSNE(
|
149 |
+
n_components=2
|
150 |
+
)
|
151 |
+
transformed_embeddings = reducer.fit_transform(embedded_tokens)
|
152 |
+
fig, ax = plt.subplots()
|
153 |
+
ax.scatter(
|
154 |
+
x=transformed_embeddings[:, 0],
|
155 |
+
y=transformed_embeddings[:, 1],
|
156 |
+
c=[
|
157 |
+
PLOT_COLOR_PALETTE[i]
|
158 |
+
for i in labels_as_ints
|
159 |
+
],
|
160 |
+
)
|
161 |
+
st.pyplot(fig)
|
clarin_datasets/polemo_dataset.py
CHANGED
@@ -1,13 +1,19 @@
|
|
|
|
|
|
|
|
1 |
from datasets import load_dataset
|
2 |
import pandas as pd
|
3 |
import plotly.figure_factory as ff
|
4 |
import plotly.graph_objects as go
|
|
|
5 |
import streamlit as st
|
6 |
|
7 |
from clarin_datasets.dataset_to_show import DatasetToShow
|
8 |
from clarin_datasets.utils import (
|
9 |
count_num_of_characters,
|
10 |
count_num_of_words,
|
|
|
|
|
11 |
)
|
12 |
|
13 |
|
@@ -38,6 +44,7 @@ class PolemoDataset(DatasetToShow):
|
|
38 |
dataframe_head = st.container()
|
39 |
word_searching = st.container()
|
40 |
dataset_statistics = st.container()
|
|
|
41 |
|
42 |
with header:
|
43 |
st.title(self.dataset_name)
|
@@ -201,3 +208,27 @@ class PolemoDataset(DatasetToShow):
|
|
201 |
xaxis_title="Number of characters",
|
202 |
)
|
203 |
st.plotly_chart(fig_num_chars, use_container_width=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
import seaborn as sns
|
4 |
from datasets import load_dataset
|
5 |
import pandas as pd
|
6 |
import plotly.figure_factory as ff
|
7 |
import plotly.graph_objects as go
|
8 |
+
from sklearn.manifold import TSNE
|
9 |
import streamlit as st
|
10 |
|
11 |
from clarin_datasets.dataset_to_show import DatasetToShow
|
12 |
from clarin_datasets.utils import (
|
13 |
count_num_of_characters,
|
14 |
count_num_of_words,
|
15 |
+
embed_sentence,
|
16 |
+
PLOT_COLOR_PALETTE
|
17 |
)
|
18 |
|
19 |
|
|
|
44 |
dataframe_head = st.container()
|
45 |
word_searching = st.container()
|
46 |
dataset_statistics = st.container()
|
47 |
+
tsne_projection = st.container()
|
48 |
|
49 |
with header:
|
50 |
st.title(self.dataset_name)
|
|
|
208 |
xaxis_title="Number of characters",
|
209 |
)
|
210 |
st.plotly_chart(fig_num_chars, use_container_width=True)
|
211 |
+
|
212 |
+
with tsne_projection:
|
213 |
+
st.header("t-SNE projection of the dataset")
|
214 |
+
subset_to_project = st.selectbox(
|
215 |
+
label="Select subset to project", options=self.subsets
|
216 |
+
)
|
217 |
+
sentences = self.data_dict[subset_to_project]["text"].values
|
218 |
+
reducer = TSNE(
|
219 |
+
n_components=2
|
220 |
+
)
|
221 |
+
embedded_sentences = np.array(
|
222 |
+
[embed_sentence(text) for text in sentences]
|
223 |
+
)
|
224 |
+
transformed_embeddings = reducer.fit_transform(embedded_sentences)
|
225 |
+
fig, ax = plt.subplots()
|
226 |
+
ax.scatter(
|
227 |
+
x=transformed_embeddings[:, 0],
|
228 |
+
y=transformed_embeddings[:, 1],
|
229 |
+
c=[
|
230 |
+
PLOT_COLOR_PALETTE[x]
|
231 |
+
for x in self.data_dict[subset_to_project]["target"].values
|
232 |
+
],
|
233 |
+
)
|
234 |
+
st.pyplot(fig)
|
clarin_datasets/punctuation_restoration_dataset.py
CHANGED
@@ -1,8 +1,12 @@
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
from datasets import load_dataset
|
|
|
3 |
import streamlit as st
|
4 |
|
5 |
from clarin_datasets.dataset_to_show import DatasetToShow
|
|
|
6 |
|
7 |
|
8 |
class PunctuationRestorationDataset(DatasetToShow):
|
@@ -68,6 +72,7 @@ class PunctuationRestorationDataset(DatasetToShow):
|
|
68 |
description = st.container()
|
69 |
dataframe_head = st.container()
|
70 |
class_distribution = st.container()
|
|
|
71 |
|
72 |
with header:
|
73 |
st.title(self.dataset_name)
|
@@ -116,3 +121,39 @@ class PunctuationRestorationDataset(DatasetToShow):
|
|
116 |
st.text_area(
|
117 |
label="LaTeX code", value=class_distribution_df.style.to_latex()
|
118 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
import pandas as pd
|
4 |
from datasets import load_dataset
|
5 |
+
from sklearn.manifold import TSNE
|
6 |
import streamlit as st
|
7 |
|
8 |
from clarin_datasets.dataset_to_show import DatasetToShow
|
9 |
+
from clarin_datasets.utils import embed_sentence, PLOT_COLOR_PALETTE
|
10 |
|
11 |
|
12 |
class PunctuationRestorationDataset(DatasetToShow):
|
|
|
72 |
description = st.container()
|
73 |
dataframe_head = st.container()
|
74 |
class_distribution = st.container()
|
75 |
+
tsne_projection = st.container()
|
76 |
|
77 |
with header:
|
78 |
st.title(self.dataset_name)
|
|
|
121 |
st.text_area(
|
122 |
label="LaTeX code", value=class_distribution_df.style.to_latex()
|
123 |
)
|
124 |
+
with tsne_projection:
|
125 |
+
st.header("t-SNE projection of the dataset")
|
126 |
+
subset_to_project = st.selectbox(
|
127 |
+
label="Select subset to project", options=self.subsets
|
128 |
+
)
|
129 |
+
tokens_unzipped = self.data_dict_named[subset_to_project]["tokens"].tolist()
|
130 |
+
tokens_unzipped = np.array([x for subarray in tokens_unzipped for x in subarray])
|
131 |
+
labels_unzipped = self.data_dict_named[subset_to_project]["tags"].tolist()
|
132 |
+
labels_unzipped = np.array([x for subarray in labels_unzipped for x in subarray])
|
133 |
+
df_unzipped = pd.DataFrame(
|
134 |
+
{
|
135 |
+
"tokens": tokens_unzipped,
|
136 |
+
"tags": labels_unzipped,
|
137 |
+
}
|
138 |
+
)
|
139 |
+
df_unzipped = df_unzipped.loc[df_unzipped["tags"] != "O"]
|
140 |
+
tokens_unzipped = df_unzipped["tokens"].values
|
141 |
+
labels_unzipped = df_unzipped["tags"].values
|
142 |
+
mapping_dict = {name: number for number, name in enumerate(set(labels_unzipped))}
|
143 |
+
labels_as_ints = [mapping_dict[label] for label in labels_unzipped]
|
144 |
+
embedded_tokens = np.array(
|
145 |
+
[embed_sentence(x) for x in tokens_unzipped]
|
146 |
+
)
|
147 |
+
reducer = TSNE(
|
148 |
+
n_components=2
|
149 |
+
)
|
150 |
+
transformed_embeddings = reducer.fit_transform(embedded_tokens)
|
151 |
+
fig, ax = plt.subplots()
|
152 |
+
ax.scatter(
|
153 |
+
x=transformed_embeddings[:, 0],
|
154 |
+
y=transformed_embeddings[:, 1],
|
155 |
+
c=[
|
156 |
+
PLOT_COLOR_PALETTE[i] for i in labels_as_ints
|
157 |
+
]
|
158 |
+
)
|
159 |
+
st.pyplot(fig)
|
clarin_datasets/utils.py
CHANGED
@@ -1,8 +1,56 @@
|
|
1 |
import re
|
2 |
from typing import List
|
3 |
|
|
|
|
|
|
|
4 |
from unidecode import unidecode
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
def flatten_list(main_list: List[List]) -> List:
|
8 |
return [item for sublist in main_list for item in sublist]
|
@@ -14,3 +62,9 @@ def count_num_of_characters(text: str) -> int:
|
|
14 |
|
15 |
def count_num_of_words(text: str) -> int:
|
16 |
return len(re.sub(r"[^a-zA-Z ]", "", unidecode(text)).split(" "))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import re
|
2 |
from typing import List
|
3 |
|
4 |
+
from embeddings.embedding.auto_flair import AutoFlairDocumentEmbedding
|
5 |
+
from flair.data import Sentence
|
6 |
+
from numpy import typing as nt
|
7 |
from unidecode import unidecode
|
8 |
|
9 |
+
embedding = AutoFlairDocumentEmbedding.from_hub("clarin-pl/word2vec-kgr10")
|
10 |
+
|
11 |
+
PLOT_COLOR_PALETTE = [
|
12 |
+
"#FAEBD7",
|
13 |
+
"#00FFFF",
|
14 |
+
"#7FFFD4",
|
15 |
+
"#000000",
|
16 |
+
"#0000FF",
|
17 |
+
"#8A2BE2",
|
18 |
+
"#A52A2A",
|
19 |
+
"#DEB887",
|
20 |
+
"#5F9EA0",
|
21 |
+
"#7FFF00",
|
22 |
+
"#D2691E",
|
23 |
+
"#FF7F50",
|
24 |
+
"#6495ED",
|
25 |
+
"#FFF8DC",
|
26 |
+
"#DC143C",
|
27 |
+
"#00FFFF",
|
28 |
+
"#00008B",
|
29 |
+
"#008B8B",
|
30 |
+
"#B8860B",
|
31 |
+
"#A9A9A9",
|
32 |
+
"#006400",
|
33 |
+
"#BDB76B",
|
34 |
+
"#8B008B",
|
35 |
+
"#556B2F",
|
36 |
+
"#FF8C00",
|
37 |
+
"#9932CC",
|
38 |
+
"#8B0000",
|
39 |
+
"#E9967A",
|
40 |
+
"#8FBC8F",
|
41 |
+
"#2F4F4F",
|
42 |
+
"#00CED1",
|
43 |
+
"#FFD700",
|
44 |
+
"#DAA520",
|
45 |
+
"#808080",
|
46 |
+
"#FF69B4",
|
47 |
+
"#4B0082",
|
48 |
+
"#CD5C5C",
|
49 |
+
"#7CFC00",
|
50 |
+
"#F08080",
|
51 |
+
"#66CDAA",
|
52 |
+
]
|
53 |
+
|
54 |
|
55 |
def flatten_list(main_list: List[List]) -> List:
|
56 |
return [item for sublist in main_list for item in sublist]
|
|
|
62 |
|
63 |
def count_num_of_words(text: str) -> int:
|
64 |
return len(re.sub(r"[^a-zA-Z ]", "", unidecode(text)).split(" "))
|
65 |
+
|
66 |
+
|
67 |
+
def embed_sentence(sentence: str) -> nt.NDArray:
|
68 |
+
sentence = Sentence(sentence)
|
69 |
+
embedding.embed([sentence])
|
70 |
+
return sentence.embedding.numpy()
|
requirements.txt
CHANGED
@@ -4,6 +4,12 @@ pyperclip==1.8.2
|
|
4 |
streamlit==1.11.0
|
5 |
Unidecode==1.3.4
|
6 |
scipy
|
7 |
-
datasets
|
8 |
umap-learn
|
9 |
-
clarinpl-embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
streamlit==1.11.0
|
5 |
Unidecode==1.3.4
|
6 |
scipy
|
7 |
+
datasets~=2.4.0
|
8 |
umap-learn
|
9 |
+
clarinpl-embeddings
|
10 |
+
pynndescent
|
11 |
+
flair~=0.11.3
|
12 |
+
numpy~=1.23.3
|
13 |
+
matplotlib~=3.5.3
|
14 |
+
seaborn~=0.12.0
|
15 |
+
scikit-learn~=1.1.2
|