Mariusz Kossakowski commited on
Commit
77405f7
1 Parent(s): d1e0fdf

Add tSNE projection

Browse files
clarin_datasets/abusive_clauses_dataset.py CHANGED
@@ -1,13 +1,18 @@
 
 
 
1
  import pandas as pd
2
  import plotly.figure_factory as ff
3
  import plotly.graph_objects as go
4
  import streamlit as st
 
5
 
6
  from clarin_datasets.dataset_to_show import DatasetToShow
7
  from clarin_datasets.utils import (
8
  count_num_of_characters,
9
  count_num_of_words,
10
  )
 
11
 
12
 
13
  class AbusiveClausesDataset(DatasetToShow):
@@ -53,6 +58,7 @@ class AbusiveClausesDataset(DatasetToShow):
53
  dataframe_head = st.container()
54
  word_searching = st.container()
55
  dataset_statistics = st.container()
 
56
 
57
  with header:
58
  st.title(self.dataset_name)
@@ -188,3 +194,32 @@ class AbusiveClausesDataset(DatasetToShow):
188
  xaxis_title="Number of characters",
189
  )
190
  st.plotly_chart(fig_num_chars, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+ import seaborn as sns
4
  import pandas as pd
5
  import plotly.figure_factory as ff
6
  import plotly.graph_objects as go
7
  import streamlit as st
8
+ from sklearn.manifold import TSNE
9
 
10
  from clarin_datasets.dataset_to_show import DatasetToShow
11
  from clarin_datasets.utils import (
12
  count_num_of_characters,
13
  count_num_of_words,
14
  )
15
+ from clarin_datasets.utils import embed_sentence
16
 
17
 
18
  class AbusiveClausesDataset(DatasetToShow):
58
  dataframe_head = st.container()
59
  word_searching = st.container()
60
  dataset_statistics = st.container()
61
+ tsne_projection = st.container()
62
 
63
  with header:
64
  st.title(self.dataset_name)
194
  xaxis_title="Number of characters",
195
  )
196
  st.plotly_chart(fig_num_chars, use_container_width=True)
197
+
198
+ with tsne_projection:
199
+ st.header("t-SNE projection of the dataset")
200
+ subset_to_project = st.selectbox(
201
+ label="Select subset to project", options=self.subsets
202
+ )
203
+ sentences = self.data_dict[subset_to_project]["text"].values
204
+ reducer = TSNE(
205
+ n_components=2
206
+ )
207
+ embedded_sentences = np.array(
208
+ [embed_sentence(text) for text in sentences]
209
+ )
210
+ transformed_embeddings = reducer.fit_transform(embedded_sentences)
211
+ fig, ax = plt.subplots()
212
+ ax.scatter(
213
+ x=transformed_embeddings[:, 0],
214
+ y=transformed_embeddings[:, 1],
215
+ c=[
216
+ sns.color_palette()[x]
217
+ for x in self.data_dict[subset_to_project]["target"].map(
218
+ {
219
+ "BEZPIECZNE_POSTANOWIENIE_UMOWNE": 0,
220
+ "KLAUZULA_ABUZYWNA": 1
221
+ }
222
+ ).values
223
+ ],
224
+ )
225
+ st.pyplot(fig)
clarin_datasets/aspectemo_dataset.py CHANGED
@@ -1,8 +1,13 @@
 
 
1
  import pandas as pd
 
2
  from datasets import load_dataset
 
3
  import streamlit as st
4
 
5
  from clarin_datasets.dataset_to_show import DatasetToShow
 
6
 
7
 
8
  class AspectEmoDataset(DatasetToShow):
@@ -54,6 +59,7 @@ class AspectEmoDataset(DatasetToShow):
54
  dataframe_head = st.container()
55
  class_distribution = st.container()
56
  most_common_tokens = st.container()
 
57
 
58
  with header:
59
  st.title(self.dataset_name)
@@ -131,3 +137,39 @@ class AspectEmoDataset(DatasetToShow):
131
  )
132
  st.dataframe(df_to_show)
133
  st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
  import pandas as pd
4
+ import seaborn as sns
5
  from datasets import load_dataset
6
+ from sklearn.manifold import TSNE
7
  import streamlit as st
8
 
9
  from clarin_datasets.dataset_to_show import DatasetToShow
10
+ from clarin_datasets.utils import embed_sentence, PLOT_COLOR_PALETTE
11
 
12
 
13
  class AspectEmoDataset(DatasetToShow):
59
  dataframe_head = st.container()
60
  class_distribution = st.container()
61
  most_common_tokens = st.container()
62
+ tsne_projection = st.container()
63
 
64
  with header:
65
  st.title(self.dataset_name)
137
  )
138
  st.dataframe(df_to_show)
139
  st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
140
+
141
+ with tsne_projection:
142
+ st.header("t-SNE projection of the dataset")
143
+ subset_to_project = st.selectbox(
144
+ label="Select subset to project", options=self.subsets
145
+ )
146
+ tokens_unzipped = self.data_dict[subset_to_project]["tokens"].tolist()
147
+ tokens_unzipped = np.array([x for subarray in tokens_unzipped for x in subarray])
148
+ labels_unzipped = self.data_dict[subset_to_project]["labels"].tolist()
149
+ labels_unzipped = np.array([x for subarray in labels_unzipped for x in subarray])
150
+ df_unzipped = pd.DataFrame(
151
+ {
152
+ "tokens": tokens_unzipped,
153
+ "labels": labels_unzipped,
154
+ }
155
+ )
156
+ df_unzipped = df_unzipped.loc[df_unzipped["labels"] != 0]
157
+ tokens_unzipped = df_unzipped["tokens"].values
158
+ labels_unzipped = df_unzipped["labels"].values
159
+ embedded_tokens = np.array(
160
+ [embed_sentence(x) for x in tokens_unzipped]
161
+ )
162
+ reducer = TSNE(
163
+ n_components=2
164
+ )
165
+ transformed_embeddings = reducer.fit_transform(embedded_tokens)
166
+ fig, ax = plt.subplots()
167
+ ax.scatter(
168
+ x=transformed_embeddings[:, 0],
169
+ y=transformed_embeddings[:, 1],
170
+ c=[
171
+ PLOT_COLOR_PALETTE[x]
172
+ for x in labels_unzipped
173
+ ],
174
+ )
175
+ st.pyplot(fig)
clarin_datasets/cst_wikinews_dataset.py CHANGED
@@ -1,8 +1,12 @@
 
 
1
  import pandas as pd
2
  from datasets import load_dataset
 
3
  import streamlit as st
4
 
5
  from clarin_datasets.dataset_to_show import DatasetToShow
 
6
 
7
 
8
  class CSTWikinewsDataset(DatasetToShow):
@@ -23,7 +27,7 @@ class CSTWikinewsDataset(DatasetToShow):
23
  header = st.container()
24
  dataframe_head = st.container()
25
  class_distribution = st.container()
26
-
27
  with header:
28
  st.title(self.dataset_name)
29
 
@@ -54,3 +58,28 @@ class CSTWikinewsDataset(DatasetToShow):
54
 
55
  with class_distribution:
56
  st.dataframe(class_distribution_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
  import pandas as pd
4
  from datasets import load_dataset
5
+ from sklearn.manifold import TSNE
6
  import streamlit as st
7
 
8
  from clarin_datasets.dataset_to_show import DatasetToShow
9
+ from clarin_datasets.utils import embed_sentence, PLOT_COLOR_PALETTE
10
 
11
 
12
  class CSTWikinewsDataset(DatasetToShow):
27
  header = st.container()
28
  dataframe_head = st.container()
29
  class_distribution = st.container()
30
+ tsne_projection = st.container()
31
  with header:
32
  st.title(self.dataset_name)
33
 
58
 
59
  with class_distribution:
60
  st.dataframe(class_distribution_df)
61
+
62
+ with tsne_projection:
63
+ st.header("t-SNE projection of the dataset")
64
+ subset_to_project = st.selectbox(
65
+ label="Select subset to project", options=self.subsets
66
+ )
67
+ first_sentences = self.data_dict[subset_to_project]["sentence_1"].values
68
+ second_sentences = self.data_dict[subset_to_project]["sentence_2"].values
69
+ labels = self.data_dict[subset_to_project]["label"].values
70
+ first_sentences_embedded = np.array([embed_sentence(x) for x in first_sentences])
71
+ second_sentences_embedded = np.array([embed_sentence(x) for x in second_sentences])
72
+ mean_embeddings = (first_sentences_embedded + second_sentences_embedded) / 2
73
+ reducer = TSNE(
74
+ n_components=2
75
+ )
76
+ transformed_embeddings = reducer.fit_transform(mean_embeddings)
77
+ fig, ax = plt.subplots()
78
+ ax.scatter(
79
+ x=transformed_embeddings[:, 0],
80
+ y=transformed_embeddings[:, 1],
81
+ c=[
82
+ PLOT_COLOR_PALETTE[i] for i in labels
83
+ ]
84
+ )
85
+ st.pyplot(fig)
clarin_datasets/kpwr_ner_datasets.py CHANGED
@@ -1,8 +1,12 @@
 
 
1
  import pandas as pd
2
  from datasets import load_dataset
 
3
  import streamlit as st
4
 
5
  from clarin_datasets.dataset_to_show import DatasetToShow
 
6
 
7
 
8
  class KpwrNerDataset(DatasetToShow):
@@ -66,6 +70,7 @@ class KpwrNerDataset(DatasetToShow):
66
  dataframe_head = st.container()
67
  class_distribution = st.container()
68
  most_common_tokens = st.container()
 
69
 
70
  with header:
71
  st.title(self.dataset_name)
@@ -153,3 +158,44 @@ class KpwrNerDataset(DatasetToShow):
153
  )
154
  st.dataframe(df_to_show)
155
  st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
  import pandas as pd
4
  from datasets import load_dataset
5
+ from sklearn.manifold import TSNE
6
  import streamlit as st
7
 
8
  from clarin_datasets.dataset_to_show import DatasetToShow
9
+ from clarin_datasets.utils import embed_sentence, PLOT_COLOR_PALETTE
10
 
11
 
12
  class KpwrNerDataset(DatasetToShow):
70
  dataframe_head = st.container()
71
  class_distribution = st.container()
72
  most_common_tokens = st.container()
73
+ tsne_projection = st.container()
74
 
75
  with header:
76
  st.title(self.dataset_name)
158
  )
159
  st.dataframe(df_to_show)
160
  st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
161
+ SHOW_TSNE_PROJECTION = False
162
+ if SHOW_TSNE_PROJECTION:
163
+ with tsne_projection:
164
+ st.header("t-SNE projection of the dataset")
165
+ subset_to_project = st.selectbox(
166
+ label="Select subset to project", options=self.subsets
167
+ )
168
+ tokens_unzipped = self.data_dict_named[subset_to_project]["tokens"].tolist()
169
+ tokens_unzipped = np.array([x for subarray in tokens_unzipped for x in subarray])
170
+ labels_unzipped = self.data_dict_named[subset_to_project]["ner"].tolist()
171
+ labels_unzipped = np.array([x for subarray in labels_unzipped for x in subarray])
172
+ df_unzipped = pd.DataFrame(
173
+ {
174
+ "tokens": tokens_unzipped,
175
+ "ner": labels_unzipped,
176
+ }
177
+ )
178
+ df_unzipped = df_unzipped.loc[
179
+ (df_unzipped["ner"] != "O")
180
+ & ~(df_unzipped["ner"].str.startswith("I-"))
181
+ ]
182
+ tokens_unzipped = df_unzipped["tokens"].values
183
+ labels_unzipped = df_unzipped["ner"].values
184
+ mapping_dict = {name: number for number, name in enumerate(set(labels_unzipped))}
185
+ labels_as_ints = [mapping_dict[label] for label in labels_unzipped]
186
+ embedded_tokens = np.array(
187
+ [embed_sentence(x) for x in tokens_unzipped]
188
+ )
189
+ reducer = TSNE(
190
+ n_components=2
191
+ )
192
+ transformed_embeddings = reducer.fit_transform(embedded_tokens)
193
+ fig, ax = plt.subplots()
194
+ ax.scatter(
195
+ x=transformed_embeddings[:, 0],
196
+ y=transformed_embeddings[:, 1],
197
+ c=[
198
+ PLOT_COLOR_PALETTE[i] for i in labels_as_ints
199
+ ]
200
+ )
201
+ st.pyplot(fig)
clarin_datasets/nkjp_pos_dataset.py CHANGED
@@ -1,8 +1,16 @@
 
 
 
1
  import pandas as pd
2
  from datasets import load_dataset
 
3
  import streamlit as st
4
 
5
  from clarin_datasets.dataset_to_show import DatasetToShow
 
 
 
 
6
 
7
 
8
  class NkjpPosDataset(DatasetToShow):
@@ -65,6 +73,7 @@ class NkjpPosDataset(DatasetToShow):
65
  description = st.container()
66
  dataframe_head = st.container()
67
  class_distribution = st.container()
 
68
 
69
  with header:
70
  st.title(self.dataset_name)
@@ -112,3 +121,41 @@ class NkjpPosDataset(DatasetToShow):
112
  st.text_area(
113
  label="LaTeX code", value=class_distribution_df.style.to_latex()
114
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+ import seaborn as sns
4
  import pandas as pd
5
  from datasets import load_dataset
6
+ from sklearn.manifold import TSNE
7
  import streamlit as st
8
 
9
  from clarin_datasets.dataset_to_show import DatasetToShow
10
+ from clarin_datasets.utils import (
11
+ PLOT_COLOR_PALETTE,
12
+ embed_sentence
13
+ )
14
 
15
 
16
  class NkjpPosDataset(DatasetToShow):
73
  description = st.container()
74
  dataframe_head = st.container()
75
  class_distribution = st.container()
76
+ tsne_projection = st.container()
77
 
78
  with header:
79
  st.title(self.dataset_name)
121
  st.text_area(
122
  label="LaTeX code", value=class_distribution_df.style.to_latex()
123
  )
124
+ SHOW_TSNE_PROJECTION = False
125
+ if SHOW_TSNE_PROJECTION:
126
+ with tsne_projection:
127
+ st.header("t-SNE projection of the dataset")
128
+ subset_to_project = st.selectbox(
129
+ label="Select subset to project", options=self.subsets
130
+ )
131
+ tokens_unzipped = self.data_dict_named[subset_to_project]["tokens"].tolist()
132
+ tokens_unzipped = np.array([x for subarray in tokens_unzipped for x in subarray])
133
+ labels_unzipped = self.data_dict_named[subset_to_project]["tags"].tolist()
134
+ labels_unzipped = np.array([x for subarray in labels_unzipped for x in subarray])
135
+ df_unzipped = pd.DataFrame(
136
+ {
137
+ "tokens": tokens_unzipped,
138
+ "tags": labels_unzipped,
139
+ }
140
+ )
141
+ tokens_unzipped = df_unzipped["tokens"].values
142
+ labels_unzipped = df_unzipped["tags"].values
143
+ mapping_dict = {name: number for number, name in enumerate(set(labels_unzipped))}
144
+ labels_as_ints = [mapping_dict[label] for label in labels_unzipped]
145
+ embedded_tokens = np.array(
146
+ [embed_sentence(x) for x in tokens_unzipped]
147
+ )
148
+ reducer = TSNE(
149
+ n_components=2
150
+ )
151
+ transformed_embeddings = reducer.fit_transform(embedded_tokens)
152
+ fig, ax = plt.subplots()
153
+ ax.scatter(
154
+ x=transformed_embeddings[:, 0],
155
+ y=transformed_embeddings[:, 1],
156
+ c=[
157
+ PLOT_COLOR_PALETTE[i]
158
+ for i in labels_as_ints
159
+ ],
160
+ )
161
+ st.pyplot(fig)
clarin_datasets/polemo_dataset.py CHANGED
@@ -1,13 +1,19 @@
 
 
 
1
  from datasets import load_dataset
2
  import pandas as pd
3
  import plotly.figure_factory as ff
4
  import plotly.graph_objects as go
 
5
  import streamlit as st
6
 
7
  from clarin_datasets.dataset_to_show import DatasetToShow
8
  from clarin_datasets.utils import (
9
  count_num_of_characters,
10
  count_num_of_words,
 
 
11
  )
12
 
13
 
@@ -38,6 +44,7 @@ class PolemoDataset(DatasetToShow):
38
  dataframe_head = st.container()
39
  word_searching = st.container()
40
  dataset_statistics = st.container()
 
41
 
42
  with header:
43
  st.title(self.dataset_name)
@@ -201,3 +208,27 @@ class PolemoDataset(DatasetToShow):
201
  xaxis_title="Number of characters",
202
  )
203
  st.plotly_chart(fig_num_chars, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+ import seaborn as sns
4
  from datasets import load_dataset
5
  import pandas as pd
6
  import plotly.figure_factory as ff
7
  import plotly.graph_objects as go
8
+ from sklearn.manifold import TSNE
9
  import streamlit as st
10
 
11
  from clarin_datasets.dataset_to_show import DatasetToShow
12
  from clarin_datasets.utils import (
13
  count_num_of_characters,
14
  count_num_of_words,
15
+ embed_sentence,
16
+ PLOT_COLOR_PALETTE
17
  )
18
 
19
 
44
  dataframe_head = st.container()
45
  word_searching = st.container()
46
  dataset_statistics = st.container()
47
+ tsne_projection = st.container()
48
 
49
  with header:
50
  st.title(self.dataset_name)
208
  xaxis_title="Number of characters",
209
  )
210
  st.plotly_chart(fig_num_chars, use_container_width=True)
211
+
212
+ with tsne_projection:
213
+ st.header("t-SNE projection of the dataset")
214
+ subset_to_project = st.selectbox(
215
+ label="Select subset to project", options=self.subsets
216
+ )
217
+ sentences = self.data_dict[subset_to_project]["text"].values
218
+ reducer = TSNE(
219
+ n_components=2
220
+ )
221
+ embedded_sentences = np.array(
222
+ [embed_sentence(text) for text in sentences]
223
+ )
224
+ transformed_embeddings = reducer.fit_transform(embedded_sentences)
225
+ fig, ax = plt.subplots()
226
+ ax.scatter(
227
+ x=transformed_embeddings[:, 0],
228
+ y=transformed_embeddings[:, 1],
229
+ c=[
230
+ PLOT_COLOR_PALETTE[x]
231
+ for x in self.data_dict[subset_to_project]["target"].values
232
+ ],
233
+ )
234
+ st.pyplot(fig)
clarin_datasets/punctuation_restoration_dataset.py CHANGED
@@ -1,8 +1,12 @@
 
 
1
  import pandas as pd
2
  from datasets import load_dataset
 
3
  import streamlit as st
4
 
5
  from clarin_datasets.dataset_to_show import DatasetToShow
 
6
 
7
 
8
  class PunctuationRestorationDataset(DatasetToShow):
@@ -68,6 +72,7 @@ class PunctuationRestorationDataset(DatasetToShow):
68
  description = st.container()
69
  dataframe_head = st.container()
70
  class_distribution = st.container()
 
71
 
72
  with header:
73
  st.title(self.dataset_name)
@@ -116,3 +121,39 @@ class PunctuationRestorationDataset(DatasetToShow):
116
  st.text_area(
117
  label="LaTeX code", value=class_distribution_df.style.to_latex()
118
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
  import pandas as pd
4
  from datasets import load_dataset
5
+ from sklearn.manifold import TSNE
6
  import streamlit as st
7
 
8
  from clarin_datasets.dataset_to_show import DatasetToShow
9
+ from clarin_datasets.utils import embed_sentence, PLOT_COLOR_PALETTE
10
 
11
 
12
  class PunctuationRestorationDataset(DatasetToShow):
72
  description = st.container()
73
  dataframe_head = st.container()
74
  class_distribution = st.container()
75
+ tsne_projection = st.container()
76
 
77
  with header:
78
  st.title(self.dataset_name)
121
  st.text_area(
122
  label="LaTeX code", value=class_distribution_df.style.to_latex()
123
  )
124
+ with tsne_projection:
125
+ st.header("t-SNE projection of the dataset")
126
+ subset_to_project = st.selectbox(
127
+ label="Select subset to project", options=self.subsets
128
+ )
129
+ tokens_unzipped = self.data_dict_named[subset_to_project]["tokens"].tolist()
130
+ tokens_unzipped = np.array([x for subarray in tokens_unzipped for x in subarray])
131
+ labels_unzipped = self.data_dict_named[subset_to_project]["tags"].tolist()
132
+ labels_unzipped = np.array([x for subarray in labels_unzipped for x in subarray])
133
+ df_unzipped = pd.DataFrame(
134
+ {
135
+ "tokens": tokens_unzipped,
136
+ "tags": labels_unzipped,
137
+ }
138
+ )
139
+ df_unzipped = df_unzipped.loc[df_unzipped["tags"] != "O"]
140
+ tokens_unzipped = df_unzipped["tokens"].values
141
+ labels_unzipped = df_unzipped["tags"].values
142
+ mapping_dict = {name: number for number, name in enumerate(set(labels_unzipped))}
143
+ labels_as_ints = [mapping_dict[label] for label in labels_unzipped]
144
+ embedded_tokens = np.array(
145
+ [embed_sentence(x) for x in tokens_unzipped]
146
+ )
147
+ reducer = TSNE(
148
+ n_components=2
149
+ )
150
+ transformed_embeddings = reducer.fit_transform(embedded_tokens)
151
+ fig, ax = plt.subplots()
152
+ ax.scatter(
153
+ x=transformed_embeddings[:, 0],
154
+ y=transformed_embeddings[:, 1],
155
+ c=[
156
+ PLOT_COLOR_PALETTE[i] for i in labels_as_ints
157
+ ]
158
+ )
159
+ st.pyplot(fig)
clarin_datasets/utils.py CHANGED
@@ -1,8 +1,56 @@
1
  import re
2
  from typing import List
3
 
 
 
 
4
  from unidecode import unidecode
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  def flatten_list(main_list: List[List]) -> List:
8
  return [item for sublist in main_list for item in sublist]
@@ -14,3 +62,9 @@ def count_num_of_characters(text: str) -> int:
14
 
15
  def count_num_of_words(text: str) -> int:
16
  return len(re.sub(r"[^a-zA-Z ]", "", unidecode(text)).split(" "))
 
 
 
 
 
 
1
  import re
2
  from typing import List
3
 
4
+ from embeddings.embedding.auto_flair import AutoFlairDocumentEmbedding
5
+ from flair.data import Sentence
6
+ from numpy import typing as nt
7
  from unidecode import unidecode
8
 
9
+ embedding = AutoFlairDocumentEmbedding.from_hub("clarin-pl/word2vec-kgr10")
10
+
11
+ PLOT_COLOR_PALETTE = [
12
+ "#FAEBD7",
13
+ "#00FFFF",
14
+ "#7FFFD4",
15
+ "#000000",
16
+ "#0000FF",
17
+ "#8A2BE2",
18
+ "#A52A2A",
19
+ "#DEB887",
20
+ "#5F9EA0",
21
+ "#7FFF00",
22
+ "#D2691E",
23
+ "#FF7F50",
24
+ "#6495ED",
25
+ "#FFF8DC",
26
+ "#DC143C",
27
+ "#00FFFF",
28
+ "#00008B",
29
+ "#008B8B",
30
+ "#B8860B",
31
+ "#A9A9A9",
32
+ "#006400",
33
+ "#BDB76B",
34
+ "#8B008B",
35
+ "#556B2F",
36
+ "#FF8C00",
37
+ "#9932CC",
38
+ "#8B0000",
39
+ "#E9967A",
40
+ "#8FBC8F",
41
+ "#2F4F4F",
42
+ "#00CED1",
43
+ "#FFD700",
44
+ "#DAA520",
45
+ "#808080",
46
+ "#FF69B4",
47
+ "#4B0082",
48
+ "#CD5C5C",
49
+ "#7CFC00",
50
+ "#F08080",
51
+ "#66CDAA",
52
+ ]
53
+
54
 
55
  def flatten_list(main_list: List[List]) -> List:
56
  return [item for sublist in main_list for item in sublist]
62
 
63
  def count_num_of_words(text: str) -> int:
64
  return len(re.sub(r"[^a-zA-Z ]", "", unidecode(text)).split(" "))
65
+
66
+
67
+ def embed_sentence(sentence: str) -> nt.NDArray:
68
+ sentence = Sentence(sentence)
69
+ embedding.embed([sentence])
70
+ return sentence.embedding.numpy()
requirements.txt CHANGED
@@ -4,6 +4,12 @@ pyperclip==1.8.2
4
  streamlit==1.11.0
5
  Unidecode==1.3.4
6
  scipy
7
- datasets
8
  umap-learn
9
- clarinpl-embeddings
 
 
 
 
 
 
4
  streamlit==1.11.0
5
  Unidecode==1.3.4
6
  scipy
7
+ datasets~=2.4.0
8
  umap-learn
9
+ clarinpl-embeddings
10
+ pynndescent
11
+ flair~=0.11.3
12
+ numpy~=1.23.3
13
+ matplotlib~=3.5.3
14
+ seaborn~=0.12.0
15
+ scikit-learn~=1.1.2