Spaces:

clarin-pl
/

datasets-explorer

Runtime error

App Files Files Community

Mariusz Kossakowski commited on Sep 23, 2022

Commit

77405f7

1 Parent(s): d1e0fdf

Add tSNE projection

Browse files

Files changed (9) hide show

clarin_datasets/abusive_clauses_dataset.py +35 -0
clarin_datasets/aspectemo_dataset.py +42 -0
clarin_datasets/cst_wikinews_dataset.py +30 -1
clarin_datasets/kpwr_ner_datasets.py +46 -0
clarin_datasets/nkjp_pos_dataset.py +47 -0
clarin_datasets/polemo_dataset.py +31 -0
clarin_datasets/punctuation_restoration_dataset.py +41 -0
clarin_datasets/utils.py +54 -0
requirements.txt +8 -2

clarin_datasets/abusive_clauses_dataset.py CHANGED Viewed

@@ -1,13 +1,18 @@
 import pandas as pd
 import plotly.figure_factory as ff
 import plotly.graph_objects as go
 import streamlit as st
 from clarin_datasets.dataset_to_show import DatasetToShow
 from clarin_datasets.utils import (
     count_num_of_characters,
     count_num_of_words,
 )
 class AbusiveClausesDataset(DatasetToShow):
@@ -53,6 +58,7 @@ class AbusiveClausesDataset(DatasetToShow):
         dataframe_head = st.container()
         word_searching = st.container()
         dataset_statistics = st.container()
         with header:
             st.title(self.dataset_name)
@@ -188,3 +194,32 @@ class AbusiveClausesDataset(DatasetToShow):
                 xaxis_title="Number of characters",
             )
             st.plotly_chart(fig_num_chars, use_container_width=True)

+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
 import pandas as pd
 import plotly.figure_factory as ff
 import plotly.graph_objects as go
 import streamlit as st
+from sklearn.manifold import TSNE
 from clarin_datasets.dataset_to_show import DatasetToShow
 from clarin_datasets.utils import (
     count_num_of_characters,
     count_num_of_words,
 )
+from clarin_datasets.utils import embed_sentence
 class AbusiveClausesDataset(DatasetToShow):
         dataframe_head = st.container()
         word_searching = st.container()
         dataset_statistics = st.container()
+        tsne_projection = st.container()
         with header:
             st.title(self.dataset_name)
                 xaxis_title="Number of characters",
             )
             st.plotly_chart(fig_num_chars, use_container_width=True)
+            with tsne_projection:
+                st.header("t-SNE projection of the dataset")
+                subset_to_project = st.selectbox(
+                    label="Select subset to project", options=self.subsets
+                )
+                sentences = self.data_dict[subset_to_project]["text"].values
+                reducer = TSNE(
+                    n_components=2
+                )
+                embedded_sentences = np.array(
+                    [embed_sentence(text) for text in sentences]
+                )
+                transformed_embeddings = reducer.fit_transform(embedded_sentences)
+                fig, ax = plt.subplots()
+                ax.scatter(
+                    x=transformed_embeddings[:, 0],
+                    y=transformed_embeddings[:, 1],
+                    c=[
+                        sns.color_palette()[x]
+                        for x in self.data_dict[subset_to_project]["target"].map(
+                            {
+                                "BEZPIECZNE_POSTANOWIENIE_UMOWNE": 0,
+                                "KLAUZULA_ABUZYWNA": 1
+                            }
+                        ).values
+                    ],
+                )
+                st.pyplot(fig)

clarin_datasets/aspectemo_dataset.py CHANGED Viewed

@@ -1,8 +1,13 @@
 import pandas as pd
 from datasets import load_dataset
 import streamlit as st
 from clarin_datasets.dataset_to_show import DatasetToShow
 class AspectEmoDataset(DatasetToShow):
@@ -54,6 +59,7 @@ class AspectEmoDataset(DatasetToShow):
         dataframe_head = st.container()
         class_distribution = st.container()
         most_common_tokens = st.container()
         with header:
             st.title(self.dataset_name)
@@ -131,3 +137,39 @@ class AspectEmoDataset(DatasetToShow):
             )
             st.dataframe(df_to_show)
             st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())

+import numpy as np
+import matplotlib.pyplot as plt
 import pandas as pd
+import seaborn as sns
 from datasets import load_dataset
+from sklearn.manifold import TSNE
 import streamlit as st
 from clarin_datasets.dataset_to_show import DatasetToShow
+from clarin_datasets.utils import embed_sentence, PLOT_COLOR_PALETTE
 class AspectEmoDataset(DatasetToShow):
         dataframe_head = st.container()
         class_distribution = st.container()
         most_common_tokens = st.container()
+        tsne_projection = st.container()
         with header:
             st.title(self.dataset_name)
             )
             st.dataframe(df_to_show)
             st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
+            with tsne_projection:
+                st.header("t-SNE projection of the dataset")
+                subset_to_project = st.selectbox(
+                    label="Select subset to project", options=self.subsets
+                )
+                tokens_unzipped = self.data_dict[subset_to_project]["tokens"].tolist()
+                tokens_unzipped = np.array([x for subarray in tokens_unzipped for x in subarray])
+                labels_unzipped = self.data_dict[subset_to_project]["labels"].tolist()
+                labels_unzipped = np.array([x for subarray in labels_unzipped for x in subarray])
+                df_unzipped = pd.DataFrame(
+                    {
+                        "tokens": tokens_unzipped,
+                        "labels": labels_unzipped,
+                    }
+                )
+                df_unzipped = df_unzipped.loc[df_unzipped["labels"] != 0]
+                tokens_unzipped = df_unzipped["tokens"].values
+                labels_unzipped = df_unzipped["labels"].values
+                embedded_tokens = np.array(
+                    [embed_sentence(x) for x in tokens_unzipped]
+                )
+                reducer = TSNE(
+                    n_components=2
+                )
+                transformed_embeddings = reducer.fit_transform(embedded_tokens)
+                fig, ax = plt.subplots()
+                ax.scatter(
+                    x=transformed_embeddings[:, 0],
+                    y=transformed_embeddings[:, 1],
+                    c=[
+                        PLOT_COLOR_PALETTE[x]
+                        for x in labels_unzipped
+                    ],
+                )
+                st.pyplot(fig)

clarin_datasets/cst_wikinews_dataset.py CHANGED Viewed

@@ -1,8 +1,12 @@
 import pandas as pd
 from datasets import load_dataset
 import streamlit as st
 from clarin_datasets.dataset_to_show import DatasetToShow
 class CSTWikinewsDataset(DatasetToShow):
@@ -23,7 +27,7 @@ class CSTWikinewsDataset(DatasetToShow):
         header = st.container()
         dataframe_head = st.container()
         class_distribution = st.container()
         with header:
             st.title(self.dataset_name)
@@ -54,3 +58,28 @@ class CSTWikinewsDataset(DatasetToShow):
         with class_distribution:
             st.dataframe(class_distribution_df)

+import numpy as np
+import matplotlib.pyplot as plt
 import pandas as pd
 from datasets import load_dataset
+from sklearn.manifold import TSNE
 import streamlit as st
 from clarin_datasets.dataset_to_show import DatasetToShow
+from clarin_datasets.utils import embed_sentence, PLOT_COLOR_PALETTE
 class CSTWikinewsDataset(DatasetToShow):
         header = st.container()
         dataframe_head = st.container()
         class_distribution = st.container()
+        tsne_projection = st.container()
         with header:
             st.title(self.dataset_name)
         with class_distribution:
             st.dataframe(class_distribution_df)
+        with tsne_projection:
+            st.header("t-SNE projection of the dataset")
+            subset_to_project = st.selectbox(
+                label="Select subset to project", options=self.subsets
+            )
+            first_sentences = self.data_dict[subset_to_project]["sentence_1"].values
+            second_sentences = self.data_dict[subset_to_project]["sentence_2"].values
+            labels = self.data_dict[subset_to_project]["label"].values
+            first_sentences_embedded = np.array([embed_sentence(x) for x in first_sentences])
+            second_sentences_embedded = np.array([embed_sentence(x) for x in second_sentences])
+            mean_embeddings = (first_sentences_embedded + second_sentences_embedded) / 2
+            reducer = TSNE(
+                n_components=2
+            )
+            transformed_embeddings = reducer.fit_transform(mean_embeddings)
+            fig, ax = plt.subplots()
+            ax.scatter(
+                x=transformed_embeddings[:, 0],
+                y=transformed_embeddings[:, 1],
+                c=[
+                    PLOT_COLOR_PALETTE[i] for i in labels
+                ]
+            )
+            st.pyplot(fig)

clarin_datasets/kpwr_ner_datasets.py CHANGED Viewed

@@ -1,8 +1,12 @@
 import pandas as pd
 from datasets import load_dataset
 import streamlit as st
 from clarin_datasets.dataset_to_show import DatasetToShow
 class KpwrNerDataset(DatasetToShow):
@@ -66,6 +70,7 @@ class KpwrNerDataset(DatasetToShow):
         dataframe_head = st.container()
         class_distribution = st.container()
         most_common_tokens = st.container()
         with header:
             st.title(self.dataset_name)
@@ -153,3 +158,44 @@ class KpwrNerDataset(DatasetToShow):
                 )
                 st.dataframe(df_to_show)
                 st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())

+import numpy as np
+import matplotlib.pyplot as plt
 import pandas as pd
 from datasets import load_dataset
+from sklearn.manifold import TSNE
 import streamlit as st
 from clarin_datasets.dataset_to_show import DatasetToShow
+from clarin_datasets.utils import embed_sentence, PLOT_COLOR_PALETTE
 class KpwrNerDataset(DatasetToShow):
         dataframe_head = st.container()
         class_distribution = st.container()
         most_common_tokens = st.container()
+        tsne_projection = st.container()
         with header:
             st.title(self.dataset_name)
                 )
                 st.dataframe(df_to_show)
                 st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
+            SHOW_TSNE_PROJECTION = False
+            if SHOW_TSNE_PROJECTION:
+                with tsne_projection:
+                    st.header("t-SNE projection of the dataset")
+                    subset_to_project = st.selectbox(
+                        label="Select subset to project", options=self.subsets
+                    )
+                    tokens_unzipped = self.data_dict_named[subset_to_project]["tokens"].tolist()
+                    tokens_unzipped = np.array([x for subarray in tokens_unzipped for x in subarray])
+                    labels_unzipped = self.data_dict_named[subset_to_project]["ner"].tolist()
+                    labels_unzipped = np.array([x for subarray in labels_unzipped for x in subarray])
+                    df_unzipped = pd.DataFrame(
+                        {
+                            "tokens": tokens_unzipped,
+                            "ner": labels_unzipped,
+                        }
+                    )
+                    df_unzipped = df_unzipped.loc[
+                        (df_unzipped["ner"] != "O")
+                        & ~(df_unzipped["ner"].str.startswith("I-"))
+                    ]
+                    tokens_unzipped = df_unzipped["tokens"].values
+                    labels_unzipped = df_unzipped["ner"].values
+                    mapping_dict = {name: number for number, name in enumerate(set(labels_unzipped))}
+                    labels_as_ints = [mapping_dict[label] for label in labels_unzipped]
+                    embedded_tokens = np.array(
+                        [embed_sentence(x) for x in tokens_unzipped]
+                    )
+                    reducer = TSNE(
+                        n_components=2
+                    )
+                    transformed_embeddings = reducer.fit_transform(embedded_tokens)
+                    fig, ax = plt.subplots()
+                    ax.scatter(
+                        x=transformed_embeddings[:, 0],
+                        y=transformed_embeddings[:, 1],
+                        c=[
+                            PLOT_COLOR_PALETTE[i] for i in labels_as_ints
+                        ]
+                    )
+                    st.pyplot(fig)

clarin_datasets/nkjp_pos_dataset.py CHANGED Viewed

@@ -1,8 +1,16 @@
 import pandas as pd
 from datasets import load_dataset
 import streamlit as st
 from clarin_datasets.dataset_to_show import DatasetToShow
 class NkjpPosDataset(DatasetToShow):
@@ -65,6 +73,7 @@ class NkjpPosDataset(DatasetToShow):
         description = st.container()
         dataframe_head = st.container()
         class_distribution = st.container()
         with header:
             st.title(self.dataset_name)
@@ -112,3 +121,41 @@ class NkjpPosDataset(DatasetToShow):
             st.text_area(
                 label="LaTeX code", value=class_distribution_df.style.to_latex()
             )

+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
 import pandas as pd
 from datasets import load_dataset
+from sklearn.manifold import TSNE
 import streamlit as st
 from clarin_datasets.dataset_to_show import DatasetToShow
+from clarin_datasets.utils import (
+    PLOT_COLOR_PALETTE,
+    embed_sentence
+)
 class NkjpPosDataset(DatasetToShow):
         description = st.container()
         dataframe_head = st.container()
         class_distribution = st.container()
+        tsne_projection = st.container()
         with header:
             st.title(self.dataset_name)
             st.text_area(
                 label="LaTeX code", value=class_distribution_df.style.to_latex()
             )
+        SHOW_TSNE_PROJECTION = False
+        if SHOW_TSNE_PROJECTION:
+            with tsne_projection:
+                st.header("t-SNE projection of the dataset")
+                subset_to_project = st.selectbox(
+                    label="Select subset to project", options=self.subsets
+                )
+                tokens_unzipped = self.data_dict_named[subset_to_project]["tokens"].tolist()
+                tokens_unzipped = np.array([x for subarray in tokens_unzipped for x in subarray])
+                labels_unzipped = self.data_dict_named[subset_to_project]["tags"].tolist()
+                labels_unzipped = np.array([x for subarray in labels_unzipped for x in subarray])
+                df_unzipped = pd.DataFrame(
+                    {
+                        "tokens": tokens_unzipped,
+                        "tags": labels_unzipped,
+                    }
+                )
+                tokens_unzipped = df_unzipped["tokens"].values
+                labels_unzipped = df_unzipped["tags"].values
+                mapping_dict = {name: number for number, name in enumerate(set(labels_unzipped))}
+                labels_as_ints = [mapping_dict[label] for label in labels_unzipped]
+                embedded_tokens = np.array(
+                    [embed_sentence(x) for x in tokens_unzipped]
+                )
+                reducer = TSNE(
+                    n_components=2
+                )
+                transformed_embeddings = reducer.fit_transform(embedded_tokens)
+                fig, ax = plt.subplots()
+                ax.scatter(
+                    x=transformed_embeddings[:, 0],
+                    y=transformed_embeddings[:, 1],
+                    c=[
+                         PLOT_COLOR_PALETTE[i]
+                         for i in labels_as_ints
+                    ],
+                )
+                st.pyplot(fig)

clarin_datasets/polemo_dataset.py CHANGED Viewed

@@ -1,13 +1,19 @@
 from datasets import load_dataset
 import pandas as pd
 import plotly.figure_factory as ff
 import plotly.graph_objects as go
 import streamlit as st
 from clarin_datasets.dataset_to_show import DatasetToShow
 from clarin_datasets.utils import (
     count_num_of_characters,
     count_num_of_words,
 )
@@ -38,6 +44,7 @@ class PolemoDataset(DatasetToShow):
         dataframe_head = st.container()
         word_searching = st.container()
         dataset_statistics = st.container()
         with header:
             st.title(self.dataset_name)
@@ -201,3 +208,27 @@ class PolemoDataset(DatasetToShow):
                 xaxis_title="Number of characters",
             )
             st.plotly_chart(fig_num_chars, use_container_width=True)

+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
 from datasets import load_dataset
 import pandas as pd
 import plotly.figure_factory as ff
 import plotly.graph_objects as go
+from sklearn.manifold import TSNE
 import streamlit as st
 from clarin_datasets.dataset_to_show import DatasetToShow
 from clarin_datasets.utils import (
     count_num_of_characters,
     count_num_of_words,
+    embed_sentence,
+    PLOT_COLOR_PALETTE
 )
         dataframe_head = st.container()
         word_searching = st.container()
         dataset_statistics = st.container()
+        tsne_projection = st.container()
         with header:
             st.title(self.dataset_name)
                 xaxis_title="Number of characters",
             )
             st.plotly_chart(fig_num_chars, use_container_width=True)
+            with tsne_projection:
+                st.header("t-SNE projection of the dataset")
+                subset_to_project = st.selectbox(
+                    label="Select subset to project", options=self.subsets
+                )
+                sentences = self.data_dict[subset_to_project]["text"].values
+                reducer = TSNE(
+                    n_components=2
+                )
+                embedded_sentences = np.array(
+                    [embed_sentence(text) for text in sentences]
+                )
+                transformed_embeddings = reducer.fit_transform(embedded_sentences)
+                fig, ax = plt.subplots()
+                ax.scatter(
+                    x=transformed_embeddings[:, 0],
+                    y=transformed_embeddings[:, 1],
+                    c=[
+                        PLOT_COLOR_PALETTE[x]
+                        for x in self.data_dict[subset_to_project]["target"].values
+                    ],
+                )
+                st.pyplot(fig)

clarin_datasets/punctuation_restoration_dataset.py CHANGED Viewed

@@ -1,8 +1,12 @@
 import pandas as pd
 from datasets import load_dataset
 import streamlit as st
 from clarin_datasets.dataset_to_show import DatasetToShow
 class PunctuationRestorationDataset(DatasetToShow):
@@ -68,6 +72,7 @@ class PunctuationRestorationDataset(DatasetToShow):
         description = st.container()
         dataframe_head = st.container()
         class_distribution = st.container()
         with header:
             st.title(self.dataset_name)
@@ -116,3 +121,39 @@ class PunctuationRestorationDataset(DatasetToShow):
             st.text_area(
                 label="LaTeX code", value=class_distribution_df.style.to_latex()
             )

+import numpy as np
+import matplotlib.pyplot as plt
 import pandas as pd
 from datasets import load_dataset
+from sklearn.manifold import TSNE
 import streamlit as st
 from clarin_datasets.dataset_to_show import DatasetToShow
+from clarin_datasets.utils import embed_sentence, PLOT_COLOR_PALETTE
 class PunctuationRestorationDataset(DatasetToShow):
         description = st.container()
         dataframe_head = st.container()
         class_distribution = st.container()
+        tsne_projection = st.container()
         with header:
             st.title(self.dataset_name)
             st.text_area(
                 label="LaTeX code", value=class_distribution_df.style.to_latex()
             )
+        with tsne_projection:
+            st.header("t-SNE projection of the dataset")
+            subset_to_project = st.selectbox(
+                label="Select subset to project", options=self.subsets
+            )
+            tokens_unzipped = self.data_dict_named[subset_to_project]["tokens"].tolist()
+            tokens_unzipped = np.array([x for subarray in tokens_unzipped for x in subarray])
+            labels_unzipped = self.data_dict_named[subset_to_project]["tags"].tolist()
+            labels_unzipped = np.array([x for subarray in labels_unzipped for x in subarray])
+            df_unzipped = pd.DataFrame(
+                {
+                    "tokens": tokens_unzipped,
+                    "tags": labels_unzipped,
+                }
+            )
+            df_unzipped = df_unzipped.loc[df_unzipped["tags"] != "O"]
+            tokens_unzipped = df_unzipped["tokens"].values
+            labels_unzipped = df_unzipped["tags"].values
+            mapping_dict = {name: number for number, name in enumerate(set(labels_unzipped))}
+            labels_as_ints = [mapping_dict[label] for label in labels_unzipped]
+            embedded_tokens = np.array(
+                [embed_sentence(x) for x in tokens_unzipped]
+            )
+            reducer = TSNE(
+                n_components=2
+            )
+            transformed_embeddings = reducer.fit_transform(embedded_tokens)
+            fig, ax = plt.subplots()
+            ax.scatter(
+                x=transformed_embeddings[:, 0],
+                y=transformed_embeddings[:, 1],
+                c=[
+                    PLOT_COLOR_PALETTE[i] for i in labels_as_ints
+                ]
+            )
+            st.pyplot(fig)

clarin_datasets/utils.py CHANGED Viewed

@@ -1,8 +1,56 @@
 import re
 from typing import List
 from unidecode import unidecode
 def flatten_list(main_list: List[List]) -> List:
     return [item for sublist in main_list for item in sublist]
@@ -14,3 +62,9 @@ def count_num_of_characters(text: str) -> int:
 def count_num_of_words(text: str) -> int:
     return len(re.sub(r"[^a-zA-Z ]", "", unidecode(text)).split(" "))

 import re
 from typing import List
+from embeddings.embedding.auto_flair import AutoFlairDocumentEmbedding
+from flair.data import Sentence
+from numpy import typing as nt
 from unidecode import unidecode
+embedding = AutoFlairDocumentEmbedding.from_hub("clarin-pl/word2vec-kgr10")
+PLOT_COLOR_PALETTE = [
+    "#FAEBD7",
+    "#00FFFF",
+    "#7FFFD4",
+    "#000000",
+    "#0000FF",
+    "#8A2BE2",
+    "#A52A2A",
+    "#DEB887",
+    "#5F9EA0",
+    "#7FFF00",
+    "#D2691E",
+    "#FF7F50",
+    "#6495ED",
+    "#FFF8DC",
+    "#DC143C",
+    "#00FFFF",
+    "#00008B",
+    "#008B8B",
+    "#B8860B",
+    "#A9A9A9",
+    "#006400",
+    "#BDB76B",
+    "#8B008B",
+    "#556B2F",
+    "#FF8C00",
+    "#9932CC",
+    "#8B0000",
+    "#E9967A",
+    "#8FBC8F",
+    "#2F4F4F",
+    "#00CED1",
+    "#FFD700",
+    "#DAA520",
+    "#808080",
+    "#FF69B4",
+    "#4B0082",
+    "#CD5C5C",
+    "#7CFC00",
+    "#F08080",
+    "#66CDAA",
+]
 def flatten_list(main_list: List[List]) -> List:
     return [item for sublist in main_list for item in sublist]
 def count_num_of_words(text: str) -> int:
     return len(re.sub(r"[^a-zA-Z ]", "", unidecode(text)).split(" "))
+def embed_sentence(sentence: str) -> nt.NDArray:
+    sentence = Sentence(sentence)
+    embedding.embed([sentence])
+    return sentence.embedding.numpy()

requirements.txt CHANGED Viewed

@@ -4,6 +4,12 @@ pyperclip==1.8.2
 streamlit==1.11.0
 Unidecode==1.3.4
 scipy
-datasets
 umap-learn
-clarinpl-embeddings

 streamlit==1.11.0
 Unidecode==1.3.4
 scipy
+datasets~=2.4.0
 umap-learn
+clarinpl-embeddings
+pynndescent
+flair~=0.11.3
+numpy~=1.23.3
+matplotlib~=3.5.3
+seaborn~=0.12.0
+scikit-learn~=1.1.2