Spaces:

clarin-pl
/

datasets-explorer

Runtime error

App Files Files Community

Mariusz Kossakowski commited on Aug 30, 2022

Commit

c76a6b2

1 Parent(s): d405df4

Add stuff in KPWR-NER dataset

Browse files

Files changed (1) hide show

clarin_datasets/kpwr_ner_datasets.py +88 -0

clarin_datasets/kpwr_ner_datasets.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from datasets import load_dataset
 import streamlit as st
@@ -7,6 +8,7 @@ from clarin_datasets.dataset_to_show import DatasetToShow
 class KpwrNerDataset(DatasetToShow):
     def __init__(self):
         DatasetToShow.__init__(self)
         self.dataset_name = "clarin-pl/kpwr-ner"
         self.description = """
         KPWR-NER is a part the Polish Corpus of Wrocław University of Technology (Korpus Języka
@@ -36,11 +38,29 @@ class KpwrNerDataset(DatasetToShow):
         self.data_dict = {
             subset: raw_dataset[subset].to_pandas() for subset in self.subsets
         }
     def show_dataset(self):
         header = st.container()
         description = st.container()
         dataframe_head = st.container()
         with header:
             st.title(self.dataset_name)
@@ -48,3 +68,71 @@ class KpwrNerDataset(DatasetToShow):
         with description:
             st.header("Dataset description")
             st.write(self.description)

+import pandas as pd
 from datasets import load_dataset
 import streamlit as st
 class KpwrNerDataset(DatasetToShow):
     def __init__(self):
         DatasetToShow.__init__(self)
+        self.data_dict_named = None
         self.dataset_name = "clarin-pl/kpwr-ner"
         self.description = """
         KPWR-NER is a part the Polish Corpus of Wrocław University of Technology (Korpus Języka
         self.data_dict = {
             subset: raw_dataset[subset].to_pandas() for subset in self.subsets
         }
+        self.data_dict_named = {}
+        for subset in self.subsets:
+            references = raw_dataset[subset]["ner"]
+            references_named = [
+                [
+                    raw_dataset[subset].features["ner"].feature.names[label]
+                    for label in labels
+                ]
+                for labels in references
+            ]
+            self.data_dict_named[subset] = pd.DataFrame(
+                {
+                    "tokens": self.data_dict[subset]["tokens"],
+                    "ner": references_named,
+                }
+            )
     def show_dataset(self):
         header = st.container()
         description = st.container()
         dataframe_head = st.container()
+        class_distribution = st.container()
+        most_common_tokens = st.container()
         with header:
             st.title(self.dataset_name)
         with description:
             st.header("Dataset description")
             st.write(self.description)
+        full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
+        tokens_all = full_dataframe["tokens"].tolist()
+        tokens_all = [x for subarray in tokens_all for x in subarray]
+        labels_all = pd.concat(self.data_dict_named.values(), axis="rows")["ner"].tolist()
+        labels_all = [x for subarray in labels_all for x in subarray]
+        with dataframe_head:
+            df_to_show = full_dataframe.head(10)
+            st.header("First 10 observations of the dataset")
+            st.dataframe(df_to_show)
+            st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
+        class_distribution_dict = {}
+        for subset in self.subsets:
+            all_labels_from_subset = self.data_dict_named[subset]["ner"].tolist()
+            all_labels_from_subset = [
+                x
+                for subarray in all_labels_from_subset
+                for x in subarray
+                if x != "O" and not x.startswith("I-")
+            ]
+            all_labels_from_subset = pd.Series(all_labels_from_subset)
+            class_distribution_dict[subset] = (
+                all_labels_from_subset.value_counts(normalize=True)
+                    .sort_index()
+                    .reset_index()
+                    .rename({"index": "class", 0: subset}, axis="columns")
+            )
+        class_distribution_df = pd.merge(
+            class_distribution_dict["train"],
+            class_distribution_dict["test"],
+            on="class",
+        )
+        with class_distribution:
+            st.header("Class distribution in each subset (without '0' and 'I-*')")
+            st.dataframe(class_distribution_df)
+            st.text_area(
+                label="LaTeX code", value=class_distribution_df.style.to_latex()
+            )
+            # Most common tokens from selected class (without 0)
+            full_df_unzipped = pd.DataFrame(
+                {
+                    "token": tokens_all,
+                    "ner": labels_all,
+                }
+            )
+            full_df_unzipped = full_df_unzipped.loc[full_df_unzipped["ner"] != 0]
+            possible_options = sorted(full_df_unzipped["ner"].unique())
+            with most_common_tokens:
+                st.header("10 most common tokens from selected class (without '0')")
+                selected_class = st.selectbox(
+                    label="Select class to show", options=possible_options
+                )
+                df_to_show = (
+                    full_df_unzipped.loc[full_df_unzipped["ner"] == selected_class]
+                    .groupby(["token"])
+                    .count()
+                    .reset_index()
+                    .rename({"ner": "no_of_occurrences"}, axis=1)
+                    .sort_values(by="no_of_occurrences", ascending=False)
+                    .reset_index(drop=True)
+                    .head(10)
+                )
+                st.dataframe(df_to_show)
+                st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())