File size: 8,988 Bytes
77405f7
 
c76a6b2
9f7f573
77405f7
9f7f573
 
 
77405f7
9f7f573
 
 
 
 
c76a6b2
9f7f573
802f11a
2b9022f
 
 
802f11a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abb1c69
802f11a
9f7f573
 
 
 
90966f7
9f7f573
c76a6b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f7f573
 
 
 
 
c76a6b2
 
77405f7
9f7f573
 
 
 
 
 
802f11a
 
 
c76a6b2
 
 
 
f10673c
 
 
c76a6b2
 
 
d3fc096
abb1c69
 
 
d3fc096
c76a6b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f10673c
 
 
c76a6b2
 
 
 
 
 
 
 
8eb9cdc
c76a6b2
 
 
 
 
 
 
 
 
 
 
 
f10673c
 
3362a6a
abb1c69
c76a6b2
 
abb1c69
 
 
c76a6b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77405f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datasets import load_dataset
from sklearn.manifold import TSNE
import streamlit as st

from clarin_datasets.dataset_to_show import DatasetToShow
from clarin_datasets.utils import embed_sentence, PLOT_COLOR_PALETTE


class KpwrNerDataset(DatasetToShow):
    def __init__(self):
        DatasetToShow.__init__(self)
        self.data_dict_named = None
        self.dataset_name = "clarin-pl/kpwr-ner"
        self.description = [
            f"""
            Dataset link: https://huggingface.co/datasets/{self.dataset_name}
            
            KPWR-NER is a part the Polish Corpus of Wrocław University of Technology (Korpus Języka 
            Polskiego Politechniki Wrocławskiej). Its objective is named entity recognition for fine-grained categories 
            of entities. It is the ‘n82’ version of the KPWr, which means that number of classes is restricted to 82 (
            originally 120). During corpus creation, texts were annotated by humans from various sources, covering many 
            domains and genres. 
            """,
            "Tasks (input, output and metrics)",
            """
            Named entity recognition (NER) - tagging entities in text with their corresponding type.
            
            Input ('tokens' column): sequence of tokens
            
            Output ('ner' column): sequence of predicted tokens’ classes in BIO notation (82 possible classes, described 
            in detail in the annotation guidelines) 
            
            example:
            
            [‘Roboty’, ‘mają’, ‘kilkanaście’, ‘lat’, ‘i’, ‘pochodzą’, ‘z’, ‘USA’, ‘,’, ‘Wysokie’, ‘napięcie’, ‘jest’, 
            ‘dużo’, ‘młodsze’, ‘,’, ‘powstało’, ‘w’, ‘Niemczech’, ‘.’] → [‘B-nam_pro_title’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’, 
            ‘O’, ‘B-nam_loc_gpe_country’, ‘O’, ‘B-nam_pro_title’, ‘I-nam_pro_title’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’, 
            ‘B-nam_loc_gpe_country’, ‘O’]
            """,
        ]

    def load_data(self):
        raw_dataset = load_dataset(self.dataset_name)
        self.data_dict = {
            subset: raw_dataset[subset].to_pandas() for subset in self.subsets
        }
        self.data_dict_named = {}
        for subset in self.subsets:
            references = raw_dataset[subset]["ner"]
            references_named = [
                [
                    raw_dataset[subset].features["ner"].feature.names[label]
                    for label in labels
                ]
                for labels in references
            ]
            self.data_dict_named[subset] = pd.DataFrame(
                {
                    "tokens": self.data_dict[subset]["tokens"],
                    "ner": references_named,
                }
            )

    def show_dataset(self):
        header = st.container()
        description = st.container()
        dataframe_head = st.container()
        class_distribution = st.container()
        most_common_tokens = st.container()
        tsne_projection = st.container()

        with header:
            st.title(self.dataset_name)

        with description:
            st.header("Dataset description")
            st.write(self.description[0])
            st.subheader(self.description[1])
            st.write(self.description[2])

        full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
        tokens_all = full_dataframe["tokens"].tolist()
        tokens_all = [x for subarray in tokens_all for x in subarray]
        labels_all = pd.concat(self.data_dict_named.values(), axis="rows")[
            "ner"
        ].tolist()
        labels_all = [x for subarray in labels_all for x in subarray]

        with dataframe_head:
            st.header("First 10 observations of the chosen subset")
            selected_subset = st.selectbox(
                label="Select subset to see", options=self.subsets
            )
            df_to_show = self.data_dict[selected_subset].head(10)
            st.dataframe(df_to_show)
            st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())

        class_distribution_dict = {}
        for subset in self.subsets:
            all_labels_from_subset = self.data_dict_named[subset]["ner"].tolist()
            all_labels_from_subset = [
                x
                for subarray in all_labels_from_subset
                for x in subarray
                if x != "O" and not x.startswith("I-")
            ]
            all_labels_from_subset = pd.Series(all_labels_from_subset)
            class_distribution_dict[subset] = (
                all_labels_from_subset.value_counts(normalize=True)
                .sort_index()
                .reset_index()
                .rename({"index": "class", 0: subset}, axis="columns")
            )

        class_distribution_df = pd.merge(
            class_distribution_dict["train"],
            class_distribution_dict["test"],
            on="class",
        )
        with class_distribution:
            st.header("Class distribution in each subset (without 'O' and 'I-*')")
            st.dataframe(class_distribution_df)
            st.text_area(
                label="LaTeX code", value=class_distribution_df.style.to_latex()
            )

            # Most common tokens from selected class (without 0)
            full_df_unzipped = pd.DataFrame(
                {
                    "token": tokens_all,
                    "ner": labels_all,
                }
            )
            full_df_unzipped = full_df_unzipped.loc[
                (full_df_unzipped["ner"] != "O")
                & ~(full_df_unzipped["ner"].str.startswith("I-"))
            ]
            possible_options = sorted(full_df_unzipped["ner"].unique())
            with most_common_tokens:
                st.header(
                    "10 most common tokens from selected class (without 'O' and 'I-*')"
                )
                selected_class = st.selectbox(
                    label="Select class to show", options=possible_options
                )
                df_to_show = (
                    full_df_unzipped.loc[full_df_unzipped["ner"] == selected_class]
                    .groupby(["token"])
                    .count()
                    .reset_index()
                    .rename({"ner": "no_of_occurrences"}, axis=1)
                    .sort_values(by="no_of_occurrences", ascending=False)
                    .reset_index(drop=True)
                    .head(10)
                )
                st.dataframe(df_to_show)
                st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
            SHOW_TSNE_PROJECTION = False
            if SHOW_TSNE_PROJECTION:
                with tsne_projection:
                    st.header("t-SNE projection of the dataset")
                    subset_to_project = st.selectbox(
                        label="Select subset to project", options=self.subsets
                    )
                    tokens_unzipped = self.data_dict_named[subset_to_project]["tokens"].tolist()
                    tokens_unzipped = np.array([x for subarray in tokens_unzipped for x in subarray])
                    labels_unzipped = self.data_dict_named[subset_to_project]["ner"].tolist()
                    labels_unzipped = np.array([x for subarray in labels_unzipped for x in subarray])
                    df_unzipped = pd.DataFrame(
                        {
                            "tokens": tokens_unzipped,
                            "ner": labels_unzipped,
                        }
                    )
                    df_unzipped = df_unzipped.loc[
                        (df_unzipped["ner"] != "O")
                        & ~(df_unzipped["ner"].str.startswith("I-"))
                    ]
                    tokens_unzipped = df_unzipped["tokens"].values
                    labels_unzipped = df_unzipped["ner"].values
                    mapping_dict = {name: number for number, name in enumerate(set(labels_unzipped))}
                    labels_as_ints = [mapping_dict[label] for label in labels_unzipped]
                    embedded_tokens = np.array(
                        [embed_sentence(x) for x in tokens_unzipped]
                    )
                    reducer = TSNE(
                        n_components=2
                    )
                    transformed_embeddings = reducer.fit_transform(embedded_tokens)
                    fig, ax = plt.subplots()
                    ax.scatter(
                        x=transformed_embeddings[:, 0],
                        y=transformed_embeddings[:, 1],
                        c=[
                            PLOT_COLOR_PALETTE[i] for i in labels_as_ints
                        ]
                    )
                    st.pyplot(fig)