Mariusz Kossakowski commited on
Commit
c76a6b2
·
1 Parent(s): d405df4

Add stuff in KPWR-NER dataset

Browse files
clarin_datasets/kpwr_ner_datasets.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from datasets import load_dataset
2
  import streamlit as st
3
 
@@ -7,6 +8,7 @@ from clarin_datasets.dataset_to_show import DatasetToShow
7
  class KpwrNerDataset(DatasetToShow):
8
  def __init__(self):
9
  DatasetToShow.__init__(self)
 
10
  self.dataset_name = "clarin-pl/kpwr-ner"
11
  self.description = """
12
  KPWR-NER is a part the Polish Corpus of Wrocław University of Technology (Korpus Języka
@@ -36,11 +38,29 @@ class KpwrNerDataset(DatasetToShow):
36
  self.data_dict = {
37
  subset: raw_dataset[subset].to_pandas() for subset in self.subsets
38
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  def show_dataset(self):
41
  header = st.container()
42
  description = st.container()
43
  dataframe_head = st.container()
 
 
44
 
45
  with header:
46
  st.title(self.dataset_name)
@@ -48,3 +68,71 @@ class KpwrNerDataset(DatasetToShow):
48
  with description:
49
  st.header("Dataset description")
50
  st.write(self.description)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
  from datasets import load_dataset
3
  import streamlit as st
4
 
 
8
  class KpwrNerDataset(DatasetToShow):
9
  def __init__(self):
10
  DatasetToShow.__init__(self)
11
+ self.data_dict_named = None
12
  self.dataset_name = "clarin-pl/kpwr-ner"
13
  self.description = """
14
  KPWR-NER is a part the Polish Corpus of Wrocław University of Technology (Korpus Języka
 
38
  self.data_dict = {
39
  subset: raw_dataset[subset].to_pandas() for subset in self.subsets
40
  }
41
+ self.data_dict_named = {}
42
+ for subset in self.subsets:
43
+ references = raw_dataset[subset]["ner"]
44
+ references_named = [
45
+ [
46
+ raw_dataset[subset].features["ner"].feature.names[label]
47
+ for label in labels
48
+ ]
49
+ for labels in references
50
+ ]
51
+ self.data_dict_named[subset] = pd.DataFrame(
52
+ {
53
+ "tokens": self.data_dict[subset]["tokens"],
54
+ "ner": references_named,
55
+ }
56
+ )
57
 
58
  def show_dataset(self):
59
  header = st.container()
60
  description = st.container()
61
  dataframe_head = st.container()
62
+ class_distribution = st.container()
63
+ most_common_tokens = st.container()
64
 
65
  with header:
66
  st.title(self.dataset_name)
 
68
  with description:
69
  st.header("Dataset description")
70
  st.write(self.description)
71
+
72
+ full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
73
+ tokens_all = full_dataframe["tokens"].tolist()
74
+ tokens_all = [x for subarray in tokens_all for x in subarray]
75
+ labels_all = pd.concat(self.data_dict_named.values(), axis="rows")["ner"].tolist()
76
+ labels_all = [x for subarray in labels_all for x in subarray]
77
+
78
+ with dataframe_head:
79
+ df_to_show = full_dataframe.head(10)
80
+ st.header("First 10 observations of the dataset")
81
+ st.dataframe(df_to_show)
82
+ st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
83
+
84
+ class_distribution_dict = {}
85
+ for subset in self.subsets:
86
+ all_labels_from_subset = self.data_dict_named[subset]["ner"].tolist()
87
+ all_labels_from_subset = [
88
+ x
89
+ for subarray in all_labels_from_subset
90
+ for x in subarray
91
+ if x != "O" and not x.startswith("I-")
92
+ ]
93
+ all_labels_from_subset = pd.Series(all_labels_from_subset)
94
+ class_distribution_dict[subset] = (
95
+ all_labels_from_subset.value_counts(normalize=True)
96
+ .sort_index()
97
+ .reset_index()
98
+ .rename({"index": "class", 0: subset}, axis="columns")
99
+ )
100
+
101
+ class_distribution_df = pd.merge(
102
+ class_distribution_dict["train"],
103
+ class_distribution_dict["test"],
104
+ on="class",
105
+ )
106
+ with class_distribution:
107
+ st.header("Class distribution in each subset (without '0' and 'I-*')")
108
+ st.dataframe(class_distribution_df)
109
+ st.text_area(
110
+ label="LaTeX code", value=class_distribution_df.style.to_latex()
111
+ )
112
+
113
+ # Most common tokens from selected class (without 0)
114
+ full_df_unzipped = pd.DataFrame(
115
+ {
116
+ "token": tokens_all,
117
+ "ner": labels_all,
118
+ }
119
+ )
120
+ full_df_unzipped = full_df_unzipped.loc[full_df_unzipped["ner"] != 0]
121
+ possible_options = sorted(full_df_unzipped["ner"].unique())
122
+ with most_common_tokens:
123
+ st.header("10 most common tokens from selected class (without '0')")
124
+ selected_class = st.selectbox(
125
+ label="Select class to show", options=possible_options
126
+ )
127
+ df_to_show = (
128
+ full_df_unzipped.loc[full_df_unzipped["ner"] == selected_class]
129
+ .groupby(["token"])
130
+ .count()
131
+ .reset_index()
132
+ .rename({"ner": "no_of_occurrences"}, axis=1)
133
+ .sort_values(by="no_of_occurrences", ascending=False)
134
+ .reset_index(drop=True)
135
+ .head(10)
136
+ )
137
+ st.dataframe(df_to_show)
138
+ st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())