Mariusz Kossakowski commited on
Commit
d572e8e
1 Parent(s): 41b0597

Major refactor

Browse files
app.py CHANGED
@@ -1,244 +1,24 @@
1
- import re
2
- from typing import Dict, List
3
-
4
- from datasets import load_dataset
5
- import pandas as pd
6
- import plotly.figure_factory as ff
7
- import plotly.graph_objects as go
8
  import streamlit as st
9
- from unidecode import unidecode
10
-
11
- DATA_SPLITS = ["train", "validation", "test"]
12
-
13
-
14
- def load_data() -> Dict[str, pd.DataFrame]:
15
- return {
16
- data: pd.read_csv(f"data/{data}.csv").rename(
17
- {"label": "target"}, axis="columns"
18
- )
19
- for data in DATA_SPLITS
20
- }
21
-
22
-
23
- def flatten_list(main_list: List[List]) -> List:
24
- return [item for sublist in main_list for item in sublist]
25
-
26
-
27
- def count_num_of_characters(text: str) -> int:
28
- return len(re.sub(r"[^a-zA-Z]", "", unidecode(text)))
29
-
30
-
31
- def count_num_of_words(text: str) -> int:
32
- return len(re.sub(r"[^a-zA-Z ]", "", unidecode(text)).split(" "))
33
 
 
 
 
34
 
35
  selected_dataset = st.sidebar.selectbox(
36
  "Choose a dataset to load",
37
- ("clarin-pl/polemo2-official", "laugustyniak/abusive-clauses-pl"),
 
 
 
 
38
  )
39
 
 
 
 
 
 
 
40
 
41
- def load_hf_dataset():
42
- if selected_dataset == "clarin-pl/polemo2-official":
43
- data = load_dataset("clarin-pl/polemo2-official")
44
- DATA_DICT = {
45
- "train": data["train"].to_pandas(),
46
- "validation": data["validation"].to_pandas(),
47
- "test": data["test"].to_pandas(),
48
- }
49
- DATA_DESCRIPTION = """The PolEmo2.0 is a dataset of online consumer reviews from four domains: medicine,
50
- hotels, products, and university. It is human-annotated on a level of full reviews and individual
51
- sentences. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and
52
- sentence was manually annotated with sentiment in the 2+1 scheme, which gives a total of 197,
53
- 046 annotations. About 85% of the reviews are from the medicine and hotel domains. Each review is
54
- annotated with four labels: positive, negative, neutral, or ambiguous. """
55
- elif selected_dataset == "laugustyniak/abusive-clauses-pl":
56
- DATA_DICT = load_data()
57
- DATA_DESCRIPTION = """
58
- ''I have read and agree to the terms and conditions'' is one of the biggest lies on the Internet.
59
- Consumers rarely read the contracts they are required to accept. We conclude agreements over the Internet daily.
60
- But do we know the content of these agreements? Do we check potential unfair statements? On the Internet,
61
- we probably skip most of the Terms and Conditions. However, we must remember that we have concluded many more
62
- contracts. Imagine that we want to buy a house, a car, send our kids to the nursery, open a bank account,
63
- or many more. In all these situations, you will need to conclude the contract, but there is a high probability
64
- that you will not read the entire agreement with proper understanding. European consumer law aims to prevent
65
- businesses from using so-called ''unfair contractual terms'' in their unilaterally drafted contracts,
66
- requiring consumers to accept.
67
-
68
- Our dataset treats ''unfair contractual term'' as the equivalent of an abusive clause. It could be defined as a
69
- clause that is unilaterally imposed by one of the contract's parties, unequally affecting the other, or creating a
70
- situation of imbalance between the duties and rights of the parties.
71
-
72
- On the EU and at the national such as the Polish levels, agencies cannot check possible agreements by hand. Hence,
73
- we took the first step to evaluate the possibility of accelerating this process. We created a dataset and machine
74
- learning models to automate potentially abusive clauses detection partially. Consumer protection organizations and
75
- agencies can use these resources to make their work more effective and efficient. Moreover, consumers can automatically
76
- analyze contracts and understand what they agree upon.
77
- """
78
- return DATA_DICT, DATA_DESCRIPTION
79
-
80
-
81
- DATA_DICT, DATA_DESCRIPTION = load_hf_dataset()
82
-
83
- header = st.container()
84
- description = st.container()
85
- dataframe_head = st.container()
86
- word_searching = st.container()
87
- dataset_statistics = st.container()
88
-
89
- with header:
90
- st.title(selected_dataset)
91
-
92
- with description:
93
- st.header("Dataset description")
94
- st.write(DATA_DESCRIPTION)
95
-
96
- with dataframe_head:
97
- filtering_options = DATA_DICT["train"]["target"].unique().tolist()
98
- filtering_options.append("All classes")
99
-
100
- st.header("First 10 observations of a chosen class")
101
- class_to_show = st.selectbox(
102
- label="Select class to show", options=filtering_options
103
- )
104
- df_to_show = pd.concat(
105
- [
106
- DATA_DICT["train"].copy(),
107
- DATA_DICT["validation"].copy(),
108
- DATA_DICT["test"].copy(),
109
- ]
110
- )
111
- if class_to_show == "All classes":
112
- df_to_show = df_to_show.head(10)
113
- else:
114
- df_to_show = df_to_show.loc[df_to_show["target"] == class_to_show].head(10)
115
- st.dataframe(df_to_show)
116
- st.text_area(label="Latex code", value=df_to_show.style.to_latex())
117
-
118
- if selected_dataset == "clarin-pl/polemo2-official":
119
- st.subheader("First 10 observations of a chosen domain and text type")
120
- domain = st.selectbox(
121
- label="Select domain",
122
- options=["all", "hotels", "medicine", "products", "reviews"],
123
- )
124
- text_type = st.selectbox(
125
- label="Select text type", options=["Full text", "Tokenized to sentences"]
126
- )
127
- text_type_mapping_dict = {
128
- "Full text": "text",
129
- "Tokenized to sentences": "sentence",
130
- }
131
-
132
- polemo_subset = load_dataset(
133
- selected_dataset, f"{domain}_{text_type_mapping_dict[text_type]}"
134
- )
135
- df = pd.concat(
136
- [
137
- polemo_subset["train"].to_pandas(),
138
- polemo_subset["validation"].to_pandas(),
139
- polemo_subset["test"].to_pandas(),
140
- ]
141
- ).head(10)
142
- st.dataframe(df)
143
- st.text_area(label="Latex code", value=df.style.to_latex())
144
-
145
- with word_searching:
146
- st.header("Observations containing a chosen word")
147
- searched_word = st.text_input(label="Enter the word you are looking for below")
148
- df_to_show = pd.concat(
149
- [
150
- DATA_DICT["train"].copy(),
151
- DATA_DICT["validation"].copy(),
152
- DATA_DICT["test"].copy(),
153
- ]
154
- )
155
- df_to_show = df_to_show.loc[df_to_show["text"].str.contains(searched_word)]
156
- st.dataframe(df_to_show)
157
- st.text_area(label="Latex code", value=df_to_show.style.to_latex())
158
-
159
- with dataset_statistics:
160
- st.header("Dataset statistics")
161
- st.subheader("Number of samples in each data split")
162
- metrics_df = pd.DataFrame.from_dict(
163
- {
164
- "Train": DATA_DICT["train"].shape[0],
165
- "Validation": DATA_DICT["validation"].shape[0],
166
- "Test": DATA_DICT["test"].shape[0],
167
- "Total": sum(
168
- [
169
- DATA_DICT["train"].shape[0],
170
- DATA_DICT["validation"].shape[0],
171
- DATA_DICT["test"].shape[0],
172
- ]
173
- ),
174
- },
175
- orient="index",
176
- ).reset_index()
177
- metrics_df.columns = ["Subset", "Number of samples"]
178
- st.dataframe(metrics_df)
179
-
180
- latex_df = metrics_df.style.to_latex()
181
- st.text_area(label="Latex code", value=latex_df)
182
-
183
- # Class distribution in each subset
184
- st.subheader("Class distribution in each subset")
185
- target_unique_values = DATA_DICT["train"]["target"].unique()
186
- hist = (
187
- pd.DataFrame(
188
- [
189
- df["target"].value_counts(normalize=True).rename(k)
190
- for k, df in DATA_DICT.items()
191
- ]
192
- )
193
- .reset_index()
194
- .rename({"index": "split_name"}, axis=1)
195
- )
196
- plot_data = [
197
- go.Bar(
198
- name=str(target_unique_values[i]),
199
- x=DATA_SPLITS,
200
- y=hist[target_unique_values[i]].values,
201
- )
202
- for i in range(len(target_unique_values))
203
- ]
204
- barchart_class_dist = go.Figure(data=plot_data)
205
- barchart_class_dist.update_layout(
206
- barmode="group",
207
- title_text="Barchart - class distribution",
208
- xaxis_title="Split name",
209
- yaxis_title="Number of data points",
210
- )
211
- st.plotly_chart(barchart_class_dist, use_container_width=True)
212
- st.dataframe(hist)
213
- st.text_area(label="Latex code", value=hist.style.to_latex())
214
-
215
- # Number of words per observation
216
- st.subheader("Number of words per observation in each subset")
217
- hist_data_num_words = [
218
- df["text"].apply(count_num_of_words) for df in DATA_DICT.values()
219
- ]
220
- fig_num_words = ff.create_distplot(
221
- hist_data_num_words, DATA_SPLITS, show_rug=False, bin_size=1
222
- )
223
- fig_num_words.update_traces(
224
- nbinsx=100, autobinx=True, selector={"type": "histogram"}
225
- )
226
- fig_num_words.update_layout(
227
- title_text="Histogram - number of characters per observation",
228
- xaxis_title="Number of characters",
229
- )
230
- st.plotly_chart(fig_num_words, use_container_width=True)
231
-
232
- # Number of characters per observation
233
- st.subheader("Number of characters per observation in each subset")
234
- hist_data_num_characters = [
235
- df["text"].apply(count_num_of_characters) for df in DATA_DICT.values()
236
- ]
237
- fig_num_chars = ff.create_distplot(
238
- hist_data_num_characters, DATA_SPLITS, show_rug=False, bin_size=1
239
- )
240
- fig_num_chars.update_layout(
241
- title_text="Histogram - number of characters per observation",
242
- xaxis_title="Number of characters",
243
- )
244
- st.plotly_chart(fig_num_chars, use_container_width=True)
 
 
 
 
 
 
 
 
1
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ from clarin_datasets.polemo_dataset import PolemoDataset
4
+ from clarin_datasets.abusive_clauses_dataset import AbusiveClausesDataset
5
+ from clarin_datasets.aspectemo_dataset import AspectEmoDataset
6
 
7
  selected_dataset = st.sidebar.selectbox(
8
  "Choose a dataset to load",
9
+ (
10
+ "clarin-pl/polemo2-official",
11
+ "laugustyniak/abusive-clauses-pl",
12
+ "clarin-pl/aspectemo",
13
+ ),
14
  )
15
 
16
+ if selected_dataset == "clarin-pl/polemo2-official":
17
+ dataset = PolemoDataset()
18
+ elif selected_dataset == "laugustyniak/abusive-clauses-pl":
19
+ dataset = AbusiveClausesDataset()
20
+ elif selected_dataset == "clarin-pl/aspectemo":
21
+ dataset = AspectEmoDataset()
22
 
23
+ dataset.load_data()
24
+ dataset.show_dataset()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
clarin_datasets/abusive_clauses_dataset.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import plotly.figure_factory as ff
3
+ import plotly.graph_objects as go
4
+ import streamlit as st
5
+
6
+ from clarin_datasets.dataset_to_show import DatasetToShow
7
+ from clarin_datasets.utils import (
8
+ count_num_of_characters,
9
+ count_num_of_words,
10
+ )
11
+
12
+
13
+ class AbusiveClausesDataset(DatasetToShow):
14
+ def __init__(self):
15
+ self.dataset_name = "laugustyniak/abusive-clauses-pl"
16
+ self.data_dict = None
17
+ self.subsets = ["train", "validation", "test"]
18
+ self.description = """
19
+ ''I have read and agree to the terms and conditions'' is one of the biggest lies on the Internet.
20
+ Consumers rarely read the contracts they are required to accept. We conclude agreements over the Internet daily.
21
+ But do we know the content of these agreements? Do we check potential unfair statements? On the Internet,
22
+ we probably skip most of the Terms and Conditions. However, we must remember that we have concluded many more
23
+ contracts. Imagine that we want to buy a house, a car, send our kids to the nursery, open a bank account,
24
+ or many more. In all these situations, you will need to conclude the contract, but there is a high probability
25
+ that you will not read the entire agreement with proper understanding. European consumer law aims to prevent
26
+ businesses from using so-called ''unfair contractual terms'' in their unilaterally drafted contracts,
27
+ requiring consumers to accept.
28
+
29
+ Our dataset treats ''unfair contractual term'' as the equivalent of an abusive clause. It could be defined as a
30
+ clause that is unilaterally imposed by one of the contract's parties, unequally affecting the other, or creating a
31
+ situation of imbalance between the duties and rights of the parties.
32
+
33
+ On the EU and at the national such as the Polish levels, agencies cannot check possible agreements by hand. Hence,
34
+ we took the first step to evaluate the possibility of accelerating this process. We created a dataset and machine
35
+ learning models to automate potentially abusive clauses detection partially. Consumer protection organizations and
36
+ agencies can use these resources to make their work more effective and efficient. Moreover, consumers can automatically
37
+ analyze contracts and understand what they agree upon.
38
+ """
39
+
40
+ def load_data(self):
41
+ self.data_dict = {
42
+ subset: pd.read_csv(f"data/{subset}.csv").rename(
43
+ {"label": "target"}, axis="columns"
44
+ )
45
+ for subset in self.subsets
46
+ }
47
+
48
+ def show_dataset(self):
49
+ header = st.container()
50
+ description = st.container()
51
+ dataframe_head = st.container()
52
+ word_searching = st.container()
53
+ dataset_statistics = st.container()
54
+
55
+ with header:
56
+ st.title(self.dataset_name)
57
+
58
+ with description:
59
+ st.header("Dataset description")
60
+ st.write(self.description)
61
+
62
+ with dataframe_head:
63
+ filtering_options = self.data_dict["train"]["target"].unique().tolist()
64
+ filtering_options.append("All classes")
65
+
66
+ st.header("First 10 observations of a chosen class")
67
+ class_to_show = st.selectbox(
68
+ label="Select class to show", options=filtering_options
69
+ )
70
+ df_to_show = pd.concat(
71
+ [
72
+ self.data_dict["train"].copy(),
73
+ self.data_dict["validation"].copy(),
74
+ self.data_dict["test"].copy(),
75
+ ]
76
+ )
77
+ if class_to_show == "All classes":
78
+ df_to_show = df_to_show.head(10)
79
+ else:
80
+ df_to_show = df_to_show.loc[df_to_show["target"] == class_to_show].head(
81
+ 10
82
+ )
83
+ st.dataframe(df_to_show)
84
+ st.text_area(label="Latex code", value=df_to_show.style.to_latex())
85
+
86
+ with word_searching:
87
+ st.header("Observations containing a chosen word")
88
+ searched_word = st.text_input(
89
+ label="Enter the word you are looking for below"
90
+ )
91
+ df_to_show = pd.concat(
92
+ [
93
+ self.data_dict["train"].copy(),
94
+ self.data_dict["validation"].copy(),
95
+ self.data_dict["test"].copy(),
96
+ ]
97
+ )
98
+ df_to_show = df_to_show.loc[df_to_show["text"].str.contains(searched_word)]
99
+ st.dataframe(df_to_show)
100
+ st.text_area(label="Latex code", value=df_to_show.style.to_latex())
101
+
102
+ with dataset_statistics:
103
+ st.header("Dataset statistics")
104
+ st.subheader("Number of samples in each data split")
105
+ metrics_df = pd.DataFrame.from_dict(
106
+ {
107
+ "Train": self.data_dict["train"].shape[0],
108
+ "Validation": self.data_dict["validation"].shape[0],
109
+ "Test": self.data_dict["test"].shape[0],
110
+ "Total": sum(
111
+ [
112
+ self.data_dict["train"].shape[0],
113
+ self.data_dict["validation"].shape[0],
114
+ self.data_dict["test"].shape[0],
115
+ ]
116
+ ),
117
+ },
118
+ orient="index",
119
+ ).reset_index()
120
+ metrics_df.columns = ["Subset", "Number of samples"]
121
+ st.dataframe(metrics_df)
122
+
123
+ latex_df = metrics_df.style.to_latex()
124
+ st.text_area(label="Latex code", value=latex_df)
125
+
126
+ # Class distribution in each subset
127
+ st.subheader("Class distribution in each subset")
128
+ target_unique_values = self.data_dict["train"]["target"].unique()
129
+ hist = (
130
+ pd.DataFrame(
131
+ [
132
+ df["target"].value_counts(normalize=True).rename(k)
133
+ for k, df in self.data_dict.items()
134
+ ]
135
+ )
136
+ .reset_index()
137
+ .rename({"index": "split_name"}, axis=1)
138
+ )
139
+ plot_data = [
140
+ go.Bar(
141
+ name=str(target_unique_values[i]),
142
+ x=self.subsets,
143
+ y=hist[target_unique_values[i]].values,
144
+ )
145
+ for i in range(len(target_unique_values))
146
+ ]
147
+ barchart_class_dist = go.Figure(data=plot_data)
148
+ barchart_class_dist.update_layout(
149
+ barmode="group",
150
+ title_text="Barchart - class distribution",
151
+ xaxis_title="Split name",
152
+ yaxis_title="Number of data points",
153
+ )
154
+ st.plotly_chart(barchart_class_dist, use_container_width=True)
155
+ st.dataframe(hist)
156
+ st.text_area(label="Latex code", value=hist.style.to_latex())
157
+
158
+ # Number of words per observation
159
+ st.subheader("Number of words per observation in each subset")
160
+ hist_data_num_words = [
161
+ df["text"].apply(count_num_of_words) for df in self.data_dict.values()
162
+ ]
163
+ fig_num_words = ff.create_distplot(
164
+ hist_data_num_words, self.subsets, show_rug=False, bin_size=1
165
+ )
166
+ fig_num_words.update_traces(
167
+ nbinsx=100, autobinx=True, selector={"type": "histogram"}
168
+ )
169
+ fig_num_words.update_layout(
170
+ title_text="Histogram - number of characters per observation",
171
+ xaxis_title="Number of characters",
172
+ )
173
+ st.plotly_chart(fig_num_words, use_container_width=True)
174
+
175
+ # Number of characters per observation
176
+ st.subheader("Number of characters per observation in each subset")
177
+ hist_data_num_characters = [
178
+ df["text"].apply(count_num_of_characters)
179
+ for df in self.data_dict.values()
180
+ ]
181
+ fig_num_chars = ff.create_distplot(
182
+ hist_data_num_characters, self.subsets, show_rug=False, bin_size=1
183
+ )
184
+ fig_num_chars.update_layout(
185
+ title_text="Histogram - number of characters per observation",
186
+ xaxis_title="Number of characters",
187
+ )
188
+ st.plotly_chart(fig_num_chars, use_container_width=True)
clarin_datasets/dataset_to_show.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+
3
+
4
+ class DatasetToShow(ABC):
5
+ @abstractmethod
6
+ def __init__(self):
7
+ self.dataset_name = None
8
+ self.data_dict = None
9
+ self.subsets = None
10
+ self.description = None
11
+
12
+ @abstractmethod
13
+ def load_data(self):
14
+ pass
15
+
16
+ @abstractmethod
17
+ def show_dataset(self):
18
+ pass
clarin_datasets/polemo_dataset.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import pandas as pd
3
+ import plotly.figure_factory as ff
4
+ import plotly.graph_objects as go
5
+ import streamlit as st
6
+
7
+ from clarin_datasets.dataset_to_show import DatasetToShow
8
+ from clarin_datasets.utils import (
9
+ count_num_of_characters,
10
+ count_num_of_words,
11
+ )
12
+
13
+
14
+ class PolemoDataset(DatasetToShow):
15
+ def __init__(self):
16
+ self.dataset_name = "clarin-pl/polemo2-official"
17
+ self.data_dict = None
18
+ self.subsets = ["train", "validation", "test"]
19
+ self.description = """The PolEmo2.0 is a dataset of online consumer reviews from four domains: medicine,
20
+ hotels, products, and university. It is human-annotated on a level of full reviews and individual
21
+ sentences. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and
22
+ sentence was manually annotated with sentiment in the 2+1 scheme, which gives a total of 197,
23
+ 046 annotations. About 85% of the reviews are from the medicine and hotel domains. Each review is
24
+ annotated with four labels: positive, negative, neutral, or ambiguous. """
25
+
26
+ def load_data(self):
27
+ raw_dataset = load_dataset(self.dataset_name)
28
+ self.data_dict = {
29
+ subset: raw_dataset[subset].to_pandas() for subset in self.subsets
30
+ }
31
+
32
+ def show_dataset(self):
33
+ header = st.container()
34
+ description = st.container()
35
+ dataframe_head = st.container()
36
+ word_searching = st.container()
37
+ dataset_statistics = st.container()
38
+
39
+ with header:
40
+ st.title(self.dataset_name)
41
+
42
+ with description:
43
+ st.header("Dataset description")
44
+ st.write(self.description)
45
+
46
+ with dataframe_head:
47
+ filtering_options = self.data_dict["train"]["target"].unique().tolist()
48
+ filtering_options.append("All classes")
49
+
50
+ st.header("First 10 observations of a chosen class")
51
+ class_to_show = st.selectbox(
52
+ label="Select class to show", options=filtering_options
53
+ )
54
+ df_to_show = pd.concat(
55
+ [
56
+ self.data_dict["train"].copy(),
57
+ self.data_dict["validation"].copy(),
58
+ self.data_dict["test"].copy(),
59
+ ]
60
+ )
61
+ if class_to_show == "All classes":
62
+ df_to_show = df_to_show.head(10)
63
+ else:
64
+ df_to_show = df_to_show.loc[df_to_show["target"] == class_to_show].head(
65
+ 10
66
+ )
67
+ st.dataframe(df_to_show)
68
+ st.text_area(label="Latex code", value=df_to_show.style.to_latex())
69
+
70
+ st.subheader("First 10 observations of a chosen domain and text type")
71
+ domain = st.selectbox(
72
+ label="Select domain",
73
+ options=["all", "hotels", "medicine", "products", "reviews"],
74
+ )
75
+ text_type = st.selectbox(
76
+ label="Select text type",
77
+ options=["Full text", "Tokenized to sentences"],
78
+ )
79
+ text_type_mapping_dict = {
80
+ "Full text": "text",
81
+ "Tokenized to sentences": "sentence",
82
+ }
83
+
84
+ polemo_subset = load_dataset(
85
+ self.dataset_name,
86
+ f"{domain}_{text_type_mapping_dict[text_type]}",
87
+ )
88
+ df = pd.concat(
89
+ [
90
+ polemo_subset["train"].to_pandas(),
91
+ polemo_subset["validation"].to_pandas(),
92
+ polemo_subset["test"].to_pandas(),
93
+ ]
94
+ ).head(10)
95
+ st.dataframe(df)
96
+ st.text_area(label="Latex code", value=df.style.to_latex())
97
+
98
+ with word_searching:
99
+ st.header("Observations containing a chosen word")
100
+ searched_word = st.text_input(
101
+ label="Enter the word you are looking for below"
102
+ )
103
+ df_to_show = pd.concat(
104
+ [
105
+ self.data_dict["train"].copy(),
106
+ self.data_dict["validation"].copy(),
107
+ self.data_dict["test"].copy(),
108
+ ]
109
+ )
110
+ df_to_show = df_to_show.loc[df_to_show["text"].str.contains(searched_word)]
111
+ st.dataframe(df_to_show)
112
+ st.text_area(label="Latex code", value=df_to_show.style.to_latex())
113
+
114
+ with dataset_statistics:
115
+ st.header("Dataset statistics")
116
+ st.subheader("Number of samples in each data split")
117
+ metrics_df = pd.DataFrame.from_dict(
118
+ {
119
+ "Train": self.data_dict["train"].shape[0],
120
+ "Validation": self.data_dict["validation"].shape[0],
121
+ "Test": self.data_dict["test"].shape[0],
122
+ "Total": sum(
123
+ [
124
+ self.data_dict["train"].shape[0],
125
+ self.data_dict["validation"].shape[0],
126
+ self.data_dict["test"].shape[0],
127
+ ]
128
+ ),
129
+ },
130
+ orient="index",
131
+ ).reset_index()
132
+ metrics_df.columns = ["Subset", "Number of samples"]
133
+ st.dataframe(metrics_df)
134
+
135
+ latex_df = metrics_df.style.to_latex()
136
+ st.text_area(label="Latex code", value=latex_df)
137
+
138
+ # Class distribution in each subset
139
+ st.subheader("Class distribution in each subset")
140
+ target_unique_values = self.data_dict["train"]["target"].unique()
141
+ hist = (
142
+ pd.DataFrame(
143
+ [
144
+ df["target"].value_counts(normalize=True).rename(k)
145
+ for k, df in self.data_dict.items()
146
+ ]
147
+ )
148
+ .reset_index()
149
+ .rename({"index": "split_name"}, axis=1)
150
+ )
151
+ plot_data = [
152
+ go.Bar(
153
+ name=str(target_unique_values[i]),
154
+ x=self.subsets,
155
+ y=hist[target_unique_values[i]].values,
156
+ )
157
+ for i in range(len(target_unique_values))
158
+ ]
159
+ barchart_class_dist = go.Figure(data=plot_data)
160
+ barchart_class_dist.update_layout(
161
+ barmode="group",
162
+ title_text="Barchart - class distribution",
163
+ xaxis_title="Split name",
164
+ yaxis_title="Number of data points",
165
+ )
166
+ st.plotly_chart(barchart_class_dist, use_container_width=True)
167
+ st.dataframe(hist)
168
+ st.text_area(label="Latex code", value=hist.style.to_latex())
169
+
170
+ # Number of words per observation
171
+ st.subheader("Number of words per observation in each subset")
172
+ hist_data_num_words = [
173
+ df["text"].apply(count_num_of_words) for df in self.data_dict.values()
174
+ ]
175
+ fig_num_words = ff.create_distplot(
176
+ hist_data_num_words, self.subsets, show_rug=False, bin_size=1
177
+ )
178
+ fig_num_words.update_traces(
179
+ nbinsx=100, autobinx=True, selector={"type": "histogram"}
180
+ )
181
+ fig_num_words.update_layout(
182
+ title_text="Histogram - number of characters per observation",
183
+ xaxis_title="Number of characters",
184
+ )
185
+ st.plotly_chart(fig_num_words, use_container_width=True)
186
+
187
+ # Number of characters per observation
188
+ st.subheader("Number of characters per observation in each subset")
189
+ hist_data_num_characters = [
190
+ df["text"].apply(count_num_of_characters)
191
+ for df in self.data_dict.values()
192
+ ]
193
+ fig_num_chars = ff.create_distplot(
194
+ hist_data_num_characters, self.subsets, show_rug=False, bin_size=1
195
+ )
196
+ fig_num_chars.update_layout(
197
+ title_text="Histogram - number of characters per observation",
198
+ xaxis_title="Number of characters",
199
+ )
200
+ st.plotly_chart(fig_num_chars, use_container_width=True)
clarin_datasets/utils.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import List
3
+
4
+ from unidecode import unidecode
5
+
6
+
7
+ def flatten_list(main_list: List[List]) -> List:
8
+ return [item for sublist in main_list for item in sublist]
9
+
10
+
11
+ def count_num_of_characters(text: str) -> int:
12
+ return len(re.sub(r"[^a-zA-Z]", "", unidecode(text)))
13
+
14
+
15
+ def count_num_of_words(text: str) -> int:
16
+ return len(re.sub(r"[^a-zA-Z ]", "", unidecode(text)).split(" "))