Mariusz Kossakowski commited on
Commit
acb3b1d
·
1 Parent(s): 1d713ee

Generalize dashboard for multiple datasets

Browse files
Files changed (2) hide show
  1. app.py +96 -84
  2. data/{dev.csv → validation.csv} +0 -0
app.py CHANGED
@@ -1,22 +1,26 @@
1
  import re
2
- from typing import Dict, List
3
 
 
4
  import pandas as pd
5
  import plotly.figure_factory as ff
6
  import plotly.graph_objects as go
 
7
  import streamlit as st
8
  from unidecode import unidecode
9
 
10
- st.set_page_config(layout="wide")
11
 
12
- DATA_SPLITS = ["train", "dev", "test"]
13
 
14
-
15
- def load_data() -> Dict[str, pd.DataFrame]:
16
- return {data: pd.read_csv(f"data/{data}.csv") for data in DATA_SPLITS}
 
 
 
 
17
 
18
 
19
- def flatten_list(main_list: List[list]) -> list:
20
  return [item for sublist in main_list for item in sublist]
21
 
22
 
@@ -28,54 +32,78 @@ def count_num_of_words(text: str) -> int:
28
  return len(re.sub(r"[^a-zA-Z ]", "", unidecode(text)).split(" "))
29
 
30
 
31
- DATA_DICT = load_data()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  header = st.container()
34
  description = st.container()
35
  dataset_statistics = st.container()
36
- class_distribution = st.container()
37
 
38
  with header:
39
- st.title("PAC - Polish Abusive Clauses Dataset")
40
 
41
  with description:
42
  st.header("Dataset description")
43
- desc = """
44
- ''I have read and agree to the terms and conditions'' is one of the biggest lies on the Internet.
45
- Consumers rarely read the contracts they are required to accept. We conclude agreements over the Internet daily.
46
- But do we know the content of these agreements? Do we check potential unfair statements? On the Internet,
47
- we probably skip most of the Terms and Conditions. However, we must remember that we have concluded many more
48
- contracts. Imagine that we want to buy a house, a car, send our kids to the nursery, open a bank account,
49
- or many more. In all these situations, you will need to conclude the contract, but there is a high probability
50
- that you will not read the entire agreement with proper understanding. European consumer law aims to prevent
51
- businesses from using so-called ''unfair contractual terms'' in their unilaterally drafted contracts,
52
- requiring consumers to accept.
53
-
54
- Our dataset treats ''unfair contractual term'' as the equivalent of an abusive clause. It could be defined as a
55
- clause that is unilaterally imposed by one of the contract's parties, unequally affecting the other, or creating a
56
- situation of imbalance between the duties and rights of the parties.
57
-
58
- On the EU and at the national such as the Polish levels, agencies cannot check possible agreements by hand. Hence,
59
- we took the first step to evaluate the possibility of accelerating this process. We created a dataset and machine
60
- learning models to automate potentially abusive clauses detection partially. Consumer protection organizations and
61
- agencies can use these resources to make their work more effective and efficient. Moreover, consumers can automatically
62
- analyze contracts and understand what they agree upon.
63
- """
64
- st.write(desc)
65
-
66
- st.header("Dataset statistics")
67
 
68
  with dataset_statistics:
 
69
  st.subheader("Number of samples in each data split")
70
  metrics_df = pd.DataFrame.from_dict(
71
  {
72
  "Train": DATA_DICT["train"].shape[0],
73
- "Dev": DATA_DICT["dev"].shape[0],
74
  "Test": DATA_DICT["test"].shape[0],
75
  "Total": sum(
76
  [
77
  DATA_DICT["train"].shape[0],
78
- DATA_DICT["dev"].shape[0],
79
  DATA_DICT["test"].shape[0],
80
  ]
81
  ),
@@ -84,61 +112,44 @@ with dataset_statistics:
84
  ).reset_index()
85
  metrics_df.columns = ["Subset", "Number of samples"]
86
  st.dataframe(metrics_df)
87
- latex_df = pd.DataFrame([metrics_df.style.to_latex()])
88
- st.button(
89
- label="Copy table to LaTeX",
90
- on_click=latex_df.to_clipboard(index=False, header=False),
91
- key="copy_metrics_df",
92
- )
93
 
94
- # Class distribution in each subset
95
- with class_distribution:
 
 
96
  st.subheader("Class distribution in each subset")
97
- plot_column, table_column = st.columns(2)
98
- with plot_column:
99
- hist = (
100
- pd.DataFrame(
101
- [
102
- df["label"].value_counts(normalize=True).rename(k)
103
- for k, df in DATA_DICT.items()
104
- ]
105
- )
106
- .reset_index()
107
- .rename({"index": "split_name"}, axis=1)
108
- )
109
- barchart_class_dist = go.Figure(
110
- data=[
111
- go.Bar(
112
- name="BEZPIECZNE_POSTANOWIENIE_UMOWNE",
113
- x=DATA_SPLITS,
114
- y=hist["BEZPIECZNE_POSTANOWIENIE_UMOWNE"].values,
115
- ),
116
- go.Bar(
117
- name="KLAUZULA_ABUZYWNA",
118
- x=DATA_SPLITS,
119
- y=hist["KLAUZULA_ABUZYWNA"].values,
120
- ),
121
  ]
122
  )
123
- barchart_class_dist.update_layout(
124
- barmode="group",
125
- xaxis_title="Split name",
126
- yaxis_title="Number of data points",
127
- )
128
- st.plotly_chart(barchart_class_dist, use_container_width=True)
129
-
130
- with table_column:
131
- for _ in range(10):
132
- st.text("")
133
- st.dataframe(hist)
134
- latex_df_class_dist = pd.DataFrame([hist.style.to_latex()])
135
- st.button(
136
- label="Copy table to LaTeX",
137
- on_click=latex_df_class_dist.to_clipboard(header=False, index=False),
138
- key="copy_class_dist_df",
139
  )
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
  # Number of words per observation
 
142
  hist_data_num_words = [
143
  df["text"].apply(count_num_of_words) for df in DATA_DICT.values()
144
  ]
@@ -155,6 +166,7 @@ with class_distribution:
155
  st.plotly_chart(fig_num_words, use_container_width=True)
156
 
157
  # Number of characters per observation
 
158
  hist_data_num_characters = [
159
  df["text"].apply(count_num_of_characters) for df in DATA_DICT.values()
160
  ]
 
1
  import re
 
2
 
3
+ from datasets import load_dataset
4
  import pandas as pd
5
  import plotly.figure_factory as ff
6
  import plotly.graph_objects as go
7
+ import pyperclip
8
  import streamlit as st
9
  from unidecode import unidecode
10
 
11
+ DATA_SPLITS = ["train", "validation", "test"]
12
 
 
13
 
14
+ def load_data() -> dict[str, pd.DataFrame]:
15
+ return {
16
+ data: pd.read_csv(f"data/{data}.csv").rename(
17
+ {"label": "target"}, axis="columns"
18
+ )
19
+ for data in DATA_SPLITS
20
+ }
21
 
22
 
23
+ def flatten_list(main_list: list[list]) -> list:
24
  return [item for sublist in main_list for item in sublist]
25
 
26
 
 
32
  return len(re.sub(r"[^a-zA-Z ]", "", unidecode(text)).split(" "))
33
 
34
 
35
+ selected_dataset = st.sidebar.selectbox(
36
+ "Choose a dataset to load",
37
+ ("clarin-pl/polemo2-official", "laugustyniak/abusive-clauses-pl"),
38
+ )
39
+
40
+
41
+ def load_hf_dataset():
42
+ match selected_dataset:
43
+ case "clarin-pl/polemo2-official":
44
+ data = load_dataset("clarin-pl/polemo2-official")
45
+ DATA_DICT = {
46
+ "train": data["train"].to_pandas(),
47
+ "validation": data["validation"].to_pandas(),
48
+ "test": data["test"].to_pandas(),
49
+ }
50
+ DATA_DESCRIPTION = """The PolEmo2.0 is a dataset of online consumer reviews from four domains: medicine,
51
+ hotels, products, and university. It is human-annotated on a level of full reviews and individual
52
+ sentences. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and
53
+ sentence was manually annotated with sentiment in the 2+1 scheme, which gives a total of 197,
54
+ 046 annotations. About 85% of the reviews are from the medicine and hotel domains. Each review is
55
+ annotated with four labels: positive, negative, neutral, or ambiguous. """
56
+ case "laugustyniak/abusive-clauses-pl":
57
+ DATA_DICT = load_data()
58
+ DATA_DESCRIPTION = """
59
+ ''I have read and agree to the terms and conditions'' is one of the biggest lies on the Internet.
60
+ Consumers rarely read the contracts they are required to accept. We conclude agreements over the Internet daily.
61
+ But do we know the content of these agreements? Do we check potential unfair statements? On the Internet,
62
+ we probably skip most of the Terms and Conditions. However, we must remember that we have concluded many more
63
+ contracts. Imagine that we want to buy a house, a car, send our kids to the nursery, open a bank account,
64
+ or many more. In all these situations, you will need to conclude the contract, but there is a high probability
65
+ that you will not read the entire agreement with proper understanding. European consumer law aims to prevent
66
+ businesses from using so-called ''unfair contractual terms'' in their unilaterally drafted contracts,
67
+ requiring consumers to accept.
68
+
69
+ Our dataset treats ''unfair contractual term'' as the equivalent of an abusive clause. It could be defined as a
70
+ clause that is unilaterally imposed by one of the contract's parties, unequally affecting the other, or creating a
71
+ situation of imbalance between the duties and rights of the parties.
72
+
73
+ On the EU and at the national such as the Polish levels, agencies cannot check possible agreements by hand. Hence,
74
+ we took the first step to evaluate the possibility of accelerating this process. We created a dataset and machine
75
+ learning models to automate potentially abusive clauses detection partially. Consumer protection organizations and
76
+ agencies can use these resources to make their work more effective and efficient. Moreover, consumers can automatically
77
+ analyze contracts and understand what they agree upon.
78
+ """
79
+ return DATA_DICT, DATA_DESCRIPTION
80
+
81
+
82
+ DATA_DICT, DATA_DESCRIPTION = load_hf_dataset()
83
 
84
  header = st.container()
85
  description = st.container()
86
  dataset_statistics = st.container()
 
87
 
88
  with header:
89
+ st.title(selected_dataset)
90
 
91
  with description:
92
  st.header("Dataset description")
93
+ st.write(DATA_DESCRIPTION)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  with dataset_statistics:
96
+ st.header("Dataset statistics")
97
  st.subheader("Number of samples in each data split")
98
  metrics_df = pd.DataFrame.from_dict(
99
  {
100
  "Train": DATA_DICT["train"].shape[0],
101
+ "Validation": DATA_DICT["validation"].shape[0],
102
  "Test": DATA_DICT["test"].shape[0],
103
  "Total": sum(
104
  [
105
  DATA_DICT["train"].shape[0],
106
+ DATA_DICT["validation"].shape[0],
107
  DATA_DICT["test"].shape[0],
108
  ]
109
  ),
 
112
  ).reset_index()
113
  metrics_df.columns = ["Subset", "Number of samples"]
114
  st.dataframe(metrics_df)
 
 
 
 
 
 
115
 
116
+ latex_df = metrics_df.style.to_latex()
117
+ st.text_area(label="Latex code", value=latex_df)
118
+
119
+ # Class distribution in each subset
120
  st.subheader("Class distribution in each subset")
121
+ target_unique_values = DATA_DICT["train"]["target"].unique()
122
+ hist = (
123
+ pd.DataFrame(
124
+ [
125
+ df["target"].value_counts(normalize=True).rename(k)
126
+ for k, df in DATA_DICT.items()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  ]
128
  )
129
+ .reset_index()
130
+ .rename({"index": "split_name"}, axis=1)
131
+ )
132
+ plot_data = [
133
+ go.Bar(
134
+ name=str(target_unique_values[i]),
135
+ x=DATA_SPLITS,
136
+ y=hist[target_unique_values[i]].values,
 
 
 
 
 
 
 
 
137
  )
138
+ for i in range(len(target_unique_values))
139
+ ]
140
+ barchart_class_dist = go.Figure(data=plot_data)
141
+ barchart_class_dist.update_layout(
142
+ barmode="group",
143
+ title_text="Barchart - class distribution",
144
+ xaxis_title="Split name",
145
+ yaxis_title="Number of data points",
146
+ )
147
+ st.plotly_chart(barchart_class_dist, use_container_width=True)
148
+ st.dataframe(hist)
149
+ st.text_area(label="Latex code", value=hist.style.to_latex())
150
 
151
  # Number of words per observation
152
+ st.subheader("Number of words per observation in each subset")
153
  hist_data_num_words = [
154
  df["text"].apply(count_num_of_words) for df in DATA_DICT.values()
155
  ]
 
166
  st.plotly_chart(fig_num_words, use_container_width=True)
167
 
168
  # Number of characters per observation
169
+ st.subheader("Number of characters per observation in each subset")
170
  hist_data_num_characters = [
171
  df["text"].apply(count_num_of_characters) for df in DATA_DICT.values()
172
  ]
data/{dev.csv → validation.csv} RENAMED
File without changes