Spaces:
Runtime error
Runtime error
Mariusz Kossakowski
commited on
Commit
·
acb3b1d
1
Parent(s):
1d713ee
Generalize dashboard for multiple datasets
Browse files- app.py +96 -84
- data/{dev.csv → validation.csv} +0 -0
app.py
CHANGED
@@ -1,22 +1,26 @@
|
|
1 |
import re
|
2 |
-
from typing import Dict, List
|
3 |
|
|
|
4 |
import pandas as pd
|
5 |
import plotly.figure_factory as ff
|
6 |
import plotly.graph_objects as go
|
|
|
7 |
import streamlit as st
|
8 |
from unidecode import unidecode
|
9 |
|
10 |
-
|
11 |
|
12 |
-
DATA_SPLITS = ["train", "dev", "test"]
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
17 |
|
18 |
|
19 |
-
def flatten_list(main_list:
|
20 |
return [item for sublist in main_list for item in sublist]
|
21 |
|
22 |
|
@@ -28,54 +32,78 @@ def count_num_of_words(text: str) -> int:
|
|
28 |
return len(re.sub(r"[^a-zA-Z ]", "", unidecode(text)).split(" "))
|
29 |
|
30 |
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
header = st.container()
|
34 |
description = st.container()
|
35 |
dataset_statistics = st.container()
|
36 |
-
class_distribution = st.container()
|
37 |
|
38 |
with header:
|
39 |
-
st.title(
|
40 |
|
41 |
with description:
|
42 |
st.header("Dataset description")
|
43 |
-
|
44 |
-
''I have read and agree to the terms and conditions'' is one of the biggest lies on the Internet.
|
45 |
-
Consumers rarely read the contracts they are required to accept. We conclude agreements over the Internet daily.
|
46 |
-
But do we know the content of these agreements? Do we check potential unfair statements? On the Internet,
|
47 |
-
we probably skip most of the Terms and Conditions. However, we must remember that we have concluded many more
|
48 |
-
contracts. Imagine that we want to buy a house, a car, send our kids to the nursery, open a bank account,
|
49 |
-
or many more. In all these situations, you will need to conclude the contract, but there is a high probability
|
50 |
-
that you will not read the entire agreement with proper understanding. European consumer law aims to prevent
|
51 |
-
businesses from using so-called ''unfair contractual terms'' in their unilaterally drafted contracts,
|
52 |
-
requiring consumers to accept.
|
53 |
-
|
54 |
-
Our dataset treats ''unfair contractual term'' as the equivalent of an abusive clause. It could be defined as a
|
55 |
-
clause that is unilaterally imposed by one of the contract's parties, unequally affecting the other, or creating a
|
56 |
-
situation of imbalance between the duties and rights of the parties.
|
57 |
-
|
58 |
-
On the EU and at the national such as the Polish levels, agencies cannot check possible agreements by hand. Hence,
|
59 |
-
we took the first step to evaluate the possibility of accelerating this process. We created a dataset and machine
|
60 |
-
learning models to automate potentially abusive clauses detection partially. Consumer protection organizations and
|
61 |
-
agencies can use these resources to make their work more effective and efficient. Moreover, consumers can automatically
|
62 |
-
analyze contracts and understand what they agree upon.
|
63 |
-
"""
|
64 |
-
st.write(desc)
|
65 |
-
|
66 |
-
st.header("Dataset statistics")
|
67 |
|
68 |
with dataset_statistics:
|
|
|
69 |
st.subheader("Number of samples in each data split")
|
70 |
metrics_df = pd.DataFrame.from_dict(
|
71 |
{
|
72 |
"Train": DATA_DICT["train"].shape[0],
|
73 |
-
"
|
74 |
"Test": DATA_DICT["test"].shape[0],
|
75 |
"Total": sum(
|
76 |
[
|
77 |
DATA_DICT["train"].shape[0],
|
78 |
-
DATA_DICT["
|
79 |
DATA_DICT["test"].shape[0],
|
80 |
]
|
81 |
),
|
@@ -84,61 +112,44 @@ with dataset_statistics:
|
|
84 |
).reset_index()
|
85 |
metrics_df.columns = ["Subset", "Number of samples"]
|
86 |
st.dataframe(metrics_df)
|
87 |
-
latex_df = pd.DataFrame([metrics_df.style.to_latex()])
|
88 |
-
st.button(
|
89 |
-
label="Copy table to LaTeX",
|
90 |
-
on_click=latex_df.to_clipboard(index=False, header=False),
|
91 |
-
key="copy_metrics_df",
|
92 |
-
)
|
93 |
|
94 |
-
|
95 |
-
|
|
|
|
|
96 |
st.subheader("Class distribution in each subset")
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
[
|
102 |
-
|
103 |
-
for k, df in DATA_DICT.items()
|
104 |
-
]
|
105 |
-
)
|
106 |
-
.reset_index()
|
107 |
-
.rename({"index": "split_name"}, axis=1)
|
108 |
-
)
|
109 |
-
barchart_class_dist = go.Figure(
|
110 |
-
data=[
|
111 |
-
go.Bar(
|
112 |
-
name="BEZPIECZNE_POSTANOWIENIE_UMOWNE",
|
113 |
-
x=DATA_SPLITS,
|
114 |
-
y=hist["BEZPIECZNE_POSTANOWIENIE_UMOWNE"].values,
|
115 |
-
),
|
116 |
-
go.Bar(
|
117 |
-
name="KLAUZULA_ABUZYWNA",
|
118 |
-
x=DATA_SPLITS,
|
119 |
-
y=hist["KLAUZULA_ABUZYWNA"].values,
|
120 |
-
),
|
121 |
]
|
122 |
)
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
for _ in range(10):
|
132 |
-
st.text("")
|
133 |
-
st.dataframe(hist)
|
134 |
-
latex_df_class_dist = pd.DataFrame([hist.style.to_latex()])
|
135 |
-
st.button(
|
136 |
-
label="Copy table to LaTeX",
|
137 |
-
on_click=latex_df_class_dist.to_clipboard(header=False, index=False),
|
138 |
-
key="copy_class_dist_df",
|
139 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
|
141 |
# Number of words per observation
|
|
|
142 |
hist_data_num_words = [
|
143 |
df["text"].apply(count_num_of_words) for df in DATA_DICT.values()
|
144 |
]
|
@@ -155,6 +166,7 @@ with class_distribution:
|
|
155 |
st.plotly_chart(fig_num_words, use_container_width=True)
|
156 |
|
157 |
# Number of characters per observation
|
|
|
158 |
hist_data_num_characters = [
|
159 |
df["text"].apply(count_num_of_characters) for df in DATA_DICT.values()
|
160 |
]
|
|
|
1 |
import re
|
|
|
2 |
|
3 |
+
from datasets import load_dataset
|
4 |
import pandas as pd
|
5 |
import plotly.figure_factory as ff
|
6 |
import plotly.graph_objects as go
|
7 |
+
import pyperclip
|
8 |
import streamlit as st
|
9 |
from unidecode import unidecode
|
10 |
|
11 |
+
DATA_SPLITS = ["train", "validation", "test"]
|
12 |
|
|
|
13 |
|
14 |
+
def load_data() -> dict[str, pd.DataFrame]:
|
15 |
+
return {
|
16 |
+
data: pd.read_csv(f"data/{data}.csv").rename(
|
17 |
+
{"label": "target"}, axis="columns"
|
18 |
+
)
|
19 |
+
for data in DATA_SPLITS
|
20 |
+
}
|
21 |
|
22 |
|
23 |
+
def flatten_list(main_list: list[list]) -> list:
|
24 |
return [item for sublist in main_list for item in sublist]
|
25 |
|
26 |
|
|
|
32 |
return len(re.sub(r"[^a-zA-Z ]", "", unidecode(text)).split(" "))
|
33 |
|
34 |
|
35 |
+
selected_dataset = st.sidebar.selectbox(
|
36 |
+
"Choose a dataset to load",
|
37 |
+
("clarin-pl/polemo2-official", "laugustyniak/abusive-clauses-pl"),
|
38 |
+
)
|
39 |
+
|
40 |
+
|
41 |
+
def load_hf_dataset():
|
42 |
+
match selected_dataset:
|
43 |
+
case "clarin-pl/polemo2-official":
|
44 |
+
data = load_dataset("clarin-pl/polemo2-official")
|
45 |
+
DATA_DICT = {
|
46 |
+
"train": data["train"].to_pandas(),
|
47 |
+
"validation": data["validation"].to_pandas(),
|
48 |
+
"test": data["test"].to_pandas(),
|
49 |
+
}
|
50 |
+
DATA_DESCRIPTION = """The PolEmo2.0 is a dataset of online consumer reviews from four domains: medicine,
|
51 |
+
hotels, products, and university. It is human-annotated on a level of full reviews and individual
|
52 |
+
sentences. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and
|
53 |
+
sentence was manually annotated with sentiment in the 2+1 scheme, which gives a total of 197,
|
54 |
+
046 annotations. About 85% of the reviews are from the medicine and hotel domains. Each review is
|
55 |
+
annotated with four labels: positive, negative, neutral, or ambiguous. """
|
56 |
+
case "laugustyniak/abusive-clauses-pl":
|
57 |
+
DATA_DICT = load_data()
|
58 |
+
DATA_DESCRIPTION = """
|
59 |
+
''I have read and agree to the terms and conditions'' is one of the biggest lies on the Internet.
|
60 |
+
Consumers rarely read the contracts they are required to accept. We conclude agreements over the Internet daily.
|
61 |
+
But do we know the content of these agreements? Do we check potential unfair statements? On the Internet,
|
62 |
+
we probably skip most of the Terms and Conditions. However, we must remember that we have concluded many more
|
63 |
+
contracts. Imagine that we want to buy a house, a car, send our kids to the nursery, open a bank account,
|
64 |
+
or many more. In all these situations, you will need to conclude the contract, but there is a high probability
|
65 |
+
that you will not read the entire agreement with proper understanding. European consumer law aims to prevent
|
66 |
+
businesses from using so-called ''unfair contractual terms'' in their unilaterally drafted contracts,
|
67 |
+
requiring consumers to accept.
|
68 |
+
|
69 |
+
Our dataset treats ''unfair contractual term'' as the equivalent of an abusive clause. It could be defined as a
|
70 |
+
clause that is unilaterally imposed by one of the contract's parties, unequally affecting the other, or creating a
|
71 |
+
situation of imbalance between the duties and rights of the parties.
|
72 |
+
|
73 |
+
On the EU and at the national such as the Polish levels, agencies cannot check possible agreements by hand. Hence,
|
74 |
+
we took the first step to evaluate the possibility of accelerating this process. We created a dataset and machine
|
75 |
+
learning models to automate potentially abusive clauses detection partially. Consumer protection organizations and
|
76 |
+
agencies can use these resources to make their work more effective and efficient. Moreover, consumers can automatically
|
77 |
+
analyze contracts and understand what they agree upon.
|
78 |
+
"""
|
79 |
+
return DATA_DICT, DATA_DESCRIPTION
|
80 |
+
|
81 |
+
|
82 |
+
DATA_DICT, DATA_DESCRIPTION = load_hf_dataset()
|
83 |
|
84 |
header = st.container()
|
85 |
description = st.container()
|
86 |
dataset_statistics = st.container()
|
|
|
87 |
|
88 |
with header:
|
89 |
+
st.title(selected_dataset)
|
90 |
|
91 |
with description:
|
92 |
st.header("Dataset description")
|
93 |
+
st.write(DATA_DESCRIPTION)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
with dataset_statistics:
|
96 |
+
st.header("Dataset statistics")
|
97 |
st.subheader("Number of samples in each data split")
|
98 |
metrics_df = pd.DataFrame.from_dict(
|
99 |
{
|
100 |
"Train": DATA_DICT["train"].shape[0],
|
101 |
+
"Validation": DATA_DICT["validation"].shape[0],
|
102 |
"Test": DATA_DICT["test"].shape[0],
|
103 |
"Total": sum(
|
104 |
[
|
105 |
DATA_DICT["train"].shape[0],
|
106 |
+
DATA_DICT["validation"].shape[0],
|
107 |
DATA_DICT["test"].shape[0],
|
108 |
]
|
109 |
),
|
|
|
112 |
).reset_index()
|
113 |
metrics_df.columns = ["Subset", "Number of samples"]
|
114 |
st.dataframe(metrics_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
+
latex_df = metrics_df.style.to_latex()
|
117 |
+
st.text_area(label="Latex code", value=latex_df)
|
118 |
+
|
119 |
+
# Class distribution in each subset
|
120 |
st.subheader("Class distribution in each subset")
|
121 |
+
target_unique_values = DATA_DICT["train"]["target"].unique()
|
122 |
+
hist = (
|
123 |
+
pd.DataFrame(
|
124 |
+
[
|
125 |
+
df["target"].value_counts(normalize=True).rename(k)
|
126 |
+
for k, df in DATA_DICT.items()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
]
|
128 |
)
|
129 |
+
.reset_index()
|
130 |
+
.rename({"index": "split_name"}, axis=1)
|
131 |
+
)
|
132 |
+
plot_data = [
|
133 |
+
go.Bar(
|
134 |
+
name=str(target_unique_values[i]),
|
135 |
+
x=DATA_SPLITS,
|
136 |
+
y=hist[target_unique_values[i]].values,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
)
|
138 |
+
for i in range(len(target_unique_values))
|
139 |
+
]
|
140 |
+
barchart_class_dist = go.Figure(data=plot_data)
|
141 |
+
barchart_class_dist.update_layout(
|
142 |
+
barmode="group",
|
143 |
+
title_text="Barchart - class distribution",
|
144 |
+
xaxis_title="Split name",
|
145 |
+
yaxis_title="Number of data points",
|
146 |
+
)
|
147 |
+
st.plotly_chart(barchart_class_dist, use_container_width=True)
|
148 |
+
st.dataframe(hist)
|
149 |
+
st.text_area(label="Latex code", value=hist.style.to_latex())
|
150 |
|
151 |
# Number of words per observation
|
152 |
+
st.subheader("Number of words per observation in each subset")
|
153 |
hist_data_num_words = [
|
154 |
df["text"].apply(count_num_of_words) for df in DATA_DICT.values()
|
155 |
]
|
|
|
166 |
st.plotly_chart(fig_num_words, use_container_width=True)
|
167 |
|
168 |
# Number of characters per observation
|
169 |
+
st.subheader("Number of characters per observation in each subset")
|
170 |
hist_data_num_characters = [
|
171 |
df["text"].apply(count_num_of_characters) for df in DATA_DICT.values()
|
172 |
]
|
data/{dev.csv → validation.csv}
RENAMED
File without changes
|