tlkh commited on
Commit
0d49845
1 Parent(s): 55384ed

Initial commit

Browse files
Files changed (5) hide show
  1. .DS_Store +0 -0
  2. app.py +96 -0
  3. mrpc_test_scores.csv +0 -0
  4. mrpc_train_scores.csv +0 -0
  5. requirements.txt +2 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+
4
+ st.set_page_config(layout="wide")
5
+
6
+ st.sidebar.markdown("**Data Filter Options**")
7
+ split = st.sidebar.selectbox("Dataset Split", ["train", "test"])
8
+ display = st.sidebar.selectbox("Source", ["All", "Only MRPC", "Only MRPC-R1"])
9
+
10
+ ptype = st.sidebar.radio("Paraphrase Pair Types", ["All",
11
+ "Only Paraphrases (MRPC-R1)",
12
+ "Only Paraphrases (MRPC)",
13
+ "Rejected Paraphrases from MRPC",
14
+ "Corrected Paraphrases from MRPC"])
15
+
16
+ st.sidebar.markdown("**Score Filter Options**")
17
+ filter_by = st.sidebar.selectbox("Filter By Scores From", ["MRPC", "MRPC-R1"])
18
+ display_range_wpd = st.sidebar.slider("Filter by WPD Scores", min_value=0.0, max_value=1.0, value=(0.1,0.7))
19
+ display_range_ld = st.sidebar.slider("Filter by LD Scores", min_value=0.0, max_value=1.0, value=(0.1,0.4))
20
+
21
+ st.sidebar.markdown("""**Explanation**
22
+ This demo allows you to explore the data inside [MRPC](https://www.microsoft.com/en-us/download/details.aspx?id=52398), showing how we can use Word Position Deviation (WPD) and Lexical Deviation (LD) to find different types of paraphrases. By using what we observe from the data, we can also correct numerous labelling errors inside MRPC, presenting the a revision of MRPC termed as MRPC-R1. This demo accompanies the paper ["Towards Better Characterization of Paraphrases" (ACL 2022)](https://github.com/tlkh/paraphrase-metrics).""")
23
+
24
+ st.markdown("**MRPC Paraphrase Data Explorer**")
25
+
26
+ def load_df(split):
27
+ if split == "train":
28
+ df = pd.read_csv("./mrpc_train_scores.csv")
29
+ else:
30
+ df = pd.read_csv("./mrpc_test_scores.csv")
31
+ df.reset_index(drop=True, inplace=True)
32
+ return df
33
+
34
+ def filter_df(df, display, ptype, filter_by):
35
+ # filter data
36
+ if display == "MRPC":
37
+ df = df.drop(["new_s1", "new_s2"], axis=1)
38
+ elif display == "MRPC-R1":
39
+ df = df.drop(["og_s1", "og_s2"], axis=1)
40
+ # filter paraphrase type
41
+ if ptype == "Only Paraphrases (MRPC)":
42
+ condition = df.og_label == 1
43
+ df_sel = df[condition]
44
+ elif ptype == "Only Paraphrases (MRPC-R1)":
45
+ condition = df.new_label == 1
46
+ df_sel = df[condition]
47
+ elif ptype == "Rejected Paraphrases from MRPC":
48
+ condition = (df.new_label == 0) & (df.og_label == 1)
49
+ df_sel = df[condition]
50
+ elif ptype == "Corrected Paraphrases from MRPC":
51
+ condition = df.remarks == "corrected"
52
+ df_sel = df[condition]
53
+ else:
54
+ # all
55
+ df_sel = df
56
+ # sort by scores
57
+ if filter_by == "MRPC":
58
+ # wpd
59
+ condition = (df_sel.og_wpd >= display_range_wpd[0]) & (df_sel.og_wpd < display_range_wpd[1])
60
+ df_sel = df_sel[condition]
61
+ # ld
62
+ condition = (df_sel.og_ld >= display_range_ld[0]) & (df_sel.og_ld < display_range_ld[1])
63
+ df_sel = df_sel[condition]
64
+ else:
65
+ # wpd
66
+ condition = (df_sel.new_wpd >= display_range_wpd[0]) & (df_sel.new_wpd < display_range_wpd[1])
67
+ df_sel = df_sel[condition]
68
+ # ld
69
+ condition = (df_sel.new_ld >= display_range_ld[0]) & (df_sel.new_ld < display_range_ld[1])
70
+ df_sel = df_sel[condition]
71
+ # filter scores
72
+ if filter_by == "MRPC":
73
+ df_sel.sort_values("og_ld", inplace=True)
74
+ df_sel.sort_values("og_wpd", inplace=True)
75
+ else:
76
+ df_sel.sort_values("new_ld", inplace=True)
77
+ df_sel.sort_values("new_wpd", inplace=True)
78
+ return df_sel
79
+
80
+
81
+ df = load_df(split)
82
+
83
+ df_sel = filter_df(df, display, ptype, filter_by)
84
+
85
+ # CSS to inject contained in a string
86
+ hide_table_row_index = """
87
+ <style>
88
+ tbody th {display:none}
89
+ .blank {display:none}
90
+ </style>
91
+ """
92
+
93
+ # inject css at the end
94
+ st.markdown("Total "+str(len(df_sel))+" items"+hide_table_row_index, unsafe_allow_html=True)
95
+
96
+ st.table(data=df_sel)
mrpc_test_scores.csv ADDED
The diff for this file is too large to render. See raw diff
 
mrpc_train_scores.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ pandas
2
+ streamlit