tlkh commited on
Commit
653482e
1 Parent(s): 0c4f0e2

update app

Browse files
Files changed (1) hide show
  1. app.py +44 -31
app.py CHANGED
@@ -7,13 +7,13 @@ with st.sidebar.expander("📍 Explanation", expanded=False):
7
  st.markdown("""
8
  **About**
9
 
10
- This demo allows you to explore the data inside the [MRPC](https://www.microsoft.com/en-us/download/details.aspx?id=52398) dataset.
11
  It illustrates how **Word Position Deviation (WPD)** and **Lexical Deviation (LD)** can be used to find different types of [paraphrase pairs](https://direct.mit.edu/coli/article/39/3/463/1434/What-Is-a-Paraphrase) inside MRPC.
12
  By using what we observe from the data, we can find and correct numerous labelling errors inside MRPC, thus we present a revision of MRPC termed as **MRPC-R1**.
13
 
14
  **Data Display**
15
 
16
- The paraphrase pairs are displayed as **S1** and **S2** from the original MRPC (columns 1,2) and MRPC-R1 (columns 3,4), along with their labels (columns 5), showing if the label was changed or kept. **1->0** means that the pair was labelled as a paraphrase in MRPC, but corrected to non-paraphrase in MRPC-R1, meaning we rejected the paraphrase.
17
 
18
  By changing the **Display Types** option below, you can filter the displayed pairs to show pairs that were rejected (label changed from paraphrase to non-paraphrase) or corrected (inconsistencies corrected).
19
 
@@ -33,10 +33,23 @@ ptype = st.sidebar.radio("Display Types", ["All Paraphrases",
33
  display_reason = st.sidebar.checkbox(
34
  "Display reason for label change", value=False)
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  st.sidebar.markdown("**WPD/LD Score Filter Options**")
37
- display_range_wpd = st.sidebar.slider(
38
  "Filter by WPD Scores", min_value=0.0, max_value=1.0, value=(0.1, 0.7))
39
- display_range_ld = st.sidebar.slider(
40
  "Filter by LD Scores", min_value=0.0, max_value=1.0, value=(0.1, 0.4))
41
 
42
  with st.sidebar.expander("📍 WPD/LD Score Explanation", expanded=False):
@@ -53,17 +66,17 @@ with st.sidebar.expander("📍 WPD/LD Score Explanation", expanded=False):
53
 
54
  st.markdown("**Additional Filter Options**")
55
 
56
- filter_by = st.radio(
57
  "Filter By Scores From", ["MRPC", "MRPC-R1"])
58
 
59
- display_scores = st.checkbox("Display scores", value=False)
60
 
61
 
62
  def load_df(split):
63
  if split == "train":
64
- df = pd.read_csv("./mrpc_train_scores.csv")
65
  else:
66
- df = pd.read_csv("./mrpc_test_scores.csv")
67
  df.reset_index(drop=True, inplace=True)
68
  return df
69
 
@@ -71,44 +84,44 @@ def load_df(split):
71
  def filter_df(df, display, ptype, filter_by, display_scores):
72
  # filter data
73
  if display == "Only MRPC":
74
- df = df.drop(["new_s1", "new_s2"], axis=1)
75
  elif display == "Only MRPC-R1":
76
- df = df.drop(["og_s1", "og_s2"], axis=1)
77
  # filter paraphrase type
78
  if ptype == "All Paraphrases":
79
- condition = df.og_label == 1
80
- df_sel = df[condition]
81
  elif ptype == "Only Paraphrases in MRPC-R1":
82
- condition = df.new_label == 1
83
- df_sel = df[condition]
84
  elif ptype == "Rejected Paraphrases from MRPC":
85
- condition = (df.new_label == 0) & (df.og_label == 1)
86
- df_sel = df[condition]
87
  elif ptype == "Corrected Paraphrases from MRPC":
88
- condition = df.remarks == "corrected"
89
- df_sel = df[condition]
90
  else:
91
  # all
92
- df_sel = df
93
  # sort by scores
94
  if filter_by == "MRPC":
95
  # wpd
96
- condition = (df_sel.og_wpd >= display_range_wpd[0]) & (
97
  df_sel.og_wpd < display_range_wpd[1])
98
- df_sel = df_sel[condition]
99
  # ld
100
- condition = (df_sel.og_ld >= display_range_ld[0]) & (
101
  df_sel.og_ld < display_range_ld[1])
102
- df_sel = df_sel[condition]
103
  else:
104
  # wpd
105
- condition = (df_sel.new_wpd >= display_range_wpd[0]) & (
106
  df_sel.new_wpd < display_range_wpd[1])
107
- df_sel = df_sel[condition]
108
  # ld
109
- condition = (df_sel.new_ld >= display_range_ld[0]) & (
110
  df_sel.new_ld < display_range_ld[1])
111
- df_sel = df_sel[condition]
112
  # filter scores
113
  if filter_by == "MRPC":
114
  df_sel.sort_values("og_ld", inplace=True)
@@ -122,16 +135,16 @@ def filter_df(df, display, ptype, filter_by, display_scores):
122
  if not display_reason:
123
  df_sel.drop(["remarks", ],
124
  axis=1, inplace=True)
125
- label_col = df_sel["og_label"].astype(
126
  str)+"->"+df_sel["new_label"].astype(str)
127
- df_sel["og/new label"] = label_col
128
  df_sel.drop(["og_label", "new_label"], axis=1, inplace=True)
129
  return df_sel
130
 
131
 
132
- df = load_df(split)
133
 
134
- df_sel = filter_df(df, display, ptype, filter_by, display_scores)
135
  df_sel.rename(columns={"og_s1": "Original S1 (MRPC)", "og_s2": "Original S2 (MRPC)",
136
  "new_s1": "New S1 (MRPC-R1)", "new_s2": "New S2 (MRPC-R1)"}, inplace=True)
137
 
 
7
  st.markdown("""
8
  **About**
9
 
10
+ This demo allows you to explore the data inside the [MRPC](https://www.microsoft.com/en-us/download/details.aspx?id=52398) dataset.
11
  It illustrates how **Word Position Deviation (WPD)** and **Lexical Deviation (LD)** can be used to find different types of [paraphrase pairs](https://direct.mit.edu/coli/article/39/3/463/1434/What-Is-a-Paraphrase) inside MRPC.
12
  By using what we observe from the data, we can find and correct numerous labelling errors inside MRPC, thus we present a revision of MRPC termed as **MRPC-R1**.
13
 
14
  **Data Display**
15
 
16
+ The paraphrase pairs are displayed as **S1** and **S2** from the original MRPC (columns 1,2) and MRPC-R1 (columns 3,4), along with their labels (columns 5), showing if the label was changed or kept.
17
 
18
  By changing the **Display Types** option below, you can filter the displayed pairs to show pairs that were rejected (label changed from paraphrase to non-paraphrase) or corrected (inconsistencies corrected).
19
 
 
33
  display_reason = st.sidebar.checkbox(
34
  "Display reason for label change", value=False)
35
 
36
+ with st.sidebar.expander("📍 Label Change Explanation", expanded=False):
37
+ st.markdown("""
38
+ Labels may change between MRPC and MRPC-R1, as displayed in column 5.
39
+
40
+ For example, **1->0** means that the pair was labelled as a paraphrase (1) in MRPC, but corrected to non-paraphrase (0) in MRPC-R1, meaning we **rejected** the paraphrase.
41
+
42
+ There are three main cases:
43
+
44
+ 1. **no need to correct**: label was accepted. The text in original pair and new pair is the **same**.
45
+ 2. **corrected**: label was kept as sentences were corrected. The text in original pair and new pair is **different**.
46
+ 3. **can't correct**: label was rejected as sentences could not be corrected. The text in original pair and new pair is the **same**.
47
+ """)
48
+
49
  st.sidebar.markdown("**WPD/LD Score Filter Options**")
50
+ display_range_wpd=st.sidebar.slider(
51
  "Filter by WPD Scores", min_value=0.0, max_value=1.0, value=(0.1, 0.7))
52
+ display_range_ld=st.sidebar.slider(
53
  "Filter by LD Scores", min_value=0.0, max_value=1.0, value=(0.1, 0.4))
54
 
55
  with st.sidebar.expander("📍 WPD/LD Score Explanation", expanded=False):
 
66
 
67
  st.markdown("**Additional Filter Options**")
68
 
69
+ filter_by=st.radio(
70
  "Filter By Scores From", ["MRPC", "MRPC-R1"])
71
 
72
+ display_scores=st.checkbox("Display scores", value=False)
73
 
74
 
75
  def load_df(split):
76
  if split == "train":
77
+ df=pd.read_csv("./mrpc_train_scores.csv")
78
  else:
79
+ df=pd.read_csv("./mrpc_test_scores.csv")
80
  df.reset_index(drop=True, inplace=True)
81
  return df
82
 
 
84
  def filter_df(df, display, ptype, filter_by, display_scores):
85
  # filter data
86
  if display == "Only MRPC":
87
+ df=df.drop(["new_s1", "new_s2"], axis=1)
88
  elif display == "Only MRPC-R1":
89
+ df=df.drop(["og_s1", "og_s2"], axis=1)
90
  # filter paraphrase type
91
  if ptype == "All Paraphrases":
92
+ condition=df.og_label == 1
93
+ df_sel=df[condition]
94
  elif ptype == "Only Paraphrases in MRPC-R1":
95
+ condition=df.new_label == 1
96
+ df_sel=df[condition]
97
  elif ptype == "Rejected Paraphrases from MRPC":
98
+ condition=(df.new_label == 0) & (df.og_label == 1)
99
+ df_sel=df[condition]
100
  elif ptype == "Corrected Paraphrases from MRPC":
101
+ condition=df.remarks == "corrected"
102
+ df_sel=df[condition]
103
  else:
104
  # all
105
+ df_sel=df
106
  # sort by scores
107
  if filter_by == "MRPC":
108
  # wpd
109
+ condition=(df_sel.og_wpd >= display_range_wpd[0]) & (
110
  df_sel.og_wpd < display_range_wpd[1])
111
+ df_sel=df_sel[condition]
112
  # ld
113
+ condition=(df_sel.og_ld >= display_range_ld[0]) & (
114
  df_sel.og_ld < display_range_ld[1])
115
+ df_sel=df_sel[condition]
116
  else:
117
  # wpd
118
+ condition=(df_sel.new_wpd >= display_range_wpd[0]) & (
119
  df_sel.new_wpd < display_range_wpd[1])
120
+ df_sel=df_sel[condition]
121
  # ld
122
+ condition=(df_sel.new_ld >= display_range_ld[0]) & (
123
  df_sel.new_ld < display_range_ld[1])
124
+ df_sel=df_sel[condition]
125
  # filter scores
126
  if filter_by == "MRPC":
127
  df_sel.sort_values("og_ld", inplace=True)
 
135
  if not display_reason:
136
  df_sel.drop(["remarks", ],
137
  axis=1, inplace=True)
138
+ label_col=df_sel["og_label"].astype(
139
  str)+"->"+df_sel["new_label"].astype(str)
140
+ df_sel["og/new label"]=label_col
141
  df_sel.drop(["og_label", "new_label"], axis=1, inplace=True)
142
  return df_sel
143
 
144
 
145
+ df=load_df(split)
146
 
147
+ df_sel=filter_df(df, display, ptype, filter_by, display_scores)
148
  df_sel.rename(columns={"og_s1": "Original S1 (MRPC)", "og_s2": "Original S2 (MRPC)",
149
  "new_s1": "New S1 (MRPC-R1)", "new_s2": "New S2 (MRPC-R1)"}, inplace=True)
150