Spaces:

tlkh
/

paraphrase-metrics-mrpc

Runtime error

Initial commit

0d49845 over 2 years ago

3.94 kB

	import streamlit as st
	import pandas as pd

	st.set_page_config(layout="wide")

	st.sidebar.markdown("Data Filter Options")
	split = st.sidebar.selectbox("Dataset Split", ["train", "test"])
	display = st.sidebar.selectbox("Source", ["All", "Only MRPC", "Only MRPC-R1"])

	ptype = st.sidebar.radio("Paraphrase Pair Types", ["All",
	"Only Paraphrases (MRPC-R1)",
	"Only Paraphrases (MRPC)",
	"Rejected Paraphrases from MRPC",
	"Corrected Paraphrases from MRPC"])

	st.sidebar.markdown("Score Filter Options")
	filter_by = st.sidebar.selectbox("Filter By Scores From", ["MRPC", "MRPC-R1"])
	display_range_wpd = st.sidebar.slider("Filter by WPD Scores", min_value=0.0, max_value=1.0, value=(0.1,0.7))
	display_range_ld = st.sidebar.slider("Filter by LD Scores", min_value=0.0, max_value=1.0, value=(0.1,0.4))

	st.sidebar.markdown("""Explanation
	This demo allows you to explore the data inside [MRPC](https://www.microsoft.com/en-us/download/details.aspx?id=52398), showing how we can use Word Position Deviation (WPD) and Lexical Deviation (LD) to find different types of paraphrases. By using what we observe from the data, we can also correct numerous labelling errors inside MRPC, presenting the a revision of MRPC termed as MRPC-R1. This demo accompanies the paper ["Towards Better Characterization of Paraphrases" (ACL 2022)](https://github.com/tlkh/paraphrase-metrics).""")

	st.markdown("MRPC Paraphrase Data Explorer")

	def load_df(split):
	if split == "train":
	df = pd.read_csv("./mrpc_train_scores.csv")
	else:
	df = pd.read_csv("./mrpc_test_scores.csv")
	df.reset_index(drop=True, inplace=True)
	return df

	def filter_df(df, display, ptype, filter_by):
	# filter data
	if display == "MRPC":
	df = df.drop(["new_s1", "new_s2"], axis=1)
	elif display == "MRPC-R1":
	df = df.drop(["og_s1", "og_s2"], axis=1)
	# filter paraphrase type
	if ptype == "Only Paraphrases (MRPC)":
	condition = df.og_label == 1
	df_sel = df[condition]
	elif ptype == "Only Paraphrases (MRPC-R1)":
	condition = df.new_label == 1
	df_sel = df[condition]
	elif ptype == "Rejected Paraphrases from MRPC":
	condition = (df.new_label == 0) & (df.og_label == 1)
	df_sel = df[condition]
	elif ptype == "Corrected Paraphrases from MRPC":
	condition = df.remarks == "corrected"
	df_sel = df[condition]
	else:
	# all
	df_sel = df
	# sort by scores
	if filter_by == "MRPC":
	# wpd
	condition = (df_sel.og_wpd >= display_range_wpd[0]) & (df_sel.og_wpd < display_range_wpd[1])
	df_sel = df_sel[condition]
	# ld
	condition = (df_sel.og_ld >= display_range_ld[0]) & (df_sel.og_ld < display_range_ld[1])
	df_sel = df_sel[condition]
	else:
	# wpd
	condition = (df_sel.new_wpd >= display_range_wpd[0]) & (df_sel.new_wpd < display_range_wpd[1])
	df_sel = df_sel[condition]
	# ld
	condition = (df_sel.new_ld >= display_range_ld[0]) & (df_sel.new_ld < display_range_ld[1])
	df_sel = df_sel[condition]
	# filter scores
	if filter_by == "MRPC":
	df_sel.sort_values("og_ld", inplace=True)
	df_sel.sort_values("og_wpd", inplace=True)
	else:
	df_sel.sort_values("new_ld", inplace=True)
	df_sel.sort_values("new_wpd", inplace=True)
	return df_sel


	df = load_df(split)

	df_sel = filter_df(df, display, ptype, filter_by)

	# CSS to inject contained in a string
	hide_table_row_index = """
	<style>
	tbody th {display:none}
	.blank {display:none}
	</style>
	"""

	# inject css at the end
	st.markdown("Total "+str(len(df_sel))+" items"+hide_table_row_index, unsafe_allow_html=True)

	st.table(data=df_sel)