Spaces:

Booking-com
/

rectour24-review-ranking-leaderboard-test

Running

rectour24-review-ranking-leaderboard-test / app.py

Eran Fainman

add set_name to table

fa2db7c 9 months ago

6.82 kB

	import os
	import uuid
	import numpy as np
	import pandas as pd
	import streamlit as st
	import huggingface_hub as hh
	from datetime import datetime

	# read files from HF
	OWNER = "Booking-com"
	MAX_SUBMISSIONS = 100

	REPO_ID = f"{OWNER}/streamlit-review-ranking-leaderboard"
	RESULTS_REPO = f"{OWNER}/results"
	GT_REPO = f"{OWNER}/accommodation-reviews-gt"
	GROUPS_INFO_REPO = f"{OWNER}/rectour2024-groups"

	TOKEN = os.environ.get("HF_TOKEN")
	CACHE_PATH = os.getenv("HF_HOME", ".")

	EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
	TEMP_RESULTS_PATH = os.path.join(CACHE_PATH, "temp-results")
	GT_PATH = os.path.join(CACHE_PATH, "gt")
	GROUPS_INFO_PATH = os.path.join(CACHE_PATH, "groups-info")

	REQUIRED_COLUMNS = ['accommodation_id', 'user_id'] + [f'review_{i}' for i in range(1, 11)]

	API = hh.HfApi(token=TOKEN)


	def restart_space():
	API.restart_space(repo_id=REPO_ID)


	# download the GT - shouldn't update too frequent
	hh.snapshot_download(
	repo_id=GT_REPO, local_dir=GT_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
	token=TOKEN
	)


	def refresh_data():
	hh.snapshot_download(
	repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
	token=TOKEN
	)

	hh.snapshot_download(
	repo_id=GROUPS_INFO_REPO, local_dir=GROUPS_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
	token=TOKEN
	)


	refresh_data()


	def get_match_index(row):
	for i in range(1, 11):
	if row['review_id'] == row[f'review_{i}']:
	return i
	return np.inf


	def calculate_metrics(df_pred):
	df_gt = pd.read_csv(os.path.join(GT_PATH, 'test_matches.csv'))
	if len(df_pred) != len(df_gt):
	raise Exception("Your predictions file should contain {} rows, only {} rows were found in the file".format(
	len(df_gt), len(df_pred)
	))

	df_merged = pd.merge(df_gt, df_pred, how='left', on=['accommodation_id', 'user_id']).fillna('')
	df_merged['match_index'] = df_merged.apply(get_match_index, axis=1)
	df_merged['mrr10'] = df_merged['match_index'].apply(lambda x: 1/x)
	df_merged['precision10'] = df_merged['match_index'].apply(lambda x: 1 if x != np.inf else 0)

	return df_merged['mrr10'].mean(), df_merged['precision10'].mean()


	def get_group_name_by_email(email):
	df = pd.read_csv(os.path.join(GROUPS_INFO_PATH, 'groups_data.csv'))
	df_email = df[df['email'] == email].reset_index(drop=True)
	if len(df_email) > 0:
	return df_email.iloc[0]['group_name']
	else:
	raise Exception("E-mail is not valid")


	def validate_pred_file(df_pred):
	for col in REQUIRED_COLUMNS:
	if col not in df_pred.columns:
	raise Exception(f"Column {col} not in prediction file")


	def get_revision(df_results, email):
	df_group_data = df_results[df_results['email'] == email]
	curr_revision = 0
	if len(df_group_data) > 0:
	curr_revision = df_group_data['revision'].max()
	if curr_revision >= MAX_SUBMISSIONS:
	raise Exception("We're sorry but you reached your maximal number of submissions")
	return curr_revision


	def get_results_dataframe():
	dfs = []
	for f in os.listdir(EVAL_RESULTS_PATH):
	if f.endswith('.csv'):
	dfs.append(pd.read_csv(os.path.join(EVAL_RESULTS_PATH, f)))
	return pd.concat(dfs)


	def upload_results(group_email, group_name, model_name, revision, mrr10, precision10):
	submission_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

	if not os.path.exists(TEMP_RESULTS_PATH):
	os.mkdir(TEMP_RESULTS_PATH)

	df_temp_results = pd.DataFrame({'email': [group_email], 'set_name': ["test set"], 'group_name': [group_name],
	"model_name": [model_name], "submission_date": [submission_date],
	"revision": [revision], "MRR@10": [mrr10], "Precision@10": [precision10]})
	temp_results_fn = str(uuid.uuid4()) + '.csv'
	temp_path = os.path.join(TEMP_RESULTS_PATH, temp_results_fn)
	df_temp_results.to_csv(temp_path, index=False)
	hh.upload_file(path_or_fileobj=temp_path, repo_id=RESULTS_REPO, token=TOKEN, repo_type="dataset",
	path_in_repo=temp_results_fn)


	def render():
	st.set_page_config(page_title="RecTour2024 - Booking.com Review Ranking Challenge Leaderboard", layout="wide")
	st.title("🏆 RecTour2024 Leaderboard")

	leaderboard_tab, submission_tab = st.tabs(["Leaderboard", "Submission"])

	# leaderboard area
	if leaderboard_tab.button("Refresh"):
	refresh_data()

	df_results = get_results_dataframe()
	leaderboard_tab.dataframe(df_results.drop(columns=['email']).sort_values(['set_name', 'MRR@10'],
	ascending=[True, False]))

	# submission area
	group_email = submission_tab.text_input(label="Group email", value="")
	model_name = submission_tab.text_input(label="Model name", value="")
	pred_file = submission_tab.file_uploader(label="Upload your prediction file",
	help="Upload a csv.zip file, in pandas this can be achieved "
	"with df.to_csv(<file_path>, compression='zip')",)
	if submission_tab.button("Upload"):
	if not pred_file:
	submission_tab.markdown("no file was submitted!")
	else:
	try:
	group_name = get_group_name_by_email(group_email)
	df_pred = pd.read_csv(pred_file, compression='zip')
	validate_pred_file(df_pred)
	mrr10, precision10 = calculate_metrics(df_pred)
	revision = get_revision(df_results=df_results, email=group_email) + 1 # generate next revision id
	upload_results(group_email=group_email, group_name=group_name, model_name=model_name, revision=revision,
	mrr10=mrr10, precision10=precision10)

	submission_tab.markdown("## THANK YOU FOR YOUR SUBMISSION!")
	submission_tab.markdown("Here are your submission details:")
	submission_tab.markdown("Group name: " + group_name)
	submission_tab.markdown("Model name: " + model_name)
	submission_tab.markdown("Revision: " + str(revision) +
	f" (out of {MAX_SUBMISSIONS} allowed submissions)")

	submission_tab.write("### Submission results")
	submission_tab.markdown("MRR@10: {:.4f}".format(mrr10))
	submission_tab.markdown("Precision@10: {:.4f}".format(precision10))
	except Exception as e:
	submission_tab.markdown(e)


	if __name__ == "__main__":
	render()