Spaces:

TabArena
/

leaderboard

Running

App Files Files Community

leaderboard / main.py

LennartPurucker

maint: update to new lb design

28bec71 about 1 month ago

raw

history blame

19.6 kB

	from __future__ import annotations

	import zipfile
	from dataclasses import dataclass
	from pathlib import Path
	import gradio as gr
	import pandas as pd
	import website_texts
	from apscheduler.schedulers.background import BackgroundScheduler
	from constants import Constants, model_type_emoji
	from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
	from website_texts import (
	ABOUT_TEXT,
	CITATION_BUTTON_LABEL,
	CITATION_BUTTON_TEXT,
	INTRODUCTION_TEXT,
	TITLE,
	VERSION_HISTORY_BUTTON_TEXT,
	)


	def get_model_family(model_name: str) -> str:
	prefixes_mapping = {
	Constants.reference: ["AutoGluon"],
	Constants.neural_network: ["REALMLP", "TabM", "FASTAI", "MNCA", "NN_TORCH"],
	Constants.tree: ["GBM", "CAT", "EBM", "XGB", "XT", "RF"],
	Constants.foundational: ["TABDPT", "TABICL", "TABPFN"],
	Constants.baseline: ["KNN", "LR"],
	}

	for method_type, prefixes in prefixes_mapping.items():
	for prefix in prefixes:
	if prefix.lower() in model_name.lower():
	return method_type
	return Constants.other


	@dataclass
	class LBContainer:
	name: str
	base_path_to_results: str
	blurb: str

	@property
	def _base_path(self):
	return Path(__file__).parent / "data" / self.base_path_to_results

	def load_df_leaderboard(self) -> pd.DataFrame:
	df = pd.read_csv(self._base_path / "website_leaderboard.csv")
	df = df.rename(columns={"1#": "#"})
	return df

	def _handle_img_zip(self, img_name: str) -> str:
	_base_path = self._base_path / img_name
	zip_path = _base_path.with_suffix(".png.zip")
	img_path = _base_path.with_suffix(".png")
	with zipfile.ZipFile(zip_path, "r") as zipf:
	zipf.extractall(img_path.parent)
	return str(img_path)

	def get_path_to_tuning_impact_elo(self) -> str:
	return self._handle_img_zip("tuning-impact-elo")

	def get_path_to_pareto_front_improvability_vs_time_infer(self) -> str:
	return self._handle_img_zip("pareto_front_improvability_vs_time_infer")

	def get_path_to_pareto_n_configs_imp(self) -> str:
	return self._handle_img_zip("pareto_n_configs_imp")

	def get_path_to_winrate_matrix(self) -> str:
	return self._handle_img_zip("winrate_matrix")


	def make_overview_images(lb: LBContainer, subset_name):
	# Main Figure
	gr.Image(
	lb.get_path_to_tuning_impact_elo(),
	label=f"Leaderboard Overview [{subset_name}]",
	show_label=True,
	height=500,
	show_share_button=True,
	)

	with gr.Row():
	with gr.Column(scale=1):
	gr.Image(
	value=lb.get_path_to_pareto_front_improvability_vs_time_infer(),
	label=f"Inference Time Pareto Front [{subset_name}]",
	height=400,
	show_label=True,
	show_share_button=True,
	)
	with gr.Column(scale=1):
	gr.Image(
	value=lb.get_path_to_pareto_n_configs_imp(),
	label=f"Tuning Trajectories [{subset_name}]",
	height=400,
	show_label=True,
	show_share_button=True,
	)


	def make_overview_leaderboard(lbs: [LBContainer]):
	# Create column per LB
	all_models = {
	m.split("[")[0].strip()
	for lb in lbs
	for m in lb.df_leaderboard[
	~lb.df_leaderboard["TypeName"].isin(["Reference Pipeline"])
	]["Model"]
	.unique()
	.tolist()
	}

	full_df = None
	for lb in lbs:
	df = lb.df_leaderboard.copy()
	df = df[~df["TypeName"].isin(["Reference Pipeline"])]
	df[lb.name] = df["Elo [⬆️]"].rank(ascending=False, method="first").astype(int)
	df = df.sort_values(by=lb.name, ascending=True)

	# Adding indicators does not work as it makes it a string and then not sort
	# correctly.
	# df[lb.name] = df[lb.name].astype(str)
	# df[lb.name] = df[lb.name].replace({
	# "1": "🥇 1",
	# "2": "🥈 2",
	# "3": "🥉 3",
	# }
	# )

	df = df[["Type", "Model", lb.name]]
	# Remove imputed message.
	df["Model"] = (
	df["Model"].apply(lambda x: x.split("[")[0].strip()).astype("string")
	)

	if full_df is None:
	# TODO: add support in case a model did not run on the full LB.
	assert all_models.difference(set(df["Model"].unique())) == set()
	full_df = df
	else:
	df = df[["Model", lb.name]]
	df_models = set(df["Model"].unique())
	missing_models = all_models.difference(df_models)
	if missing_models:
	missing_models_df = pd.DataFrame(
	[[mm, "--"] for mm in missing_models],
	columns=["Model", lb.name],
	)
	df = pd.concat([df, missing_models_df], ignore_index=True)
	df["Model"] = df["Model"].astype("string")
	# Merge
	full_df = full_df.merge(df, how="left", on="Model", validate="1:1")

	medal_colors = ["#998A00", "#808080", "#8C5520"]

	# Highlight function
	def highlight_top3(col):
	styles = [""] * len(col)
	for index_i in range(len(col)):
	if (not isinstance(col.iloc[index_i], str)) and col.iloc[index_i] <= 3:
	styles[index_i] = (
	f"background-color: {medal_colors[col.iloc[index_i] - 1]};"
	)

	return styles

	styler = full_df.style.apply(highlight_top3, axis=0, subset=[lb.name for lb in lbs])

	return gr.DataFrame(
	styler,
	pinned_columns=2,
	interactive=False,
	show_search="search",
	label="The ranking of all models (with imputation) across various leaderboards.",
	)


	def make_leaderboard(lb: LBContainer) -> Leaderboard:
	df_leaderboard = lb.load_df_leaderboard()

	# -- Add filters
	df_leaderboard["TypeFiler"] = df_leaderboard["TypeName"].apply(
	lambda m: f"{m} {model_type_emoji[m]}"
	)
	df_leaderboard["Only Default"] = df_leaderboard["Model"].str.endswith("(default)")
	df_leaderboard["Only Tuned"] = df_leaderboard["Model"].str.endswith("(tuned)")
	df_leaderboard["Only Tuned + Ensemble"] = df_leaderboard["Model"].str.endswith(
	"(tuned + ensemble)"
	) \| df_leaderboard["Model"].str.endswith("(4h)")

	filter_columns = [
	ColumnFilter("TypeFiler", type="checkboxgroup", label="🤖 Model Types"),
	ColumnFilter("Only Default", type="boolean", default=False),
	ColumnFilter("Only Tuned", type="boolean", default=False),
	ColumnFilter("Only Tuned + Ensemble", type="boolean", default=False),
	]

	# Add Imputed count postfix
	if any(df_leaderboard["Imputed"]):
	df_leaderboard["Imputed"] = df_leaderboard["Imputed"].replace(
	{
	True: "Imputed",
	False: "Not Imputed",
	}
	)
	filter_columns.append(
	ColumnFilter(
	"Imputed",
	type="checkboxgroup",
	label="(Not) Imputed Models",
	info="We impute the performance for models that cannot run on all"
	" datasets due to task or dataset size constraints. We impute with"
	" the performance of a default RandomForest."
	" We add a postfix [X% IMPUTED] to the model if any results were"
	" imputed. The X% shows the percentage of"
	" datasets that were imputed. In general, imputation negatively"
	" represents the model performance, punishing the model for not"
	" being able to run on all datasets.",
	)
	)

	return Leaderboard(
	value=df_leaderboard,
	select_columns=SelectColumns(
	default_selection=list(df_leaderboard.columns),
	cant_deselect=["Type", "Model"],
	label="Select Columns to Display:",
	),
	hide_columns=[
	"TypeName",
	"TypeFiler",
	"RefModel",
	"Only Default",
	"Only Tuned",
	"Only Tuned + Ensemble",
	"Imputed",
	],
	search_columns=["Model", "TypeName"],
	filter_columns=filter_columns,
	bool_checkboxgroup_label="Custom Views (exclusive, only toggle one at a time):",
	height=800,
	)


	@dataclass
	class LBMatrixElement:
	imputation: str
	splits: str
	tasks: str
	datasets: str

	def get_path_to_results(self) -> str:
	return (
	f"imputation_{self.imputation}/"
	f"splits_{self.splits}/"
	f"tasks_{self.tasks}/"
	f"datasets_{self.datasets}/"
	)


	@dataclass
	class LBMatrix:
	imputation = ["no", "yes"]
	splits = ["all", "lite"]
	tasks = ["all", "classification", "regression"]
	datasets = ["all", "small", "medium", "tabpfn"]

	# TODO: get correct numbers
	blurb_map_n_datasets = {
	"all": {
	"all": 51,
	"small": 35,
	"medium": 16,
	"tabpfn": 33,
	},
	"classification": {
	"all": 30,
	"small": 20,
	"medium": 10,
	"tabpfn": 20,
	},
	"regression": {
	"all": 21,
	"small": 15,
	"medium": 6,
	"tabpfn": 13,
	},
	}

	@staticmethod
	def get_name_for_lb(lb_key, lb_value):
	if lb_key == "imputation":
	return "All Models" if lb_value == "no" else "With Imputed Models"
	if lb_key == "splits":
	return "All Repeats" if lb_value == "all" else "Lite"
	if lb_key == "tasks":
	match lb_value:
	case "all":
	return "All Tasks"
	case "classification":
	return "Classification"
	case "regression":
	return "Regression"
	case _:
	raise ValueError()
	if lb_key == "datasets":
	match lb_value:
	case "all":
	return "All Datasets"
	case "small":
	return "Small"
	case "medium":
	return "Medium"
	case "tabpfn":
	return "TabPFNv2-data"
	case _:
	raise ValueError()
	raise ValueError()

	def element_to_blurb(self, element: LBMatrixElement) -> str:
	n_datasets = self.blurb_map_n_datasets[element.tasks][element.datasets]

	datasets_name = (
	element.datasets if element.datasets != "tabpfn" else "TabPFNv2-compatible"
	)
	blurb = f"Leaderboard for {n_datasets} datasets ({datasets_name} datasets, {element.tasks} tasks) "

	if element.splits == "lite":
	blurb += "for one split (1st fold, 1st repeat) "

	blurb += "including all "
	if element.imputation == "yes":
	blurb += "(imputed) "
	blurb += f"models."
	return blurb


	def main():
	css = """
	.markdown-text-box {
	padding: 4px;
	border-radius: 2px;
	}
	"""
	js_func = """
	function refresh() {
	const url = new URL(window.location);

	if (url.searchParams.get('__theme') !== 'dark') {
	url.searchParams.set('__theme', 'dark');
	window.location.href = url.href;
	}
	}
	"""
	demo = gr.Blocks(css=css, js=js_func, title="TabArena")
	with demo:
	gr.HTML(TITLE)

	# -- Introduction
	gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
	with gr.Row():
	with gr.Column(), gr.Accordion("📊 Datasets", open=False):
	gr.Markdown(
	website_texts.OVERVIEW_DATASETS, elem_classes="markdown-text-box"
	)

	with gr.Column(), gr.Accordion("🤖 Models", open=False):
	gr.Markdown(
	website_texts.OVERVIEW_MODELS, elem_classes="markdown-text-box"
	)
	with gr.Row():
	with gr.Column(), gr.Accordion("📈 Metrics", open=False):
	gr.Markdown(
	website_texts.OVERVIEW_METRICS, elem_classes="markdown-text-box"
	)
	with gr.Column(), gr.Accordion("📊 Reference Pipeline", open=False):
	gr.Markdown(
	website_texts.OVERVIEW_REF_PIPE, elem_classes="markdown-text-box"
	)
	with gr.Row(), gr.Accordion("📝 More Details", open=False):
	gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text-box")
	with gr.Row(), gr.Accordion("📙 Citation", open=False):
	gr.Textbox(
	value=CITATION_BUTTON_TEXT,
	label=CITATION_BUTTON_LABEL,
	lines=7,
	elem_id="citation-button",
	show_copy_button=True,
	)

	# -- Get all LBs we need:
	# all_lbs = _get_lbs()
	# # -- LB Overview
	# gr.Markdown("## 🗺️ TabArena Overview")
	# ordered_lbs = [
	# ta,
	# ta_clf,
	# ta_reg,
	# ta_tabicl,
	# ta_tabpfn,
	# ta_tabpfn_tabicl,
	# ta_lite,
	# ]
	# make_overview_leaderboard(lbs=ordered_lbs)

	gr.Markdown("## 🏆 TabArena Leaderboards")
	lb_matrix = LBMatrix()

	# Imputation
	with gr.Tabs(elem_classes="tab-buttons"):
	for impute_id, impute_t in enumerate(lb_matrix.imputation):
	impute_t_name = lb_matrix.get_name_for_lb("imputation", impute_t)
	with gr.TabItem(
	impute_t_name, elem_id="llm-benchmark-tab-table", id=impute_id
	):
	# Splits
	with gr.Tabs(elem_classes="tab-buttons"):
	for splits_id, splits_t in enumerate(lb_matrix.splits):
	splits_t = lb_matrix.get_name_for_lb("splits", splits_t)
	with gr.TabItem(
	splits_t,
	elem_id="llm-benchmark-tab-table",
	id=f"{impute_id}_{splits_id}",
	):
	# Tasks
	with gr.Tabs(elem_classes="tab-buttons"):
	for tasks_id, tasks_t in enumerate(lb_matrix.tasks):
	tasks_t_name = lb_matrix.get_name_for_lb(
	"tasks", tasks_t
	)
	with gr.TabItem(
	tasks_t_name,
	elem_id="llm-benchmark-tab-table",
	id=f"{impute_id}_{splits_id}_{tasks_id}",
	):
	# Datasets
	with gr.Tabs(elem_classes="tab-buttons"):
	for (
	datasets_id,
	datasets_t,
	) in enumerate(lb_matrix.datasets):
	datasets_t_name = (
	lb_matrix.get_name_for_lb(
	"datasets", datasets_t
	)
	)
	with gr.TabItem(
	datasets_t_name,
	elem_id="llm-benchmark-tab-table",
	id=f"{impute_id}_{splits_id}_{tasks_id}_{datasets_id}",
	):
	# Load LB
	lb_element = LBMatrixElement(
	imputation=lb_matrix.imputation[
	impute_id
	],
	splits=lb_matrix.splits[
	splits_id
	],
	tasks=lb_matrix.tasks[
	tasks_id
	],
	datasets=lb_matrix.datasets[
	datasets_id
	],
	)
	lb = LBContainer(
	name=f"{impute_t_name} \| {splits_t} \| {tasks_t_name} \| {datasets_t_name}",
	base_path_to_results=lb_element.get_path_to_results(),
	blurb=lb_matrix.element_to_blurb(
	lb_element
	),
	)
	gr.Markdown(
	lb.blurb,
	elem_classes="markdown-text",
	)
	make_overview_images(
	lb, subset_name=lb.name
	)
	make_leaderboard(lb)
	gr.Image(
	lb.get_path_to_winrate_matrix(),
	label=f"Winmatrix Overview [{lb.name}]",
	show_label=True,
	height=800,
	show_share_button=True,
	)


	with gr.Row(), gr.Accordion("📂 Version History", open=False):
	gr.Markdown(VERSION_HISTORY_BUTTON_TEXT, elem_classes="markdown-text")

	scheduler = BackgroundScheduler()
	# scheduler.add_job(restart_space, "interval", seconds=1800)
	scheduler.start()
	demo.queue(default_concurrency_limit=40).launch()
	demo.launch()


	if __name__ == "__main__":
	main()