leaderboard / main.py
LennartPurucker's picture
maint: update to new lb design
28bec71
raw
history blame
19.6 kB
from __future__ import annotations
import zipfile
from dataclasses import dataclass
from pathlib import Path
import gradio as gr
import pandas as pd
import website_texts
from apscheduler.schedulers.background import BackgroundScheduler
from constants import Constants, model_type_emoji
from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
from website_texts import (
ABOUT_TEXT,
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
INTRODUCTION_TEXT,
TITLE,
VERSION_HISTORY_BUTTON_TEXT,
)
def get_model_family(model_name: str) -> str:
prefixes_mapping = {
Constants.reference: ["AutoGluon"],
Constants.neural_network: ["REALMLP", "TabM", "FASTAI", "MNCA", "NN_TORCH"],
Constants.tree: ["GBM", "CAT", "EBM", "XGB", "XT", "RF"],
Constants.foundational: ["TABDPT", "TABICL", "TABPFN"],
Constants.baseline: ["KNN", "LR"],
}
for method_type, prefixes in prefixes_mapping.items():
for prefix in prefixes:
if prefix.lower() in model_name.lower():
return method_type
return Constants.other
@dataclass
class LBContainer:
name: str
base_path_to_results: str
blurb: str
@property
def _base_path(self):
return Path(__file__).parent / "data" / self.base_path_to_results
def load_df_leaderboard(self) -> pd.DataFrame:
df = pd.read_csv(self._base_path / "website_leaderboard.csv")
df = df.rename(columns={"1#": "#"})
return df
def _handle_img_zip(self, img_name: str) -> str:
_base_path = self._base_path / img_name
zip_path = _base_path.with_suffix(".png.zip")
img_path = _base_path.with_suffix(".png")
with zipfile.ZipFile(zip_path, "r") as zipf:
zipf.extractall(img_path.parent)
return str(img_path)
def get_path_to_tuning_impact_elo(self) -> str:
return self._handle_img_zip("tuning-impact-elo")
def get_path_to_pareto_front_improvability_vs_time_infer(self) -> str:
return self._handle_img_zip("pareto_front_improvability_vs_time_infer")
def get_path_to_pareto_n_configs_imp(self) -> str:
return self._handle_img_zip("pareto_n_configs_imp")
def get_path_to_winrate_matrix(self) -> str:
return self._handle_img_zip("winrate_matrix")
def make_overview_images(lb: LBContainer, subset_name):
# Main Figure
gr.Image(
lb.get_path_to_tuning_impact_elo(),
label=f"Leaderboard Overview [{subset_name}]",
show_label=True,
height=500,
show_share_button=True,
)
with gr.Row():
with gr.Column(scale=1):
gr.Image(
value=lb.get_path_to_pareto_front_improvability_vs_time_infer(),
label=f"Inference Time Pareto Front [{subset_name}]",
height=400,
show_label=True,
show_share_button=True,
)
with gr.Column(scale=1):
gr.Image(
value=lb.get_path_to_pareto_n_configs_imp(),
label=f"Tuning Trajectories [{subset_name}]",
height=400,
show_label=True,
show_share_button=True,
)
def make_overview_leaderboard(lbs: [LBContainer]):
# Create column per LB
all_models = {
m.split("[")[0].strip()
for lb in lbs
for m in lb.df_leaderboard[
~lb.df_leaderboard["TypeName"].isin(["Reference Pipeline"])
]["Model"]
.unique()
.tolist()
}
full_df = None
for lb in lbs:
df = lb.df_leaderboard.copy()
df = df[~df["TypeName"].isin(["Reference Pipeline"])]
df[lb.name] = df["Elo [⬆️]"].rank(ascending=False, method="first").astype(int)
df = df.sort_values(by=lb.name, ascending=True)
# Adding indicators does not work as it makes it a string and then not sort
# correctly.
# df[lb.name] = df[lb.name].astype(str)
# df[lb.name] = df[lb.name].replace({
# "1": "πŸ₯‡ 1",
# "2": "πŸ₯ˆ 2",
# "3": "πŸ₯‰ 3",
# }
# )
df = df[["Type", "Model", lb.name]]
# Remove imputed message.
df["Model"] = (
df["Model"].apply(lambda x: x.split("[")[0].strip()).astype("string")
)
if full_df is None:
# TODO: add support in case a model did not run on the full LB.
assert all_models.difference(set(df["Model"].unique())) == set()
full_df = df
else:
df = df[["Model", lb.name]]
df_models = set(df["Model"].unique())
missing_models = all_models.difference(df_models)
if missing_models:
missing_models_df = pd.DataFrame(
[[mm, "--"] for mm in missing_models],
columns=["Model", lb.name],
)
df = pd.concat([df, missing_models_df], ignore_index=True)
df["Model"] = df["Model"].astype("string")
# Merge
full_df = full_df.merge(df, how="left", on="Model", validate="1:1")
medal_colors = ["#998A00", "#808080", "#8C5520"]
# Highlight function
def highlight_top3(col):
styles = [""] * len(col)
for index_i in range(len(col)):
if (not isinstance(col.iloc[index_i], str)) and col.iloc[index_i] <= 3:
styles[index_i] = (
f"background-color: {medal_colors[col.iloc[index_i] - 1]};"
)
return styles
styler = full_df.style.apply(highlight_top3, axis=0, subset=[lb.name for lb in lbs])
return gr.DataFrame(
styler,
pinned_columns=2,
interactive=False,
show_search="search",
label="The ranking of all models (with imputation) across various leaderboards.",
)
def make_leaderboard(lb: LBContainer) -> Leaderboard:
df_leaderboard = lb.load_df_leaderboard()
# -- Add filters
df_leaderboard["TypeFiler"] = df_leaderboard["TypeName"].apply(
lambda m: f"{m} {model_type_emoji[m]}"
)
df_leaderboard["Only Default"] = df_leaderboard["Model"].str.endswith("(default)")
df_leaderboard["Only Tuned"] = df_leaderboard["Model"].str.endswith("(tuned)")
df_leaderboard["Only Tuned + Ensemble"] = df_leaderboard["Model"].str.endswith(
"(tuned + ensemble)"
) | df_leaderboard["Model"].str.endswith("(4h)")
filter_columns = [
ColumnFilter("TypeFiler", type="checkboxgroup", label="πŸ€– Model Types"),
ColumnFilter("Only Default", type="boolean", default=False),
ColumnFilter("Only Tuned", type="boolean", default=False),
ColumnFilter("Only Tuned + Ensemble", type="boolean", default=False),
]
# Add Imputed count postfix
if any(df_leaderboard["Imputed"]):
df_leaderboard["Imputed"] = df_leaderboard["Imputed"].replace(
{
True: "Imputed",
False: "Not Imputed",
}
)
filter_columns.append(
ColumnFilter(
"Imputed",
type="checkboxgroup",
label="(Not) Imputed Models",
info="We impute the performance for models that cannot run on all"
" datasets due to task or dataset size constraints. We impute with"
" the performance of a default RandomForest."
" We add a postfix [X% IMPUTED] to the model if any results were"
" imputed. The X% shows the percentage of"
" datasets that were imputed. In general, imputation negatively"
" represents the model performance, punishing the model for not"
" being able to run on all datasets.",
)
)
return Leaderboard(
value=df_leaderboard,
select_columns=SelectColumns(
default_selection=list(df_leaderboard.columns),
cant_deselect=["Type", "Model"],
label="Select Columns to Display:",
),
hide_columns=[
"TypeName",
"TypeFiler",
"RefModel",
"Only Default",
"Only Tuned",
"Only Tuned + Ensemble",
"Imputed",
],
search_columns=["Model", "TypeName"],
filter_columns=filter_columns,
bool_checkboxgroup_label="Custom Views (exclusive, only toggle one at a time):",
height=800,
)
@dataclass
class LBMatrixElement:
imputation: str
splits: str
tasks: str
datasets: str
def get_path_to_results(self) -> str:
return (
f"imputation_{self.imputation}/"
f"splits_{self.splits}/"
f"tasks_{self.tasks}/"
f"datasets_{self.datasets}/"
)
@dataclass
class LBMatrix:
imputation = ["no", "yes"]
splits = ["all", "lite"]
tasks = ["all", "classification", "regression"]
datasets = ["all", "small", "medium", "tabpfn"]
# TODO: get correct numbers
blurb_map_n_datasets = {
"all": {
"all": 51,
"small": 35,
"medium": 16,
"tabpfn": 33,
},
"classification": {
"all": 30,
"small": 20,
"medium": 10,
"tabpfn": 20,
},
"regression": {
"all": 21,
"small": 15,
"medium": 6,
"tabpfn": 13,
},
}
@staticmethod
def get_name_for_lb(lb_key, lb_value):
if lb_key == "imputation":
return "All Models" if lb_value == "no" else "With Imputed Models"
if lb_key == "splits":
return "All Repeats" if lb_value == "all" else "Lite"
if lb_key == "tasks":
match lb_value:
case "all":
return "All Tasks"
case "classification":
return "Classification"
case "regression":
return "Regression"
case _:
raise ValueError()
if lb_key == "datasets":
match lb_value:
case "all":
return "All Datasets"
case "small":
return "Small"
case "medium":
return "Medium"
case "tabpfn":
return "TabPFNv2-data"
case _:
raise ValueError()
raise ValueError()
def element_to_blurb(self, element: LBMatrixElement) -> str:
n_datasets = self.blurb_map_n_datasets[element.tasks][element.datasets]
datasets_name = (
element.datasets if element.datasets != "tabpfn" else "TabPFNv2-compatible"
)
blurb = f"Leaderboard for {n_datasets} datasets ({datasets_name} datasets, {element.tasks} tasks) "
if element.splits == "lite":
blurb += "for one split (1st fold, 1st repeat) "
blurb += "including all "
if element.imputation == "yes":
blurb += "(imputed) "
blurb += f"models."
return blurb
def main():
css = """
.markdown-text-box {
padding: 4px;
border-radius: 2px;
}
"""
js_func = """
function refresh() {
const url = new URL(window.location);
if (url.searchParams.get('__theme') !== 'dark') {
url.searchParams.set('__theme', 'dark');
window.location.href = url.href;
}
}
"""
demo = gr.Blocks(css=css, js=js_func, title="TabArena")
with demo:
gr.HTML(TITLE)
# -- Introduction
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Row():
with gr.Column(), gr.Accordion("πŸ“Š Datasets", open=False):
gr.Markdown(
website_texts.OVERVIEW_DATASETS, elem_classes="markdown-text-box"
)
with gr.Column(), gr.Accordion("πŸ€– Models", open=False):
gr.Markdown(
website_texts.OVERVIEW_MODELS, elem_classes="markdown-text-box"
)
with gr.Row():
with gr.Column(), gr.Accordion("πŸ“ˆ Metrics", open=False):
gr.Markdown(
website_texts.OVERVIEW_METRICS, elem_classes="markdown-text-box"
)
with gr.Column(), gr.Accordion("πŸ“Š Reference Pipeline", open=False):
gr.Markdown(
website_texts.OVERVIEW_REF_PIPE, elem_classes="markdown-text-box"
)
with gr.Row(), gr.Accordion("πŸ“ More Details", open=False):
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text-box")
with gr.Row(), gr.Accordion("πŸ“™ Citation", open=False):
gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=7,
elem_id="citation-button",
show_copy_button=True,
)
# -- Get all LBs we need:
# all_lbs = _get_lbs()
# # -- LB Overview
# gr.Markdown("## πŸ—ΊοΈ TabArena Overview")
# ordered_lbs = [
# ta,
# ta_clf,
# ta_reg,
# ta_tabicl,
# ta_tabpfn,
# ta_tabpfn_tabicl,
# ta_lite,
# ]
# make_overview_leaderboard(lbs=ordered_lbs)
gr.Markdown("## πŸ† TabArena Leaderboards")
lb_matrix = LBMatrix()
# Imputation
with gr.Tabs(elem_classes="tab-buttons"):
for impute_id, impute_t in enumerate(lb_matrix.imputation):
impute_t_name = lb_matrix.get_name_for_lb("imputation", impute_t)
with gr.TabItem(
impute_t_name, elem_id="llm-benchmark-tab-table", id=impute_id
):
# Splits
with gr.Tabs(elem_classes="tab-buttons"):
for splits_id, splits_t in enumerate(lb_matrix.splits):
splits_t = lb_matrix.get_name_for_lb("splits", splits_t)
with gr.TabItem(
splits_t,
elem_id="llm-benchmark-tab-table",
id=f"{impute_id}_{splits_id}",
):
# Tasks
with gr.Tabs(elem_classes="tab-buttons"):
for tasks_id, tasks_t in enumerate(lb_matrix.tasks):
tasks_t_name = lb_matrix.get_name_for_lb(
"tasks", tasks_t
)
with gr.TabItem(
tasks_t_name,
elem_id="llm-benchmark-tab-table",
id=f"{impute_id}_{splits_id}_{tasks_id}",
):
# Datasets
with gr.Tabs(elem_classes="tab-buttons"):
for (
datasets_id,
datasets_t,
) in enumerate(lb_matrix.datasets):
datasets_t_name = (
lb_matrix.get_name_for_lb(
"datasets", datasets_t
)
)
with gr.TabItem(
datasets_t_name,
elem_id="llm-benchmark-tab-table",
id=f"{impute_id}_{splits_id}_{tasks_id}_{datasets_id}",
):
# Load LB
lb_element = LBMatrixElement(
imputation=lb_matrix.imputation[
impute_id
],
splits=lb_matrix.splits[
splits_id
],
tasks=lb_matrix.tasks[
tasks_id
],
datasets=lb_matrix.datasets[
datasets_id
],
)
lb = LBContainer(
name=f"{impute_t_name} | {splits_t} | {tasks_t_name} | {datasets_t_name}",
base_path_to_results=lb_element.get_path_to_results(),
blurb=lb_matrix.element_to_blurb(
lb_element
),
)
gr.Markdown(
lb.blurb,
elem_classes="markdown-text",
)
make_overview_images(
lb, subset_name=lb.name
)
make_leaderboard(lb)
gr.Image(
lb.get_path_to_winrate_matrix(),
label=f"Winmatrix Overview [{lb.name}]",
show_label=True,
height=800,
show_share_button=True,
)
with gr.Row(), gr.Accordion("πŸ“‚ Version History", open=False):
gr.Markdown(VERSION_HISTORY_BUTTON_TEXT, elem_classes="markdown-text")
scheduler = BackgroundScheduler()
# scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch()
demo.launch()
if __name__ == "__main__":
main()