Spaces:
Running
Running
| from __future__ import annotations | |
| import zipfile | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| import gradio as gr | |
| import pandas as pd | |
| import website_texts | |
| from apscheduler.schedulers.background import BackgroundScheduler | |
| from constants import Constants, model_type_emoji | |
| from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns | |
| from website_texts import ( | |
| ABOUT_TEXT, | |
| CITATION_BUTTON_LABEL, | |
| CITATION_BUTTON_TEXT, | |
| INTRODUCTION_TEXT, | |
| TITLE, | |
| VERSION_HISTORY_BUTTON_TEXT, | |
| ) | |
| def get_model_family(model_name: str) -> str: | |
| prefixes_mapping = { | |
| Constants.reference: ["AutoGluon"], | |
| Constants.neural_network: ["REALMLP", "TabM", "FASTAI", "MNCA", "NN_TORCH"], | |
| Constants.tree: ["GBM", "CAT", "EBM", "XGB", "XT", "RF"], | |
| Constants.foundational: ["TABDPT", "TABICL", "TABPFN"], | |
| Constants.baseline: ["KNN", "LR"], | |
| } | |
| for method_type, prefixes in prefixes_mapping.items(): | |
| for prefix in prefixes: | |
| if prefix.lower() in model_name.lower(): | |
| return method_type | |
| return Constants.other | |
| class LBContainer: | |
| name: str | |
| base_path_to_results: str | |
| blurb: str | |
| def _base_path(self): | |
| return Path(__file__).parent / "data" / self.base_path_to_results | |
| def load_df_leaderboard(self) -> pd.DataFrame: | |
| df = pd.read_csv(self._base_path / "website_leaderboard.csv") | |
| df = df.rename(columns={"1#": "#"}) | |
| return df | |
| def _handle_img_zip(self, img_name: str) -> str: | |
| _base_path = self._base_path / img_name | |
| zip_path = _base_path.with_suffix(".png.zip") | |
| img_path = _base_path.with_suffix(".png") | |
| with zipfile.ZipFile(zip_path, "r") as zipf: | |
| zipf.extractall(img_path.parent) | |
| return str(img_path) | |
| def get_path_to_tuning_impact_elo(self) -> str: | |
| return self._handle_img_zip("tuning-impact-elo") | |
| def get_path_to_pareto_front_improvability_vs_time_infer(self) -> str: | |
| return self._handle_img_zip("pareto_front_improvability_vs_time_infer") | |
| def get_path_to_pareto_n_configs_imp(self) -> str: | |
| return self._handle_img_zip("pareto_n_configs_imp") | |
| def get_path_to_winrate_matrix(self) -> str: | |
| return self._handle_img_zip("winrate_matrix") | |
| def make_overview_images(lb: LBContainer, subset_name): | |
| # Main Figure | |
| gr.Image( | |
| lb.get_path_to_tuning_impact_elo(), | |
| label=f"Leaderboard Overview [{subset_name}]", | |
| show_label=True, | |
| height=500, | |
| show_share_button=True, | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Image( | |
| value=lb.get_path_to_pareto_front_improvability_vs_time_infer(), | |
| label=f"Inference Time Pareto Front [{subset_name}]", | |
| height=400, | |
| show_label=True, | |
| show_share_button=True, | |
| ) | |
| with gr.Column(scale=1): | |
| gr.Image( | |
| value=lb.get_path_to_pareto_n_configs_imp(), | |
| label=f"Tuning Trajectories [{subset_name}]", | |
| height=400, | |
| show_label=True, | |
| show_share_button=True, | |
| ) | |
| def make_overview_leaderboard(lbs: [LBContainer]): | |
| # Create column per LB | |
| all_models = { | |
| m.split("[")[0].strip() | |
| for lb in lbs | |
| for m in lb.df_leaderboard[ | |
| ~lb.df_leaderboard["TypeName"].isin(["Reference Pipeline"]) | |
| ]["Model"] | |
| .unique() | |
| .tolist() | |
| } | |
| full_df = None | |
| for lb in lbs: | |
| df = lb.df_leaderboard.copy() | |
| df = df[~df["TypeName"].isin(["Reference Pipeline"])] | |
| df[lb.name] = df["Elo [β¬οΈ]"].rank(ascending=False, method="first").astype(int) | |
| df = df.sort_values(by=lb.name, ascending=True) | |
| # Adding indicators does not work as it makes it a string and then not sort | |
| # correctly. | |
| # df[lb.name] = df[lb.name].astype(str) | |
| # df[lb.name] = df[lb.name].replace({ | |
| # "1": "π₯ 1", | |
| # "2": "π₯ 2", | |
| # "3": "π₯ 3", | |
| # } | |
| # ) | |
| df = df[["Type", "Model", lb.name]] | |
| # Remove imputed message. | |
| df["Model"] = ( | |
| df["Model"].apply(lambda x: x.split("[")[0].strip()).astype("string") | |
| ) | |
| if full_df is None: | |
| # TODO: add support in case a model did not run on the full LB. | |
| assert all_models.difference(set(df["Model"].unique())) == set() | |
| full_df = df | |
| else: | |
| df = df[["Model", lb.name]] | |
| df_models = set(df["Model"].unique()) | |
| missing_models = all_models.difference(df_models) | |
| if missing_models: | |
| missing_models_df = pd.DataFrame( | |
| [[mm, "--"] for mm in missing_models], | |
| columns=["Model", lb.name], | |
| ) | |
| df = pd.concat([df, missing_models_df], ignore_index=True) | |
| df["Model"] = df["Model"].astype("string") | |
| # Merge | |
| full_df = full_df.merge(df, how="left", on="Model", validate="1:1") | |
| medal_colors = ["#998A00", "#808080", "#8C5520"] | |
| # Highlight function | |
| def highlight_top3(col): | |
| styles = [""] * len(col) | |
| for index_i in range(len(col)): | |
| if (not isinstance(col.iloc[index_i], str)) and col.iloc[index_i] <= 3: | |
| styles[index_i] = ( | |
| f"background-color: {medal_colors[col.iloc[index_i] - 1]};" | |
| ) | |
| return styles | |
| styler = full_df.style.apply(highlight_top3, axis=0, subset=[lb.name for lb in lbs]) | |
| return gr.DataFrame( | |
| styler, | |
| pinned_columns=2, | |
| interactive=False, | |
| show_search="search", | |
| label="The ranking of all models (with imputation) across various leaderboards.", | |
| ) | |
| def make_leaderboard(lb: LBContainer) -> Leaderboard: | |
| df_leaderboard = lb.load_df_leaderboard() | |
| # -- Add filters | |
| df_leaderboard["TypeFiler"] = df_leaderboard["TypeName"].apply( | |
| lambda m: f"{m} {model_type_emoji[m]}" | |
| ) | |
| df_leaderboard["Only Default"] = df_leaderboard["Model"].str.endswith("(default)") | |
| df_leaderboard["Only Tuned"] = df_leaderboard["Model"].str.endswith("(tuned)") | |
| df_leaderboard["Only Tuned + Ensemble"] = df_leaderboard["Model"].str.endswith( | |
| "(tuned + ensemble)" | |
| ) | df_leaderboard["Model"].str.endswith("(4h)") | |
| filter_columns = [ | |
| ColumnFilter("TypeFiler", type="checkboxgroup", label="π€ Model Types"), | |
| ColumnFilter("Only Default", type="boolean", default=False), | |
| ColumnFilter("Only Tuned", type="boolean", default=False), | |
| ColumnFilter("Only Tuned + Ensemble", type="boolean", default=False), | |
| ] | |
| # Add Imputed count postfix | |
| if any(df_leaderboard["Imputed"]): | |
| df_leaderboard["Imputed"] = df_leaderboard["Imputed"].replace( | |
| { | |
| True: "Imputed", | |
| False: "Not Imputed", | |
| } | |
| ) | |
| filter_columns.append( | |
| ColumnFilter( | |
| "Imputed", | |
| type="checkboxgroup", | |
| label="(Not) Imputed Models", | |
| info="We impute the performance for models that cannot run on all" | |
| " datasets due to task or dataset size constraints. We impute with" | |
| " the performance of a default RandomForest." | |
| " We add a postfix [X% IMPUTED] to the model if any results were" | |
| " imputed. The X% shows the percentage of" | |
| " datasets that were imputed. In general, imputation negatively" | |
| " represents the model performance, punishing the model for not" | |
| " being able to run on all datasets.", | |
| ) | |
| ) | |
| return Leaderboard( | |
| value=df_leaderboard, | |
| select_columns=SelectColumns( | |
| default_selection=list(df_leaderboard.columns), | |
| cant_deselect=["Type", "Model"], | |
| label="Select Columns to Display:", | |
| ), | |
| hide_columns=[ | |
| "TypeName", | |
| "TypeFiler", | |
| "RefModel", | |
| "Only Default", | |
| "Only Tuned", | |
| "Only Tuned + Ensemble", | |
| "Imputed", | |
| ], | |
| search_columns=["Model", "TypeName"], | |
| filter_columns=filter_columns, | |
| bool_checkboxgroup_label="Custom Views (exclusive, only toggle one at a time):", | |
| height=800, | |
| ) | |
| class LBMatrixElement: | |
| imputation: str | |
| splits: str | |
| tasks: str | |
| datasets: str | |
| def get_path_to_results(self) -> str: | |
| return ( | |
| f"imputation_{self.imputation}/" | |
| f"splits_{self.splits}/" | |
| f"tasks_{self.tasks}/" | |
| f"datasets_{self.datasets}/" | |
| ) | |
| class LBMatrix: | |
| imputation = ["no", "yes"] | |
| splits = ["all", "lite"] | |
| tasks = ["all", "classification", "regression"] | |
| datasets = ["all", "small", "medium", "tabpfn"] | |
| # TODO: get correct numbers | |
| blurb_map_n_datasets = { | |
| "all": { | |
| "all": 51, | |
| "small": 35, | |
| "medium": 16, | |
| "tabpfn": 33, | |
| }, | |
| "classification": { | |
| "all": 30, | |
| "small": 20, | |
| "medium": 10, | |
| "tabpfn": 20, | |
| }, | |
| "regression": { | |
| "all": 21, | |
| "small": 15, | |
| "medium": 6, | |
| "tabpfn": 13, | |
| }, | |
| } | |
| def get_name_for_lb(lb_key, lb_value): | |
| if lb_key == "imputation": | |
| return "All Models" if lb_value == "no" else "With Imputed Models" | |
| if lb_key == "splits": | |
| return "All Repeats" if lb_value == "all" else "Lite" | |
| if lb_key == "tasks": | |
| match lb_value: | |
| case "all": | |
| return "All Tasks" | |
| case "classification": | |
| return "Classification" | |
| case "regression": | |
| return "Regression" | |
| case _: | |
| raise ValueError() | |
| if lb_key == "datasets": | |
| match lb_value: | |
| case "all": | |
| return "All Datasets" | |
| case "small": | |
| return "Small" | |
| case "medium": | |
| return "Medium" | |
| case "tabpfn": | |
| return "TabPFNv2-data" | |
| case _: | |
| raise ValueError() | |
| raise ValueError() | |
| def element_to_blurb(self, element: LBMatrixElement) -> str: | |
| n_datasets = self.blurb_map_n_datasets[element.tasks][element.datasets] | |
| datasets_name = ( | |
| element.datasets if element.datasets != "tabpfn" else "TabPFNv2-compatible" | |
| ) | |
| blurb = f"Leaderboard for {n_datasets} datasets ({datasets_name} datasets, {element.tasks} tasks) " | |
| if element.splits == "lite": | |
| blurb += "for one split (1st fold, 1st repeat) " | |
| blurb += "including all " | |
| if element.imputation == "yes": | |
| blurb += "(imputed) " | |
| blurb += f"models." | |
| return blurb | |
| def main(): | |
| css = """ | |
| .markdown-text-box { | |
| padding: 4px; | |
| border-radius: 2px; | |
| } | |
| """ | |
| js_func = """ | |
| function refresh() { | |
| const url = new URL(window.location); | |
| if (url.searchParams.get('__theme') !== 'dark') { | |
| url.searchParams.set('__theme', 'dark'); | |
| window.location.href = url.href; | |
| } | |
| } | |
| """ | |
| demo = gr.Blocks(css=css, js=js_func, title="TabArena") | |
| with demo: | |
| gr.HTML(TITLE) | |
| # -- Introduction | |
| gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") | |
| with gr.Row(): | |
| with gr.Column(), gr.Accordion("π Datasets", open=False): | |
| gr.Markdown( | |
| website_texts.OVERVIEW_DATASETS, elem_classes="markdown-text-box" | |
| ) | |
| with gr.Column(), gr.Accordion("π€ Models", open=False): | |
| gr.Markdown( | |
| website_texts.OVERVIEW_MODELS, elem_classes="markdown-text-box" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(), gr.Accordion("π Metrics", open=False): | |
| gr.Markdown( | |
| website_texts.OVERVIEW_METRICS, elem_classes="markdown-text-box" | |
| ) | |
| with gr.Column(), gr.Accordion("π Reference Pipeline", open=False): | |
| gr.Markdown( | |
| website_texts.OVERVIEW_REF_PIPE, elem_classes="markdown-text-box" | |
| ) | |
| with gr.Row(), gr.Accordion("π More Details", open=False): | |
| gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text-box") | |
| with gr.Row(), gr.Accordion("π Citation", open=False): | |
| gr.Textbox( | |
| value=CITATION_BUTTON_TEXT, | |
| label=CITATION_BUTTON_LABEL, | |
| lines=7, | |
| elem_id="citation-button", | |
| show_copy_button=True, | |
| ) | |
| # -- Get all LBs we need: | |
| # all_lbs = _get_lbs() | |
| # # -- LB Overview | |
| # gr.Markdown("## πΊοΈ TabArena Overview") | |
| # ordered_lbs = [ | |
| # ta, | |
| # ta_clf, | |
| # ta_reg, | |
| # ta_tabicl, | |
| # ta_tabpfn, | |
| # ta_tabpfn_tabicl, | |
| # ta_lite, | |
| # ] | |
| # make_overview_leaderboard(lbs=ordered_lbs) | |
| gr.Markdown("## π TabArena Leaderboards") | |
| lb_matrix = LBMatrix() | |
| # Imputation | |
| with gr.Tabs(elem_classes="tab-buttons"): | |
| for impute_id, impute_t in enumerate(lb_matrix.imputation): | |
| impute_t_name = lb_matrix.get_name_for_lb("imputation", impute_t) | |
| with gr.TabItem( | |
| impute_t_name, elem_id="llm-benchmark-tab-table", id=impute_id | |
| ): | |
| # Splits | |
| with gr.Tabs(elem_classes="tab-buttons"): | |
| for splits_id, splits_t in enumerate(lb_matrix.splits): | |
| splits_t = lb_matrix.get_name_for_lb("splits", splits_t) | |
| with gr.TabItem( | |
| splits_t, | |
| elem_id="llm-benchmark-tab-table", | |
| id=f"{impute_id}_{splits_id}", | |
| ): | |
| # Tasks | |
| with gr.Tabs(elem_classes="tab-buttons"): | |
| for tasks_id, tasks_t in enumerate(lb_matrix.tasks): | |
| tasks_t_name = lb_matrix.get_name_for_lb( | |
| "tasks", tasks_t | |
| ) | |
| with gr.TabItem( | |
| tasks_t_name, | |
| elem_id="llm-benchmark-tab-table", | |
| id=f"{impute_id}_{splits_id}_{tasks_id}", | |
| ): | |
| # Datasets | |
| with gr.Tabs(elem_classes="tab-buttons"): | |
| for ( | |
| datasets_id, | |
| datasets_t, | |
| ) in enumerate(lb_matrix.datasets): | |
| datasets_t_name = ( | |
| lb_matrix.get_name_for_lb( | |
| "datasets", datasets_t | |
| ) | |
| ) | |
| with gr.TabItem( | |
| datasets_t_name, | |
| elem_id="llm-benchmark-tab-table", | |
| id=f"{impute_id}_{splits_id}_{tasks_id}_{datasets_id}", | |
| ): | |
| # Load LB | |
| lb_element = LBMatrixElement( | |
| imputation=lb_matrix.imputation[ | |
| impute_id | |
| ], | |
| splits=lb_matrix.splits[ | |
| splits_id | |
| ], | |
| tasks=lb_matrix.tasks[ | |
| tasks_id | |
| ], | |
| datasets=lb_matrix.datasets[ | |
| datasets_id | |
| ], | |
| ) | |
| lb = LBContainer( | |
| name=f"{impute_t_name} | {splits_t} | {tasks_t_name} | {datasets_t_name}", | |
| base_path_to_results=lb_element.get_path_to_results(), | |
| blurb=lb_matrix.element_to_blurb( | |
| lb_element | |
| ), | |
| ) | |
| gr.Markdown( | |
| lb.blurb, | |
| elem_classes="markdown-text", | |
| ) | |
| make_overview_images( | |
| lb, subset_name=lb.name | |
| ) | |
| make_leaderboard(lb) | |
| gr.Image( | |
| lb.get_path_to_winrate_matrix(), | |
| label=f"Winmatrix Overview [{lb.name}]", | |
| show_label=True, | |
| height=800, | |
| show_share_button=True, | |
| ) | |
| with gr.Row(), gr.Accordion("π Version History", open=False): | |
| gr.Markdown(VERSION_HISTORY_BUTTON_TEXT, elem_classes="markdown-text") | |
| scheduler = BackgroundScheduler() | |
| # scheduler.add_job(restart_space, "interval", seconds=1800) | |
| scheduler.start() | |
| demo.queue(default_concurrency_limit=40).launch() | |
| demo.launch() | |
| if __name__ == "__main__": | |
| main() | |