|  | import json | 
					
						
						|  | from pathlib import Path | 
					
						
						|  |  | 
					
						
						|  | import gradio as gr | 
					
						
						|  | import pandas as pd | 
					
						
						|  |  | 
					
						
						|  | TITLE = """<h1 align="center" id="space-title">LLM Leaderboard for open-r1 Models</h1>""" | 
					
						
						|  |  | 
					
						
						|  | DESCRIPTION = f""" | 
					
						
						|  | Evaluation of open-r1 models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy. | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | BENCHMARKS_TO_SKIP = ["math", "mini_math", "aimo_math_integer_lvl4-5", "mini_math_v2"] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_leaderboard_df(): | 
					
						
						|  | filepaths = list(Path("eval_results").rglob("*.json")) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | models = set() | 
					
						
						|  | for filepath in filepaths: | 
					
						
						|  | path_parts = Path(filepath).parts | 
					
						
						|  | model_revision = "_".join(path_parts[1:4]) | 
					
						
						|  | models.add(model_revision) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | df = pd.DataFrame(index=list(models)) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | for filepath in filepaths: | 
					
						
						|  | path_parts = Path(filepath).parts | 
					
						
						|  | date = filepath.stem.split("_")[-1][:-3] | 
					
						
						|  | model_revision = "_".join(path_parts[1:4]) + "_" + date | 
					
						
						|  | task = path_parts[4] | 
					
						
						|  | df.loc[model_revision, "Date"] = date | 
					
						
						|  |  | 
					
						
						|  | with open(filepath, "r") as file: | 
					
						
						|  | data = json.load(file) | 
					
						
						|  |  | 
					
						
						|  | if task.lower() in BENCHMARKS_TO_SKIP: | 
					
						
						|  | continue | 
					
						
						|  |  | 
					
						
						|  | if task.lower() in ["mixeval", "mixeval_hard"]: | 
					
						
						|  | value = data["overall score (final score)"] | 
					
						
						|  | df.loc[model_revision, f"{task}"] = value | 
					
						
						|  | else: | 
					
						
						|  | first_result_key = next(iter(data["results"])) | 
					
						
						|  |  | 
					
						
						|  | if task.lower() == "truthfulqa": | 
					
						
						|  | value = data["results"][first_result_key]["truthfulqa_mc2"] | 
					
						
						|  | df.loc[model_revision, task] = float(value) | 
					
						
						|  |  | 
					
						
						|  | elif task.lower() == "ifeval": | 
					
						
						|  | values = 0.0 | 
					
						
						|  | for metric in [ | 
					
						
						|  | "prompt_level_loose", | 
					
						
						|  | "prompt_level_strict", | 
					
						
						|  | "inst_level_strict", | 
					
						
						|  | "inst_level_loose", | 
					
						
						|  | ]: | 
					
						
						|  | values += data["results"][first_result_key][f"{metric}_acc"] | 
					
						
						|  | value = values / 4 | 
					
						
						|  | df.loc[model_revision, f"{task}"] = float(value) | 
					
						
						|  |  | 
					
						
						|  | elif task.lower() == "mmlu": | 
					
						
						|  | value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0] | 
					
						
						|  | df.loc[model_revision, task] = float(value) | 
					
						
						|  |  | 
					
						
						|  | elif task.lower() in ["hellaswag", "arc"]: | 
					
						
						|  | value = data["results"][first_result_key]["acc_norm"] | 
					
						
						|  | df.loc[model_revision, task] = float(value) | 
					
						
						|  |  | 
					
						
						|  | elif task.lower() == "bbh": | 
					
						
						|  | if "all" in data["results"]: | 
					
						
						|  | value = data["results"]["all"]["acc"] | 
					
						
						|  | else: | 
					
						
						|  | value = -100 | 
					
						
						|  | df.loc[model_revision, task] = float(value) | 
					
						
						|  |  | 
					
						
						|  | elif task.lower() == "agieval": | 
					
						
						|  | value = data["results"]["all"]["acc_norm"] | 
					
						
						|  | df.loc[model_revision, task] = float(value) | 
					
						
						|  |  | 
					
						
						|  | elif task.lower() in ["aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]: | 
					
						
						|  | value = data["results"]["all"]["qem"] | 
					
						
						|  | df.loc[model_revision, task] = float(value) | 
					
						
						|  |  | 
					
						
						|  | elif task.lower() in ["mini_math_v2"]: | 
					
						
						|  | for k, v in data["results"].items(): | 
					
						
						|  | if k != "all": | 
					
						
						|  | level = k.split("|")[1].split(":")[-1] | 
					
						
						|  | value = v["qem"] | 
					
						
						|  | df.loc[model_revision, f"{task}_{level}"] = value | 
					
						
						|  |  | 
					
						
						|  | elif task.lower() in ["aimo_kaggle_medium_pot", "aimo_kaggle_hard_pot"]: | 
					
						
						|  | for k, v in data["results"].items(): | 
					
						
						|  | if k != "all" and "_average" not in k: | 
					
						
						|  | version = k.split("|")[1].split(":")[-1] | 
					
						
						|  | value = v["qem"] if "qem" in v else v["score"] | 
					
						
						|  | df.loc[model_revision, f"{task}_{version}"] = value | 
					
						
						|  |  | 
					
						
						|  | elif task.lower() in [ | 
					
						
						|  | "aimo_tora_eval_kaggle_medium", | 
					
						
						|  | "aimo_tora_eval_kaggle_hard", | 
					
						
						|  | "aimo_kaggle_fast_eval_hard", | 
					
						
						|  | "aimo_kaggle_tora_medium", | 
					
						
						|  | "aimo_kaggle_tora_hard", | 
					
						
						|  | "aimo_kaggle_tora_medium_extended", | 
					
						
						|  | "aimo_kaggle_tora_hard_extended", | 
					
						
						|  | "aimo_math_integer_lvl4", | 
					
						
						|  | "aimo_math_integer_lvl5", | 
					
						
						|  | ]: | 
					
						
						|  | for k, v in data["results"].items(): | 
					
						
						|  | value = float(v["qem"]) / 100.0 | 
					
						
						|  | df.loc[model_revision, f"{task}"] = value | 
					
						
						|  |  | 
					
						
						|  | elif task.lower() == "alpaca_eval": | 
					
						
						|  | value = data["results"][first_result_key]["win_rate"] | 
					
						
						|  | df.loc[model_revision, "Alpaca_eval"] = value / 100.0 | 
					
						
						|  | value = data["results"][first_result_key]["length_controlled_winrate"] | 
					
						
						|  | df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0 | 
					
						
						|  | else: | 
					
						
						|  | first_metric_key = next( | 
					
						
						|  | iter(data["results"][first_result_key]) | 
					
						
						|  | ) | 
					
						
						|  | value = data["results"][first_result_key][first_metric_key] | 
					
						
						|  | df.loc[model_revision, task] = float(value) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"]) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | df.columns = [c.replace("_level_", "_l") for c in df.columns] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | df.columns = [c.replace("aimo_", "") for c in df.columns] | 
					
						
						|  |  | 
					
						
						|  | df = df.reset_index().rename(columns={"index": "Model"}).round(4) | 
					
						
						|  |  | 
					
						
						|  | df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0]) | 
					
						
						|  |  | 
					
						
						|  | return df | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | leaderboard_df = get_leaderboard_df() | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def agg_df(df, agg: str = "max"): | 
					
						
						|  | df = df.copy() | 
					
						
						|  |  | 
					
						
						|  | df = df.drop("Date", axis=1).groupby("Model").agg(agg).reset_index() | 
					
						
						|  |  | 
					
						
						|  | df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True)) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | df[df.select_dtypes(include=["number"]).columns] *= 100.0 | 
					
						
						|  | df = df.sort_values(by=["Average"], ascending=False) | 
					
						
						|  | return df | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def filter_and_search(cols: list[str], search_query: str, agg: str): | 
					
						
						|  | df = leaderboard_df | 
					
						
						|  | df = agg_df(df, agg) | 
					
						
						|  | if len(search_query) > 0: | 
					
						
						|  | search_terms = search_query.split(";") | 
					
						
						|  | search_terms = [term.strip().lower() for term in search_terms] | 
					
						
						|  | pattern = "|".join(search_terms) | 
					
						
						|  | df = df[df["Model"].str.lower().str.contains(pattern, regex=True)] | 
					
						
						|  |  | 
					
						
						|  | df = df.dropna(how="all", axis=1) | 
					
						
						|  | if len(cols) > 0: | 
					
						
						|  | index_cols = list(leaderboard_df.columns[:1]) | 
					
						
						|  | new_cols = index_cols + cols | 
					
						
						|  | df = df.copy()[new_cols] | 
					
						
						|  |  | 
					
						
						|  | df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols]) | 
					
						
						|  |  | 
					
						
						|  | df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True)) | 
					
						
						|  | return df | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | demo = gr.Blocks() | 
					
						
						|  |  | 
					
						
						|  | with demo: | 
					
						
						|  | gr.HTML(TITLE) | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | gr.Markdown(DESCRIPTION, elem_classes="markdown-text") | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False) | 
					
						
						|  | agg = gr.Radio( | 
					
						
						|  | ["min", "max", "mean"], | 
					
						
						|  | value="max", | 
					
						
						|  | label="Aggregation", | 
					
						
						|  | info="How to aggregate results for each model", | 
					
						
						|  | ) | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | cols_bar = gr.CheckboxGroup( | 
					
						
						|  | choices=[c for c in leaderboard_df.columns[1:] if c != "Average"], | 
					
						
						|  | show_label=False, | 
					
						
						|  | info="Select columns to display", | 
					
						
						|  | ) | 
					
						
						|  | with gr.Group(): | 
					
						
						|  | leaderboard_table = gr.Dataframe( | 
					
						
						|  | value=leaderboard_df, | 
					
						
						|  | wrap=True, | 
					
						
						|  | height=1000, | 
					
						
						|  | column_widths=[400, 110] + [(260 + len(c)) for c in leaderboard_df.columns[1:]], | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table]) | 
					
						
						|  | agg.change(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table]) | 
					
						
						|  | search_bar.submit(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table]) | 
					
						
						|  |  | 
					
						
						|  | demo.launch() | 
					
						
						|  |  |