open-r1-eval-leaderboard

Running

App Files Files Community

open-r1-eval-leaderboard / app.py

edbeeching HF Staff

Update app.py

9b14f93 verified 9 months ago

raw

history blame

9.89 kB

	import json
	from pathlib import Path

	import gradio as gr
	import pandas as pd

	TITLE = """<h1 align="center" id="space-title">LLM Leaderboard for open-r1 Models</h1>"""

	DESCRIPTION = f"""
	Evaluation of open-r1 models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
	"""

	BENCHMARKS_TO_SKIP = ["math", "mini_math", "aimo_math_integer_lvl4-5", "mini_math_v2"]


	def get_leaderboard_df():
	filepaths = list(Path("eval_results").rglob("*.json"))

	# Parse filepaths to get unique models
	models = set()
	for filepath in filepaths:
	path_parts = Path(filepath).parts
	model_revision = "_".join(path_parts[1:4])
	models.add(model_revision)

	# Initialize DataFrame
	df = pd.DataFrame(index=list(models))

	# Extract data from each file and populate the DataFrame
	for filepath in filepaths:
	path_parts = Path(filepath).parts
	date = filepath.stem.split("_")[-1][:-3]
	model_revision = "_".join(path_parts[1:4]) + "_" + date
	task = path_parts[4]
	df.loc[model_revision, "Date"] = date

	with open(filepath, "r") as file:
	data = json.load(file)
	# Skip benchmarks that we don't want to include in the leaderboard
	if task.lower() in BENCHMARKS_TO_SKIP:
	continue
	# MixEval doen't have a results key, so we need to get the overall score
	if task.lower() in ["mixeval", "mixeval_hard"]:
	value = data["overall score (final score)"]
	df.loc[model_revision, f"{task}"] = value
	else:
	first_result_key = next(iter(data["results"])) # gets the first key in 'results'
	# TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
	if task.lower() == "truthfulqa":
	value = data["results"][first_result_key]["truthfulqa_mc2"]
	df.loc[model_revision, task] = float(value)
	# IFEval has several metrics but we report the average like Llama3 paper
	elif task.lower() == "ifeval":
	values = 0.0
	for metric in [
	"prompt_level_loose",
	"prompt_level_strict",
	"inst_level_strict",
	"inst_level_loose",
	]:
	values += data["results"][first_result_key][f"{metric}_acc"]
	value = values / 4
	df.loc[model_revision, f"{task}"] = float(value)
	# MMLU has several metrics but we report just the average one
	elif task.lower() == "mmlu":
	value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0]
	df.loc[model_revision, task] = float(value)
	# HellaSwag and ARC reports acc_norm
	elif task.lower() in ["hellaswag", "arc"]:
	value = data["results"][first_result_key]["acc_norm"]
	df.loc[model_revision, task] = float(value)
	# BBH has several metrics but we report just the average one
	elif task.lower() == "bbh":
	if "all" in data["results"]:
	value = data["results"]["all"]["acc"]
	else:
	value = -100
	df.loc[model_revision, task] = float(value)
	# AGIEval reports acc_norm
	elif task.lower() == "agieval":
	value = data["results"]["all"]["acc_norm"]
	df.loc[model_revision, task] = float(value)
	# MATH reports qem
	elif task.lower() in ["aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
	value = data["results"]["all"]["qem"]
	df.loc[model_revision, task] = float(value)
	# For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
	elif task.lower() in ["mini_math_v2"]:
	for k, v in data["results"].items():
	if k != "all":
	level = k.split("\|")[1].split(":")[-1]
	value = v["qem"]
	df.loc[model_revision, f"{task}_{level}"] = value
	# For PoT we report N metrics, one for each prompt and store each one as a separate row in the dataframe
	elif task.lower() in ["aimo_kaggle_medium_pot", "aimo_kaggle_hard_pot"]:
	for k, v in data["results"].items():
	if k != "all" and "_average" not in k:
	version = k.split("\|")[1].split(":")[-1]
	value = v["qem"] if "qem" in v else v["score"]
	df.loc[model_revision, f"{task}_{version}"] = value
	# For kaggle_tora we report accuracy as a percentage, so need to divide by 100
	elif task.lower() in [
	"aimo_tora_eval_kaggle_medium",
	"aimo_tora_eval_kaggle_hard",
	"aimo_kaggle_fast_eval_hard",
	"aimo_kaggle_tora_medium",
	"aimo_kaggle_tora_hard",
	"aimo_kaggle_tora_medium_extended",
	"aimo_kaggle_tora_hard_extended",
	"aimo_math_integer_lvl4",
	"aimo_math_integer_lvl5",
	]:
	for k, v in data["results"].items():
	value = float(v["qem"]) / 100.0
	df.loc[model_revision, f"{task}"] = value
	# For AlpacaEval we report base winrate and lenght corrected one
	elif task.lower() == "alpaca_eval":
	value = data["results"][first_result_key]["win_rate"]
	df.loc[model_revision, "Alpaca_eval"] = value / 100.0
	value = data["results"][first_result_key]["length_controlled_winrate"]
	df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0
	else:
	first_metric_key = next(
	iter(data["results"][first_result_key])
	) # gets the first key in the first result
	value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
	df.loc[model_revision, task] = float(value)

	# Drop rows where every entry is NaN
	df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"])

	# Trim minimath column names
	df.columns = [c.replace("_level_", "_l") for c in df.columns]

	# Trim AIMO column names
	df.columns = [c.replace("aimo_", "") for c in df.columns]

	df = df.reset_index().rename(columns={"index": "Model"}).round(4)
	# Strip off date from model name
	df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])

	return df


	leaderboard_df = get_leaderboard_df()


	def agg_df(df, agg: str = "max"):
	df = df.copy()
	# Drop date and aggregate results by model name
	df = df.drop("Date", axis=1).groupby("Model").agg(agg).reset_index()

	df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))

	# Convert all values to percentage
	df[df.select_dtypes(include=["number"]).columns] *= 100.0
	df = df.sort_values(by=["Average"], ascending=False)
	return df


	# Function to update the table based on search query
	def filter_and_search(cols: list[str], search_query: str, agg: str):
	df = leaderboard_df
	df = agg_df(df, agg)
	if len(search_query) > 0:
	search_terms = search_query.split(";")
	search_terms = [term.strip().lower() for term in search_terms]
	pattern = "\|".join(search_terms)
	df = df[df["Model"].str.lower().str.contains(pattern, regex=True)]
	# Drop any columns which are all NaN
	df = df.dropna(how="all", axis=1)
	if len(cols) > 0:
	index_cols = list(leaderboard_df.columns[:1])
	new_cols = index_cols + cols
	df = df.copy()[new_cols]
	# Drop rows with NaN values
	df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])
	# Recompute average
	df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
	return df


	demo = gr.Blocks()

	with demo:
	gr.HTML(TITLE)
	with gr.Column():
	gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
	with gr.Row():
	search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
	agg = gr.Radio(
	["min", "max", "mean"],
	value="max",
	label="Aggregation",
	info="How to aggregate results for each model",
	)
	with gr.Row():
	cols_bar = gr.CheckboxGroup(
	choices=[c for c in leaderboard_df.columns[1:] if c != "Average"],
	show_label=False,
	info="Select columns to display",
	)
	with gr.Group():
	leaderboard_table = gr.Dataframe(
	value=leaderboard_df,
	wrap=True,
	height=1000,
	column_widths=[400, 110] + [(260 + len(c)) for c in leaderboard_df.columns[1:]],
	)

	cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table])
	agg.change(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table])
	search_bar.submit(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table])

	demo.launch()