File size: 10,836 Bytes
e2e6875 25557b5 05c90f4 6679087 25557b5 7379857 38f4369 ddc25db 7379857 38f4369 25557b5 c660995 25557b5 3caeacd 05c90f4 c660995 25557b5 05c90f4 023a289 3caeacd 05c90f4 6679087 e29ab28 6679087 99aea78 6679087 81f1dd1 0d4c659 6679087 0d4c659 6679087 0d4c659 6679087 0d4c659 6679087 a0691fa 0d4c659 81f1dd1 6679087 3782698 5b4c5f8 25557b5 3caeacd 7379857 c8b695a 7379857 c8b695a 7379857 99aea78 eec78c0 7379857 eec78c0 7379857 e611814 023a289 e611814 023a289 6679087 e611814 023a289 6679087 e611814 2436603 6679087 05c90f4 3caeacd 7379857 3caeacd 7379857 3caeacd 7379857 c8b695a 7379857 25557b5 3caeacd 99aea78 6679087 5b4c5f8 3caeacd 5b4c5f8 ddc25db 6679087 7379857 3caeacd c8b695a 7379857 99aea78 7379857 eec78c0 7379857 eec78c0 7379857 e611814 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 |
import io
import json
import gradio as gr
import pandas as pd
from huggingface_hub import HfFileSystem
RESULTS_DATASET_ID = "datasets/open-llm-leaderboard/results"
EXCLUDED_KEYS = {
"pretty_env_info",
"chat_template",
"group_subtasks",
}
# EXCLUDED_RESULTS_KEYS = {
# "leaderboard",
# }
# EXCLUDED_RESULTS_LEADERBOARDS_KEYS = {
# "alias",
# }
DETAILS_DATASET_ID = "datasets/open-llm-leaderboard/{model_name_sanitized}-details"
DETAILS_FILENAME = "samples_{subtask}_*.json"
TASKS = {
"leaderboard_arc_challenge": ("ARC", "leaderboard_arc_challenge"),
"leaderboard_bbh": ("BBH", "leaderboard_bbh"),
"leaderboard_gpqa": ("GPQA", "leaderboard_gpqa"),
"leaderboard_ifeval": ("IFEval", "leaderboard_ifeval"),
"leaderboard_math_hard": ("MATH", "leaderboard_math"),
"leaderboard_mmlu_pro": ("MMLU-Pro", "leaderboard_mmlu_pro"),
"leaderboard_musr": ("MuSR", "leaderboard_musr"),
}
SUBTASKS = {
"leaderboard_arc_challenge": ["leaderboard_arc_challenge"],
"leaderboard_bbh": [
"leaderboard_bbh_boolean_expressions",
"leaderboard_bbh_causal_judgement",
"leaderboard_bbh_date_understanding",
"leaderboard_bbh_disambiguation_qa",
"leaderboard_bbh_formal_fallacies",
"leaderboard_bbh_geometric_shapes",
"leaderboard_bbh_hyperbaton",
"leaderboard_bbh_logical_deduction_five_objects",
"leaderboard_bbh_logical_deduction_seven_objects",
"leaderboard_bbh_logical_deduction_three_objects",
"leaderboard_bbh_movie_recommendation",
"leaderboard_bbh_navigate",
"leaderboard_bbh_object_counting",
"leaderboard_bbh_penguins_in_a_table",
"leaderboard_bbh_reasoning_about_colored_objects",
"leaderboard_bbh_ruin_names",
"leaderboard_bbh_salient_translation_error_detection",
"leaderboard_bbh_snarks", "leaderboard_bbh_sports_understanding",
"leaderboard_bbh_temporal_sequences",
"leaderboard_bbh_tracking_shuffled_objects_five_objects",
"leaderboard_bbh_tracking_shuffled_objects_seven_objects",
"leaderboard_bbh_tracking_shuffled_objects_three_objects",
"leaderboard_bbh_web_of_lies",
],
"leaderboard_gpqa": [
"leaderboard_gpqa_extended",
"leaderboard_gpqa_diamond",
"leaderboard_gpqa_main",
],
"leaderboard_ifeval": ["leaderboard_ifeval"],
# "leaderboard_math_hard": [
"leaderboard_math": [
"leaderboard_math_algebra_hard",
"leaderboard_math_counting_and_prob_hard",
"leaderboard_math_geometry_hard",
"leaderboard_math_intermediate_algebra_hard",
"leaderboard_math_num_theory_hard",
"leaderboard_math_prealgebra_hard",
"leaderboard_math_precalculus_hard",
],
"leaderboard_mmlu_pro": ["leaderboard_mmlu_pro"],
"leaderboard_musr": [
"leaderboard_musr_murder_mysteries",
"leaderboard_musr_object_placements",
"leaderboard_musr_team_allocation",
],
}
fs = HfFileSystem()
def fetch_result_paths():
paths = fs.glob(f"{RESULTS_DATASET_ID}/**/**/*.json")
return paths
def filter_latest_result_path_per_model(paths):
from collections import defaultdict
d = defaultdict(list)
for path in paths:
model_id, _ = path[len(RESULTS_DATASET_ID) +1:].rsplit("/", 1)
d[model_id].append(path)
return {model_id: max(paths) for model_id, paths in d.items()}
def get_result_path_from_model(model_id, result_path_per_model):
return result_path_per_model[model_id]
def update_load_results_component():
return gr.Button("Load Results", interactive=True)
def load_data(result_path) -> pd.DataFrame:
with fs.open(result_path, "r") as f:
data = json.load(f)
return data
def load_results_dataframe(model_id):
if not model_id:
return
result_path = get_result_path_from_model(model_id, latest_result_path_per_model)
data = load_data(result_path)
model_name = data.get("model_name", "Model")
df = pd.json_normalize([{key: value for key, value in data.items() if key not in EXCLUDED_KEYS}])
# df.columns = df.columns.str.split(".") # .split return a list instead of a tuple
return df.set_index(pd.Index([model_name])).reset_index()
def load_results_dataframes(*model_ids):
return [load_results_dataframe(model_id) for model_id in model_ids]
def display_results(df_1, df_2, task):
df = pd.concat([df.set_index("index") for df in [df_1, df_2] if "index" in df.columns])
df = df.T.rename_axis(columns=None)
return display_tab("results", df, task), display_tab("configs", df, task)
def display_tab(tab, df, task):
df = df.style.format(na_rep="")
df.hide(
[
row
for row in df.index
if (
not row.startswith(f"{tab}.")
or row.startswith(f"{tab}.leaderboard.")
or row.endswith(".alias")
or (not row.startswith(f"{tab}.{task}") if task != "All" else False)
)
],
axis="index",
)
start = len(f"{tab}.leaderboard_") if task == "All" else len(f"{tab}.{task} ")
df.format_index(lambda idx: idx[start:].removesuffix(",none"), axis="index")
return df.to_html()
def update_tasks_component():
return gr.Radio(
["All"] + list(TASKS.values()),
label="Tasks",
info="Evaluation tasks to be displayed",
value="All",
interactive=True,
)
def update_subtasks_component(task):
return gr.Radio(
SUBTASKS.get(task),
info="Evaluation subtasks to be displayed",
value=None,
)
def update_load_details_component(model_id_1, model_id_2, subtask):
if (model_id_1 or model_id_2) and subtask:
return gr.Button("Load Details", interactive=True)
else:
return gr.Button("Load Details", interactive=False)
def load_details_dataframe(model_id, subtask):
if not model_id or not subtask:
return
model_name_sanitized = model_id.replace("/", "__")
paths = fs.glob(
f"{DETAILS_DATASET_ID}/**/{DETAILS_FILENAME}".format(
model_name_sanitized=model_name_sanitized, subtask=subtask
)
)
if not paths:
return
path = max(paths)
with fs.open(path, "r") as f:
data = [json.loads(line) for line in f]
df = pd.json_normalize(data)
# df = df.rename_axis("Parameters", axis="columns")
df["model_name"] = model_id # Keep model_name
return df
# return df.set_index(pd.Index([model_id])).reset_index()
def load_details_dataframes(subtask, *model_ids):
return [load_details_dataframe(model_id, subtask) for model_id in model_ids]
def display_details(sample_idx, *dfs):
rows = [df.iloc[sample_idx] for df in dfs if "model_name" in df.columns and sample_idx < len(df)]
if not rows:
return
# Pop model_name and add it to the column name
df = pd.concat([row.rename(row.pop("model_name")) for row in rows], axis="columns")
return (
df.style
.format(na_rep="")
# .hide(axis="index")
.to_html()
)
# if __name__ == "__main__":
latest_result_path_per_model = filter_latest_result_path_per_model(fetch_result_paths())
with gr.Blocks(fill_height=True) as demo:
gr.HTML("<h1 style='text-align: center;'>Compare Results of the 🤗 Open LLM Leaderboard</h1>")
gr.HTML("<h3 style='text-align: center;'>Select 2 models to load and compare their results</h3>")
with gr.Row():
with gr.Column():
model_id_1 = gr.Dropdown(choices=list(latest_result_path_per_model.keys()), label="Models")
dataframe_1 = gr.Dataframe(visible=False)
with gr.Column():
model_id_2 = gr.Dropdown(choices=list(latest_result_path_per_model.keys()), label="Models")
dataframe_2 = gr.Dataframe(visible=False)
with gr.Row():
# with gr.Tab("All"):
# pass
with gr.Tab("Results"):
task = gr.Radio(
["All"] + list(TASKS.values()),
label="Tasks",
info="Evaluation tasks to be displayed",
value="All",
interactive=False,
)
load_results_btn = gr.Button("Load Results", interactive=False)
with gr.Tab("Results"):
results = gr.HTML()
with gr.Tab("Configs"):
configs = gr.HTML()
with gr.Tab("Details"):
details_task = gr.Radio(
["All"] + list(TASKS.values()),
label="Tasks",
info="Evaluation tasks to be displayed",
value="All",
interactive=True,
)
subtask = gr.Radio(
SUBTASKS.get(details_task.value),
label="Subtasks",
info="Evaluation subtasks to be displayed (choose one of the Tasks above)",
)
sample_idx = gr.Number(value=0, label="Sample Index", info="Index of the sample to be displayed", minimum=0)
load_details_btn = gr.Button("Load Details", interactive=False)
details = gr.HTML()
details_dataframe_1 = gr.Dataframe(visible=False)
details_dataframe_2 = gr.Dataframe(visible=False)
details_dataframe = gr.DataFrame(visible=False)
model_id_1.change(
fn=update_load_results_component,
outputs=load_results_btn,
)
load_results_btn.click(
fn=load_results_dataframes,
inputs=[model_id_1, model_id_2],
outputs=[dataframe_1, dataframe_2],
).then(
fn=display_results,
inputs=[dataframe_1, dataframe_2, task],
outputs=[results, configs],
).then(
fn=update_tasks_component,
outputs=task,
)
task.change(
fn=display_results,
inputs=[dataframe_1, dataframe_2, task],
outputs=[results, configs],
)
details_task.change(
fn=update_subtasks_component,
inputs=details_task,
outputs=subtask,
)
gr.on(
triggers=[model_id_1.change, model_id_2.change, subtask.change, details_task.change],
fn=update_load_details_component,
inputs=[model_id_1, model_id_2, subtask],
outputs=load_details_btn,
)
load_details_btn.click(
fn=load_details_dataframes,
inputs=[subtask, model_id_1, model_id_2],
outputs=[details_dataframe_1, details_dataframe_2],
).then(
fn=display_details,
inputs=[sample_idx, details_dataframe_1, details_dataframe_2],
outputs=details,
)
sample_idx.change(
fn=display_details,
inputs=[sample_idx, details_dataframe_1, details_dataframe_2],
outputs=details,
)
demo.launch()
|