Spaces:
Build error
Build error
Fix DS MATH
Browse files
app.py
CHANGED
|
@@ -62,36 +62,23 @@ def get_leaderboard_df():
|
|
| 62 |
elif task.lower() == "agieval":
|
| 63 |
value = data["results"]["all"]["acc_norm"]
|
| 64 |
# MATH reports qem
|
| 65 |
-
elif task.lower() in ["math", "math_v2", "aimo_kaggle"]:
|
| 66 |
value = data["results"]["all"]["qem"]
|
| 67 |
-
else:
|
| 68 |
-
first_metric_key = next(
|
| 69 |
-
iter(data["results"][first_result_key])
|
| 70 |
-
) # gets the first key in the first result
|
| 71 |
-
value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
|
| 72 |
-
|
| 73 |
# For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
|
| 74 |
-
|
| 75 |
for k, v in data["results"].items():
|
| 76 |
if k != "all":
|
| 77 |
level = k.split("|")[1].split(":")[-1]
|
| 78 |
value = v["qem"]
|
| 79 |
df.loc[model_revision, f"{task}_{level}"] = value
|
| 80 |
-
# For
|
| 81 |
-
elif task.lower() in ["aimo_kaggle_medium_pot"]:
|
| 82 |
-
for k, v in data["results"].items():
|
| 83 |
-
if k != "all" and "_average" not in k:
|
| 84 |
-
version = k.split("|")[1].split(":")[-1]
|
| 85 |
-
value = v["qem"] if "qem" in v else v["score"]
|
| 86 |
-
df.loc[model_revision, f"{task}_{version}"] = value
|
| 87 |
-
# For kaggle_pot we report N metrics, one for each prompt and store each one as a separate row in the dataframe
|
| 88 |
-
elif task.lower() in ["aimo_kaggle_hard_pot"]:
|
| 89 |
for k, v in data["results"].items():
|
| 90 |
if k != "all" and "_average" not in k:
|
| 91 |
version = k.split("|")[1].split(":")[-1]
|
| 92 |
value = v["qem"] if "qem" in v else v["score"]
|
| 93 |
df.loc[model_revision, f"{task}_{version}"] = value
|
| 94 |
-
# For kaggle_tora we report accuracy, so need to divide by 100
|
| 95 |
elif task.lower() in [
|
| 96 |
"aimo_tora_eval_kaggle_medium",
|
| 97 |
"aimo_tora_eval_kaggle_hard",
|
|
@@ -113,6 +100,10 @@ def get_leaderboard_df():
|
|
| 113 |
value = data["results"][first_result_key]["length_controlled_winrate"]
|
| 114 |
df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0
|
| 115 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
df.loc[model_revision, task] = float(value)
|
| 117 |
|
| 118 |
# Drop rows where every entry is NaN
|
|
@@ -130,8 +121,10 @@ def get_leaderboard_df():
|
|
| 130 |
|
| 131 |
return df
|
| 132 |
|
|
|
|
| 133 |
leaderboard_df = get_leaderboard_df()
|
| 134 |
|
|
|
|
| 135 |
def agg_df(df, agg: str = "max"):
|
| 136 |
df = df.copy()
|
| 137 |
# Drop date and aggregate results by model name
|
|
@@ -144,6 +137,7 @@ def agg_df(df, agg: str = "max"):
|
|
| 144 |
df = df.sort_values(by=["Average"], ascending=False)
|
| 145 |
return df
|
| 146 |
|
|
|
|
| 147 |
# Function to update the table based on search query
|
| 148 |
def filter_and_search(cols: list[str], search_query: str, agg: str):
|
| 149 |
df = leaderboard_df
|
|
|
|
| 62 |
elif task.lower() == "agieval":
|
| 63 |
value = data["results"]["all"]["acc_norm"]
|
| 64 |
# MATH reports qem
|
| 65 |
+
elif task.lower() in ["math", "math_v2", "aimo_kaggle", "math_deepseek_cot", "math_deepseek_rl_cot"]:
|
| 66 |
value = data["results"]["all"]["qem"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
# For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
|
| 68 |
+
elif task.lower() in ["mini_math_v2"]:
|
| 69 |
for k, v in data["results"].items():
|
| 70 |
if k != "all":
|
| 71 |
level = k.split("|")[1].split(":")[-1]
|
| 72 |
value = v["qem"]
|
| 73 |
df.loc[model_revision, f"{task}_{level}"] = value
|
| 74 |
+
# For PoT we report N metrics, one for each prompt and store each one as a separate row in the dataframe
|
| 75 |
+
elif task.lower() in ["aimo_kaggle_medium_pot", "aimo_kaggle_hard_pot"]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
for k, v in data["results"].items():
|
| 77 |
if k != "all" and "_average" not in k:
|
| 78 |
version = k.split("|")[1].split(":")[-1]
|
| 79 |
value = v["qem"] if "qem" in v else v["score"]
|
| 80 |
df.loc[model_revision, f"{task}_{version}"] = value
|
| 81 |
+
# For kaggle_tora we report accuracy as a percentage, so need to divide by 100
|
| 82 |
elif task.lower() in [
|
| 83 |
"aimo_tora_eval_kaggle_medium",
|
| 84 |
"aimo_tora_eval_kaggle_hard",
|
|
|
|
| 100 |
value = data["results"][first_result_key]["length_controlled_winrate"]
|
| 101 |
df.loc[model_revision, "Alpaca_eval_lc"] = value / 100.0
|
| 102 |
else:
|
| 103 |
+
first_metric_key = next(
|
| 104 |
+
iter(data["results"][first_result_key])
|
| 105 |
+
) # gets the first key in the first result
|
| 106 |
+
value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
|
| 107 |
df.loc[model_revision, task] = float(value)
|
| 108 |
|
| 109 |
# Drop rows where every entry is NaN
|
|
|
|
| 121 |
|
| 122 |
return df
|
| 123 |
|
| 124 |
+
|
| 125 |
leaderboard_df = get_leaderboard_df()
|
| 126 |
|
| 127 |
+
|
| 128 |
def agg_df(df, agg: str = "max"):
|
| 129 |
df = df.copy()
|
| 130 |
# Drop date and aggregate results by model name
|
|
|
|
| 137 |
df = df.sort_values(by=["Average"], ascending=False)
|
| 138 |
return df
|
| 139 |
|
| 140 |
+
|
| 141 |
# Function to update the table based on search query
|
| 142 |
def filter_and_search(cols: list[str], search_query: str, agg: str):
|
| 143 |
df = leaderboard_df
|