Spaces:
Build error
Build error
Hide math and mini_math
Browse files
app.py
CHANGED
|
@@ -10,6 +10,8 @@ DESCRIPTION = f"""
|
|
| 10 |
Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
|
| 11 |
"""
|
| 12 |
|
|
|
|
|
|
|
| 13 |
|
| 14 |
def get_leaderboard_df(merge_values: bool = True):
|
| 15 |
filepaths = list(Path("eval_results").rglob("*.json"))
|
|
@@ -35,6 +37,9 @@ def get_leaderboard_df(merge_values: bool = True):
|
|
| 35 |
with open(filepath, "r") as file:
|
| 36 |
data = json.load(file)
|
| 37 |
first_result_key = next(iter(data["results"])) # gets the first key in 'results'
|
|
|
|
|
|
|
|
|
|
| 38 |
# TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
|
| 39 |
if task.lower() == "truthfulqa":
|
| 40 |
value = data["results"][first_result_key]["truthfulqa_mc2"]
|
|
@@ -66,7 +71,7 @@ def get_leaderboard_df(merge_values: bool = True):
|
|
| 66 |
value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
|
| 67 |
|
| 68 |
# For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
|
| 69 |
-
if task.lower() in ["
|
| 70 |
for k, v in data["results"].items():
|
| 71 |
if k != "all":
|
| 72 |
level = k.split("|")[1].split(":")[-1]
|
|
@@ -98,6 +103,9 @@ def get_leaderboard_df(merge_values: bool = True):
|
|
| 98 |
df = df[["Model", "Date"]].merge(merged_df, on="Model", how="left")
|
| 99 |
df.drop_duplicates(subset=["Model"], inplace=True)
|
| 100 |
df = df.sort_values(by=["Average"], ascending=False).round(2)
|
|
|
|
|
|
|
|
|
|
| 101 |
return df
|
| 102 |
|
| 103 |
|
|
@@ -137,7 +145,7 @@ with demo:
|
|
| 137 |
value=leaderboard_df,
|
| 138 |
wrap=True,
|
| 139 |
height=1000,
|
| 140 |
-
column_widths=[400, 110] + [
|
| 141 |
)
|
| 142 |
with gr.Row():
|
| 143 |
refresh_button = gr.Button("Refresh")
|
|
|
|
| 10 |
Evaluation of H4 and community models across a diverse range of benchmarks from [LightEval](https://github.com/huggingface/lighteval). All scores are reported as accuracy.
|
| 11 |
"""
|
| 12 |
|
| 13 |
+
BENCHMARKS_TO_SKIP = ["math", "mini_math"]
|
| 14 |
+
|
| 15 |
|
| 16 |
def get_leaderboard_df(merge_values: bool = True):
|
| 17 |
filepaths = list(Path("eval_results").rglob("*.json"))
|
|
|
|
| 37 |
with open(filepath, "r") as file:
|
| 38 |
data = json.load(file)
|
| 39 |
first_result_key = next(iter(data["results"])) # gets the first key in 'results'
|
| 40 |
+
# Skip benchmarks that we don't want to include in the leaderboard
|
| 41 |
+
if task.lower() in BENCHMARKS_TO_SKIP:
|
| 42 |
+
continue
|
| 43 |
# TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
|
| 44 |
if task.lower() == "truthfulqa":
|
| 45 |
value = data["results"][first_result_key]["truthfulqa_mc2"]
|
|
|
|
| 71 |
value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
|
| 72 |
|
| 73 |
# For mini_math we report 5 metrics, one for each level and store each one as a separate row in the dataframe
|
| 74 |
+
if task.lower() in ["mini_math_v2"]:
|
| 75 |
for k, v in data["results"].items():
|
| 76 |
if k != "all":
|
| 77 |
level = k.split("|")[1].split(":")[-1]
|
|
|
|
| 103 |
df = df[["Model", "Date"]].merge(merged_df, on="Model", how="left")
|
| 104 |
df.drop_duplicates(subset=["Model"], inplace=True)
|
| 105 |
df = df.sort_values(by=["Average"], ascending=False).round(2)
|
| 106 |
+
|
| 107 |
+
# Trim minimath column names
|
| 108 |
+
df.columns = [c.replace("_level_", "_l") for c in df.columns]
|
| 109 |
return df
|
| 110 |
|
| 111 |
|
|
|
|
| 145 |
value=leaderboard_df,
|
| 146 |
wrap=True,
|
| 147 |
height=1000,
|
| 148 |
+
column_widths=[400, 110] + [(150 + len(c)) for c in leaderboard_df.columns[2:]],
|
| 149 |
)
|
| 150 |
with gr.Row():
|
| 151 |
refresh_button = gr.Button("Refresh")
|