Report correct IFEval score
Browse files
app.py
CHANGED
|
@@ -39,6 +39,9 @@ def get_leaderboard_df():
|
|
| 39 |
# TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
|
| 40 |
if task == "truthfulqa":
|
| 41 |
value = data["results"][first_result_key]["truthfulqa_mc2"]
|
|
|
|
|
|
|
|
|
|
| 42 |
else:
|
| 43 |
first_metric_key = next(
|
| 44 |
iter(data["results"][first_result_key])
|
|
@@ -46,6 +49,9 @@ def get_leaderboard_df():
|
|
| 46 |
value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
|
| 47 |
df.loc[model_revision, task] = value
|
| 48 |
|
|
|
|
|
|
|
|
|
|
| 49 |
df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
|
| 50 |
df = df.sort_values(by=["Average"], ascending=False)
|
| 51 |
df = df.reset_index().rename(columns={"index": "Model"}).round(3)
|
|
|
|
| 39 |
# TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
|
| 40 |
if task == "truthfulqa":
|
| 41 |
value = data["results"][first_result_key]["truthfulqa_mc2"]
|
| 42 |
+
# IFEval has several metrics but we report just the prompt-loose-acc one
|
| 43 |
+
elif task == "ifeval":
|
| 44 |
+
value = data["results"][first_result_key]["prompt_level_loose_acc"]
|
| 45 |
else:
|
| 46 |
first_metric_key = next(
|
| 47 |
iter(data["results"][first_result_key])
|
|
|
|
| 49 |
value = data["results"][first_result_key][first_metric_key] # gets the value of the first metric
|
| 50 |
df.loc[model_revision, task] = value
|
| 51 |
|
| 52 |
+
# Put IFEval in first column
|
| 53 |
+
ifeval_col = df.pop("Ifeval")
|
| 54 |
+
df.insert(1, "Ifeval", ifeval_col)
|
| 55 |
df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
|
| 56 |
df = df.sort_values(by=["Average"], ascending=False)
|
| 57 |
df = df.reset_index().rename(columns={"index": "Model"}).round(3)
|