Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
eduagarcia
commited on
Commit
•
a69553b
1
Parent(s):
79aba72
support hf leaderboard format and my format
Browse files
src/leaderboard/read_evals.py
CHANGED
@@ -75,7 +75,6 @@ class EvalResult:
|
|
75 |
tasks = ORIGINAL_TASKS
|
76 |
for task in tasks:
|
77 |
benchmark, metric = task
|
78 |
-
metric = metric + ',all'
|
79 |
|
80 |
# We skip old mmlu entries
|
81 |
wrong_mmlu_version = False
|
@@ -92,12 +91,21 @@ class EvalResult:
|
|
92 |
if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][metric])):
|
93 |
results[benchmark] = 0.0
|
94 |
continue
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
# We average all scores of a given metric (mostly for mmlu)
|
97 |
-
accs = np.array([v
|
98 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
99 |
continue
|
100 |
-
|
101 |
|
102 |
mean_acc = np.mean(accs) * 100.0
|
103 |
results[benchmark] = mean_acc
|
|
|
75 |
tasks = ORIGINAL_TASKS
|
76 |
for task in tasks:
|
77 |
benchmark, metric = task
|
|
|
78 |
|
79 |
# We skip old mmlu entries
|
80 |
wrong_mmlu_version = False
|
|
|
91 |
if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][metric])):
|
92 |
results[benchmark] = 0.0
|
93 |
continue
|
94 |
+
|
95 |
+
def get_metric(v):
|
96 |
+
res = v.get(metric, None)
|
97 |
+
if res is None:
|
98 |
+
res = v.get(metric + ',all', None)
|
99 |
+
if res is None:
|
100 |
+
res = v.get(metric + ',None', None)
|
101 |
+
if res is None:
|
102 |
+
res = v.get('main_score', None)
|
103 |
+
return res
|
104 |
+
|
105 |
# We average all scores of a given metric (mostly for mmlu)
|
106 |
+
accs = np.array([get_metric(v) for k, v in data["results"].items() if benchmark in k])
|
107 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
108 |
continue
|
|
|
109 |
|
110 |
mean_acc = np.mean(accs) * 100.0
|
111 |
results[benchmark] = mean_acc
|