eduagarcia commited on
Commit
a69553b
1 Parent(s): 79aba72

support hf leaderboard format and my format

Browse files
Files changed (1) hide show
  1. src/leaderboard/read_evals.py +12 -4
src/leaderboard/read_evals.py CHANGED
@@ -75,7 +75,6 @@ class EvalResult:
75
  tasks = ORIGINAL_TASKS
76
  for task in tasks:
77
  benchmark, metric = task
78
- metric = metric + ',all'
79
 
80
  # We skip old mmlu entries
81
  wrong_mmlu_version = False
@@ -92,12 +91,21 @@ class EvalResult:
92
  if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][metric])):
93
  results[benchmark] = 0.0
94
  continue
95
-
 
 
 
 
 
 
 
 
 
 
96
  # We average all scores of a given metric (mostly for mmlu)
97
- accs = np.array([v.get(metric, None) for k, v in data["results"].items() if benchmark in k])
98
  if accs.size == 0 or any([acc is None for acc in accs]):
99
  continue
100
-
101
 
102
  mean_acc = np.mean(accs) * 100.0
103
  results[benchmark] = mean_acc
 
75
  tasks = ORIGINAL_TASKS
76
  for task in tasks:
77
  benchmark, metric = task
 
78
 
79
  # We skip old mmlu entries
80
  wrong_mmlu_version = False
 
91
  if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][metric])):
92
  results[benchmark] = 0.0
93
  continue
94
+
95
+ def get_metric(v):
96
+ res = v.get(metric, None)
97
+ if res is None:
98
+ res = v.get(metric + ',all', None)
99
+ if res is None:
100
+ res = v.get(metric + ',None', None)
101
+ if res is None:
102
+ res = v.get('main_score', None)
103
+ return res
104
+
105
  # We average all scores of a given metric (mostly for mmlu)
106
+ accs = np.array([get_metric(v) for k, v in data["results"].items() if benchmark in k])
107
  if accs.size == 0 or any([acc is None for acc in accs]):
108
  continue
 
109
 
110
  mean_acc = np.mean(accs) * 100.0
111
  results[benchmark] = mean_acc