pminervini commited on
Commit
d354e12
1 Parent(s): 7de3b23
src/display/utils.py CHANGED
@@ -40,7 +40,7 @@ class Tasks(Enum):
40
  cnndm_b = Task("cnndm_v2", "bertscore_precision", "CNN-DM/BERT-P")
41
 
42
  race = Task("race", "acc", "RACE/Acc")
43
- squadv2 = Task("squadv2", "exact", "SQUaDv2/EM")
44
 
45
  memotrap = Task("memo-trap_v2", "acc", "MemoTrap/Acc")
46
  ifeval = Task("ifeval", "prompt_level_strict_acc", "IFEval/Acc")
 
40
  cnndm_b = Task("cnndm_v2", "bertscore_precision", "CNN-DM/BERT-P")
41
 
42
  race = Task("race", "acc", "RACE/Acc")
43
+ # squadv2 = Task("squadv2", "exact_normalised", "SQUaDv2/EM")
44
 
45
  memotrap = Task("memo-trap_v2", "acc", "MemoTrap/Acc")
46
  ifeval = Task("ifeval", "prompt_level_strict_acc", "IFEval/Acc")
src/leaderboard/read_evals.py CHANGED
@@ -11,6 +11,14 @@ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, Weigh
11
  from src.submission.check_validity import is_model_on_hub
12
 
13
 
 
 
 
 
 
 
 
 
14
  @dataclass
15
  class EvalResult:
16
  # Also see src.display.utils.AutoEvalColumn for what will be displayed.
@@ -84,6 +92,12 @@ class EvalResult:
84
  for k, v in entry_copy.items():
85
  if "exact_match" in k:
86
  results[task_name][k.replace("exact_match", "em")] = v
 
 
 
 
 
 
87
 
88
  entry_copy = results[task_name].copy()
89
 
@@ -99,6 +113,8 @@ class EvalResult:
99
  if accs.size == 0 or any([acc is None for acc in accs]):
100
  continue
101
 
 
 
102
  mean_acc = np.mean(accs) * 100.0
103
  results[task.benchmark] = mean_acc
104
 
 
11
  from src.submission.check_validity import is_model_on_hub
12
 
13
 
14
+ def is_float(string):
15
+ try:
16
+ float(string)
17
+ return True
18
+ except ValueError:
19
+ return False
20
+
21
+
22
  @dataclass
23
  class EvalResult:
24
  # Also see src.display.utils.AutoEvalColumn for what will be displayed.
 
92
  for k, v in entry_copy.items():
93
  if "exact_match" in k:
94
  results[task_name][k.replace("exact_match", "em")] = v
95
+ if "squadv2" in task_name:
96
+ value = results[task_name][k]
97
+ if is_float(value) and 'normalised' not in k:
98
+ results[task_name][f"{k}_normalised"] = value / 100.0
99
+ else:
100
+ del results[task_name][k]
101
 
102
  entry_copy = results[task_name].copy()
103
 
 
113
  if accs.size == 0 or any([acc is None for acc in accs]):
114
  continue
115
 
116
+ # print(accs)
117
+
118
  mean_acc = np.mean(accs) * 100.0
119
  results[task.benchmark] = mean_acc
120