Sean Cho commited on
Commit
bd9a9ad
1 Parent(s): 6f030e8

revert logic

Browse files
src/leaderboard/read_evals.py CHANGED
@@ -103,6 +103,13 @@ class EvalResult:
103
  results[task.benchmark] = 0.0
104
  continue
105
 
 
 
 
 
 
 
 
106
  # We average all scores of a given metric (mostly for mmlu)
107
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
108
  if accs.size == 0 or any([acc is None for acc in accs]):
@@ -144,7 +151,28 @@ class EvalResult:
144
  def to_dict(self):
145
  """Converts the Eval Result to a dict compatible with our dataframe display"""
146
 
147
- average = sum([v for v in self.results.values() if v is not None]) / sum([1 for v in self.results.values() if v is not None])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  data_dict = {
149
  "eval_name": self.eval_name, # not a column, just a save name,
150
  AutoEvalColumn.precision.name: self.precision.value.name,
 
103
  results[task.benchmark] = 0.0
104
  continue
105
 
106
+ # New tasks have been added, we need to skip them if not exists
107
+ if task.benchmark in ["ko_winogrande", "ko_gsm8k", "ko_eq_bench", "ko_inst_follow", "kor_nat_cka", "kor_nat_sva", "ko_harmlessness", "ko_helpfulness"]:
108
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
109
+ if accs.size == 0 or any([acc is None for acc in accs]):
110
+ results[task.benchmark] = 0.0
111
+ continue
112
+
113
  # We average all scores of a given metric (mostly for mmlu)
114
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
115
  if accs.size == 0 or any([acc is None for acc in accs]):
 
151
  def to_dict(self):
152
  """Converts the Eval Result to a dict compatible with our dataframe display"""
153
 
154
+ # Skip the two new tasks for now
155
+ # TODO: safely remove this code when the task results are all added
156
+ skip_avg_len = 0
157
+ if self.results['ko_winogrande'] == 0.0:
158
+ skip_avg_len += 1
159
+ if self.results['ko_gsm8k'] == 0.0:
160
+ skip_avg_len += 1
161
+ if self.results['ko_eq_bench'] == 0.0:
162
+ skip_avg_len += 1
163
+ if self.results['ko_inst_follow'] == 0.0:
164
+ skip_avg_len += 1
165
+ if self.results['kor_nat_cka'] == 0.0:
166
+ skip_avg_len += 1
167
+ if self.results['kor_nat_sva'] == 0.0:
168
+ skip_avg_len += 1
169
+ if self.results['ko_harmlessness'] == 0.0:
170
+ skip_avg_len += 1
171
+ if self.results['ko_helpfulness'] == 0.0:
172
+ skip_avg_len += 1
173
+
174
+ average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks) - skip_avg_len)
175
+
176
  data_dict = {
177
  "eval_name": self.eval_name, # not a column, just a save name,
178
  AutoEvalColumn.precision.name: self.precision.value.name,
src/populate.py CHANGED
@@ -16,7 +16,6 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
16
  filter_models(all_data_json)
17
 
18
  df = pd.DataFrame.from_records(all_data_json)
19
- print(df.to_string())
20
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
21
  df = df[cols].round(decimals=2)
22
 
 
16
  filter_models(all_data_json)
17
 
18
  df = pd.DataFrame.from_records(all_data_json)
 
19
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
20
  df = df[cols].round(decimals=2)
21
 
src/tools/plots.py CHANGED
@@ -36,7 +36,25 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
36
 
37
  current_date = row["date"]
38
  if task.benchmark == "Average":
39
- current_score = np.mean(list(row["results"].values()))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  else:
41
  current_score = row["results"][task.benchmark]
42
 
 
36
 
37
  current_date = row["date"]
38
  if task.benchmark == "Average":
39
+ avg_skip_len = 0
40
+ if row["results"]["ko_winogrande"] == 0.0:
41
+ avg_skip_len += 1
42
+ if row["results"]["ko_gsm8k"] == 0.0:
43
+ avg_skip_len += 1
44
+ if row["results"]["ko_eq_bench"] == 0.0:
45
+ avg_skip_len += 1
46
+ if row["results"]["ko_inst_follow"] == 0.0:
47
+ avg_skip_len += 1
48
+ if row["results"]["kor_nat_cka"] == 0.0:
49
+ avg_skip_len += 1
50
+ if row["results"]["kor_nat_sva"] == 0.0:
51
+ avg_skip_len += 1
52
+ if row["results"]["ko_harmlessness"] == 0.0:
53
+ avg_skip_len += 1
54
+ if row["results"]["ko_helpfulness"] == 0.0:
55
+ avg_skip_len += 1
56
+
57
+ current_score = np.sum(list(row["results"].values())) / (len(row["results"]) - avg_skip_len)
58
  else:
59
  current_score = row["results"][task.benchmark]
60