djstrong commited on
Commit
b7ba21a
1 Parent(s): 2542dfb

simplify API model names

Browse files
Files changed (1) hide show
  1. src/leaderboard/read_evals.py +3 -12
src/leaderboard/read_evals.py CHANGED
@@ -70,6 +70,9 @@ class EvalResult:
70
  org_and_model = re.sub(r"^pretrained=", "", org_and_model)
71
  org_and_model = org_and_model.replace(",trust_remote_code=True", "")
72
  org_and_model = org_and_model.replace(",parallelize=True", "")
 
 
 
73
  org_and_model = re.sub(",prefix_token_id=\d+", "", org_and_model)
74
  org_and_model = re.sub("/$", "", org_and_model)
75
 
@@ -193,18 +196,6 @@ class EvalResult:
193
  baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
194
 
195
  average_old = sum([v for task, v in self.results.items() if v is not None and task in all_tasks_wo_polqa]) / len(all_tasks_wo_polqa)
196
- # average_g = sum([v for task, v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
197
- # average_mc = sum([v for task, v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
198
- # print('XXXXXXXXXXXX')
199
- # print(self.eval_name)
200
- # print(all_tasks)
201
- # print(baselines)
202
- # print(self.results)
203
- # print('XXXXXXXXXXXX')
204
-
205
- # average = sum([((v if v is not None else 0)-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if task in all_tasks]) / len(all_tasks)
206
- # average_g = sum([((v if v is not None else 0)-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if task in g_tasks]) / len(g_tasks)
207
- # average_mc = sum([((v if v is not None else 0)-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if task in mc_tasks]) / len(mc_tasks)
208
 
209
  average = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in all_tasks]) / len(all_tasks)
210
  average_g = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in g_tasks]) / len(g_tasks)
 
70
  org_and_model = re.sub(r"^pretrained=", "", org_and_model)
71
  org_and_model = org_and_model.replace(",trust_remote_code=True", "")
72
  org_and_model = org_and_model.replace(",parallelize=True", "")
73
+ org_and_model = org_and_model.replace(",tokenizer_backend=huggingface", "")
74
+ org_and_model = re.sub(",base_url=[^,]+", ",API", org_and_model)
75
+
76
  org_and_model = re.sub(",prefix_token_id=\d+", "", org_and_model)
77
  org_and_model = re.sub("/$", "", org_and_model)
78
 
 
196
  baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
197
 
198
  average_old = sum([v for task, v in self.results.items() if v is not None and task in all_tasks_wo_polqa]) / len(all_tasks_wo_polqa)
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
  average = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in all_tasks]) / len(all_tasks)
201
  average_g = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in g_tasks]) / len(g_tasks)