Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
simplify API model names
Browse files
src/leaderboard/read_evals.py
CHANGED
@@ -70,6 +70,9 @@ class EvalResult:
|
|
70 |
org_and_model = re.sub(r"^pretrained=", "", org_and_model)
|
71 |
org_and_model = org_and_model.replace(",trust_remote_code=True", "")
|
72 |
org_and_model = org_and_model.replace(",parallelize=True", "")
|
|
|
|
|
|
|
73 |
org_and_model = re.sub(",prefix_token_id=\d+", "", org_and_model)
|
74 |
org_and_model = re.sub("/$", "", org_and_model)
|
75 |
|
@@ -193,18 +196,6 @@ class EvalResult:
|
|
193 |
baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
|
194 |
|
195 |
average_old = sum([v for task, v in self.results.items() if v is not None and task in all_tasks_wo_polqa]) / len(all_tasks_wo_polqa)
|
196 |
-
# average_g = sum([v for task, v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
|
197 |
-
# average_mc = sum([v for task, v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
|
198 |
-
# print('XXXXXXXXXXXX')
|
199 |
-
# print(self.eval_name)
|
200 |
-
# print(all_tasks)
|
201 |
-
# print(baselines)
|
202 |
-
# print(self.results)
|
203 |
-
# print('XXXXXXXXXXXX')
|
204 |
-
|
205 |
-
# average = sum([((v if v is not None else 0)-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if task in all_tasks]) / len(all_tasks)
|
206 |
-
# average_g = sum([((v if v is not None else 0)-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if task in g_tasks]) / len(g_tasks)
|
207 |
-
# average_mc = sum([((v if v is not None else 0)-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if task in mc_tasks]) / len(mc_tasks)
|
208 |
|
209 |
average = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in all_tasks]) / len(all_tasks)
|
210 |
average_g = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in g_tasks]) / len(g_tasks)
|
|
|
70 |
org_and_model = re.sub(r"^pretrained=", "", org_and_model)
|
71 |
org_and_model = org_and_model.replace(",trust_remote_code=True", "")
|
72 |
org_and_model = org_and_model.replace(",parallelize=True", "")
|
73 |
+
org_and_model = org_and_model.replace(",tokenizer_backend=huggingface", "")
|
74 |
+
org_and_model = re.sub(",base_url=[^,]+", ",API", org_and_model)
|
75 |
+
|
76 |
org_and_model = re.sub(",prefix_token_id=\d+", "", org_and_model)
|
77 |
org_and_model = re.sub("/$", "", org_and_model)
|
78 |
|
|
|
196 |
baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
|
197 |
|
198 |
average_old = sum([v for task, v in self.results.items() if v is not None and task in all_tasks_wo_polqa]) / len(all_tasks_wo_polqa)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
|
200 |
average = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in all_tasks]) / len(all_tasks)
|
201 |
average_g = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in g_tasks]) / len(g_tasks)
|