facat commited on
Commit
f2c1a54
1 Parent(s): 3a8c0d0
Files changed (2) hide show
  1. tasks.py +3 -0
  2. tlem.py +31 -13
tasks.py CHANGED
@@ -85,6 +85,9 @@ class Task:
85
  }
86
  self.label_column = self.label_column or self.input_column
87
 
 
 
 
88
  @cached_property
89
  def samples(self):
90
  return self.dataset[self.input_column]
 
85
  }
86
  self.label_column = self.label_column or self.input_column
87
 
88
+ def __eq__(self, __value: object) -> bool:
89
+ return self.name == __value.name
90
+
91
  @cached_property
92
  def samples(self):
93
  return self.dataset[self.input_column]
tlem.py CHANGED
@@ -13,6 +13,7 @@ import pandas as pd
13
  from .tasks import *
14
  from .utils import *
15
  from itertools import chain
 
16
 
17
 
18
  class ReasoningMetric(evaluate.Metric):
@@ -78,26 +79,29 @@ class Suite(EvaluationSuite):
78
  # case _:
79
  # return list(chain(*self.suite.values()))[key]
80
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  def run(
82
  self,
83
  model_or_pipeline: Any,
84
- suite=None,
85
  ) -> dict[str, float]:
86
  self.assert_suite_nonempty()
87
- if suite is None:
88
- suite = self.suite
89
 
90
  self.suite: dict[str, list[Task]]
91
- results = defaultdict(dict)
92
- for category, tasks in (bar := tqdm(self.suite.items())):
93
- bar.desc = f"complete {category}."
94
- if isinstance(tasks, dict):
95
- results[category] = self.run(model_or_pipeline, tasks)
96
- else:
97
- for task in tasks:
98
- results[category].update(task.run(model_or_pipeline))
99
- results[category] = np.mean(list(results[category].values()))
100
- return results
101
 
102
  def get_suite(self, name) -> dict[str, Task]:
103
  chat = False
@@ -144,6 +148,20 @@ class Suite(EvaluationSuite):
144
  input_column="problem",
145
  label_column="solution",
146
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  if isinstance(suite, Task):
148
  suite = [suite]
149
  if isinstance(suite, list):
 
13
  from .tasks import *
14
  from .utils import *
15
  from itertools import chain
16
+ from copy import deepcopy
17
 
18
 
19
  class ReasoningMetric(evaluate.Metric):
 
79
  # case _:
80
  # return list(chain(*self.suite.values()))[key]
81
 
82
+ def aggregate(self, suite):
83
+ for cate, tasks in suite.items():
84
+ if isinstance(tasks, dict):
85
+ suite[cate] = self.aggregate(tasks)
86
+ else:
87
+ result = []
88
+ for task in tasks:
89
+ result.extend(task.result.values())
90
+ suite[cate] = np.mean(result)
91
+
92
+ return suite
93
+
94
  def run(
95
  self,
96
  model_or_pipeline: Any,
 
97
  ) -> dict[str, float]:
98
  self.assert_suite_nonempty()
 
 
99
 
100
  self.suite: dict[str, list[Task]]
101
+ for task in (bar := tqdm(self.tasks)):
102
+ bar.desc = f"complete {task.name}."
103
+ _ = task.run(model_or_pipeline)
104
+ return self.aggregate(deepcopy(self.suite))
 
 
 
 
 
 
105
 
106
  def get_suite(self, name) -> dict[str, Task]:
107
  chat = False
 
148
  input_column="problem",
149
  label_column="solution",
150
  )
151
+
152
+ case "open-leaderboard":
153
+ suite = {}
154
+ for name in [
155
+ "arc",
156
+ "hellaswag",
157
+ "mmlu-chat",
158
+ "winogrande",
159
+ "gsm8k",
160
+ # "truthful_qa",
161
+ "drop",
162
+ ]:
163
+ suite[name] = self.get_suite(name)
164
+
165
  if isinstance(suite, Task):
166
  suite = [suite]
167
  if isinstance(suite, list):