Spaces:

Bias-Leaderboard
/

leaderboard

Sleeping

meg-huggingface commited on Jan 19

Commit

5bbe032

•

1 Parent(s): 7dd365e

Add handling for when some metrics aren't calculated

Files changed (1) hide show

src/leaderboard/read_evals.py CHANGED Viewed

@@ -34,6 +34,8 @@ class EvalResult:
     @classmethod
     def init_from_json_file(self, json_filepath):
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
@@ -68,7 +70,12 @@ class EvalResult:
         # Extract results available in this file (some results are split in several files)
         results = {}
         for task in Tasks:
-            task = task.value
             # We average all scores of a given metric (not all metrics are present in all files)
             accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
@@ -108,7 +115,7 @@ class EvalResult:
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
-        average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.precision.name: self.precision.value.name,
@@ -127,7 +134,10 @@ class EvalResult:
         }
         for task in Tasks:
-            data_dict[task.value.col_name] = self.results[task.value.benchmark]
         return data_dict

     @classmethod
     def init_from_json_file(self, json_filepath):
         """Inits the result from the specific model result file"""
+        print("Looking at json_filepath:")
+        print(json_filepath)
         with open(json_filepath) as fp:
             data = json.load(fp)
         # Extract results available in this file (some results are split in several files)
         results = {}
         for task in Tasks:
+            print("Looking at task:")
+            print(task)
+            try:
+                task = task.value
+            except Exception as e:
+                print(e)
             # We average all scores of a given metric (not all metrics are present in all files)
             accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
+        average = sum([v for v in self.results.values()]) / len(self.results.values())
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.precision.name: self.precision.value.name,
         }
         for task in Tasks:
+            try:
+                data_dict[task.value.col_name] = self.results[task.value.benchmark]
+            except KeyError:
+                data_dict[task.value.col_name] = None
         return data_dict