Spaces:
Sleeping
Sleeping
meg-huggingface
commited on
Commit
•
5bbe032
1
Parent(s):
7dd365e
Add handling for when some metrics aren't calculated
Browse files
src/leaderboard/read_evals.py
CHANGED
@@ -34,6 +34,8 @@ class EvalResult:
|
|
34 |
@classmethod
|
35 |
def init_from_json_file(self, json_filepath):
|
36 |
"""Inits the result from the specific model result file"""
|
|
|
|
|
37 |
with open(json_filepath) as fp:
|
38 |
data = json.load(fp)
|
39 |
|
@@ -68,7 +70,12 @@ class EvalResult:
|
|
68 |
# Extract results available in this file (some results are split in several files)
|
69 |
results = {}
|
70 |
for task in Tasks:
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
# We average all scores of a given metric (not all metrics are present in all files)
|
74 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
@@ -108,7 +115,7 @@ class EvalResult:
|
|
108 |
|
109 |
def to_dict(self):
|
110 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
111 |
-
average = sum([v for v in self.results.values()
|
112 |
data_dict = {
|
113 |
"eval_name": self.eval_name, # not a column, just a save name,
|
114 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
@@ -127,7 +134,10 @@ class EvalResult:
|
|
127 |
}
|
128 |
|
129 |
for task in Tasks:
|
130 |
-
|
|
|
|
|
|
|
131 |
|
132 |
return data_dict
|
133 |
|
|
|
34 |
@classmethod
|
35 |
def init_from_json_file(self, json_filepath):
|
36 |
"""Inits the result from the specific model result file"""
|
37 |
+
print("Looking at json_filepath:")
|
38 |
+
print(json_filepath)
|
39 |
with open(json_filepath) as fp:
|
40 |
data = json.load(fp)
|
41 |
|
|
|
70 |
# Extract results available in this file (some results are split in several files)
|
71 |
results = {}
|
72 |
for task in Tasks:
|
73 |
+
print("Looking at task:")
|
74 |
+
print(task)
|
75 |
+
try:
|
76 |
+
task = task.value
|
77 |
+
except Exception as e:
|
78 |
+
print(e)
|
79 |
|
80 |
# We average all scores of a given metric (not all metrics are present in all files)
|
81 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
|
|
115 |
|
116 |
def to_dict(self):
|
117 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
118 |
+
average = sum([v for v in self.results.values()]) / len(self.results.values())
|
119 |
data_dict = {
|
120 |
"eval_name": self.eval_name, # not a column, just a save name,
|
121 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
134 |
}
|
135 |
|
136 |
for task in Tasks:
|
137 |
+
try:
|
138 |
+
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
139 |
+
except KeyError:
|
140 |
+
data_dict[task.value.col_name] = None
|
141 |
|
142 |
return data_dict
|
143 |
|