Update space
Browse files- app.py +2 -2
- src/leaderboard/read_evals.py +32 -10
- src/populate.py +3 -0
app.py
CHANGED
@@ -96,7 +96,7 @@ def init_leaderboard(dataframe):
|
|
96 |
interactive=False,
|
97 |
)
|
98 |
|
99 |
-
|
100 |
model_leaderboard_df = get_model_leaderboard_df(model_result_path)
|
101 |
|
102 |
def overall_leaderboard(dataframe):
|
@@ -129,7 +129,7 @@ with demo:
|
|
129 |
|
130 |
|
131 |
with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
|
132 |
-
leaderboard =
|
133 |
|
134 |
with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
|
135 |
|
|
|
96 |
interactive=False,
|
97 |
)
|
98 |
|
99 |
+
model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
|
100 |
model_leaderboard_df = get_model_leaderboard_df(model_result_path)
|
101 |
|
102 |
def overall_leaderboard(dataframe):
|
|
|
129 |
|
130 |
|
131 |
with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
|
132 |
+
leaderboard = overall_leaderboard(model_leaderboard_df)
|
133 |
|
134 |
with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
|
135 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -30,10 +30,10 @@ class ModelResult:
|
|
30 |
config = data.get("config")
|
31 |
# Get model and org
|
32 |
model = config.get("model_name")
|
33 |
-
org = config.get("
|
34 |
license = config.get("license")
|
35 |
knowledge_cutoff = config.get("knowledge_cutoff")
|
36 |
-
|
37 |
# Extract results available in this file (some results are split in several files)
|
38 |
results = {}
|
39 |
for domain in Domains:
|
@@ -75,8 +75,8 @@ class ModelResult:
|
|
75 |
# AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
76 |
}
|
77 |
|
78 |
-
for task in Tasks:
|
79 |
-
|
80 |
|
81 |
for domain in Domains:
|
82 |
data_dict[domain.value.col_name] = self.results[domain.value.dimension]
|
@@ -277,26 +277,48 @@ def get_raw_model_results(results_path: str) -> list[EvalResult]:
|
|
277 |
except:
|
278 |
data = eval(open(results_path).read()) # a list of dicts
|
279 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
280 |
eval_results = {}
|
281 |
|
282 |
for result in data:
|
283 |
# Creation of result
|
284 |
eval_result = ModelResult.init_from_json_dict(result)
|
|
|
|
|
|
|
|
|
|
|
285 |
|
286 |
-
# Store results of same eval together
|
287 |
eval_name = eval_result.eval_name
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
|
|
|
|
|
|
292 |
|
293 |
results = []
|
294 |
for v in eval_results.values():
|
|
|
|
|
|
|
|
|
|
|
295 |
try:
|
296 |
v.to_dict() # we test if the dict version is complete
|
297 |
results.append(v)
|
298 |
except KeyError: # not all eval values present
|
299 |
continue
|
300 |
-
|
301 |
return results
|
302 |
|
|
|
30 |
config = data.get("config")
|
31 |
# Get model and org
|
32 |
model = config.get("model_name")
|
33 |
+
org = config.get("organization")
|
34 |
license = config.get("license")
|
35 |
knowledge_cutoff = config.get("knowledge_cutoff")
|
36 |
+
|
37 |
# Extract results available in this file (some results are split in several files)
|
38 |
results = {}
|
39 |
for domain in Domains:
|
|
|
75 |
# AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
76 |
}
|
77 |
|
78 |
+
# for task in Tasks:
|
79 |
+
# data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
80 |
|
81 |
for domain in Domains:
|
82 |
data_dict[domain.value.col_name] = self.results[domain.value.dimension]
|
|
|
277 |
except:
|
278 |
data = eval(open(results_path).read()) # a list of dicts
|
279 |
|
280 |
+
# print("data", len(data))
|
281 |
+
# print(data[0])
|
282 |
+
# {'config': {'model_name': 'ChatGPT-4o-latest (2024-09-03)',
|
283 |
+
# 'organization': 'OpenAI', 'license': 'Proprietary',
|
284 |
+
# 'knowledge_cutoff': '2023/10'},
|
285 |
+
# 'results': {'math-algebra':
|
286 |
+
# {'Score': 99.19484702, 'Avg Rank': 1.666666667, 'Min Rank': 1, 'Max Rank': 3},
|
287 |
+
# 'math-probability': {'Score': 100, 'Avg Rank': 1, 'Min Rank': 1, 'Max Rank': 1},
|
288 |
+
# 'reasoning-logical': {'Avg Rank': 1, 'Min Rank': 1, 'Max Rank': 1},
|
289 |
+
# 'overall': {'Avg Rank': 2, 'Min Rank': 2, 'Max Rank': 2}}}
|
290 |
eval_results = {}
|
291 |
|
292 |
for result in data:
|
293 |
# Creation of result
|
294 |
eval_result = ModelResult.init_from_json_dict(result)
|
295 |
+
# print(eval_result)
|
296 |
+
# ModelResult(eval_name='OpenAI_ChatGPT-4o-latest (2024-09-03)',
|
297 |
+
# full_model='OpenAI/ChatGPT-4o-latest (2024-09-03)',
|
298 |
+
# org='OpenAI', model='ChatGPT-4o-latest (2024-09-03)',
|
299 |
+
# results={'overall': None}, license='Proprietary', knowledge_cutoff='2023/10')
|
300 |
|
|
|
301 |
eval_name = eval_result.eval_name
|
302 |
+
eval_results[eval_name] = eval_result
|
303 |
+
|
304 |
+
# # Store results of same eval together
|
305 |
+
# if eval_name in eval_results.keys():
|
306 |
+
# eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
307 |
+
# else:
|
308 |
+
# eval_results[eval_name] = eval_result
|
309 |
|
310 |
results = []
|
311 |
for v in eval_results.values():
|
312 |
+
# print(v.to_dict())
|
313 |
+
# {'eval_name': 'OpenAI_ChatGPT-4o-latest (2024-09-03)',
|
314 |
+
# 'Model': '<a target="_blank" href="https://huggingface.co/OpenAI/ChatGPT-4o-latest (2024-09-03)"
|
315 |
+
# style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">OpenAI/ChatGPT-4o-latest (2024-09-03)</a>',
|
316 |
+
# 'Hub License': 'Proprietary', 'Organization': 'OpenAI', 'Knowledge cutoff': '2023/10', 'Overall': None}
|
317 |
try:
|
318 |
v.to_dict() # we test if the dict version is complete
|
319 |
results.append(v)
|
320 |
except KeyError: # not all eval values present
|
321 |
continue
|
322 |
+
|
323 |
return results
|
324 |
|
src/populate.py
CHANGED
@@ -15,6 +15,9 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
|
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
|
|
|
|
|
|
18 |
for col in cols:
|
19 |
if col not in df.columns:
|
20 |
df[col] = None
|
|
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
18 |
+
# print(cols) # []
|
19 |
+
# print(df.columns) # ['eval_name', 'Model', 'Hub License', 'Organization', 'Knowledge cutoff', 'Overall']
|
20 |
+
# exit()
|
21 |
for col in cols:
|
22 |
if col not in df.columns:
|
23 |
df[col] = None
|