yzabc007 commited on
Commit
4106f16
1 Parent(s): c18feb5

Update space

Browse files
Files changed (3) hide show
  1. app.py +2 -2
  2. src/leaderboard/read_evals.py +32 -10
  3. src/populate.py +3 -0
app.py CHANGED
@@ -96,7 +96,7 @@ def init_leaderboard(dataframe):
96
  interactive=False,
97
  )
98
 
99
- # model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
100
  model_leaderboard_df = get_model_leaderboard_df(model_result_path)
101
 
102
  def overall_leaderboard(dataframe):
@@ -129,7 +129,7 @@ with demo:
129
 
130
 
131
  with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
132
- leaderboard = init_leaderboard(LEADERBOARD_DF)
133
 
134
  with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
135
 
 
96
  interactive=False,
97
  )
98
 
99
+ model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
100
  model_leaderboard_df = get_model_leaderboard_df(model_result_path)
101
 
102
  def overall_leaderboard(dataframe):
 
129
 
130
 
131
  with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
132
+ leaderboard = overall_leaderboard(model_leaderboard_df)
133
 
134
  with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
135
 
src/leaderboard/read_evals.py CHANGED
@@ -30,10 +30,10 @@ class ModelResult:
30
  config = data.get("config")
31
  # Get model and org
32
  model = config.get("model_name")
33
- org = config.get("org_name")
34
  license = config.get("license")
35
  knowledge_cutoff = config.get("knowledge_cutoff")
36
-
37
  # Extract results available in this file (some results are split in several files)
38
  results = {}
39
  for domain in Domains:
@@ -75,8 +75,8 @@ class ModelResult:
75
  # AutoEvalColumn.still_on_hub.name: self.still_on_hub,
76
  }
77
 
78
- for task in Tasks:
79
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
80
 
81
  for domain in Domains:
82
  data_dict[domain.value.col_name] = self.results[domain.value.dimension]
@@ -277,26 +277,48 @@ def get_raw_model_results(results_path: str) -> list[EvalResult]:
277
  except:
278
  data = eval(open(results_path).read()) # a list of dicts
279
 
 
 
 
 
 
 
 
 
 
 
280
  eval_results = {}
281
 
282
  for result in data:
283
  # Creation of result
284
  eval_result = ModelResult.init_from_json_dict(result)
 
 
 
 
 
285
 
286
- # Store results of same eval together
287
  eval_name = eval_result.eval_name
288
- if eval_name in eval_results.keys():
289
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
290
- else:
291
- eval_results[eval_name] = eval_result
 
 
 
292
 
293
  results = []
294
  for v in eval_results.values():
 
 
 
 
 
295
  try:
296
  v.to_dict() # we test if the dict version is complete
297
  results.append(v)
298
  except KeyError: # not all eval values present
299
  continue
300
-
301
  return results
302
 
 
30
  config = data.get("config")
31
  # Get model and org
32
  model = config.get("model_name")
33
+ org = config.get("organization")
34
  license = config.get("license")
35
  knowledge_cutoff = config.get("knowledge_cutoff")
36
+
37
  # Extract results available in this file (some results are split in several files)
38
  results = {}
39
  for domain in Domains:
 
75
  # AutoEvalColumn.still_on_hub.name: self.still_on_hub,
76
  }
77
 
78
+ # for task in Tasks:
79
+ # data_dict[task.value.col_name] = self.results[task.value.benchmark]
80
 
81
  for domain in Domains:
82
  data_dict[domain.value.col_name] = self.results[domain.value.dimension]
 
277
  except:
278
  data = eval(open(results_path).read()) # a list of dicts
279
 
280
+ # print("data", len(data))
281
+ # print(data[0])
282
+ # {'config': {'model_name': 'ChatGPT-4o-latest (2024-09-03)',
283
+ # 'organization': 'OpenAI', 'license': 'Proprietary',
284
+ # 'knowledge_cutoff': '2023/10'},
285
+ # 'results': {'math-algebra':
286
+ # {'Score': 99.19484702, 'Avg Rank': 1.666666667, 'Min Rank': 1, 'Max Rank': 3},
287
+ # 'math-probability': {'Score': 100, 'Avg Rank': 1, 'Min Rank': 1, 'Max Rank': 1},
288
+ # 'reasoning-logical': {'Avg Rank': 1, 'Min Rank': 1, 'Max Rank': 1},
289
+ # 'overall': {'Avg Rank': 2, 'Min Rank': 2, 'Max Rank': 2}}}
290
  eval_results = {}
291
 
292
  for result in data:
293
  # Creation of result
294
  eval_result = ModelResult.init_from_json_dict(result)
295
+ # print(eval_result)
296
+ # ModelResult(eval_name='OpenAI_ChatGPT-4o-latest (2024-09-03)',
297
+ # full_model='OpenAI/ChatGPT-4o-latest (2024-09-03)',
298
+ # org='OpenAI', model='ChatGPT-4o-latest (2024-09-03)',
299
+ # results={'overall': None}, license='Proprietary', knowledge_cutoff='2023/10')
300
 
 
301
  eval_name = eval_result.eval_name
302
+ eval_results[eval_name] = eval_result
303
+
304
+ # # Store results of same eval together
305
+ # if eval_name in eval_results.keys():
306
+ # eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
307
+ # else:
308
+ # eval_results[eval_name] = eval_result
309
 
310
  results = []
311
  for v in eval_results.values():
312
+ # print(v.to_dict())
313
+ # {'eval_name': 'OpenAI_ChatGPT-4o-latest (2024-09-03)',
314
+ # 'Model': '<a target="_blank" href="https://huggingface.co/OpenAI/ChatGPT-4o-latest (2024-09-03)"
315
+ # style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">OpenAI/ChatGPT-4o-latest (2024-09-03)</a>',
316
+ # 'Hub License': 'Proprietary', 'Organization': 'OpenAI', 'Knowledge cutoff': '2023/10', 'Overall': None}
317
  try:
318
  v.to_dict() # we test if the dict version is complete
319
  results.append(v)
320
  except KeyError: # not all eval values present
321
  continue
322
+
323
  return results
324
 
src/populate.py CHANGED
@@ -15,6 +15,9 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
  # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
 
 
 
18
  for col in cols:
19
  if col not in df.columns:
20
  df[col] = None
 
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
  # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
+ # print(cols) # []
19
+ # print(df.columns) # ['eval_name', 'Model', 'Hub License', 'Organization', 'Knowledge cutoff', 'Overall']
20
+ # exit()
21
  for col in cols:
22
  if col not in df.columns:
23
  df[col] = None