djstrong commited on
Commit
28627fa
1 Parent(s): 6b50f19
Files changed (1) hide show
  1. src/leaderboard/read_evals.py +13 -2
src/leaderboard/read_evals.py CHANGED
@@ -456,15 +456,26 @@ def get_raw_eval_results(results_path: str, requests_path: str, metadata) -> lis
456
  if task_name in missing_results_for_task:
457
  missing_results_for_task[task_name].append(f"{v.full_model}|{v.org_and_model}")
458
  if v.still_on_hub and task.value.benchmark in all_tasks:
459
- print(f'batch start.sh "bash eval_model_task_bs1.sh {r["n_shot"]} {task.value.benchmark} {v.full_model}"')
 
460
  else:
461
  missing_results_for_task[task_name] = [f"{v.full_model}|{v.org_and_model}"]
462
  if v.still_on_hub and task.value.benchmark in all_tasks:
463
- print(f'batch start.sh "bash eval_model_task_bs1.sh {r["n_shot"]} {task.value.benchmark} {v.full_model}"')
 
464
  if r[AutoEvalColumn.lang.name] is None or r[AutoEvalColumn.lang.name] == "?":
465
  missing_metadata.append(f"{v.full_model}")
466
  all_models.append((v.full_model, v.num_params, v.still_on_hub))
467
 
 
 
 
 
 
 
 
 
 
468
  # print('missing_results_for_task', missing_results_for_task)
469
  for task, models in missing_results_for_task.items():
470
  print(f"Missing results for {task} for {len(models)} models")
 
456
  if task_name in missing_results_for_task:
457
  missing_results_for_task[task_name].append(f"{v.full_model}|{v.org_and_model}")
458
  if v.still_on_hub and task.value.benchmark in all_tasks:
459
+ for_run.append([r["n_shot"], task.value.benchmark, v.full_model])
460
+ # print(f'sbatch start.sh "bash eval_model_task_bs1.sh {r["n_shot"]} {task.value.benchmark} {v.full_model}"')
461
  else:
462
  missing_results_for_task[task_name] = [f"{v.full_model}|{v.org_and_model}"]
463
  if v.still_on_hub and task.value.benchmark in all_tasks:
464
+ for_run.append([r["n_shot"], task.value.benchmark, v.full_model])
465
+ # print(f'sbatch start.sh "bash eval_model_task_bs1.sh {r["n_shot"]} {task.value.benchmark} {v.full_model}"')
466
  if r[AutoEvalColumn.lang.name] is None or r[AutoEvalColumn.lang.name] == "?":
467
  missing_metadata.append(f"{v.full_model}")
468
  all_models.append((v.full_model, v.num_params, v.still_on_hub))
469
 
470
+ print(f"Missing sbatch results:")
471
+ for r in for_run:
472
+ fm=r[2].replace(',multiturn','')
473
+ if ',chat' in fm:
474
+ fm=fm.replace(',chat','')
475
+ print(f'sbatch start.sh "bash eval_model_task_bs1_chat.sh {r[0]} {r[1]} {fm}"')
476
+ else:
477
+ print(f'sbatch start.sh "bash eval_model_task_bs1.sh {r[0]} {r[1]} {fm}"')
478
+
479
  # print('missing_results_for_task', missing_results_for_task)
480
  for task, models in missing_results_for_task.items():
481
  print(f"Missing results for {task} for {len(models)} models")