pminervini commited on
Commit
c0e342c
1 Parent(s): d9f893d
Files changed (3) hide show
  1. beta-cli.py +6 -1
  2. src/leaderboard/read_evals.py +6 -0
  3. src/populate.py +1 -1
beta-cli.py CHANGED
@@ -44,7 +44,7 @@ eval_requests: list[EvalRequest] = get_eval_requests(job_status=current_finished
44
  # Sort the evals by priority (first submitted first run)
45
  eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
46
 
47
- eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND)
48
 
49
  result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
50
  result_name_to_result = {r.eval_name: r for r in eval_results}
@@ -64,3 +64,8 @@ for eval_request in eval_requests:
64
 
65
  if task_name not in eval_result.results:
66
  print('RUN THIS ONE!', result_name, task_name)
 
 
 
 
 
 
44
  # Sort the evals by priority (first submitted first run)
45
  eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
46
 
47
+ eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
48
 
49
  result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
50
  result_name_to_result = {r.eval_name: r for r in eval_results}
 
64
 
65
  if task_name not in eval_result.results:
66
  print('RUN THIS ONE!', result_name, task_name)
67
+
68
+ raw_data = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
69
+ all_data_json = [v.to_dict() for v in raw_data if v.is_complete()]
70
+
71
+ breakpoint()
src/leaderboard/read_evals.py CHANGED
@@ -123,6 +123,12 @@ class EvalResult:
123
  except Exception:
124
  print(f"Could not find request file for {self.org}/{self.model}")
125
 
 
 
 
 
 
 
126
  def to_dict(self):
127
  """Converts the Eval Result to a dict compatible with our dataframe display"""
128
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
123
  except Exception:
124
  print(f"Could not find request file for {self.org}/{self.model}")
125
 
126
+ def is_complete(self) -> bool:
127
+ for task in Tasks:
128
+ if task.value.benchmark not in self.results:
129
+ return False
130
+ return True
131
+
132
  def to_dict(self):
133
  """Converts the Eval Result to a dict compatible with our dataframe display"""
134
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
src/populate.py CHANGED
@@ -15,7 +15,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
15
  # EvalResult(eval_name='EleutherAI_pythia-1.3b_torch.float16', full_model='EleutherAI/pythia-1.3b', org='EleutherAI', model='pythia-1.3b', revision='34b668ff0acfe56f2d541aa46b385557ee39eb3f', results={'arc:challenge': 31.14334470989761, 'hellaswag': 51.43397729535949, 'hendrycksTest': 26.55151159544371, 'truthfulqa:mc': 39.24322830092449, 'winogrande': 57.37963693764798, 'gsm8k': 0.9855951478392722, 'drop': 4.056312919463095}, precision='torch.float16', model_type=<ModelType.PT: ModelTypeDetails(name='pretrained', symbol='🟢')>, weight_type='Original', architecture='GPTNeoXForCausalLM', license='apache-2.0', likes=7, num_params=1.312, date='2023-09-09T10:52:17Z', still_on_hub=True)
16
  # EvalResult and get_raw_eval_results are defined in ./src/leaderboard/read_evals.py, the results slots are not hardcoded
17
  raw_data = get_raw_eval_results(results_path, requests_path)
18
- all_data_json = [v.to_dict() for v in raw_data]
19
  # all_data_json.append(baseline_row)
20
  filter_models(all_data_json)
21
 
 
15
  # EvalResult(eval_name='EleutherAI_pythia-1.3b_torch.float16', full_model='EleutherAI/pythia-1.3b', org='EleutherAI', model='pythia-1.3b', revision='34b668ff0acfe56f2d541aa46b385557ee39eb3f', results={'arc:challenge': 31.14334470989761, 'hellaswag': 51.43397729535949, 'hendrycksTest': 26.55151159544371, 'truthfulqa:mc': 39.24322830092449, 'winogrande': 57.37963693764798, 'gsm8k': 0.9855951478392722, 'drop': 4.056312919463095}, precision='torch.float16', model_type=<ModelType.PT: ModelTypeDetails(name='pretrained', symbol='🟢')>, weight_type='Original', architecture='GPTNeoXForCausalLM', license='apache-2.0', likes=7, num_params=1.312, date='2023-09-09T10:52:17Z', still_on_hub=True)
16
  # EvalResult and get_raw_eval_results are defined in ./src/leaderboard/read_evals.py, the results slots are not hardcoded
17
  raw_data = get_raw_eval_results(results_path, requests_path)
18
+ all_data_json = [v.to_dict() for v in raw_data if v.is_complete()]
19
  # all_data_json.append(baseline_row)
20
  filter_models(all_data_json)
21