pminervini commited on
Commit
717e20e
1 Parent(s): b0b4782
Files changed (1) hide show
  1. src/populate.py +4 -3
src/populate.py CHANGED
@@ -6,10 +6,10 @@ import pandas as pd
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.filter_models import filter_models
9
- from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
 
12
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> tuple[dict, pd.DataFrame]:
13
  # Returns a list of EvalResult
14
  # raw_data[0]:
15
  # EvalResult(eval_name='EleutherAI_pythia-1.3b_torch.float16', full_model='EleutherAI/pythia-1.3b', org='EleutherAI', model='pythia-1.3b', revision='34b668ff0acfe56f2d541aa46b385557ee39eb3f', results={'arc:challenge': 31.14334470989761, 'hellaswag': 51.43397729535949, 'hendrycksTest': 26.55151159544371, 'truthfulqa:mc': 39.24322830092449, 'winogrande': 57.37963693764798, 'gsm8k': 0.9855951478392722, 'drop': 4.056312919463095}, precision='torch.float16', model_type=<ModelType.PT: ModelTypeDetails(name='pretrained', symbol='🟢')>, weight_type='Original', architecture='GPTNeoXForCausalLM', license='apache-2.0', likes=7, num_params=1.312, date='2023-09-09T10:52:17Z', still_on_hub=True)
@@ -20,7 +20,8 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
20
  filter_models(all_data_json)
21
 
22
  df = pd.DataFrame.from_records(all_data_json)
23
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
 
24
  df = df[cols].round(decimals=2)
25
 
26
  # filter out if any of the benchmarks have not been produced
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.filter_models import filter_models
9
+ from src.leaderboard.read_evals import get_raw_eval_results, EvalResult
10
 
11
 
12
+ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> tuple[list[EvalResult], pd.DataFrame]:
13
  # Returns a list of EvalResult
14
  # raw_data[0]:
15
  # EvalResult(eval_name='EleutherAI_pythia-1.3b_torch.float16', full_model='EleutherAI/pythia-1.3b', org='EleutherAI', model='pythia-1.3b', revision='34b668ff0acfe56f2d541aa46b385557ee39eb3f', results={'arc:challenge': 31.14334470989761, 'hellaswag': 51.43397729535949, 'hendrycksTest': 26.55151159544371, 'truthfulqa:mc': 39.24322830092449, 'winogrande': 57.37963693764798, 'gsm8k': 0.9855951478392722, 'drop': 4.056312919463095}, precision='torch.float16', model_type=<ModelType.PT: ModelTypeDetails(name='pretrained', symbol='🟢')>, weight_type='Original', architecture='GPTNeoXForCausalLM', license='apache-2.0', likes=7, num_params=1.312, date='2023-09-09T10:52:17Z', still_on_hub=True)
 
20
  filter_models(all_data_json)
21
 
22
  df = pd.DataFrame.from_records(all_data_json)
23
+ if AutoEvalColumn.average.name in df:
24
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
25
  df = df[cols].round(decimals=2)
26
 
27
  # filter out if any of the benchmarks have not been produced