T145 commited on
Commit
5ef372e
Β·
1 Parent(s): 5af06fb

Set constant keys to make future changes easy

Browse files
Files changed (1) hide show
  1. functions.py +36 -31
functions.py CHANGED
@@ -21,6 +21,12 @@ The purpose of this PR is to add evaluation results from the Open LLM Leaderboar
21
 
22
  Please report any issues here: https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard/discussions"""
23
 
 
 
 
 
 
 
24
 
25
  def normalize_within_range(value, lower_bound=0, higher_bound=1):
26
  return (np.clip(value - lower_bound, 0, None)) / (higher_bound - lower_bound) * 100
@@ -54,18 +60,22 @@ def calculate_results(repo: str, pool: urllib3.PoolManager):
54
 
55
  # Average BBH score
56
  bbh_score = sum(bbh_scores) / len(bbh_scores)
 
57
 
58
  # Calculate the MATH score
59
  math_raw_score = data["results"]["leaderboard_math_hard"]["exact_match,none"]
60
  math_score = normalize_within_range(math_raw_score, 0, 1.0)
 
61
 
62
  # Normalize GPQA scores
63
  gpqa_raw_score = data["results"]["leaderboard_gpqa"]["acc_norm,none"]
64
  gpqa_score = normalize_within_range(gpqa_raw_score, 0.25, 1.0)
 
65
 
66
  # Normalize MMLU PRO scores
67
- mmlu_pro_raw_score = data["results"]["leaderboard_mmlu_pro"]["acc,none"]
68
- mmlu_pro_score = normalize_within_range(mmlu_pro_raw_score, 0.1, 1.0)
 
69
 
70
  # Compute IFEval
71
  ifeval_inst_score = (
@@ -77,6 +87,7 @@ def calculate_results(repo: str, pool: urllib3.PoolManager):
77
 
78
  # Average IFEval scores
79
  ifeval_score = (ifeval_inst_score + ifeval_prompt_score) / 2
 
80
 
81
  # Normalize MUSR scores
82
  musr_scores = []
@@ -92,31 +103,25 @@ def calculate_results(repo: str, pool: urllib3.PoolManager):
92
  del dataset
93
 
94
  musr_score = sum(musr_scores) / len(musr_scores)
 
95
 
96
  # Calculate overall score
97
- overall_score = (
98
- bbh_score + math_score + gpqa_score + mmlu_pro_score + musr_score + ifeval_score
99
  ) / 6
 
100
 
101
- # Round all scores to 2 decimal places
102
- bbh_score = float(round(bbh_score, 2))
103
- math_score = float(round(math_score, 2))
104
- gpqa_score = float(round(gpqa_score, 2))
105
- mmlu_pro_score = float(round(mmlu_pro_score, 2))
106
- musr_score = float(round(musr_score, 2))
107
- ifeval_score = float(round(ifeval_score, 2))
108
- overall_score = float(round(overall_score, 2))
109
  results = {
110
  "Model": repo,
111
  "Precision": precision,
112
  "Revision": revision,
113
- "Average": overall_score,
114
- "IFEval": ifeval_score,
115
- "BBH": bbh_score,
116
- "MATH Lvl 5": math_score,
117
- "GPQA": gpqa_score,
118
- "MUSR": musr_score,
119
- "MMLU-PRO": mmlu_pro_score,
120
  }
121
  # pprint(results, sort_dicts=False)
122
  return results
@@ -141,61 +146,61 @@ def get_query_url(repo: str):
141
 
142
  def get_task_summary(results):
143
  return {
144
- "IFEval": {
145
  "dataset_type": "wis-k/instruction-following-eval",
146
  "dataset_name": "IFEval (0-Shot)",
147
  "metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
148
- "metric_value": results["IFEval"],
149
  "dataset_config": None,
150
  "dataset_split": "train",
151
  "dataset_args": {"num_few_shot": 0},
152
  "metric_name": "averaged accuracy",
153
  },
154
- "BBH": {
155
  "dataset_type": "SaylorTwift/bbh",
156
  "dataset_name": "BBH (3-Shot)",
157
  "metric_type": "acc_norm",
158
- "metric_value": results["BBH"],
159
  "dataset_config": None,
160
  "dataset_split": "test",
161
  "dataset_args": {"num_few_shot": 3},
162
  "metric_name": "normalized accuracy",
163
  },
164
- "MATH Lvl 5": {
165
  "dataset_type": "lighteval/MATH-Hard",
166
  "dataset_name": "MATH Lvl 5 (4-Shot)",
167
  "metric_type": "exact_match",
168
- "metric_value": results["MATH Lvl 5"],
169
  "dataset_config": None,
170
  "dataset_split": "test",
171
  "dataset_args": {"num_few_shot": 4},
172
  "metric_name": "exact match",
173
  },
174
- "GPQA": {
175
  "dataset_type": "Idavidrein/gpqa",
176
  "dataset_name": "GPQA (0-shot)",
177
  "metric_type": "acc_norm",
178
- "metric_value": results["GPQA"],
179
  "dataset_config": None,
180
  "dataset_split": "train",
181
  "dataset_args": {"num_few_shot": 0},
182
  "metric_name": "acc_norm",
183
  },
184
- "MuSR": {
185
  "dataset_type": "TAUR-Lab/MuSR",
186
  "dataset_name": "MuSR (0-shot)",
187
  "metric_type": "acc_norm",
188
- "metric_value": results["MUSR"],
189
  "dataset_config": None,
190
  "dataset_split": None, # three test splits
191
  "dataset_args": {"num_few_shot": 0},
192
  "metric_name": "acc_norm",
193
  },
194
- "MMLU-PRO": {
195
  "dataset_type": "TIGER-Lab/MMLU-Pro",
196
  "dataset_name": "MMLU-PRO (5-shot)",
197
  "metric_type": "acc",
198
- "metric_value": results["MMLU-PRO"],
199
  "dataset_config": "main",
200
  "dataset_split": "test",
201
  "dataset_args": {"num_few_shot": 5},
 
21
 
22
  Please report any issues here: https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard/discussions"""
23
 
24
+ KEY_IFEVAL = "IFEval"
25
+ KEY_BBH = "BBH"
26
+ KEY_MATH = "MATH Lvl 5"
27
+ KEY_GPQA = "GPQA"
28
+ KEY_MUSR = "MuSR"
29
+ KEY_MMLU = "MMLU-Pro"
30
 
31
  def normalize_within_range(value, lower_bound=0, higher_bound=1):
32
  return (np.clip(value - lower_bound, 0, None)) / (higher_bound - lower_bound) * 100
 
60
 
61
  # Average BBH score
62
  bbh_score = sum(bbh_scores) / len(bbh_scores)
63
+ bbh_score = float(round(bbh_score, 2))
64
 
65
  # Calculate the MATH score
66
  math_raw_score = data["results"]["leaderboard_math_hard"]["exact_match,none"]
67
  math_score = normalize_within_range(math_raw_score, 0, 1.0)
68
+ math_score = float(round(math_score, 2))
69
 
70
  # Normalize GPQA scores
71
  gpqa_raw_score = data["results"]["leaderboard_gpqa"]["acc_norm,none"]
72
  gpqa_score = normalize_within_range(gpqa_raw_score, 0.25, 1.0)
73
+ gpqa_score = float(round(gpqa_score, 2))
74
 
75
  # Normalize MMLU PRO scores
76
+ mmlu_raw_score = data["results"]["leaderboard_mmlu_pro"]["acc,none"]
77
+ mmlu_score = normalize_within_range(mmlu_raw_score, 0.1, 1.0)
78
+ mmlu_score = float(round(mmlu_score, 2))
79
 
80
  # Compute IFEval
81
  ifeval_inst_score = (
 
87
 
88
  # Average IFEval scores
89
  ifeval_score = (ifeval_inst_score + ifeval_prompt_score) / 2
90
+ ifeval_score = float(round(ifeval_score, 2))
91
 
92
  # Normalize MUSR scores
93
  musr_scores = []
 
103
  del dataset
104
 
105
  musr_score = sum(musr_scores) / len(musr_scores)
106
+ musr_score = float(round(musr_score, 2))
107
 
108
  # Calculate overall score
109
+ average_score = (
110
+ bbh_score + math_score + gpqa_score + mmlu_score + musr_score + ifeval_score
111
  ) / 6
112
+ average_score = float(round(average_score, 2))
113
 
 
 
 
 
 
 
 
 
114
  results = {
115
  "Model": repo,
116
  "Precision": precision,
117
  "Revision": revision,
118
+ "Average": average_score,
119
+ KEY_IFEVAL: ifeval_score,
120
+ KEY_BBH: bbh_score,
121
+ KEY_MATH: math_score,
122
+ KEY_GPQA: gpqa_score,
123
+ KEY_MUSR: musr_score,
124
+ KEY_MMLU: mmlu_score,
125
  }
126
  # pprint(results, sort_dicts=False)
127
  return results
 
146
 
147
  def get_task_summary(results):
148
  return {
149
+ KEY_IFEVAL: {
150
  "dataset_type": "wis-k/instruction-following-eval",
151
  "dataset_name": "IFEval (0-Shot)",
152
  "metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
153
+ "metric_value": results[KEY_IFEVAL],
154
  "dataset_config": None,
155
  "dataset_split": "train",
156
  "dataset_args": {"num_few_shot": 0},
157
  "metric_name": "averaged accuracy",
158
  },
159
+ KEY_BBH: {
160
  "dataset_type": "SaylorTwift/bbh",
161
  "dataset_name": "BBH (3-Shot)",
162
  "metric_type": "acc_norm",
163
+ "metric_value": results[KEY_BBH],
164
  "dataset_config": None,
165
  "dataset_split": "test",
166
  "dataset_args": {"num_few_shot": 3},
167
  "metric_name": "normalized accuracy",
168
  },
169
+ KEY_MATH: {
170
  "dataset_type": "lighteval/MATH-Hard",
171
  "dataset_name": "MATH Lvl 5 (4-Shot)",
172
  "metric_type": "exact_match",
173
+ "metric_value": results[KEY_MATH],
174
  "dataset_config": None,
175
  "dataset_split": "test",
176
  "dataset_args": {"num_few_shot": 4},
177
  "metric_name": "exact match",
178
  },
179
+ KEY_GPQA: {
180
  "dataset_type": "Idavidrein/gpqa",
181
  "dataset_name": "GPQA (0-shot)",
182
  "metric_type": "acc_norm",
183
+ "metric_value": results[KEY_GPQA],
184
  "dataset_config": None,
185
  "dataset_split": "train",
186
  "dataset_args": {"num_few_shot": 0},
187
  "metric_name": "acc_norm",
188
  },
189
+ KEY_MUSR: {
190
  "dataset_type": "TAUR-Lab/MuSR",
191
  "dataset_name": "MuSR (0-shot)",
192
  "metric_type": "acc_norm",
193
+ "metric_value": results[KEY_MUSR],
194
  "dataset_config": None,
195
  "dataset_split": None, # three test splits
196
  "dataset_args": {"num_few_shot": 0},
197
  "metric_name": "acc_norm",
198
  },
199
+ KEY_MMLU: {
200
  "dataset_type": "TIGER-Lab/MMLU-Pro",
201
  "dataset_name": "MMLU-PRO (5-shot)",
202
  "metric_type": "acc",
203
+ "metric_value": results[KEY_MMLU],
204
  "dataset_config": "main",
205
  "dataset_split": "test",
206
  "dataset_args": {"num_few_shot": 5},