Set constant keys to make future changes easy
Browse files- functions.py +36 -31
functions.py
CHANGED
@@ -21,6 +21,12 @@ The purpose of this PR is to add evaluation results from the Open LLM Leaderboar
|
|
21 |
|
22 |
Please report any issues here: https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard/discussions"""
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
def normalize_within_range(value, lower_bound=0, higher_bound=1):
|
26 |
return (np.clip(value - lower_bound, 0, None)) / (higher_bound - lower_bound) * 100
|
@@ -54,18 +60,22 @@ def calculate_results(repo: str, pool: urllib3.PoolManager):
|
|
54 |
|
55 |
# Average BBH score
|
56 |
bbh_score = sum(bbh_scores) / len(bbh_scores)
|
|
|
57 |
|
58 |
# Calculate the MATH score
|
59 |
math_raw_score = data["results"]["leaderboard_math_hard"]["exact_match,none"]
|
60 |
math_score = normalize_within_range(math_raw_score, 0, 1.0)
|
|
|
61 |
|
62 |
# Normalize GPQA scores
|
63 |
gpqa_raw_score = data["results"]["leaderboard_gpqa"]["acc_norm,none"]
|
64 |
gpqa_score = normalize_within_range(gpqa_raw_score, 0.25, 1.0)
|
|
|
65 |
|
66 |
# Normalize MMLU PRO scores
|
67 |
-
|
68 |
-
|
|
|
69 |
|
70 |
# Compute IFEval
|
71 |
ifeval_inst_score = (
|
@@ -77,6 +87,7 @@ def calculate_results(repo: str, pool: urllib3.PoolManager):
|
|
77 |
|
78 |
# Average IFEval scores
|
79 |
ifeval_score = (ifeval_inst_score + ifeval_prompt_score) / 2
|
|
|
80 |
|
81 |
# Normalize MUSR scores
|
82 |
musr_scores = []
|
@@ -92,31 +103,25 @@ def calculate_results(repo: str, pool: urllib3.PoolManager):
|
|
92 |
del dataset
|
93 |
|
94 |
musr_score = sum(musr_scores) / len(musr_scores)
|
|
|
95 |
|
96 |
# Calculate overall score
|
97 |
-
|
98 |
-
bbh_score + math_score + gpqa_score +
|
99 |
) / 6
|
|
|
100 |
|
101 |
-
# Round all scores to 2 decimal places
|
102 |
-
bbh_score = float(round(bbh_score, 2))
|
103 |
-
math_score = float(round(math_score, 2))
|
104 |
-
gpqa_score = float(round(gpqa_score, 2))
|
105 |
-
mmlu_pro_score = float(round(mmlu_pro_score, 2))
|
106 |
-
musr_score = float(round(musr_score, 2))
|
107 |
-
ifeval_score = float(round(ifeval_score, 2))
|
108 |
-
overall_score = float(round(overall_score, 2))
|
109 |
results = {
|
110 |
"Model": repo,
|
111 |
"Precision": precision,
|
112 |
"Revision": revision,
|
113 |
-
"Average":
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
}
|
121 |
# pprint(results, sort_dicts=False)
|
122 |
return results
|
@@ -141,61 +146,61 @@ def get_query_url(repo: str):
|
|
141 |
|
142 |
def get_task_summary(results):
|
143 |
return {
|
144 |
-
|
145 |
"dataset_type": "wis-k/instruction-following-eval",
|
146 |
"dataset_name": "IFEval (0-Shot)",
|
147 |
"metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
|
148 |
-
"metric_value": results[
|
149 |
"dataset_config": None,
|
150 |
"dataset_split": "train",
|
151 |
"dataset_args": {"num_few_shot": 0},
|
152 |
"metric_name": "averaged accuracy",
|
153 |
},
|
154 |
-
|
155 |
"dataset_type": "SaylorTwift/bbh",
|
156 |
"dataset_name": "BBH (3-Shot)",
|
157 |
"metric_type": "acc_norm",
|
158 |
-
"metric_value": results[
|
159 |
"dataset_config": None,
|
160 |
"dataset_split": "test",
|
161 |
"dataset_args": {"num_few_shot": 3},
|
162 |
"metric_name": "normalized accuracy",
|
163 |
},
|
164 |
-
|
165 |
"dataset_type": "lighteval/MATH-Hard",
|
166 |
"dataset_name": "MATH Lvl 5 (4-Shot)",
|
167 |
"metric_type": "exact_match",
|
168 |
-
"metric_value": results[
|
169 |
"dataset_config": None,
|
170 |
"dataset_split": "test",
|
171 |
"dataset_args": {"num_few_shot": 4},
|
172 |
"metric_name": "exact match",
|
173 |
},
|
174 |
-
|
175 |
"dataset_type": "Idavidrein/gpqa",
|
176 |
"dataset_name": "GPQA (0-shot)",
|
177 |
"metric_type": "acc_norm",
|
178 |
-
"metric_value": results[
|
179 |
"dataset_config": None,
|
180 |
"dataset_split": "train",
|
181 |
"dataset_args": {"num_few_shot": 0},
|
182 |
"metric_name": "acc_norm",
|
183 |
},
|
184 |
-
|
185 |
"dataset_type": "TAUR-Lab/MuSR",
|
186 |
"dataset_name": "MuSR (0-shot)",
|
187 |
"metric_type": "acc_norm",
|
188 |
-
"metric_value": results[
|
189 |
"dataset_config": None,
|
190 |
"dataset_split": None, # three test splits
|
191 |
"dataset_args": {"num_few_shot": 0},
|
192 |
"metric_name": "acc_norm",
|
193 |
},
|
194 |
-
|
195 |
"dataset_type": "TIGER-Lab/MMLU-Pro",
|
196 |
"dataset_name": "MMLU-PRO (5-shot)",
|
197 |
"metric_type": "acc",
|
198 |
-
"metric_value": results[
|
199 |
"dataset_config": "main",
|
200 |
"dataset_split": "test",
|
201 |
"dataset_args": {"num_few_shot": 5},
|
|
|
21 |
|
22 |
Please report any issues here: https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard/discussions"""
|
23 |
|
24 |
+
KEY_IFEVAL = "IFEval"
|
25 |
+
KEY_BBH = "BBH"
|
26 |
+
KEY_MATH = "MATH Lvl 5"
|
27 |
+
KEY_GPQA = "GPQA"
|
28 |
+
KEY_MUSR = "MuSR"
|
29 |
+
KEY_MMLU = "MMLU-Pro"
|
30 |
|
31 |
def normalize_within_range(value, lower_bound=0, higher_bound=1):
|
32 |
return (np.clip(value - lower_bound, 0, None)) / (higher_bound - lower_bound) * 100
|
|
|
60 |
|
61 |
# Average BBH score
|
62 |
bbh_score = sum(bbh_scores) / len(bbh_scores)
|
63 |
+
bbh_score = float(round(bbh_score, 2))
|
64 |
|
65 |
# Calculate the MATH score
|
66 |
math_raw_score = data["results"]["leaderboard_math_hard"]["exact_match,none"]
|
67 |
math_score = normalize_within_range(math_raw_score, 0, 1.0)
|
68 |
+
math_score = float(round(math_score, 2))
|
69 |
|
70 |
# Normalize GPQA scores
|
71 |
gpqa_raw_score = data["results"]["leaderboard_gpqa"]["acc_norm,none"]
|
72 |
gpqa_score = normalize_within_range(gpqa_raw_score, 0.25, 1.0)
|
73 |
+
gpqa_score = float(round(gpqa_score, 2))
|
74 |
|
75 |
# Normalize MMLU PRO scores
|
76 |
+
mmlu_raw_score = data["results"]["leaderboard_mmlu_pro"]["acc,none"]
|
77 |
+
mmlu_score = normalize_within_range(mmlu_raw_score, 0.1, 1.0)
|
78 |
+
mmlu_score = float(round(mmlu_score, 2))
|
79 |
|
80 |
# Compute IFEval
|
81 |
ifeval_inst_score = (
|
|
|
87 |
|
88 |
# Average IFEval scores
|
89 |
ifeval_score = (ifeval_inst_score + ifeval_prompt_score) / 2
|
90 |
+
ifeval_score = float(round(ifeval_score, 2))
|
91 |
|
92 |
# Normalize MUSR scores
|
93 |
musr_scores = []
|
|
|
103 |
del dataset
|
104 |
|
105 |
musr_score = sum(musr_scores) / len(musr_scores)
|
106 |
+
musr_score = float(round(musr_score, 2))
|
107 |
|
108 |
# Calculate overall score
|
109 |
+
average_score = (
|
110 |
+
bbh_score + math_score + gpqa_score + mmlu_score + musr_score + ifeval_score
|
111 |
) / 6
|
112 |
+
average_score = float(round(average_score, 2))
|
113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
results = {
|
115 |
"Model": repo,
|
116 |
"Precision": precision,
|
117 |
"Revision": revision,
|
118 |
+
"Average": average_score,
|
119 |
+
KEY_IFEVAL: ifeval_score,
|
120 |
+
KEY_BBH: bbh_score,
|
121 |
+
KEY_MATH: math_score,
|
122 |
+
KEY_GPQA: gpqa_score,
|
123 |
+
KEY_MUSR: musr_score,
|
124 |
+
KEY_MMLU: mmlu_score,
|
125 |
}
|
126 |
# pprint(results, sort_dicts=False)
|
127 |
return results
|
|
|
146 |
|
147 |
def get_task_summary(results):
|
148 |
return {
|
149 |
+
KEY_IFEVAL: {
|
150 |
"dataset_type": "wis-k/instruction-following-eval",
|
151 |
"dataset_name": "IFEval (0-Shot)",
|
152 |
"metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
|
153 |
+
"metric_value": results[KEY_IFEVAL],
|
154 |
"dataset_config": None,
|
155 |
"dataset_split": "train",
|
156 |
"dataset_args": {"num_few_shot": 0},
|
157 |
"metric_name": "averaged accuracy",
|
158 |
},
|
159 |
+
KEY_BBH: {
|
160 |
"dataset_type": "SaylorTwift/bbh",
|
161 |
"dataset_name": "BBH (3-Shot)",
|
162 |
"metric_type": "acc_norm",
|
163 |
+
"metric_value": results[KEY_BBH],
|
164 |
"dataset_config": None,
|
165 |
"dataset_split": "test",
|
166 |
"dataset_args": {"num_few_shot": 3},
|
167 |
"metric_name": "normalized accuracy",
|
168 |
},
|
169 |
+
KEY_MATH: {
|
170 |
"dataset_type": "lighteval/MATH-Hard",
|
171 |
"dataset_name": "MATH Lvl 5 (4-Shot)",
|
172 |
"metric_type": "exact_match",
|
173 |
+
"metric_value": results[KEY_MATH],
|
174 |
"dataset_config": None,
|
175 |
"dataset_split": "test",
|
176 |
"dataset_args": {"num_few_shot": 4},
|
177 |
"metric_name": "exact match",
|
178 |
},
|
179 |
+
KEY_GPQA: {
|
180 |
"dataset_type": "Idavidrein/gpqa",
|
181 |
"dataset_name": "GPQA (0-shot)",
|
182 |
"metric_type": "acc_norm",
|
183 |
+
"metric_value": results[KEY_GPQA],
|
184 |
"dataset_config": None,
|
185 |
"dataset_split": "train",
|
186 |
"dataset_args": {"num_few_shot": 0},
|
187 |
"metric_name": "acc_norm",
|
188 |
},
|
189 |
+
KEY_MUSR: {
|
190 |
"dataset_type": "TAUR-Lab/MuSR",
|
191 |
"dataset_name": "MuSR (0-shot)",
|
192 |
"metric_type": "acc_norm",
|
193 |
+
"metric_value": results[KEY_MUSR],
|
194 |
"dataset_config": None,
|
195 |
"dataset_split": None, # three test splits
|
196 |
"dataset_args": {"num_few_shot": 0},
|
197 |
"metric_name": "acc_norm",
|
198 |
},
|
199 |
+
KEY_MMLU: {
|
200 |
"dataset_type": "TIGER-Lab/MMLU-Pro",
|
201 |
"dataset_name": "MMLU-PRO (5-shot)",
|
202 |
"metric_type": "acc",
|
203 |
+
"metric_value": results[KEY_MMLU],
|
204 |
"dataset_config": "main",
|
205 |
"dataset_split": "test",
|
206 |
"dataset_args": {"num_few_shot": 5},
|