Spaces:

T145
/

open-llm-leaderboard-results-to-modelcard

Running

App Files Files Community

T145 commited on Jan 3

Commit

5ef372e

1 Parent(s): 5af06fb

Set constant keys to make future changes easy

Browse files

Files changed (1) hide show

functions.py +36 -31

functions.py CHANGED Viewed

@@ -21,6 +21,12 @@ The purpose of this PR is to add evaluation results from the Open LLM Leaderboar
 Please report any issues here: https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard/discussions"""
 def normalize_within_range(value, lower_bound=0, higher_bound=1):
     return (np.clip(value - lower_bound, 0, None)) / (higher_bound - lower_bound) * 100
@@ -54,18 +60,22 @@ def calculate_results(repo: str, pool: urllib3.PoolManager):
         # Average BBH score
         bbh_score = sum(bbh_scores) / len(bbh_scores)
         # Calculate the MATH score
         math_raw_score = data["results"]["leaderboard_math_hard"]["exact_match,none"]
         math_score = normalize_within_range(math_raw_score, 0, 1.0)
         # Normalize GPQA scores
         gpqa_raw_score = data["results"]["leaderboard_gpqa"]["acc_norm,none"]
         gpqa_score = normalize_within_range(gpqa_raw_score, 0.25, 1.0)
         # Normalize MMLU PRO scores
-        mmlu_pro_raw_score = data["results"]["leaderboard_mmlu_pro"]["acc,none"]
-        mmlu_pro_score = normalize_within_range(mmlu_pro_raw_score, 0.1, 1.0)
         # Compute IFEval
         ifeval_inst_score = (
@@ -77,6 +87,7 @@ def calculate_results(repo: str, pool: urllib3.PoolManager):
         # Average IFEval scores
         ifeval_score = (ifeval_inst_score + ifeval_prompt_score) / 2
         # Normalize MUSR scores
         musr_scores = []
@@ -92,31 +103,25 @@ def calculate_results(repo: str, pool: urllib3.PoolManager):
             del dataset
         musr_score = sum(musr_scores) / len(musr_scores)
         # Calculate overall score
-        overall_score = (
-            bbh_score + math_score + gpqa_score + mmlu_pro_score + musr_score + ifeval_score
         ) / 6
-        # Round all scores to 2 decimal places
-        bbh_score = float(round(bbh_score, 2))
-        math_score = float(round(math_score, 2))
-        gpqa_score = float(round(gpqa_score, 2))
-        mmlu_pro_score = float(round(mmlu_pro_score, 2))
-        musr_score = float(round(musr_score, 2))
-        ifeval_score = float(round(ifeval_score, 2))
-        overall_score = float(round(overall_score, 2))
         results = {
             "Model": repo,
             "Precision": precision,
             "Revision": revision,
-            "Average": overall_score,
-            "IFEval": ifeval_score,
-            "BBH": bbh_score,
-            "MATH Lvl 5": math_score,
-            "GPQA": gpqa_score,
-            "MUSR": musr_score,
-            "MMLU-PRO": mmlu_pro_score,
         }
         # pprint(results, sort_dicts=False)
         return results
@@ -141,61 +146,61 @@ def get_query_url(repo: str):
 def get_task_summary(results):
     return {
-        "IFEval": {
             "dataset_type": "wis-k/instruction-following-eval",
             "dataset_name": "IFEval (0-Shot)",
             "metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
-            "metric_value": results["IFEval"],
             "dataset_config": None,
             "dataset_split": "train",
             "dataset_args": {"num_few_shot": 0},
             "metric_name": "averaged accuracy",
         },
-        "BBH": {
             "dataset_type": "SaylorTwift/bbh",
             "dataset_name": "BBH (3-Shot)",
             "metric_type": "acc_norm",
-            "metric_value": results["BBH"],
             "dataset_config": None,
             "dataset_split": "test",
             "dataset_args": {"num_few_shot": 3},
             "metric_name": "normalized accuracy",
         },
-        "MATH Lvl 5": {
             "dataset_type": "lighteval/MATH-Hard",
             "dataset_name": "MATH Lvl 5 (4-Shot)",
             "metric_type": "exact_match",
-            "metric_value": results["MATH Lvl 5"],
             "dataset_config": None,
             "dataset_split": "test",
             "dataset_args": {"num_few_shot": 4},
             "metric_name": "exact match",
         },
-        "GPQA": {
             "dataset_type": "Idavidrein/gpqa",
             "dataset_name": "GPQA (0-shot)",
             "metric_type": "acc_norm",
-            "metric_value": results["GPQA"],
             "dataset_config": None,
             "dataset_split": "train",
             "dataset_args": {"num_few_shot": 0},
             "metric_name": "acc_norm",
         },
-        "MuSR": {
             "dataset_type": "TAUR-Lab/MuSR",
             "dataset_name": "MuSR (0-shot)",
             "metric_type": "acc_norm",
-            "metric_value": results["MUSR"],
             "dataset_config": None,
             "dataset_split": None,  # three test splits
             "dataset_args": {"num_few_shot": 0},
             "metric_name": "acc_norm",
         },
-        "MMLU-PRO": {
             "dataset_type": "TIGER-Lab/MMLU-Pro",
             "dataset_name": "MMLU-PRO (5-shot)",
             "metric_type": "acc",
-            "metric_value": results["MMLU-PRO"],
             "dataset_config": "main",
             "dataset_split": "test",
             "dataset_args": {"num_few_shot": 5},

 Please report any issues here: https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard/discussions"""
+KEY_IFEVAL = "IFEval"
+KEY_BBH = "BBH"
+KEY_MATH = "MATH Lvl 5"
+KEY_GPQA = "GPQA"
+KEY_MUSR = "MuSR"
+KEY_MMLU = "MMLU-Pro"
 def normalize_within_range(value, lower_bound=0, higher_bound=1):
     return (np.clip(value - lower_bound, 0, None)) / (higher_bound - lower_bound) * 100
         # Average BBH score
         bbh_score = sum(bbh_scores) / len(bbh_scores)
+        bbh_score = float(round(bbh_score, 2))
         # Calculate the MATH score
         math_raw_score = data["results"]["leaderboard_math_hard"]["exact_match,none"]
         math_score = normalize_within_range(math_raw_score, 0, 1.0)
+        math_score = float(round(math_score, 2))
         # Normalize GPQA scores
         gpqa_raw_score = data["results"]["leaderboard_gpqa"]["acc_norm,none"]
         gpqa_score = normalize_within_range(gpqa_raw_score, 0.25, 1.0)
+        gpqa_score = float(round(gpqa_score, 2))
         # Normalize MMLU PRO scores
+        mmlu_raw_score = data["results"]["leaderboard_mmlu_pro"]["acc,none"]
+        mmlu_score = normalize_within_range(mmlu_raw_score, 0.1, 1.0)
+        mmlu_score = float(round(mmlu_score, 2))
         # Compute IFEval
         ifeval_inst_score = (
         # Average IFEval scores
         ifeval_score = (ifeval_inst_score + ifeval_prompt_score) / 2
+        ifeval_score = float(round(ifeval_score, 2))
         # Normalize MUSR scores
         musr_scores = []
             del dataset
         musr_score = sum(musr_scores) / len(musr_scores)
+        musr_score = float(round(musr_score, 2))
         # Calculate overall score
+        average_score = (
+            bbh_score + math_score + gpqa_score + mmlu_score + musr_score + ifeval_score
         ) / 6
+        average_score = float(round(average_score, 2))
         results = {
             "Model": repo,
             "Precision": precision,
             "Revision": revision,
+            "Average": average_score,
+            KEY_IFEVAL: ifeval_score,
+            KEY_BBH: bbh_score,
+            KEY_MATH: math_score,
+            KEY_GPQA: gpqa_score,
+            KEY_MUSR: musr_score,
+            KEY_MMLU: mmlu_score,
         }
         # pprint(results, sort_dicts=False)
         return results
 def get_task_summary(results):
     return {
+        KEY_IFEVAL: {
             "dataset_type": "wis-k/instruction-following-eval",
             "dataset_name": "IFEval (0-Shot)",
             "metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
+            "metric_value": results[KEY_IFEVAL],
             "dataset_config": None,
             "dataset_split": "train",
             "dataset_args": {"num_few_shot": 0},
             "metric_name": "averaged accuracy",
         },
+        KEY_BBH: {
             "dataset_type": "SaylorTwift/bbh",
             "dataset_name": "BBH (3-Shot)",
             "metric_type": "acc_norm",
+            "metric_value": results[KEY_BBH],
             "dataset_config": None,
             "dataset_split": "test",
             "dataset_args": {"num_few_shot": 3},
             "metric_name": "normalized accuracy",
         },
+        KEY_MATH: {
             "dataset_type": "lighteval/MATH-Hard",
             "dataset_name": "MATH Lvl 5 (4-Shot)",
             "metric_type": "exact_match",
+            "metric_value": results[KEY_MATH],
             "dataset_config": None,
             "dataset_split": "test",
             "dataset_args": {"num_few_shot": 4},
             "metric_name": "exact match",
         },
+        KEY_GPQA: {
             "dataset_type": "Idavidrein/gpqa",
             "dataset_name": "GPQA (0-shot)",
             "metric_type": "acc_norm",
+            "metric_value": results[KEY_GPQA],
             "dataset_config": None,
             "dataset_split": "train",
             "dataset_args": {"num_few_shot": 0},
             "metric_name": "acc_norm",
         },
+        KEY_MUSR: {
             "dataset_type": "TAUR-Lab/MuSR",
             "dataset_name": "MuSR (0-shot)",
             "metric_type": "acc_norm",
+            "metric_value": results[KEY_MUSR],
             "dataset_config": None,
             "dataset_split": None,  # three test splits
             "dataset_args": {"num_few_shot": 0},
             "metric_name": "acc_norm",
         },
+        KEY_MMLU: {
             "dataset_type": "TIGER-Lab/MMLU-Pro",
             "dataset_name": "MMLU-PRO (5-shot)",
             "metric_type": "acc",
+            "metric_value": results[KEY_MMLU],
             "dataset_config": "main",
             "dataset_split": "test",
             "dataset_args": {"num_few_shot": 5},