Spaces:

HuggingFaceH4
/

open_llm_leaderboard

Restarting on CPU Upgrade

App Files Files Community

748

alozowski commited on 19 days ago

Commit

f86eaae

•

1 Parent(s): 87e47c2

Fixing WIP

Browse files

Files changed (3) hide show

src/display/utils.py +21 -0
src/leaderboard/filter_models.py +0 -3
src/leaderboard/read_evals.py +39 -36

src/display/utils.py CHANGED Viewed

@@ -1,9 +1,30 @@
 from dataclasses import dataclass, make_dataclass
 from enum import Enum
 import json
 import pandas as pd
 def load_json_data(file_path):
     """Safely load JSON data from a file."""
     try:

 from dataclasses import dataclass, make_dataclass
 from enum import Enum
 import json
+import logging
+from datetime import datetime
 import pandas as pd
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+def parse_datetime(datetime_str):
+    formats = [
+        "%Y-%m-%dT%H-%M-%S.%f",  # Format with dashes
+        "%Y-%m-%dT%H:%M:%S.%f",  # Standard format with colons
+        "%Y-%m-%dT%H %M %S.%f",  # Spaces as separator
+    ]
+    for fmt in formats:
+        try:
+            return datetime.strptime(datetime_str, fmt)
+        except ValueError:
+            continue
+    # in rare cases set unix start time for files with incorrect time (legacy files)
+    logging.error(f"No valid date format found for: {datetime_str}")
+    return datetime(1970, 1, 1)
 def load_json_data(file_path):
     """Safely load JSON data from a file."""
     try:

src/leaderboard/filter_models.py CHANGED Viewed

@@ -1,8 +1,6 @@
-import logging
 from src.display.formatting import model_hyperlink
 from src.display.utils import AutoEvalColumn
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 # Models which have been flagged by users as being problematic for a reason or another
 # (Model name to forum discussion link)
@@ -141,7 +139,6 @@ def flag_models(leaderboard_data: list[dict]):
         else:
             flag_key = model_data[AutoEvalColumn.fullname.name]
         if flag_key in FLAGGED_MODELS:
-            # logging.info(f"Flagged model: {flag_key}") # Do we need to print out the list of flagged models?
             issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
             issue_link = model_hyperlink(
                 FLAGGED_MODELS[flag_key],

 from src.display.formatting import model_hyperlink
 from src.display.utils import AutoEvalColumn
 # Models which have been flagged by users as being problematic for a reason or another
 # (Model name to forum discussion link)
         else:
             flag_key = model_data[AutoEvalColumn.fullname.name]
         if flag_key in FLAGGED_MODELS:
             issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
             issue_link = model_hyperlink(
                 FLAGGED_MODELS[flag_key],

src/leaderboard/read_evals.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import json
 from pathlib import Path
-from datetime import datetime
 from json import JSONDecodeError
 import logging
 import math
@@ -14,7 +13,7 @@ from tqdm.contrib.logging import logging_redirect_tqdm
 import numpy as np
 from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -54,7 +53,14 @@ class EvalResult:
         org_and_model = config.get("model_name", "").split("/", 1)
         org = org_and_model[0] if len(org_and_model) > 1 else None
         model = org_and_model[-1]
-        result_key = "_".join(filter(None, [*org_and_model, precision.value.name]))
         full_model = "/".join(org_and_model)
         results = cls.extract_results(data)  # Properly call the method to extract results
@@ -71,26 +77,39 @@ class EvalResult:
     @staticmethod
     def extract_results(data: Dict) -> Dict[str, float]:
         results = {}
         for task in Tasks:
-            task_value = task.value
-            if task_value.benchmark == "hendrycksTest":
-                if any(data.get("versions", {}).get(mmlu_k, 1) == 0 for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]):
                     continue
-            if task_value.benchmark == "truthfulqa:mc":
-                task_key = "harness|truthfulqa:mc|0"
-                if task_key in data["results"]:
-                    task_metric_value = data["results"][task_key][task_value.metric]
-                    if math.isnan(float(task_metric_value)):
-                        results[task_value.benchmark] = 0.0
-                        continue
-            accs = [float(v.get(task_value.metric, 0)) for k, v in data["results"].items() if task_value.benchmark in k and v.get(task_value.metric, None) is not None]
-            if accs:
-                mean_acc = np.mean(accs) * 100.0
-                results[task_value.benchmark] = mean_acc
         return results
@@ -192,23 +211,6 @@ def get_request_file_for_model(requests_path, model_name, precision):
     return request_file
-def parse_datetime(datetime_str):
-    formats = [
-        "%Y-%m-%dT%H-%M-%S.%f",  # Format with dashes
-        "%Y-%m-%dT%H:%M:%S.%f",  # Standard format with colons
-        "%Y-%m-%dT%H %M %S.%f",  # Spaces as separator
-    ]
-    for fmt in formats:
-        try:
-            return datetime.strptime(datetime_str, fmt)
-        except ValueError:
-            continue
-    # in rare cases set unix start time for files with incorrect time (legacy files)
-    logging.error(f"No valid date format found for: {datetime_str}")
-    return datetime(1970, 1, 1)
 def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
     with open(dynamic_path) as f:
@@ -246,7 +248,8 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
                 v.to_dict()  # we test if the dict version is complete
                 results.append(v)
         except KeyError as e:
-            logging.error(f"Error while checking model {k} dict, no key: {e}")  # not all eval values present
             continue
-    return results

 import json
 from pathlib import Path
 from json import JSONDecodeError
 import logging
 import math
 import numpy as np
 from src.display.formatting import make_clickable_model
+from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
         org_and_model = config.get("model_name", "").split("/", 1)
         org = org_and_model[0] if len(org_and_model) > 1 else None
         model = org_and_model[-1]
+        if len(org_and_model) == 1:
+            org = None
+            model = org_and_model[0]
+            result_key = f"{model}_{precision.value.name}"
+        else:
+            org = org_and_model[0]
+            model = org_and_model[1]
+            result_key = f"{org}_{model}_{precision.value.name}"
         full_model = "/".join(org_and_model)
         results = cls.extract_results(data)  # Properly call the method to extract results
     @staticmethod
     def extract_results(data: Dict) -> Dict[str, float]:
+        """
+        Extracts and computes average scores from test result data for different benchmarks.
+        Skips entries based on specific conditions and handles NaN values appropriately.
+        Returns a dictionary with benchmarks as keys and their averaged scores as values in percentage.
+        Parameters:
+        - data (Dict): Input data with 'versions' and 'results'.
+        Returns:
+        - Dict[str, float]: A dictionary with benchmark names and their computed average scores.
+        """
         results = {}
         for task in Tasks:
+            task = task.value
+            # We skip old mmlu entries
+            if task.benchmark == "hendrycksTest":
+                for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
+                    if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
+                        continue
+            # Some truthfulQA values are NaNs
+            if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
+                if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
+                    results[task.benchmark] = 0.0
                     continue
+            # We average all scores of a given metric (mostly for mmlu)
+            accs = [v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k]
+            if accs or any([acc is None for acc in accs]):
+                continue
+            results[task.benchmark] = np.mean(accs) * 100.0
         return results
     return request_file
 def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
     with open(dynamic_path) as f:
                 v.to_dict()  # we test if the dict version is complete
                 results.append(v)
         except KeyError as e:
+            logging.error(f"Error while checking model {k} {v.date} json, no key: {e}")  # not all eval values present
             continue
+    return results