clefourrier HF staff alozowski HF staff commited on
Commit
a5d34d3
1 Parent(s): b7d036c

performance-improvement (#705)

Browse files

- read_evals initial change (705a80cbc41d99ade1f153597b6a9615e9e49a6e)
- improved logging (dadbd309a2806d85f67d888071f2f462a8631573)
- wip improvement (79b2cd565d40f76388770b0703b07431d41efe2a)
- more read_evals.py improvement (9b133aab61075d213546baa519cd392206ea5d05)
- Updated app.py download_dataset function (87e47c26a99aa08208c7aca46842ef9a3f2b078d)
- Fixing WIP (f86eaae89ef990a5d0066fb92946b8d8648adfa4)
- Changes as per comments (c74b7d7ce23fd9f7df60deddf8789e51288d1821)


Co-authored-by: Alina Lozovskaya <alozowski@users.noreply.huggingface.co>

app.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  import logging
3
  import gradio as gr
4
  import pandas as pd
@@ -49,6 +50,9 @@ from src.tools.collections import update_collections
49
  from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
50
 
51
 
 
 
 
52
  # Start ephemeral Spaces on PRs (see config in README.md)
53
  enable_space_ci()
54
 
@@ -57,12 +61,24 @@ def restart_space():
57
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
58
 
59
 
60
- def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3):
61
- """Attempt to download dataset with retries."""
 
 
 
 
 
 
 
 
 
 
 
 
62
  attempt = 0
63
  while attempt < max_attempts:
64
  try:
65
- print(f"Downloading {repo_id} to {local_dir}")
66
  snapshot_download(
67
  repo_id=repo_id,
68
  local_dir=local_dir,
@@ -71,21 +87,25 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3):
71
  etag_timeout=30,
72
  max_workers=8,
73
  )
 
74
  return
75
  except Exception as e:
76
- logging.error(f"Error downloading {repo_id}: {e}")
 
 
77
  attempt += 1
78
- if attempt == max_attempts:
79
- restart_space()
80
-
81
 
82
  def init_space(full_init: bool = True):
83
  """Initializes the application space, loading only necessary data."""
84
  if full_init:
85
  # These downloads only occur on full initialization
86
- download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
87
- download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
88
- download_dataset(RESULTS_REPO, EVAL_RESULTS_PATH)
 
 
 
89
 
90
  # Always retrieve the leaderboard DataFrame
91
  raw_data, original_df = get_leaderboard_df(
 
1
  import os
2
+ import time
3
  import logging
4
  import gradio as gr
5
  import pandas as pd
 
50
  from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
51
 
52
 
53
+ # Configure logging
54
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
55
+
56
  # Start ephemeral Spaces on PRs (see config in README.md)
57
  enable_space_ci()
58
 
 
61
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
62
 
63
 
64
+ def time_diff_wrapper(func):
65
+ def wrapper(*args, **kwargs):
66
+ start_time = time.time()
67
+ result = func(*args, **kwargs)
68
+ end_time = time.time()
69
+ diff = end_time - start_time
70
+ logging.info(f"Time taken for {func.__name__}: {diff} seconds")
71
+ return result
72
+ return wrapper
73
+
74
+
75
+ @time_diff_wrapper
76
+ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
77
+ """Download dataset with exponential backoff retries."""
78
  attempt = 0
79
  while attempt < max_attempts:
80
  try:
81
+ logging.info(f"Downloading {repo_id} to {local_dir}")
82
  snapshot_download(
83
  repo_id=repo_id,
84
  local_dir=local_dir,
 
87
  etag_timeout=30,
88
  max_workers=8,
89
  )
90
+ logging.info("Download successful")
91
  return
92
  except Exception as e:
93
+ wait_time = backoff_factor ** attempt
94
+ logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
95
+ time.sleep(wait_time)
96
  attempt += 1
97
+ raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
 
 
98
 
99
  def init_space(full_init: bool = True):
100
  """Initializes the application space, loading only necessary data."""
101
  if full_init:
102
  # These downloads only occur on full initialization
103
+ try:
104
+ download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
105
+ download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
106
+ download_dataset(RESULTS_REPO, EVAL_RESULTS_PATH)
107
+ except Exception:
108
+ restart_space()
109
 
110
  # Always retrieve the leaderboard DataFrame
111
  raw_data, original_df = get_leaderboard_df(
pyproject.toml CHANGED
@@ -1,9 +1,15 @@
1
  [tool.ruff]
2
- # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
- lint.select = ["E", "F"]
4
- lint.ignore = ["E501"] # line too long (black is taking care of this)
5
- line-length = 119
6
- lint.fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
 
 
 
 
 
 
7
 
8
  [tool.isort]
9
  profile = "black"
 
1
  [tool.ruff]
2
+ line-length = 120
3
+ target-version = "py312"
4
+ include = ["*.py", "*.pyi", "**/pyproject.toml", "*.ipynb"]
5
+ ignore=["I","EM","FBT","TRY003","S101","D101","D102","D103","D104","D105","G004","D107","FA102"]
6
+ fixable=["ALL"]
7
+ select=["ALL"]
8
+
9
+ [tool.ruff.lint]
10
+ select = ["E", "F"]
11
+ fixable = ["ALL"]
12
+ ignore = ["E501"] # line too long (black is taking care of this)
13
 
14
  [tool.isort]
15
  profile = "black"
src/display/utils.py CHANGED
@@ -1,9 +1,30 @@
1
  from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
  import json
 
 
4
  import pandas as pd
5
 
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def load_json_data(file_path):
8
  """Safely load JSON data from a file."""
9
  try:
 
1
  from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
  import json
4
+ import logging
5
+ from datetime import datetime
6
  import pandas as pd
7
 
8
 
9
+ # Configure logging
10
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
11
+
12
+ def parse_datetime(datetime_str):
13
+ formats = [
14
+ "%Y-%m-%dT%H-%M-%S.%f", # Format with dashes
15
+ "%Y-%m-%dT%H:%M:%S.%f", # Standard format with colons
16
+ "%Y-%m-%dT%H %M %S.%f", # Spaces as separator
17
+ ]
18
+
19
+ for fmt in formats:
20
+ try:
21
+ return datetime.strptime(datetime_str, fmt)
22
+ except ValueError:
23
+ continue
24
+ # in rare cases set unix start time for files with incorrect time (legacy files)
25
+ logging.error(f"No valid date format found for: {datetime_str}")
26
+ return datetime(1970, 1, 1)
27
+
28
  def load_json_data(file_path):
29
  """Safely load JSON data from a file."""
30
  try:
src/envs.py CHANGED
@@ -26,7 +26,7 @@ if not os.access(HF_HOME, os.W_OK):
26
  HF_HOME = "."
27
  os.environ["HF_HOME"] = HF_HOME
28
  else:
29
- print(f"Write access confirmed for HF_HOME")
30
 
31
  EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
32
  EVAL_RESULTS_PATH = os.path.join(HF_HOME, "eval-results")
 
26
  HF_HOME = "."
27
  os.environ["HF_HOME"] = HF_HOME
28
  else:
29
+ print("Write access confirmed for HF_HOME")
30
 
31
  EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
32
  EVAL_RESULTS_PATH = os.path.join(HF_HOME, "eval-results")
src/leaderboard/filter_models.py CHANGED
@@ -1,6 +1,7 @@
1
  from src.display.formatting import model_hyperlink
2
  from src.display.utils import AutoEvalColumn
3
 
 
4
  # Models which have been flagged by users as being problematic for a reason or another
5
  # (Model name to forum discussion link)
6
  FLAGGED_MODELS = {
@@ -137,10 +138,7 @@ def flag_models(leaderboard_data: list[dict]):
137
  flag_key = "merged"
138
  else:
139
  flag_key = model_data[AutoEvalColumn.fullname.name]
140
-
141
- print(f"model check: {flag_key}")
142
  if flag_key in FLAGGED_MODELS:
143
- print(f"Flagged model: {flag_key}")
144
  issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
145
  issue_link = model_hyperlink(
146
  FLAGGED_MODELS[flag_key],
 
1
  from src.display.formatting import model_hyperlink
2
  from src.display.utils import AutoEvalColumn
3
 
4
+
5
  # Models which have been flagged by users as being problematic for a reason or another
6
  # (Model name to forum discussion link)
7
  FLAGGED_MODELS = {
 
138
  flag_key = "merged"
139
  else:
140
  flag_key = model_data[AutoEvalColumn.fullname.name]
 
 
141
  if flag_key in FLAGGED_MODELS:
 
142
  issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
143
  issue_link = model_hyperlink(
144
  FLAGGED_MODELS[flag_key],
src/leaderboard/read_evals.py CHANGED
@@ -1,55 +1,58 @@
1
- import glob
2
  import json
 
 
 
3
  import math
4
- import os
5
- from dataclasses import dataclass
6
 
7
- import dateutil
 
 
 
 
 
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
12
 
 
 
13
 
14
  @dataclass
15
  class EvalResult:
16
  # Also see src.display.utils.AutoEvalColumn for what will be displayed.
17
- eval_name: str # org_model_precision (uid)
18
- full_model: str # org/model (path on hub)
19
- org: str
20
  model: str
21
- revision: str # commit hash, "" if main
22
- results: dict
23
  precision: Precision = Precision.Unknown
24
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
25
- weight_type: WeightType = WeightType.Original # Original or Adapter
26
- architecture: str = "Unknown" # From config file
27
  license: str = "?"
28
  likes: int = 0
29
  num_params: int = 0
30
- date: str = "" # submission date of request file
31
  still_on_hub: bool = True
32
  is_merge: bool = False
33
  flagged: bool = False
34
  status: str = "FINISHED"
35
- tags: list = None
36
-
 
 
37
  @classmethod
38
- def init_from_json_file(self, json_filepath):
39
- """Inits the result from the specific model result file"""
40
- with open(json_filepath) as fp:
41
  data = json.load(fp)
42
 
43
- # We manage the legacy config format
44
- config = data.get("config_general")
45
-
46
- # Precision
47
- precision = Precision.from_str(config.get("model_dtype"))
48
-
49
- # Get model and org
50
- org_and_model = config.get("model_name")
51
- org_and_model = org_and_model.split("/", 1)
52
-
53
  if len(org_and_model) == 1:
54
  org = None
55
  model = org_and_model[0]
@@ -60,25 +63,53 @@ class EvalResult:
60
  result_key = f"{org}_{model}_{precision.value.name}"
61
  full_model = "/".join(org_and_model)
62
 
63
- # Extract results available in this file (some results are split in several files)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  results = {}
65
  for task in Tasks:
66
  task = task.value
67
  # We skip old mmlu entries
68
- wrong_mmlu_version = False
69
  if task.benchmark == "hendrycksTest":
70
  for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
71
  if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
72
- wrong_mmlu_version = True
73
 
74
- if wrong_mmlu_version:
75
- continue
76
-
77
- # Some truthfulQA values are NaNs
78
- if task.benchmark == "truthfulqa:mc" and "harness|truthfulqa:mc|0" in data["results"]:
79
- if math.isnan(float(data["results"]["harness|truthfulqa:mc|0"][task.metric])):
80
- results[task.benchmark] = 0.0
81
- continue
82
 
83
  # We average all scores of a given metric (mostly for mmlu)
84
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
@@ -87,40 +118,54 @@ class EvalResult:
87
 
88
  mean_acc = np.mean(accs) * 100.0
89
  results[task.benchmark] = mean_acc
 
 
90
 
91
- return self(
92
- eval_name=result_key,
93
- full_model=full_model,
94
- org=org,
95
- model=model,
96
- results=results,
97
- precision=precision,
98
- revision=config.get("model_sha", ""),
99
- )
100
 
101
  def update_with_request_file(self, requests_path):
102
- """Finds the relevant request file for the current model and updates info with it"""
103
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
104
-
105
  try:
 
 
 
 
 
 
106
  with open(request_file, "r") as f:
107
  request = json.load(f)
 
108
  self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
109
  self.weight_type = WeightType[request.get("weight_type", "Original")]
110
- self.num_params = request.get("params", 0)
111
  self.date = request.get("submitted_time", "")
112
  self.architecture = request.get("architectures", "Unknown")
113
  self.status = request.get("status", "FAILED")
114
- except Exception:
 
 
 
 
 
 
 
115
  self.status = "FAILED"
116
- print(f"Could not find request file for {self.org}/{self.model}")
 
 
 
 
117
 
118
  def update_with_dynamic_file_dict(self, file_dict):
 
 
119
  self.license = file_dict.get("license", "?")
120
- self.likes = file_dict.get("likes", 0)
121
- self.still_on_hub = file_dict["still_on_hub"]
122
  self.tags = file_dict.get("tags", [])
123
- self.flagged = any("flagged" in tag for tag in self.tags)
 
 
 
124
 
125
  def to_dict(self):
126
  """Converts the Eval Result to a dict compatible with our dataframe display"""
@@ -149,55 +194,48 @@ class EvalResult:
149
  data_dict[task.value.col_name] = self.results[task.value.benchmark]
150
 
151
  return data_dict
152
-
153
 
154
  def get_request_file_for_model(requests_path, model_name, precision):
155
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
156
- request_files = os.path.join(
157
- requests_path,
158
- f"{model_name}_eval_request_*.json",
159
- )
160
- request_files = glob.glob(request_files)
161
-
162
- # Select correct request file (precision)
163
- request_file = ""
164
- request_files = sorted(request_files, reverse=True)
165
- for tmp_request_file in request_files:
166
- with open(tmp_request_file, "r") as f:
 
 
167
  req_content = json.load(f)
168
- if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
169
- request_file = tmp_request_file
 
 
170
  return request_file
171
 
172
 
173
  def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
174
  """From the path of the results folder root, extract all needed info for results"""
175
- model_result_filepaths = []
176
-
177
- for root, _, files in os.walk(results_path):
178
- # We should only have json files in model results
179
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
180
- continue
181
-
182
- # Sort the files by date
183
- try:
184
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
185
- except dateutil.parser._parser.ParserError:
186
- files = [files[-1]]
187
-
188
- for file in files:
189
- model_result_filepaths.append(os.path.join(root, file))
190
-
191
  with open(dynamic_path) as f:
192
  dynamic_data = json.load(f)
 
 
 
 
193
 
194
  eval_results = {}
195
- for model_result_filepath in model_result_filepaths:
 
196
  # Creation of result
197
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
198
- eval_result.update_with_request_file(requests_path)
199
- if eval_result.full_model == "databricks/dbrx-base":
200
- print("WE HERE")
201
  if eval_result.full_model in dynamic_data:
202
  eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
203
  # Hardcoding because of gating problem
@@ -212,12 +250,14 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
212
  eval_results[eval_name] = eval_result
213
 
214
  results = []
215
- for v in eval_results.values():
216
  try:
217
  if v.status == "FINISHED":
218
  v.to_dict() # we test if the dict version is complete
219
  results.append(v)
220
- except KeyError: # not all eval values present
 
221
  continue
222
 
223
  return results
 
 
 
1
  import json
2
+ from pathlib import Path
3
+ from json import JSONDecodeError
4
+ import logging
5
  import math
 
 
6
 
7
+ from dataclasses import dataclass, field
8
+ from typing import Optional, Dict, List
9
+
10
+ from tqdm import tqdm
11
+ from tqdm.contrib.logging import logging_redirect_tqdm
12
+
13
  import numpy as np
14
 
15
  from src.display.formatting import make_clickable_model
16
+ from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
17
 
18
+ # Configure logging
19
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
20
 
21
  @dataclass
22
  class EvalResult:
23
  # Also see src.display.utils.AutoEvalColumn for what will be displayed.
24
+ eval_name: str # org_model_precision (uid)
25
+ full_model: str # org/model (path on hub)
26
+ org: Optional[str]
27
  model: str
28
+ revision: str # commit hash, "" if main
29
+ results: Dict[str, float]
30
  precision: Precision = Precision.Unknown
31
+ model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
32
+ weight_type: WeightType = WeightType.Original
33
+ architecture: str = "Unknown" # From config file
34
  license: str = "?"
35
  likes: int = 0
36
  num_params: int = 0
37
+ date: str = "" # submission date of request file
38
  still_on_hub: bool = True
39
  is_merge: bool = False
40
  flagged: bool = False
41
  status: str = "FINISHED"
42
+ # List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
43
+ tags: List[str] = field(default_factory=list)
44
+
45
+
46
  @classmethod
47
+ def init_from_json_file(cls, json_filepath: str) -> 'EvalResult':
48
+ with open(json_filepath, 'r') as fp:
 
49
  data = json.load(fp)
50
 
51
+ config = data.get("config_general", {})
52
+ precision = Precision.from_str(config.get("model_dtype", "unknown"))
53
+ org_and_model = config.get("model_name", "").split("/", 1)
54
+ org = org_and_model[0] if len(org_and_model) > 1 else None
55
+ model = org_and_model[-1]
 
 
 
 
 
56
  if len(org_and_model) == 1:
57
  org = None
58
  model = org_and_model[0]
 
63
  result_key = f"{org}_{model}_{precision.value.name}"
64
  full_model = "/".join(org_and_model)
65
 
66
+ results = cls.extract_results(data) # Properly call the method to extract results
67
+
68
+ return cls(
69
+ eval_name=result_key,
70
+ full_model=full_model,
71
+ org=org,
72
+ model=model,
73
+ results=results,
74
+ precision=precision,
75
+ revision=config.get("model_sha", "")
76
+ )
77
+
78
+ @staticmethod
79
+ def extract_results(data: Dict) -> Dict[str, float]:
80
+ """
81
+ Extract and process benchmark results from a given dict.
82
+
83
+ Parameters:
84
+ - data (Dict): A dictionary containing benchmark data. This dictionary must
85
+ include 'versions' and 'results' keys with respective sub-data.
86
+
87
+ Returns:
88
+ - Dict[str, float]: A dictionary where keys are benchmark names and values
89
+ are the processed average scores as percentages.
90
+
91
+ Notes:
92
+ - The method specifically checks for certain benchmark names to skip outdated entries.
93
+ - Handles NaN values by setting the corresponding benchmark result to 0.0.
94
+ - Averages scores across metrics for benchmarks found in the data, in a percentage format.
95
+ """
96
  results = {}
97
  for task in Tasks:
98
  task = task.value
99
  # We skip old mmlu entries
 
100
  if task.benchmark == "hendrycksTest":
101
  for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
102
  if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
103
+ continue
104
 
105
+ # Some benchamrk values are NaNs, mostly truthfulQA
106
+ # Would be more optimal (without the whole dict itertion) if benchmark name was same as key in results
107
+ # e.g. not harness|truthfulqa:mc|0 but truthfulqa:mc
108
+ for k, v in data["results"].items():
109
+ if task.benchmark in k:
110
+ if math.isnan(float(v[task.metric])):
111
+ results[task.benchmark] = 0.0
112
+ continue
113
 
114
  # We average all scores of a given metric (mostly for mmlu)
115
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
 
118
 
119
  mean_acc = np.mean(accs) * 100.0
120
  results[task.benchmark] = mean_acc
121
+
122
+ return results
123
 
 
 
 
 
 
 
 
 
 
124
 
125
  def update_with_request_file(self, requests_path):
126
+ """Finds the relevant request file for the current model and updates info with it."""
 
 
127
  try:
128
+ request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
129
+ if request_file is None:
130
+ logging.warning(f"No request file for {self.org}/{self.model}")
131
+ self.status = "FAILED"
132
+ return
133
+
134
  with open(request_file, "r") as f:
135
  request = json.load(f)
136
+
137
  self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
138
  self.weight_type = WeightType[request.get("weight_type", "Original")]
139
+ self.num_params = int(request.get("params", 0)) # Ensuring type safety
140
  self.date = request.get("submitted_time", "")
141
  self.architecture = request.get("architectures", "Unknown")
142
  self.status = request.get("status", "FAILED")
143
+
144
+ except FileNotFoundError:
145
+ self.status = "FAILED"
146
+ logging.error(f"Request file: {request_file} not found for {self.org}/{self.model}")
147
+ except JSONDecodeError:
148
+ self.status = "FAILED"
149
+ logging.error(f"Error decoding JSON from the request file for {self.org}/{self.model}")
150
+ except KeyError as e:
151
  self.status = "FAILED"
152
+ logging.error(f"Key error {e} in processing request file for {self.org}/{self.model}")
153
+ except Exception as e: # Catch-all for any other unexpected exceptions
154
+ self.status = "FAILED"
155
+ logging.error(f"Unexpected error {e} for {self.org}/{self.model}")
156
+
157
 
158
  def update_with_dynamic_file_dict(self, file_dict):
159
+ """Update object attributes based on the provided dictionary, with error handling for missing keys and type validation."""
160
+ # Default values set for optional or potentially missing keys.
161
  self.license = file_dict.get("license", "?")
162
+ self.likes = int(file_dict.get("likes", 0)) # Ensure likes is treated as an integer
163
+ self.still_on_hub = file_dict.get("still_on_hub", False) # Default to False if key is missing
164
  self.tags = file_dict.get("tags", [])
165
+
166
+ # Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
167
+ self.flagged = "flagged" in self.tags
168
+
169
 
170
  def to_dict(self):
171
  """Converts the Eval Result to a dict compatible with our dataframe display"""
 
194
  data_dict[task.value.col_name] = self.results[task.value.benchmark]
195
 
196
  return data_dict
197
+
198
 
199
  def get_request_file_for_model(requests_path, model_name, precision):
200
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
201
+ requests_path = Path(requests_path)
202
+ pattern = f"{model_name}_eval_request_*.json"
203
+
204
+ # Using pathlib to find files matching the pattern
205
+ request_files = list(requests_path.glob(pattern))
206
+
207
+ # Sort the files by name in descending order to mimic 'reverse=True'
208
+ request_files.sort(reverse=True)
209
+
210
+ # Select the correct request file based on 'status' and 'precision'
211
+ request_file = None
212
+ for request_file in request_files:
213
+ with request_file.open("r") as f:
214
  req_content = json.load(f)
215
+ if req_content["status"] == "FINISHED" and req_content["precision"] == precision.split(".")[-1]:
216
+ request_file = str(request_file)
217
+
218
+ # Return empty string if no file found that matches criteria
219
  return request_file
220
 
221
 
222
  def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
223
  """From the path of the results folder root, extract all needed info for results"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  with open(dynamic_path) as f:
225
  dynamic_data = json.load(f)
226
+
227
+ results_path = Path(results_path)
228
+ model_files = list(results_path.rglob('results_*.json'))
229
+ model_files.sort(key=lambda file: parse_datetime(file.stem.removeprefix("results_")))
230
 
231
  eval_results = {}
232
+ # Wrap model_files iteration with tqdm for progress display
233
+ for model_result_filepath in tqdm(model_files, desc="Processing model files"):
234
  # Creation of result
235
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
236
+ with logging_redirect_tqdm():
237
+ eval_result.update_with_request_file(requests_path)
238
+
239
  if eval_result.full_model in dynamic_data:
240
  eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
241
  # Hardcoding because of gating problem
 
250
  eval_results[eval_name] = eval_result
251
 
252
  results = []
253
+ for k, v in eval_results.items():
254
  try:
255
  if v.status == "FINISHED":
256
  v.to_dict() # we test if the dict version is complete
257
  results.append(v)
258
+ except KeyError as e:
259
+ logging.error(f"Error while checking model {k} {v.date} json, no key: {e}") # not all eval values present
260
  continue
261
 
262
  return results
263
+
src/populate.py CHANGED
@@ -52,4 +52,3 @@ def get_leaderboard_df(results_path, requests_path, dynamic_path, cols, benchmar
52
  df = df[cols].round(decimals=2)
53
  df = df[has_no_nan_values(df, benchmark_cols)]
54
  return raw_data, df
55
-
 
52
  df = df[cols].round(decimals=2)
53
  df = df[has_no_nan_values(df, benchmark_cols)]
54
  return raw_data, df
 
src/tools/collections.py CHANGED
@@ -73,4 +73,4 @@ def update_collections(df: DataFrame):
73
  try:
74
  delete_collection_item(collection_slug=PATH_TO_COLLECTION, item_object_id=item_id, token=H4_TOKEN)
75
  except HfHubHTTPError:
76
- continue
 
73
  try:
74
  delete_collection_item(collection_slug=PATH_TO_COLLECTION, item_object_id=item_id, token=H4_TOKEN)
75
  except HfHubHTTPError:
76
+ continue