Clémentine commited on
Commit
4fc3864
·
1 Parent(s): 3ac217c

init - cleaning the code base, plus adding the new system to load from contents

Browse files
app.py CHANGED
@@ -2,6 +2,7 @@ import os
2
  import logging
3
  import time
4
  import gradio as gr
 
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
  from huggingface_hub import snapshot_download
7
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
@@ -30,21 +31,14 @@ from src.display.utils import (
30
  )
31
  from src.envs import (
32
  API,
33
- DYNAMIC_INFO_FILE_PATH,
34
- DYNAMIC_INFO_PATH,
35
- DYNAMIC_INFO_REPO,
36
  EVAL_REQUESTS_PATH,
37
- EVAL_RESULTS_PATH,
38
  H4_TOKEN,
39
- IS_PUBLIC,
40
  QUEUE_REPO,
41
  REPO_ID,
42
- RESULTS_REPO,
43
  )
44
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
45
- from src.scripts.update_all_request_files import update_dynamic_files
46
  from src.submission.submit import add_new_eval
47
- from src.tools.collections import update_collections
48
  from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
49
 
50
  # Configure logging
@@ -101,30 +95,21 @@ def init_space(full_init: bool = True):
101
  # These downloads only occur on full initialization
102
  try:
103
  download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
104
- download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
105
- download_dataset(RESULTS_REPO, EVAL_RESULTS_PATH)
106
  except Exception:
107
  restart_space()
108
 
109
  # Always retrieve the leaderboard DataFrame
110
- raw_data, original_df = get_leaderboard_df(
111
- results_path=EVAL_RESULTS_PATH,
112
- requests_path=EVAL_REQUESTS_PATH,
113
- dynamic_path=DYNAMIC_INFO_FILE_PATH,
114
  cols=COLS,
115
  benchmark_cols=BENCHMARK_COLS,
116
  )
117
 
118
- if full_init:
119
- # Collection update only happens on full initialization
120
- update_collections(original_df)
121
-
122
- leaderboard_df = original_df.copy()
123
-
124
  # Evaluation queue DataFrame retrieval is independent of initialization detail level
125
  eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
126
 
127
- return leaderboard_df, raw_data, original_df, eval_queue_dfs
128
 
129
 
130
  # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
@@ -133,14 +118,14 @@ do_full_init = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
133
 
134
  # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
135
  # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
136
- leaderboard_df, raw_data, original_df, eval_queue_dfs = init_space(full_init=do_full_init)
137
  finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
138
 
139
 
140
  # Data processing for plots now only on demand in the respective Gradio tab
141
- def load_and_create_plots():
142
- plot_df = create_plot_df(create_scores_df(raw_data))
143
- return plot_df
144
 
145
 
146
  demo = gr.Blocks(css=custom_css)
@@ -182,24 +167,24 @@ with demo:
182
  bool_checkboxgroup_label="Hide models",
183
  )
184
 
185
- with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
186
- with gr.Row():
187
- with gr.Column():
188
- plot_df = load_and_create_plots()
189
- chart = create_metric_plot_obj(
190
- plot_df,
191
- [AutoEvalColumn.average.name],
192
- title="Average of Top Scores and Human Baseline Over Time (from last update)",
193
- )
194
- gr.Plot(value=chart, min_width=500)
195
- with gr.Column():
196
- plot_df = load_and_create_plots()
197
- chart = create_metric_plot_obj(
198
- plot_df,
199
- BENCHMARK_COLS,
200
- title="Top Scores and Human Baseline Over Time (from last update)",
201
- )
202
- gr.Plot(value=chart, min_width=500)
203
 
204
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
205
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
@@ -219,7 +204,6 @@ with demo:
219
  with gr.Column():
220
  model_name_textbox = gr.Textbox(label="Model name")
221
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
222
- private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
223
  model_type = gr.Dropdown(
224
  choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
225
  label="Model type",
@@ -290,7 +274,6 @@ with demo:
290
  base_model_name_textbox,
291
  revision_name_textbox,
292
  precision,
293
- private,
294
  weight_type,
295
  model_type,
296
  ],
@@ -307,9 +290,4 @@ with demo:
307
  show_copy_button=True,
308
  )
309
 
310
- scheduler = BackgroundScheduler()
311
- scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h
312
- scheduler.add_job(update_dynamic_files, "interval", hours=2) # launched every 2 hour
313
- scheduler.start()
314
-
315
  demo.queue(default_concurrency_limit=40).launch()
 
2
  import logging
3
  import time
4
  import gradio as gr
5
+ import datasets
6
  from apscheduler.schedulers.background import BackgroundScheduler
7
  from huggingface_hub import snapshot_download
8
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 
31
  )
32
  from src.envs import (
33
  API,
 
 
 
34
  EVAL_REQUESTS_PATH,
35
+ AGGREGATED_REPO,
36
  H4_TOKEN,
 
37
  QUEUE_REPO,
38
  REPO_ID,
 
39
  )
40
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
 
41
  from src.submission.submit import add_new_eval
 
42
  from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
43
 
44
  # Configure logging
 
95
  # These downloads only occur on full initialization
96
  try:
97
  download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
 
 
98
  except Exception:
99
  restart_space()
100
 
101
  # Always retrieve the leaderboard DataFrame
102
+ leaderboard_dataset = datasets.load_dataset(AGGREGATED_REPO, "default", split="train")
103
+ leaderboard_df = get_leaderboard_df(
104
+ leaderboard_dataset=leaderboard_dataset,
 
105
  cols=COLS,
106
  benchmark_cols=BENCHMARK_COLS,
107
  )
108
 
 
 
 
 
 
 
109
  # Evaluation queue DataFrame retrieval is independent of initialization detail level
110
  eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
111
 
112
+ return leaderboard_df, eval_queue_dfs
113
 
114
 
115
  # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
 
118
 
119
  # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
120
  # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
121
+ leaderboard_df, eval_queue_dfs = init_space(full_init=do_full_init)
122
  finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
123
 
124
 
125
  # Data processing for plots now only on demand in the respective Gradio tab
126
+ #def load_and_create_plots():
127
+ # plot_df = create_plot_df(create_scores_df(leaderboard_df))
128
+ # return plot_df
129
 
130
 
131
  demo = gr.Blocks(css=custom_css)
 
167
  bool_checkboxgroup_label="Hide models",
168
  )
169
 
170
+ #with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
171
+ # with gr.Row():
172
+ # with gr.Column():
173
+ # plot_df = load_and_create_plots()
174
+ # chart = create_metric_plot_obj(
175
+ # plot_df,
176
+ # [AutoEvalColumn.average.name],
177
+ # title="Average of Top Scores and Human Baseline Over Time (from last update)",
178
+ # )
179
+ # gr.Plot(value=chart, min_width=500)
180
+ # with gr.Column():
181
+ # plot_df = load_and_create_plots()
182
+ # chart = create_metric_plot_obj(
183
+ # plot_df,
184
+ # BENCHMARK_COLS,
185
+ # title="Top Scores and Human Baseline Over Time (from last update)",
186
+ # )
187
+ # gr.Plot(value=chart, min_width=500)
188
 
189
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
190
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
204
  with gr.Column():
205
  model_name_textbox = gr.Textbox(label="Model name")
206
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
 
207
  model_type = gr.Dropdown(
208
  choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
209
  label="Model type",
 
274
  base_model_name_textbox,
275
  revision_name_textbox,
276
  precision,
 
277
  weight_type,
278
  model_type,
279
  ],
 
290
  show_copy_button=True,
291
  )
292
 
 
 
 
 
 
293
  demo.queue(default_concurrency_limit=40).launch()
src/envs.py CHANGED
@@ -6,13 +6,7 @@ H4_TOKEN = os.environ.get("H4_TOKEN", None)
6
 
7
  REPO_ID = "HuggingFaceH4/open_llm_leaderboard"
8
  QUEUE_REPO = "open-llm-leaderboard/requests"
9
- DYNAMIC_INFO_REPO = "open-llm-leaderboard/dynamic_model_information"
10
- RESULTS_REPO = "open-llm-leaderboard/results"
11
-
12
- PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
13
- PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
14
-
15
- IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
16
 
17
  HF_HOME = os.getenv("HF_HOME", ".")
18
 
@@ -27,12 +21,6 @@ else:
27
  print("Write access confirmed for HF_HOME")
28
 
29
  EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
30
- EVAL_RESULTS_PATH = os.path.join(HF_HOME, "eval-results")
31
- DYNAMIC_INFO_PATH = os.path.join(HF_HOME, "dynamic-info")
32
- DYNAMIC_INFO_FILE_PATH = os.path.join(DYNAMIC_INFO_PATH, "model_infos.json")
33
-
34
- EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
35
- EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
36
 
37
  PATH_TO_COLLECTION = "open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03"
38
 
 
6
 
7
  REPO_ID = "HuggingFaceH4/open_llm_leaderboard"
8
  QUEUE_REPO = "open-llm-leaderboard/requests"
9
+ AGGREGATED_REPO = "open-llm-leaderboard/contents"
 
 
 
 
 
 
10
 
11
  HF_HOME = os.getenv("HF_HOME", ".")
12
 
 
21
  print("Write access confirmed for HF_HOME")
22
 
23
  EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
 
 
 
 
 
 
24
 
25
  PATH_TO_COLLECTION = "open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03"
26
 
src/leaderboard/filter_models.py CHANGED
@@ -167,6 +167,18 @@ def remove_forbidden_models(leaderboard_data: list[dict]):
167
  leaderboard_data.pop(ix)
168
  return leaderboard_data
169
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
  def filter_models_flags(leaderboard_data: list[dict]):
172
  leaderboard_data = remove_forbidden_models(leaderboard_data)
 
167
  leaderboard_data.pop(ix)
168
  return leaderboard_data
169
 
170
+ """
171
+ def remove_forbidden_models(leaderboard_data):
172
+ #Removes models from the leaderboard based on the DO_NOT_SUBMIT list.
173
+ indices_to_remove = []
174
+ for ix, row in leaderboard_data.iterrows():
175
+ if row[AutoEvalColumn.fullname.name] in DO_NOT_SUBMIT_MODELS:
176
+ indices_to_remove.append(ix)
177
+
178
+ # Remove the models from the list
179
+ return leaderboard_data.drop(indices_to_remove)
180
+ """
181
+
182
 
183
  def filter_models_flags(leaderboard_data: list[dict]):
184
  leaderboard_data = remove_forbidden_models(leaderboard_data)
src/leaderboard/read_evals.py DELETED
@@ -1,261 +0,0 @@
1
- import json
2
- from pathlib import Path
3
- from json import JSONDecodeError
4
- import logging
5
- import math
6
-
7
- from dataclasses import dataclass, field
8
- from typing import Optional, Dict, List
9
-
10
- from tqdm import tqdm
11
- from tqdm.contrib.logging import logging_redirect_tqdm
12
-
13
- import numpy as np
14
-
15
- from src.display.formatting import make_clickable_model
16
- from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
17
-
18
- # Configure logging
19
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
20
-
21
-
22
- @dataclass
23
- class EvalResult:
24
- # Also see src.display.utils.AutoEvalColumn for what will be displayed.
25
- eval_name: str # org_model_precision (uid)
26
- full_model: str # org/model (path on hub)
27
- org: Optional[str]
28
- model: str
29
- revision: str # commit hash, "" if main
30
- results: Dict[str, float]
31
- precision: Precision = Precision.Unknown
32
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
33
- weight_type: WeightType = WeightType.Original
34
- architecture: str = "Unknown" # From config file
35
- license: str = "?"
36
- likes: int = 0
37
- num_params: int = 0
38
- date: str = "" # submission date of request file
39
- still_on_hub: bool = True
40
- is_merge: bool = False
41
- not_flagged: bool = False
42
- status: str = "FINISHED"
43
- # List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
44
- tags: List[str] = field(default_factory=list)
45
-
46
- @classmethod
47
- def init_from_json_file(cls, json_filepath: str) -> "EvalResult":
48
- with open(json_filepath, "r") as fp:
49
- data = json.load(fp)
50
-
51
- config = data.get("config_general", {})
52
- precision = Precision.from_str(config.get("model_dtype", "unknown"))
53
- org_and_model = config.get("model_name", "").split("/", 1)
54
- org = org_and_model[0] if len(org_and_model) > 1 else None
55
- model = org_and_model[-1]
56
- if len(org_and_model) == 1:
57
- org = None
58
- model = org_and_model[0]
59
- result_key = f"{model}_{precision.value.name}"
60
- else:
61
- org = org_and_model[0]
62
- model = org_and_model[1]
63
- result_key = f"{org}_{model}_{precision.value.name}"
64
- full_model = "/".join(org_and_model)
65
-
66
- results = cls.extract_results(data) # Properly call the method to extract results
67
-
68
- return cls(
69
- eval_name=result_key,
70
- full_model=full_model,
71
- org=org,
72
- model=model,
73
- results=results,
74
- precision=precision,
75
- revision=config.get("model_sha", ""),
76
- )
77
-
78
- @staticmethod
79
- def extract_results(data: Dict) -> Dict[str, float]:
80
- """
81
- Extract and process benchmark results from a given dict.
82
-
83
- Parameters:
84
- - data (Dict): A dictionary containing benchmark data. This dictionary must
85
- include 'versions' and 'results' keys with respective sub-data.
86
-
87
- Returns:
88
- - Dict[str, float]: A dictionary where keys are benchmark names and values
89
- are the processed average scores as percentages.
90
-
91
- Notes:
92
- - The method specifically checks for certain benchmark names to skip outdated entries.
93
- - Handles NaN values by setting the corresponding benchmark result to 0.0.
94
- - Averages scores across metrics for benchmarks found in the data, in a percentage format.
95
- """
96
- results = {}
97
- for task in Tasks:
98
- task = task.value
99
- # We skip old mmlu entries
100
- if task.benchmark == "hendrycksTest":
101
- for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
102
- if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
103
- continue
104
-
105
- # Some benchamrk values are NaNs, mostly truthfulQA
106
- # Would be more optimal (without the whole dict itertion) if benchmark name was same as key in results
107
- # e.g. not harness|truthfulqa:mc|0 but truthfulqa:mc
108
- for k, v in data["results"].items():
109
- if task.benchmark in k:
110
- if math.isnan(float(v[task.metric])):
111
- results[task.benchmark] = 0.0
112
- continue
113
-
114
- # We average all scores of a given metric (mostly for mmlu)
115
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
116
- if accs.size == 0 or any([acc is None for acc in accs]):
117
- continue
118
-
119
- mean_acc = np.mean(accs) * 100.0
120
- results[task.benchmark] = mean_acc
121
-
122
- return results
123
-
124
- def update_with_request_file(self, requests_path):
125
- """Finds the relevant request file for the current model and updates info with it."""
126
- try:
127
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
128
- if request_file is None:
129
- logging.warning(f"No request file for {self.org}/{self.model}")
130
- self.status = "FAILED"
131
- return
132
-
133
- with open(request_file, "r") as f:
134
- request = json.load(f)
135
-
136
- self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
137
- self.weight_type = WeightType[request.get("weight_type", "Original")]
138
- self.num_params = int(request.get("params", 0)) # Ensuring type safety
139
- self.date = request.get("submitted_time", "")
140
- self.architecture = request.get("architectures", "Unknown")
141
- self.status = request.get("status", "FAILED")
142
-
143
- except FileNotFoundError:
144
- self.status = "FAILED"
145
- logging.error(f"Request file: {request_file} not found for {self.org}/{self.model}")
146
- except JSONDecodeError:
147
- self.status = "FAILED"
148
- logging.error(f"Error decoding JSON from the request file for {self.org}/{self.model}")
149
- except KeyError as e:
150
- self.status = "FAILED"
151
- logging.error(f"Key error {e} in processing request file for {self.org}/{self.model}")
152
- except Exception as e: # Catch-all for any other unexpected exceptions
153
- self.status = "FAILED"
154
- logging.error(f"Unexpected error {e} for {self.org}/{self.model}")
155
-
156
- def update_with_dynamic_file_dict(self, file_dict):
157
- """Update object attributes based on the provided dictionary, with error handling for missing keys and type validation."""
158
- # Default values set for optional or potentially missing keys.
159
- self.license = file_dict.get("license", "?")
160
- self.likes = int(file_dict.get("likes", 0)) # Ensure likes is treated as an integer
161
- self.still_on_hub = file_dict.get("still_on_hub", False) # Default to False if key is missing
162
- self.tags = file_dict.get("tags", [])
163
-
164
- # Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
165
- self.not_flagged = not (any("flagged" in tag for tag in self.tags))
166
-
167
- def to_dict(self):
168
- """Converts the Eval Result to a dict compatible with our dataframe display"""
169
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
170
- data_dict = {
171
- "eval_name": self.eval_name, # not a column, just a save name,
172
- AutoEvalColumn.precision.name: self.precision.value.name,
173
- AutoEvalColumn.model_type.name: self.model_type.value.name,
174
- AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
175
- AutoEvalColumn.weight_type.name: self.weight_type.value.name,
176
- AutoEvalColumn.architecture.name: self.architecture,
177
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
178
- AutoEvalColumn.fullname.name: self.full_model,
179
- AutoEvalColumn.revision.name: self.revision,
180
- AutoEvalColumn.average.name: average,
181
- AutoEvalColumn.license.name: self.license,
182
- AutoEvalColumn.likes.name: self.likes,
183
- AutoEvalColumn.params.name: self.num_params,
184
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
185
- AutoEvalColumn.merged.name: not ("merge" in self.tags if self.tags else False),
186
- AutoEvalColumn.moe.name: not (
187
- ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower()
188
- ),
189
- AutoEvalColumn.not_flagged.name: self.not_flagged,
190
- }
191
-
192
- for task in Tasks:
193
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
194
-
195
- return data_dict
196
-
197
-
198
- def get_request_file_for_model(requests_path, model_name, precision):
199
- """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
200
- requests_path = Path(requests_path)
201
- pattern = f"{model_name}_eval_request_*.json"
202
-
203
- # Using pathlib to find files matching the pattern
204
- request_files = list(requests_path.glob(pattern))
205
-
206
- # Sort the files by name in descending order to mimic 'reverse=True'
207
- request_files.sort(reverse=True)
208
-
209
- # Select the correct request file based on 'status' and 'precision'
210
- request_file = None
211
- for request_file in request_files:
212
- with request_file.open("r") as f:
213
- req_content = json.load(f)
214
- if req_content["status"] == "FINISHED" and req_content["precision"] == precision.split(".")[-1]:
215
- request_file = str(request_file)
216
-
217
- # Return empty string if no file found that matches criteria
218
- return request_file
219
-
220
-
221
- def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
222
- """From the path of the results folder root, extract all needed info for results"""
223
- with open(dynamic_path) as f:
224
- dynamic_data = json.load(f)
225
-
226
- results_path = Path(results_path)
227
- model_files = list(results_path.rglob("results_*.json"))
228
- model_files.sort(key=lambda file: parse_datetime(file.stem.removeprefix("results_")))
229
-
230
- eval_results = {}
231
- # Wrap model_files iteration with tqdm for progress display
232
- for model_result_filepath in tqdm(model_files, desc="Processing model files"):
233
- # Creation of result
234
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
235
- with logging_redirect_tqdm():
236
- eval_result.update_with_request_file(requests_path)
237
-
238
- if eval_result.full_model in dynamic_data:
239
- eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
240
- # Hardcoding because of gating problem
241
- if any([org in eval_result.full_model for org in ["meta-llama/", "google/", "tiiuae/"]]):
242
- eval_result.still_on_hub = True
243
-
244
- # Store results of same eval together
245
- eval_name = eval_result.eval_name
246
- if eval_name in eval_results.keys():
247
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
248
- else:
249
- eval_results[eval_name] = eval_result
250
-
251
- results = []
252
- for k, v in eval_results.items():
253
- try:
254
- if v.status == "FINISHED":
255
- v.to_dict() # we test if the dict version is complete
256
- results.append(v)
257
- except KeyError as e:
258
- logging.error(f"Error while checking model {k} {v.date} json, no key: {e}") # not all eval values present
259
- continue
260
-
261
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/populate.py CHANGED
@@ -1,9 +1,9 @@
1
  import pathlib
2
  import pandas as pd
 
3
  from src.display.formatting import has_no_nan_values, make_clickable_model
4
  from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
5
  from src.leaderboard.filter_models import filter_models_flags
6
- from src.leaderboard.read_evals import get_raw_eval_results
7
  from src.display.utils import load_json_data
8
 
9
 
@@ -39,14 +39,15 @@ def get_evaluation_queue_df(save_path, cols):
39
  return tuple(pd.DataFrame(status_dfs[status], columns=cols) for status in ["FINISHED", "RUNNING", "PENDING"])
40
 
41
 
42
- def get_leaderboard_df(results_path, requests_path, dynamic_path, cols, benchmark_cols):
43
  """Retrieve and process leaderboard data."""
44
- raw_data = get_raw_eval_results(results_path, requests_path, dynamic_path)
45
- all_data_json = [model.to_dict() for model in raw_data] + [baseline_row]
 
46
  filter_models_flags(all_data_json)
47
 
48
  df = pd.DataFrame.from_records(all_data_json)
49
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
50
  df = df[cols].round(decimals=2)
51
  df = df[has_no_nan_values(df, benchmark_cols)]
52
- return raw_data, df
 
1
  import pathlib
2
  import pandas as pd
3
+ from datasets import Dataset
4
  from src.display.formatting import has_no_nan_values, make_clickable_model
5
  from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
6
  from src.leaderboard.filter_models import filter_models_flags
 
7
  from src.display.utils import load_json_data
8
 
9
 
 
39
  return tuple(pd.DataFrame(status_dfs[status], columns=cols) for status in ["FINISHED", "RUNNING", "PENDING"])
40
 
41
 
42
+ def get_leaderboard_df(leaderboard_dataset: Dataset, cols: list, benchmark_cols: list):
43
  """Retrieve and process leaderboard data."""
44
+ all_data_json = leaderboard_dataset.to_dict()
45
+ num_items = leaderboard_dataset.num_rows
46
+ all_data_json = [{k: all_data_json[k][ix] for k in all_data_json.keys()} for ix in range(num_items)]
47
  filter_models_flags(all_data_json)
48
 
49
  df = pd.DataFrame.from_records(all_data_json)
50
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
51
  df = df[cols].round(decimals=2)
52
  df = df[has_no_nan_values(df, benchmark_cols)]
53
+ return df
src/scripts/update_all_request_files.py DELETED
@@ -1,129 +0,0 @@
1
- import json
2
- import os
3
- import time
4
-
5
- from huggingface_hub import snapshot_download
6
-
7
- from src.envs import API, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, DYNAMIC_INFO_REPO, EVAL_REQUESTS_PATH, H4_TOKEN
8
- from src.submission.check_validity import check_model_card, get_model_tags, is_model_on_hub
9
-
10
-
11
- def update_one_model(model_id, data, models_on_the_hub):
12
- # Model no longer on the hub at all
13
- if model_id not in models_on_the_hub:
14
- data["still_on_hub"] = False
15
- data["likes"] = 0
16
- data["downloads"] = 0
17
- data["created_at"] = ""
18
- data["tags"] = []
19
- return data
20
-
21
- # Grabbing model parameters
22
- model_cfg = models_on_the_hub[model_id]
23
- data["likes"] = model_cfg.likes
24
- data["downloads"] = model_cfg.downloads
25
- data["created_at"] = str(model_cfg.created_at)
26
- data["license"] = model_cfg.card_data.license if model_cfg.card_data is not None else ""
27
-
28
- # Grabbing model details
29
- model_name = model_id
30
- if model_cfg.card_data is not None and model_cfg.card_data.base_model is not None:
31
- if isinstance(model_cfg.card_data.base_model, str):
32
- model_name = model_cfg.card_data.base_model # for adapters, we look at the parent model
33
- still_on_hub, _, _ = is_model_on_hub(
34
- model_name=model_name,
35
- revision=data.get("revision"),
36
- trust_remote_code=True,
37
- test_tokenizer=False,
38
- token=H4_TOKEN,
39
- )
40
- # If the model doesn't have a model card or a license, we consider it's deleted
41
- if still_on_hub:
42
- try:
43
- status, _, model_card = check_model_card(model_id)
44
- if status is False:
45
- still_on_hub = False
46
- except Exception:
47
- model_card = None
48
- still_on_hub = False
49
- data["still_on_hub"] = still_on_hub
50
-
51
- tags = get_model_tags(model_card, model_id) if still_on_hub else []
52
-
53
- data["tags"] = tags
54
- return data
55
-
56
-
57
- def update_models(file_path, models_on_the_hub):
58
- """
59
- Search through all JSON files in the specified root folder and its subfolders,
60
- and update the likes key in JSON dict from value of input dict
61
- """
62
- seen_models = []
63
- with open(file_path, "r") as f:
64
- model_infos = json.load(f)
65
- for model_id in model_infos.keys():
66
- seen_models.append(model_id)
67
- model_infos[model_id] = update_one_model(
68
- model_id=model_id, data=model_infos[model_id], models_on_the_hub=models_on_the_hub
69
- )
70
-
71
- # If new requests files have been created since we started all this
72
- # we grab them
73
- all_models = []
74
- try:
75
- for ix, (root, _, files) in enumerate(os.walk(EVAL_REQUESTS_PATH)):
76
- if ix == 0:
77
- continue
78
- for file in files:
79
- if "eval_request" in file:
80
- path = root.split("/")[-1] + "/" + file.split("_eval_request")[0]
81
- all_models.append(path)
82
- except Exception as e:
83
- print(e)
84
- pass
85
-
86
- for model_id in all_models:
87
- if model_id not in seen_models:
88
- model_infos[model_id] = update_one_model(model_id=model_id, data={}, models_on_the_hub=models_on_the_hub)
89
-
90
- with open(file_path, "w") as f:
91
- json.dump(model_infos, f, indent=2)
92
-
93
-
94
- def update_dynamic_files():
95
- """This will only update metadata for models already linked in the repo, not add missing ones."""
96
- snapshot_download(
97
- repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
98
- )
99
-
100
- print("UPDATE_DYNAMIC: Loaded snapshot")
101
- # Get models
102
- start = time.time()
103
-
104
- models = list(
105
- API.list_models(
106
- # filter=ModelFilter(task="text-generation"),
107
- full=False,
108
- cardData=True,
109
- fetch_config=True,
110
- )
111
- )
112
- id_to_model = {model.id: model for model in models}
113
-
114
- print(f"UPDATE_DYNAMIC: Downloaded list of models in {time.time() - start:.2f} seconds")
115
-
116
- start = time.time()
117
-
118
- update_models(DYNAMIC_INFO_FILE_PATH, id_to_model)
119
-
120
- print(f"UPDATE_DYNAMIC: updated in {time.time() - start:.2f} seconds")
121
-
122
- API.upload_file(
123
- path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
124
- path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
125
- repo_id=DYNAMIC_INFO_REPO,
126
- repo_type="dataset",
127
- commit_message="Daily request file update.",
128
- )
129
- print("UPDATE_DYNAMIC: pushed to hub")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/submit.py CHANGED
@@ -7,9 +7,6 @@ from huggingface_hub import snapshot_download
7
  from src.display.formatting import styled_error, styled_message, styled_warning
8
  from src.envs import (
9
  API,
10
- DYNAMIC_INFO_FILE_PATH,
11
- DYNAMIC_INFO_PATH,
12
- DYNAMIC_INFO_REPO,
13
  EVAL_REQUESTS_PATH,
14
  H4_TOKEN,
15
  QUEUE_REPO,
@@ -35,7 +32,6 @@ def add_new_eval(
35
  base_model: str,
36
  revision: str,
37
  precision: str,
38
- private: bool,
39
  weight_type: str,
40
  model_type: str,
41
  ):
@@ -126,7 +122,6 @@ def add_new_eval(
126
  "model": model,
127
  "base_model": base_model,
128
  "revision": model_info.sha, # force to use the exact model commit
129
- "private": private,
130
  "precision": precision,
131
  "params": model_size,
132
  "architectures": architecture,
@@ -154,7 +149,7 @@ def add_new_eval(
154
  print("Creating eval file")
155
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
156
  os.makedirs(OUT_DIR, exist_ok=True)
157
- out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
158
 
159
  with open(out_path, "w") as f:
160
  f.write(json.dumps(eval_entry))
 
7
  from src.display.formatting import styled_error, styled_message, styled_warning
8
  from src.envs import (
9
  API,
 
 
 
10
  EVAL_REQUESTS_PATH,
11
  H4_TOKEN,
12
  QUEUE_REPO,
 
32
  base_model: str,
33
  revision: str,
34
  precision: str,
 
35
  weight_type: str,
36
  model_type: str,
37
  ):
 
122
  "model": model,
123
  "base_model": base_model,
124
  "revision": model_info.sha, # force to use the exact model commit
 
125
  "precision": precision,
126
  "params": model_size,
127
  "architectures": architecture,
 
149
  print("Creating eval file")
150
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
151
  os.makedirs(OUT_DIR, exist_ok=True)
152
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
153
 
154
  with open(out_path, "w") as f:
155
  f.write(json.dumps(eval_entry))
src/tools/collections.py DELETED
@@ -1,76 +0,0 @@
1
- import pandas as pd
2
- from huggingface_hub import add_collection_item, delete_collection_item, get_collection, update_collection_item
3
- from huggingface_hub.utils._errors import HfHubHTTPError
4
- from pandas import DataFrame
5
-
6
- from src.display.utils import AutoEvalColumn, ModelType
7
- from src.envs import H4_TOKEN, PATH_TO_COLLECTION
8
-
9
- # Specific intervals for the collections
10
- intervals = {
11
- "1B": pd.Interval(0, 1.5, closed="right"),
12
- "3B": pd.Interval(2.5, 3.5, closed="neither"),
13
- "7B": pd.Interval(6, 8, closed="neither"),
14
- "13B": pd.Interval(10, 14, closed="neither"),
15
- "30B": pd.Interval(25, 35, closed="neither"),
16
- "65B": pd.Interval(60, 70, closed="neither"),
17
- }
18
-
19
-
20
- def _filter_by_type_and_size(df, model_type, size_interval):
21
- """Filter DataFrame by model type and parameter size interval."""
22
- type_emoji = model_type.value.symbol[0]
23
- filtered_df = df[df[AutoEvalColumn.model_type_symbol.name] == type_emoji]
24
- params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
25
- mask = params_column.apply(lambda x: x in size_interval)
26
- return filtered_df.loc[mask]
27
-
28
-
29
- def _add_models_to_collection(collection, models, model_type, size):
30
- """Add best models to the collection and update positions."""
31
- cur_len_collection = len(collection.items)
32
- for ix, model in enumerate(models, start=1):
33
- try:
34
- collection = add_collection_item(
35
- PATH_TO_COLLECTION,
36
- item_id=model,
37
- item_type="model",
38
- exists_ok=True,
39
- note=f"Best {model_type.to_str(' ')} model of around {size} on the leaderboard today!",
40
- token=H4_TOKEN,
41
- )
42
- # Ensure position is correct if item was added
43
- if len(collection.items) > cur_len_collection:
44
- item_object_id = collection.items[-1].item_object_id
45
- update_collection_item(collection_slug=PATH_TO_COLLECTION, item_object_id=item_object_id, position=ix)
46
- cur_len_collection = len(collection.items)
47
- break # assuming we only add the top model
48
- except HfHubHTTPError:
49
- continue
50
-
51
-
52
- def update_collections(df: DataFrame):
53
- """Update collections by filtering and adding the best models."""
54
- collection = get_collection(collection_slug=PATH_TO_COLLECTION, token=H4_TOKEN)
55
- cur_best_models = []
56
-
57
- for model_type in ModelType:
58
- if not model_type.value.name:
59
- continue
60
- for size, interval in intervals.items():
61
- filtered_df = _filter_by_type_and_size(df, model_type, interval)
62
- best_models = list(
63
- filtered_df.sort_values(AutoEvalColumn.average.name, ascending=False)[AutoEvalColumn.fullname.name][:10]
64
- )
65
- print(model_type.value.symbol, size, best_models)
66
- _add_models_to_collection(collection, best_models, model_type, size)
67
- cur_best_models.extend(best_models)
68
-
69
- # Cleanup
70
- existing_models = {item.item_id for item in collection.items}
71
- to_remove = existing_models - set(cur_best_models)
72
- for item_id in to_remove:
73
- try:
74
- delete_collection_item(collection_slug=PATH_TO_COLLECTION, item_object_id=item_id, token=H4_TOKEN)
75
- except HfHubHTTPError:
76
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/{scripts → tools}/create_request_file.py RENAMED
File without changes
src/tools/plots.py CHANGED
@@ -6,10 +6,9 @@ from plotly.graph_objs import Figure
6
  from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
7
  from src.display.utils import human_baseline_row as HUMAN_BASELINE
8
  from src.leaderboard.filter_models import FLAGGED_MODELS
9
- from src.leaderboard.read_evals import EvalResult
10
 
11
 
12
- def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
13
  """
14
  Generates a DataFrame containing the maximum scores until each date.
15
 
@@ -17,8 +16,7 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
17
  :return: A new DataFrame containing the maximum scores until each date for every metric.
18
  """
19
  # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
20
- results_df = pd.DataFrame(raw_data)
21
- # results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
22
  results_df.sort_values(by="date", inplace=True)
23
 
24
  # Step 2: Initialize the scores dictionary
 
6
  from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
7
  from src.display.utils import human_baseline_row as HUMAN_BASELINE
8
  from src.leaderboard.filter_models import FLAGGED_MODELS
 
9
 
10
 
11
+ def create_scores_df(results_df: list[dict]) -> pd.DataFrame:
12
  """
13
  Generates a DataFrame containing the maximum scores until each date.
14
 
 
16
  :return: A new DataFrame containing the maximum scores until each date for every metric.
17
  """
18
  # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
19
+ results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
 
20
  results_df.sort_values(by="date", inplace=True)
21
 
22
  # Step 2: Initialize the scores dictionary