Clémentine
commited on
Commit
·
4fc3864
1
Parent(s):
3ac217c
init - cleaning the code base, plus adding the new system to load from contents
Browse files- app.py +28 -50
- src/envs.py +1 -13
- src/leaderboard/filter_models.py +12 -0
- src/leaderboard/read_evals.py +0 -261
- src/populate.py +6 -5
- src/scripts/update_all_request_files.py +0 -129
- src/submission/submit.py +1 -6
- src/tools/collections.py +0 -76
- src/{scripts → tools}/create_request_file.py +0 -0
- src/tools/plots.py +2 -4
app.py
CHANGED
@@ -2,6 +2,7 @@ import os
|
|
2 |
import logging
|
3 |
import time
|
4 |
import gradio as gr
|
|
|
5 |
from apscheduler.schedulers.background import BackgroundScheduler
|
6 |
from huggingface_hub import snapshot_download
|
7 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
@@ -30,21 +31,14 @@ from src.display.utils import (
|
|
30 |
)
|
31 |
from src.envs import (
|
32 |
API,
|
33 |
-
DYNAMIC_INFO_FILE_PATH,
|
34 |
-
DYNAMIC_INFO_PATH,
|
35 |
-
DYNAMIC_INFO_REPO,
|
36 |
EVAL_REQUESTS_PATH,
|
37 |
-
|
38 |
H4_TOKEN,
|
39 |
-
IS_PUBLIC,
|
40 |
QUEUE_REPO,
|
41 |
REPO_ID,
|
42 |
-
RESULTS_REPO,
|
43 |
)
|
44 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
45 |
-
from src.scripts.update_all_request_files import update_dynamic_files
|
46 |
from src.submission.submit import add_new_eval
|
47 |
-
from src.tools.collections import update_collections
|
48 |
from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
|
49 |
|
50 |
# Configure logging
|
@@ -101,30 +95,21 @@ def init_space(full_init: bool = True):
|
|
101 |
# These downloads only occur on full initialization
|
102 |
try:
|
103 |
download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
|
104 |
-
download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
|
105 |
-
download_dataset(RESULTS_REPO, EVAL_RESULTS_PATH)
|
106 |
except Exception:
|
107 |
restart_space()
|
108 |
|
109 |
# Always retrieve the leaderboard DataFrame
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
dynamic_path=DYNAMIC_INFO_FILE_PATH,
|
114 |
cols=COLS,
|
115 |
benchmark_cols=BENCHMARK_COLS,
|
116 |
)
|
117 |
|
118 |
-
if full_init:
|
119 |
-
# Collection update only happens on full initialization
|
120 |
-
update_collections(original_df)
|
121 |
-
|
122 |
-
leaderboard_df = original_df.copy()
|
123 |
-
|
124 |
# Evaluation queue DataFrame retrieval is independent of initialization detail level
|
125 |
eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
126 |
|
127 |
-
return leaderboard_df,
|
128 |
|
129 |
|
130 |
# Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
|
@@ -133,14 +118,14 @@ do_full_init = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
|
|
133 |
|
134 |
# Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
|
135 |
# This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
|
136 |
-
leaderboard_df,
|
137 |
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
|
138 |
|
139 |
|
140 |
# Data processing for plots now only on demand in the respective Gradio tab
|
141 |
-
def load_and_create_plots():
|
142 |
-
plot_df = create_plot_df(create_scores_df(
|
143 |
-
return plot_df
|
144 |
|
145 |
|
146 |
demo = gr.Blocks(css=custom_css)
|
@@ -182,24 +167,24 @@ with demo:
|
|
182 |
bool_checkboxgroup_label="Hide models",
|
183 |
)
|
184 |
|
185 |
-
with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
|
204 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
|
205 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
@@ -219,7 +204,6 @@ with demo:
|
|
219 |
with gr.Column():
|
220 |
model_name_textbox = gr.Textbox(label="Model name")
|
221 |
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
222 |
-
private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
|
223 |
model_type = gr.Dropdown(
|
224 |
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
225 |
label="Model type",
|
@@ -290,7 +274,6 @@ with demo:
|
|
290 |
base_model_name_textbox,
|
291 |
revision_name_textbox,
|
292 |
precision,
|
293 |
-
private,
|
294 |
weight_type,
|
295 |
model_type,
|
296 |
],
|
@@ -307,9 +290,4 @@ with demo:
|
|
307 |
show_copy_button=True,
|
308 |
)
|
309 |
|
310 |
-
scheduler = BackgroundScheduler()
|
311 |
-
scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h
|
312 |
-
scheduler.add_job(update_dynamic_files, "interval", hours=2) # launched every 2 hour
|
313 |
-
scheduler.start()
|
314 |
-
|
315 |
demo.queue(default_concurrency_limit=40).launch()
|
|
|
2 |
import logging
|
3 |
import time
|
4 |
import gradio as gr
|
5 |
+
import datasets
|
6 |
from apscheduler.schedulers.background import BackgroundScheduler
|
7 |
from huggingface_hub import snapshot_download
|
8 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
|
|
31 |
)
|
32 |
from src.envs import (
|
33 |
API,
|
|
|
|
|
|
|
34 |
EVAL_REQUESTS_PATH,
|
35 |
+
AGGREGATED_REPO,
|
36 |
H4_TOKEN,
|
|
|
37 |
QUEUE_REPO,
|
38 |
REPO_ID,
|
|
|
39 |
)
|
40 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
|
|
41 |
from src.submission.submit import add_new_eval
|
|
|
42 |
from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
|
43 |
|
44 |
# Configure logging
|
|
|
95 |
# These downloads only occur on full initialization
|
96 |
try:
|
97 |
download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
|
|
|
|
|
98 |
except Exception:
|
99 |
restart_space()
|
100 |
|
101 |
# Always retrieve the leaderboard DataFrame
|
102 |
+
leaderboard_dataset = datasets.load_dataset(AGGREGATED_REPO, "default", split="train")
|
103 |
+
leaderboard_df = get_leaderboard_df(
|
104 |
+
leaderboard_dataset=leaderboard_dataset,
|
|
|
105 |
cols=COLS,
|
106 |
benchmark_cols=BENCHMARK_COLS,
|
107 |
)
|
108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
# Evaluation queue DataFrame retrieval is independent of initialization detail level
|
110 |
eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
111 |
|
112 |
+
return leaderboard_df, eval_queue_dfs
|
113 |
|
114 |
|
115 |
# Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
|
|
|
118 |
|
119 |
# Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
|
120 |
# This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
|
121 |
+
leaderboard_df, eval_queue_dfs = init_space(full_init=do_full_init)
|
122 |
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
|
123 |
|
124 |
|
125 |
# Data processing for plots now only on demand in the respective Gradio tab
|
126 |
+
#def load_and_create_plots():
|
127 |
+
# plot_df = create_plot_df(create_scores_df(leaderboard_df))
|
128 |
+
# return plot_df
|
129 |
|
130 |
|
131 |
demo = gr.Blocks(css=custom_css)
|
|
|
167 |
bool_checkboxgroup_label="Hide models",
|
168 |
)
|
169 |
|
170 |
+
#with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
|
171 |
+
# with gr.Row():
|
172 |
+
# with gr.Column():
|
173 |
+
# plot_df = load_and_create_plots()
|
174 |
+
# chart = create_metric_plot_obj(
|
175 |
+
# plot_df,
|
176 |
+
# [AutoEvalColumn.average.name],
|
177 |
+
# title="Average of Top Scores and Human Baseline Over Time (from last update)",
|
178 |
+
# )
|
179 |
+
# gr.Plot(value=chart, min_width=500)
|
180 |
+
# with gr.Column():
|
181 |
+
# plot_df = load_and_create_plots()
|
182 |
+
# chart = create_metric_plot_obj(
|
183 |
+
# plot_df,
|
184 |
+
# BENCHMARK_COLS,
|
185 |
+
# title="Top Scores and Human Baseline Over Time (from last update)",
|
186 |
+
# )
|
187 |
+
# gr.Plot(value=chart, min_width=500)
|
188 |
|
189 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
|
190 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
204 |
with gr.Column():
|
205 |
model_name_textbox = gr.Textbox(label="Model name")
|
206 |
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
|
|
207 |
model_type = gr.Dropdown(
|
208 |
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
209 |
label="Model type",
|
|
|
274 |
base_model_name_textbox,
|
275 |
revision_name_textbox,
|
276 |
precision,
|
|
|
277 |
weight_type,
|
278 |
model_type,
|
279 |
],
|
|
|
290 |
show_copy_button=True,
|
291 |
)
|
292 |
|
|
|
|
|
|
|
|
|
|
|
293 |
demo.queue(default_concurrency_limit=40).launch()
|
src/envs.py
CHANGED
@@ -6,13 +6,7 @@ H4_TOKEN = os.environ.get("H4_TOKEN", None)
|
|
6 |
|
7 |
REPO_ID = "HuggingFaceH4/open_llm_leaderboard"
|
8 |
QUEUE_REPO = "open-llm-leaderboard/requests"
|
9 |
-
|
10 |
-
RESULTS_REPO = "open-llm-leaderboard/results"
|
11 |
-
|
12 |
-
PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
|
13 |
-
PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
|
14 |
-
|
15 |
-
IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
|
16 |
|
17 |
HF_HOME = os.getenv("HF_HOME", ".")
|
18 |
|
@@ -27,12 +21,6 @@ else:
|
|
27 |
print("Write access confirmed for HF_HOME")
|
28 |
|
29 |
EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
|
30 |
-
EVAL_RESULTS_PATH = os.path.join(HF_HOME, "eval-results")
|
31 |
-
DYNAMIC_INFO_PATH = os.path.join(HF_HOME, "dynamic-info")
|
32 |
-
DYNAMIC_INFO_FILE_PATH = os.path.join(DYNAMIC_INFO_PATH, "model_infos.json")
|
33 |
-
|
34 |
-
EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
|
35 |
-
EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
|
36 |
|
37 |
PATH_TO_COLLECTION = "open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03"
|
38 |
|
|
|
6 |
|
7 |
REPO_ID = "HuggingFaceH4/open_llm_leaderboard"
|
8 |
QUEUE_REPO = "open-llm-leaderboard/requests"
|
9 |
+
AGGREGATED_REPO = "open-llm-leaderboard/contents"
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
HF_HOME = os.getenv("HF_HOME", ".")
|
12 |
|
|
|
21 |
print("Write access confirmed for HF_HOME")
|
22 |
|
23 |
EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
PATH_TO_COLLECTION = "open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03"
|
26 |
|
src/leaderboard/filter_models.py
CHANGED
@@ -167,6 +167,18 @@ def remove_forbidden_models(leaderboard_data: list[dict]):
|
|
167 |
leaderboard_data.pop(ix)
|
168 |
return leaderboard_data
|
169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
|
171 |
def filter_models_flags(leaderboard_data: list[dict]):
|
172 |
leaderboard_data = remove_forbidden_models(leaderboard_data)
|
|
|
167 |
leaderboard_data.pop(ix)
|
168 |
return leaderboard_data
|
169 |
|
170 |
+
"""
|
171 |
+
def remove_forbidden_models(leaderboard_data):
|
172 |
+
#Removes models from the leaderboard based on the DO_NOT_SUBMIT list.
|
173 |
+
indices_to_remove = []
|
174 |
+
for ix, row in leaderboard_data.iterrows():
|
175 |
+
if row[AutoEvalColumn.fullname.name] in DO_NOT_SUBMIT_MODELS:
|
176 |
+
indices_to_remove.append(ix)
|
177 |
+
|
178 |
+
# Remove the models from the list
|
179 |
+
return leaderboard_data.drop(indices_to_remove)
|
180 |
+
"""
|
181 |
+
|
182 |
|
183 |
def filter_models_flags(leaderboard_data: list[dict]):
|
184 |
leaderboard_data = remove_forbidden_models(leaderboard_data)
|
src/leaderboard/read_evals.py
DELETED
@@ -1,261 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
from pathlib import Path
|
3 |
-
from json import JSONDecodeError
|
4 |
-
import logging
|
5 |
-
import math
|
6 |
-
|
7 |
-
from dataclasses import dataclass, field
|
8 |
-
from typing import Optional, Dict, List
|
9 |
-
|
10 |
-
from tqdm import tqdm
|
11 |
-
from tqdm.contrib.logging import logging_redirect_tqdm
|
12 |
-
|
13 |
-
import numpy as np
|
14 |
-
|
15 |
-
from src.display.formatting import make_clickable_model
|
16 |
-
from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
|
17 |
-
|
18 |
-
# Configure logging
|
19 |
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
20 |
-
|
21 |
-
|
22 |
-
@dataclass
|
23 |
-
class EvalResult:
|
24 |
-
# Also see src.display.utils.AutoEvalColumn for what will be displayed.
|
25 |
-
eval_name: str # org_model_precision (uid)
|
26 |
-
full_model: str # org/model (path on hub)
|
27 |
-
org: Optional[str]
|
28 |
-
model: str
|
29 |
-
revision: str # commit hash, "" if main
|
30 |
-
results: Dict[str, float]
|
31 |
-
precision: Precision = Precision.Unknown
|
32 |
-
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
33 |
-
weight_type: WeightType = WeightType.Original
|
34 |
-
architecture: str = "Unknown" # From config file
|
35 |
-
license: str = "?"
|
36 |
-
likes: int = 0
|
37 |
-
num_params: int = 0
|
38 |
-
date: str = "" # submission date of request file
|
39 |
-
still_on_hub: bool = True
|
40 |
-
is_merge: bool = False
|
41 |
-
not_flagged: bool = False
|
42 |
-
status: str = "FINISHED"
|
43 |
-
# List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
|
44 |
-
tags: List[str] = field(default_factory=list)
|
45 |
-
|
46 |
-
@classmethod
|
47 |
-
def init_from_json_file(cls, json_filepath: str) -> "EvalResult":
|
48 |
-
with open(json_filepath, "r") as fp:
|
49 |
-
data = json.load(fp)
|
50 |
-
|
51 |
-
config = data.get("config_general", {})
|
52 |
-
precision = Precision.from_str(config.get("model_dtype", "unknown"))
|
53 |
-
org_and_model = config.get("model_name", "").split("/", 1)
|
54 |
-
org = org_and_model[0] if len(org_and_model) > 1 else None
|
55 |
-
model = org_and_model[-1]
|
56 |
-
if len(org_and_model) == 1:
|
57 |
-
org = None
|
58 |
-
model = org_and_model[0]
|
59 |
-
result_key = f"{model}_{precision.value.name}"
|
60 |
-
else:
|
61 |
-
org = org_and_model[0]
|
62 |
-
model = org_and_model[1]
|
63 |
-
result_key = f"{org}_{model}_{precision.value.name}"
|
64 |
-
full_model = "/".join(org_and_model)
|
65 |
-
|
66 |
-
results = cls.extract_results(data) # Properly call the method to extract results
|
67 |
-
|
68 |
-
return cls(
|
69 |
-
eval_name=result_key,
|
70 |
-
full_model=full_model,
|
71 |
-
org=org,
|
72 |
-
model=model,
|
73 |
-
results=results,
|
74 |
-
precision=precision,
|
75 |
-
revision=config.get("model_sha", ""),
|
76 |
-
)
|
77 |
-
|
78 |
-
@staticmethod
|
79 |
-
def extract_results(data: Dict) -> Dict[str, float]:
|
80 |
-
"""
|
81 |
-
Extract and process benchmark results from a given dict.
|
82 |
-
|
83 |
-
Parameters:
|
84 |
-
- data (Dict): A dictionary containing benchmark data. This dictionary must
|
85 |
-
include 'versions' and 'results' keys with respective sub-data.
|
86 |
-
|
87 |
-
Returns:
|
88 |
-
- Dict[str, float]: A dictionary where keys are benchmark names and values
|
89 |
-
are the processed average scores as percentages.
|
90 |
-
|
91 |
-
Notes:
|
92 |
-
- The method specifically checks for certain benchmark names to skip outdated entries.
|
93 |
-
- Handles NaN values by setting the corresponding benchmark result to 0.0.
|
94 |
-
- Averages scores across metrics for benchmarks found in the data, in a percentage format.
|
95 |
-
"""
|
96 |
-
results = {}
|
97 |
-
for task in Tasks:
|
98 |
-
task = task.value
|
99 |
-
# We skip old mmlu entries
|
100 |
-
if task.benchmark == "hendrycksTest":
|
101 |
-
for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
|
102 |
-
if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
|
103 |
-
continue
|
104 |
-
|
105 |
-
# Some benchamrk values are NaNs, mostly truthfulQA
|
106 |
-
# Would be more optimal (without the whole dict itertion) if benchmark name was same as key in results
|
107 |
-
# e.g. not harness|truthfulqa:mc|0 but truthfulqa:mc
|
108 |
-
for k, v in data["results"].items():
|
109 |
-
if task.benchmark in k:
|
110 |
-
if math.isnan(float(v[task.metric])):
|
111 |
-
results[task.benchmark] = 0.0
|
112 |
-
continue
|
113 |
-
|
114 |
-
# We average all scores of a given metric (mostly for mmlu)
|
115 |
-
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
116 |
-
if accs.size == 0 or any([acc is None for acc in accs]):
|
117 |
-
continue
|
118 |
-
|
119 |
-
mean_acc = np.mean(accs) * 100.0
|
120 |
-
results[task.benchmark] = mean_acc
|
121 |
-
|
122 |
-
return results
|
123 |
-
|
124 |
-
def update_with_request_file(self, requests_path):
|
125 |
-
"""Finds the relevant request file for the current model and updates info with it."""
|
126 |
-
try:
|
127 |
-
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
128 |
-
if request_file is None:
|
129 |
-
logging.warning(f"No request file for {self.org}/{self.model}")
|
130 |
-
self.status = "FAILED"
|
131 |
-
return
|
132 |
-
|
133 |
-
with open(request_file, "r") as f:
|
134 |
-
request = json.load(f)
|
135 |
-
|
136 |
-
self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
|
137 |
-
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
138 |
-
self.num_params = int(request.get("params", 0)) # Ensuring type safety
|
139 |
-
self.date = request.get("submitted_time", "")
|
140 |
-
self.architecture = request.get("architectures", "Unknown")
|
141 |
-
self.status = request.get("status", "FAILED")
|
142 |
-
|
143 |
-
except FileNotFoundError:
|
144 |
-
self.status = "FAILED"
|
145 |
-
logging.error(f"Request file: {request_file} not found for {self.org}/{self.model}")
|
146 |
-
except JSONDecodeError:
|
147 |
-
self.status = "FAILED"
|
148 |
-
logging.error(f"Error decoding JSON from the request file for {self.org}/{self.model}")
|
149 |
-
except KeyError as e:
|
150 |
-
self.status = "FAILED"
|
151 |
-
logging.error(f"Key error {e} in processing request file for {self.org}/{self.model}")
|
152 |
-
except Exception as e: # Catch-all for any other unexpected exceptions
|
153 |
-
self.status = "FAILED"
|
154 |
-
logging.error(f"Unexpected error {e} for {self.org}/{self.model}")
|
155 |
-
|
156 |
-
def update_with_dynamic_file_dict(self, file_dict):
|
157 |
-
"""Update object attributes based on the provided dictionary, with error handling for missing keys and type validation."""
|
158 |
-
# Default values set for optional or potentially missing keys.
|
159 |
-
self.license = file_dict.get("license", "?")
|
160 |
-
self.likes = int(file_dict.get("likes", 0)) # Ensure likes is treated as an integer
|
161 |
-
self.still_on_hub = file_dict.get("still_on_hub", False) # Default to False if key is missing
|
162 |
-
self.tags = file_dict.get("tags", [])
|
163 |
-
|
164 |
-
# Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
|
165 |
-
self.not_flagged = not (any("flagged" in tag for tag in self.tags))
|
166 |
-
|
167 |
-
def to_dict(self):
|
168 |
-
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
169 |
-
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
170 |
-
data_dict = {
|
171 |
-
"eval_name": self.eval_name, # not a column, just a save name,
|
172 |
-
AutoEvalColumn.precision.name: self.precision.value.name,
|
173 |
-
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
174 |
-
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
175 |
-
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
176 |
-
AutoEvalColumn.architecture.name: self.architecture,
|
177 |
-
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
178 |
-
AutoEvalColumn.fullname.name: self.full_model,
|
179 |
-
AutoEvalColumn.revision.name: self.revision,
|
180 |
-
AutoEvalColumn.average.name: average,
|
181 |
-
AutoEvalColumn.license.name: self.license,
|
182 |
-
AutoEvalColumn.likes.name: self.likes,
|
183 |
-
AutoEvalColumn.params.name: self.num_params,
|
184 |
-
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
185 |
-
AutoEvalColumn.merged.name: not ("merge" in self.tags if self.tags else False),
|
186 |
-
AutoEvalColumn.moe.name: not (
|
187 |
-
("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower()
|
188 |
-
),
|
189 |
-
AutoEvalColumn.not_flagged.name: self.not_flagged,
|
190 |
-
}
|
191 |
-
|
192 |
-
for task in Tasks:
|
193 |
-
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
194 |
-
|
195 |
-
return data_dict
|
196 |
-
|
197 |
-
|
198 |
-
def get_request_file_for_model(requests_path, model_name, precision):
|
199 |
-
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
200 |
-
requests_path = Path(requests_path)
|
201 |
-
pattern = f"{model_name}_eval_request_*.json"
|
202 |
-
|
203 |
-
# Using pathlib to find files matching the pattern
|
204 |
-
request_files = list(requests_path.glob(pattern))
|
205 |
-
|
206 |
-
# Sort the files by name in descending order to mimic 'reverse=True'
|
207 |
-
request_files.sort(reverse=True)
|
208 |
-
|
209 |
-
# Select the correct request file based on 'status' and 'precision'
|
210 |
-
request_file = None
|
211 |
-
for request_file in request_files:
|
212 |
-
with request_file.open("r") as f:
|
213 |
-
req_content = json.load(f)
|
214 |
-
if req_content["status"] == "FINISHED" and req_content["precision"] == precision.split(".")[-1]:
|
215 |
-
request_file = str(request_file)
|
216 |
-
|
217 |
-
# Return empty string if no file found that matches criteria
|
218 |
-
return request_file
|
219 |
-
|
220 |
-
|
221 |
-
def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
|
222 |
-
"""From the path of the results folder root, extract all needed info for results"""
|
223 |
-
with open(dynamic_path) as f:
|
224 |
-
dynamic_data = json.load(f)
|
225 |
-
|
226 |
-
results_path = Path(results_path)
|
227 |
-
model_files = list(results_path.rglob("results_*.json"))
|
228 |
-
model_files.sort(key=lambda file: parse_datetime(file.stem.removeprefix("results_")))
|
229 |
-
|
230 |
-
eval_results = {}
|
231 |
-
# Wrap model_files iteration with tqdm for progress display
|
232 |
-
for model_result_filepath in tqdm(model_files, desc="Processing model files"):
|
233 |
-
# Creation of result
|
234 |
-
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
235 |
-
with logging_redirect_tqdm():
|
236 |
-
eval_result.update_with_request_file(requests_path)
|
237 |
-
|
238 |
-
if eval_result.full_model in dynamic_data:
|
239 |
-
eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
|
240 |
-
# Hardcoding because of gating problem
|
241 |
-
if any([org in eval_result.full_model for org in ["meta-llama/", "google/", "tiiuae/"]]):
|
242 |
-
eval_result.still_on_hub = True
|
243 |
-
|
244 |
-
# Store results of same eval together
|
245 |
-
eval_name = eval_result.eval_name
|
246 |
-
if eval_name in eval_results.keys():
|
247 |
-
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
248 |
-
else:
|
249 |
-
eval_results[eval_name] = eval_result
|
250 |
-
|
251 |
-
results = []
|
252 |
-
for k, v in eval_results.items():
|
253 |
-
try:
|
254 |
-
if v.status == "FINISHED":
|
255 |
-
v.to_dict() # we test if the dict version is complete
|
256 |
-
results.append(v)
|
257 |
-
except KeyError as e:
|
258 |
-
logging.error(f"Error while checking model {k} {v.date} json, no key: {e}") # not all eval values present
|
259 |
-
continue
|
260 |
-
|
261 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/populate.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import pathlib
|
2 |
import pandas as pd
|
|
|
3 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
4 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
|
5 |
from src.leaderboard.filter_models import filter_models_flags
|
6 |
-
from src.leaderboard.read_evals import get_raw_eval_results
|
7 |
from src.display.utils import load_json_data
|
8 |
|
9 |
|
@@ -39,14 +39,15 @@ def get_evaluation_queue_df(save_path, cols):
|
|
39 |
return tuple(pd.DataFrame(status_dfs[status], columns=cols) for status in ["FINISHED", "RUNNING", "PENDING"])
|
40 |
|
41 |
|
42 |
-
def get_leaderboard_df(
|
43 |
"""Retrieve and process leaderboard data."""
|
44 |
-
|
45 |
-
|
|
|
46 |
filter_models_flags(all_data_json)
|
47 |
|
48 |
df = pd.DataFrame.from_records(all_data_json)
|
49 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
50 |
df = df[cols].round(decimals=2)
|
51 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
52 |
-
return
|
|
|
1 |
import pathlib
|
2 |
import pandas as pd
|
3 |
+
from datasets import Dataset
|
4 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
5 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
|
6 |
from src.leaderboard.filter_models import filter_models_flags
|
|
|
7 |
from src.display.utils import load_json_data
|
8 |
|
9 |
|
|
|
39 |
return tuple(pd.DataFrame(status_dfs[status], columns=cols) for status in ["FINISHED", "RUNNING", "PENDING"])
|
40 |
|
41 |
|
42 |
+
def get_leaderboard_df(leaderboard_dataset: Dataset, cols: list, benchmark_cols: list):
|
43 |
"""Retrieve and process leaderboard data."""
|
44 |
+
all_data_json = leaderboard_dataset.to_dict()
|
45 |
+
num_items = leaderboard_dataset.num_rows
|
46 |
+
all_data_json = [{k: all_data_json[k][ix] for k in all_data_json.keys()} for ix in range(num_items)]
|
47 |
filter_models_flags(all_data_json)
|
48 |
|
49 |
df = pd.DataFrame.from_records(all_data_json)
|
50 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
51 |
df = df[cols].round(decimals=2)
|
52 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
53 |
+
return df
|
src/scripts/update_all_request_files.py
DELETED
@@ -1,129 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
import time
|
4 |
-
|
5 |
-
from huggingface_hub import snapshot_download
|
6 |
-
|
7 |
-
from src.envs import API, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, DYNAMIC_INFO_REPO, EVAL_REQUESTS_PATH, H4_TOKEN
|
8 |
-
from src.submission.check_validity import check_model_card, get_model_tags, is_model_on_hub
|
9 |
-
|
10 |
-
|
11 |
-
def update_one_model(model_id, data, models_on_the_hub):
|
12 |
-
# Model no longer on the hub at all
|
13 |
-
if model_id not in models_on_the_hub:
|
14 |
-
data["still_on_hub"] = False
|
15 |
-
data["likes"] = 0
|
16 |
-
data["downloads"] = 0
|
17 |
-
data["created_at"] = ""
|
18 |
-
data["tags"] = []
|
19 |
-
return data
|
20 |
-
|
21 |
-
# Grabbing model parameters
|
22 |
-
model_cfg = models_on_the_hub[model_id]
|
23 |
-
data["likes"] = model_cfg.likes
|
24 |
-
data["downloads"] = model_cfg.downloads
|
25 |
-
data["created_at"] = str(model_cfg.created_at)
|
26 |
-
data["license"] = model_cfg.card_data.license if model_cfg.card_data is not None else ""
|
27 |
-
|
28 |
-
# Grabbing model details
|
29 |
-
model_name = model_id
|
30 |
-
if model_cfg.card_data is not None and model_cfg.card_data.base_model is not None:
|
31 |
-
if isinstance(model_cfg.card_data.base_model, str):
|
32 |
-
model_name = model_cfg.card_data.base_model # for adapters, we look at the parent model
|
33 |
-
still_on_hub, _, _ = is_model_on_hub(
|
34 |
-
model_name=model_name,
|
35 |
-
revision=data.get("revision"),
|
36 |
-
trust_remote_code=True,
|
37 |
-
test_tokenizer=False,
|
38 |
-
token=H4_TOKEN,
|
39 |
-
)
|
40 |
-
# If the model doesn't have a model card or a license, we consider it's deleted
|
41 |
-
if still_on_hub:
|
42 |
-
try:
|
43 |
-
status, _, model_card = check_model_card(model_id)
|
44 |
-
if status is False:
|
45 |
-
still_on_hub = False
|
46 |
-
except Exception:
|
47 |
-
model_card = None
|
48 |
-
still_on_hub = False
|
49 |
-
data["still_on_hub"] = still_on_hub
|
50 |
-
|
51 |
-
tags = get_model_tags(model_card, model_id) if still_on_hub else []
|
52 |
-
|
53 |
-
data["tags"] = tags
|
54 |
-
return data
|
55 |
-
|
56 |
-
|
57 |
-
def update_models(file_path, models_on_the_hub):
|
58 |
-
"""
|
59 |
-
Search through all JSON files in the specified root folder and its subfolders,
|
60 |
-
and update the likes key in JSON dict from value of input dict
|
61 |
-
"""
|
62 |
-
seen_models = []
|
63 |
-
with open(file_path, "r") as f:
|
64 |
-
model_infos = json.load(f)
|
65 |
-
for model_id in model_infos.keys():
|
66 |
-
seen_models.append(model_id)
|
67 |
-
model_infos[model_id] = update_one_model(
|
68 |
-
model_id=model_id, data=model_infos[model_id], models_on_the_hub=models_on_the_hub
|
69 |
-
)
|
70 |
-
|
71 |
-
# If new requests files have been created since we started all this
|
72 |
-
# we grab them
|
73 |
-
all_models = []
|
74 |
-
try:
|
75 |
-
for ix, (root, _, files) in enumerate(os.walk(EVAL_REQUESTS_PATH)):
|
76 |
-
if ix == 0:
|
77 |
-
continue
|
78 |
-
for file in files:
|
79 |
-
if "eval_request" in file:
|
80 |
-
path = root.split("/")[-1] + "/" + file.split("_eval_request")[0]
|
81 |
-
all_models.append(path)
|
82 |
-
except Exception as e:
|
83 |
-
print(e)
|
84 |
-
pass
|
85 |
-
|
86 |
-
for model_id in all_models:
|
87 |
-
if model_id not in seen_models:
|
88 |
-
model_infos[model_id] = update_one_model(model_id=model_id, data={}, models_on_the_hub=models_on_the_hub)
|
89 |
-
|
90 |
-
with open(file_path, "w") as f:
|
91 |
-
json.dump(model_infos, f, indent=2)
|
92 |
-
|
93 |
-
|
94 |
-
def update_dynamic_files():
|
95 |
-
"""This will only update metadata for models already linked in the repo, not add missing ones."""
|
96 |
-
snapshot_download(
|
97 |
-
repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
98 |
-
)
|
99 |
-
|
100 |
-
print("UPDATE_DYNAMIC: Loaded snapshot")
|
101 |
-
# Get models
|
102 |
-
start = time.time()
|
103 |
-
|
104 |
-
models = list(
|
105 |
-
API.list_models(
|
106 |
-
# filter=ModelFilter(task="text-generation"),
|
107 |
-
full=False,
|
108 |
-
cardData=True,
|
109 |
-
fetch_config=True,
|
110 |
-
)
|
111 |
-
)
|
112 |
-
id_to_model = {model.id: model for model in models}
|
113 |
-
|
114 |
-
print(f"UPDATE_DYNAMIC: Downloaded list of models in {time.time() - start:.2f} seconds")
|
115 |
-
|
116 |
-
start = time.time()
|
117 |
-
|
118 |
-
update_models(DYNAMIC_INFO_FILE_PATH, id_to_model)
|
119 |
-
|
120 |
-
print(f"UPDATE_DYNAMIC: updated in {time.time() - start:.2f} seconds")
|
121 |
-
|
122 |
-
API.upload_file(
|
123 |
-
path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
|
124 |
-
path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
|
125 |
-
repo_id=DYNAMIC_INFO_REPO,
|
126 |
-
repo_type="dataset",
|
127 |
-
commit_message="Daily request file update.",
|
128 |
-
)
|
129 |
-
print("UPDATE_DYNAMIC: pushed to hub")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/submission/submit.py
CHANGED
@@ -7,9 +7,6 @@ from huggingface_hub import snapshot_download
|
|
7 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
8 |
from src.envs import (
|
9 |
API,
|
10 |
-
DYNAMIC_INFO_FILE_PATH,
|
11 |
-
DYNAMIC_INFO_PATH,
|
12 |
-
DYNAMIC_INFO_REPO,
|
13 |
EVAL_REQUESTS_PATH,
|
14 |
H4_TOKEN,
|
15 |
QUEUE_REPO,
|
@@ -35,7 +32,6 @@ def add_new_eval(
|
|
35 |
base_model: str,
|
36 |
revision: str,
|
37 |
precision: str,
|
38 |
-
private: bool,
|
39 |
weight_type: str,
|
40 |
model_type: str,
|
41 |
):
|
@@ -126,7 +122,6 @@ def add_new_eval(
|
|
126 |
"model": model,
|
127 |
"base_model": base_model,
|
128 |
"revision": model_info.sha, # force to use the exact model commit
|
129 |
-
"private": private,
|
130 |
"precision": precision,
|
131 |
"params": model_size,
|
132 |
"architectures": architecture,
|
@@ -154,7 +149,7 @@ def add_new_eval(
|
|
154 |
print("Creating eval file")
|
155 |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
156 |
os.makedirs(OUT_DIR, exist_ok=True)
|
157 |
-
out_path = f"{OUT_DIR}/{model_path}
|
158 |
|
159 |
with open(out_path, "w") as f:
|
160 |
f.write(json.dumps(eval_entry))
|
|
|
7 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
8 |
from src.envs import (
|
9 |
API,
|
|
|
|
|
|
|
10 |
EVAL_REQUESTS_PATH,
|
11 |
H4_TOKEN,
|
12 |
QUEUE_REPO,
|
|
|
32 |
base_model: str,
|
33 |
revision: str,
|
34 |
precision: str,
|
|
|
35 |
weight_type: str,
|
36 |
model_type: str,
|
37 |
):
|
|
|
122 |
"model": model,
|
123 |
"base_model": base_model,
|
124 |
"revision": model_info.sha, # force to use the exact model commit
|
|
|
125 |
"precision": precision,
|
126 |
"params": model_size,
|
127 |
"architectures": architecture,
|
|
|
149 |
print("Creating eval file")
|
150 |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
151 |
os.makedirs(OUT_DIR, exist_ok=True)
|
152 |
+
out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
|
153 |
|
154 |
with open(out_path, "w") as f:
|
155 |
f.write(json.dumps(eval_entry))
|
src/tools/collections.py
DELETED
@@ -1,76 +0,0 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
from huggingface_hub import add_collection_item, delete_collection_item, get_collection, update_collection_item
|
3 |
-
from huggingface_hub.utils._errors import HfHubHTTPError
|
4 |
-
from pandas import DataFrame
|
5 |
-
|
6 |
-
from src.display.utils import AutoEvalColumn, ModelType
|
7 |
-
from src.envs import H4_TOKEN, PATH_TO_COLLECTION
|
8 |
-
|
9 |
-
# Specific intervals for the collections
|
10 |
-
intervals = {
|
11 |
-
"1B": pd.Interval(0, 1.5, closed="right"),
|
12 |
-
"3B": pd.Interval(2.5, 3.5, closed="neither"),
|
13 |
-
"7B": pd.Interval(6, 8, closed="neither"),
|
14 |
-
"13B": pd.Interval(10, 14, closed="neither"),
|
15 |
-
"30B": pd.Interval(25, 35, closed="neither"),
|
16 |
-
"65B": pd.Interval(60, 70, closed="neither"),
|
17 |
-
}
|
18 |
-
|
19 |
-
|
20 |
-
def _filter_by_type_and_size(df, model_type, size_interval):
|
21 |
-
"""Filter DataFrame by model type and parameter size interval."""
|
22 |
-
type_emoji = model_type.value.symbol[0]
|
23 |
-
filtered_df = df[df[AutoEvalColumn.model_type_symbol.name] == type_emoji]
|
24 |
-
params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
25 |
-
mask = params_column.apply(lambda x: x in size_interval)
|
26 |
-
return filtered_df.loc[mask]
|
27 |
-
|
28 |
-
|
29 |
-
def _add_models_to_collection(collection, models, model_type, size):
|
30 |
-
"""Add best models to the collection and update positions."""
|
31 |
-
cur_len_collection = len(collection.items)
|
32 |
-
for ix, model in enumerate(models, start=1):
|
33 |
-
try:
|
34 |
-
collection = add_collection_item(
|
35 |
-
PATH_TO_COLLECTION,
|
36 |
-
item_id=model,
|
37 |
-
item_type="model",
|
38 |
-
exists_ok=True,
|
39 |
-
note=f"Best {model_type.to_str(' ')} model of around {size} on the leaderboard today!",
|
40 |
-
token=H4_TOKEN,
|
41 |
-
)
|
42 |
-
# Ensure position is correct if item was added
|
43 |
-
if len(collection.items) > cur_len_collection:
|
44 |
-
item_object_id = collection.items[-1].item_object_id
|
45 |
-
update_collection_item(collection_slug=PATH_TO_COLLECTION, item_object_id=item_object_id, position=ix)
|
46 |
-
cur_len_collection = len(collection.items)
|
47 |
-
break # assuming we only add the top model
|
48 |
-
except HfHubHTTPError:
|
49 |
-
continue
|
50 |
-
|
51 |
-
|
52 |
-
def update_collections(df: DataFrame):
|
53 |
-
"""Update collections by filtering and adding the best models."""
|
54 |
-
collection = get_collection(collection_slug=PATH_TO_COLLECTION, token=H4_TOKEN)
|
55 |
-
cur_best_models = []
|
56 |
-
|
57 |
-
for model_type in ModelType:
|
58 |
-
if not model_type.value.name:
|
59 |
-
continue
|
60 |
-
for size, interval in intervals.items():
|
61 |
-
filtered_df = _filter_by_type_and_size(df, model_type, interval)
|
62 |
-
best_models = list(
|
63 |
-
filtered_df.sort_values(AutoEvalColumn.average.name, ascending=False)[AutoEvalColumn.fullname.name][:10]
|
64 |
-
)
|
65 |
-
print(model_type.value.symbol, size, best_models)
|
66 |
-
_add_models_to_collection(collection, best_models, model_type, size)
|
67 |
-
cur_best_models.extend(best_models)
|
68 |
-
|
69 |
-
# Cleanup
|
70 |
-
existing_models = {item.item_id for item in collection.items}
|
71 |
-
to_remove = existing_models - set(cur_best_models)
|
72 |
-
for item_id in to_remove:
|
73 |
-
try:
|
74 |
-
delete_collection_item(collection_slug=PATH_TO_COLLECTION, item_object_id=item_id, token=H4_TOKEN)
|
75 |
-
except HfHubHTTPError:
|
76 |
-
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/{scripts → tools}/create_request_file.py
RENAMED
File without changes
|
src/tools/plots.py
CHANGED
@@ -6,10 +6,9 @@ from plotly.graph_objs import Figure
|
|
6 |
from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
|
7 |
from src.display.utils import human_baseline_row as HUMAN_BASELINE
|
8 |
from src.leaderboard.filter_models import FLAGGED_MODELS
|
9 |
-
from src.leaderboard.read_evals import EvalResult
|
10 |
|
11 |
|
12 |
-
def create_scores_df(
|
13 |
"""
|
14 |
Generates a DataFrame containing the maximum scores until each date.
|
15 |
|
@@ -17,8 +16,7 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
|
|
17 |
:return: A new DataFrame containing the maximum scores until each date for every metric.
|
18 |
"""
|
19 |
# Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
|
20 |
-
results_df = pd.
|
21 |
-
# results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
|
22 |
results_df.sort_values(by="date", inplace=True)
|
23 |
|
24 |
# Step 2: Initialize the scores dictionary
|
|
|
6 |
from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
|
7 |
from src.display.utils import human_baseline_row as HUMAN_BASELINE
|
8 |
from src.leaderboard.filter_models import FLAGGED_MODELS
|
|
|
9 |
|
10 |
|
11 |
+
def create_scores_df(results_df: list[dict]) -> pd.DataFrame:
|
12 |
"""
|
13 |
Generates a DataFrame containing the maximum scores until each date.
|
14 |
|
|
|
16 |
:return: A new DataFrame containing the maximum scores until each date for every metric.
|
17 |
"""
|
18 |
# Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
|
19 |
+
results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
|
|
|
20 |
results_df.sort_values(by="date", inplace=True)
|
21 |
|
22 |
# Step 2: Initialize the scores dictionary
|