Clémentine
commited on
Commit
•
3dfaf22
1
Parent(s):
eaace79
add model architecture as column
Browse files- app.py +1 -1
- src/display/utils.py +1 -0
- src/leaderboard/read_evals.py +36 -22
- src/populate.py +2 -2
- src/submission/check_validity.py +4 -3
- src/submission/submit.py +2 -2
app.py
CHANGED
@@ -54,7 +54,7 @@ except Exception:
|
|
54 |
restart_space()
|
55 |
|
56 |
|
57 |
-
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
58 |
update_collections(original_df.copy())
|
59 |
leaderboard_df = original_df.copy()
|
60 |
|
|
|
54 |
restart_space()
|
55 |
|
56 |
|
57 |
+
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
58 |
update_collections(original_df.copy())
|
59 |
leaderboard_df = original_df.copy()
|
60 |
|
src/display/utils.py
CHANGED
@@ -34,6 +34,7 @@ class AutoEvalColumn: # Auto evals column
|
|
34 |
gsm8k = ColumnContent("GSM8K", "number", True)
|
35 |
drop = ColumnContent("DROP", "number", True)
|
36 |
model_type = ColumnContent("Type", "str", False)
|
|
|
37 |
weight_type = ColumnContent("Weight type", "str", False, True)
|
38 |
precision = ColumnContent("Precision", "str", False) # , True)
|
39 |
license = ColumnContent("Hub License", "str", False)
|
|
|
34 |
gsm8k = ColumnContent("GSM8K", "number", True)
|
35 |
drop = ColumnContent("DROP", "number", True)
|
36 |
model_type = ColumnContent("Type", "str", False)
|
37 |
+
architecture = ColumnContent("Architecture", "str", False)
|
38 |
weight_type = ColumnContent("Weight type", "str", False, True)
|
39 |
precision = ColumnContent("Precision", "str", False) # , True)
|
40 |
license = ColumnContent("Hub License", "str", False)
|
src/leaderboard/read_evals.py
CHANGED
@@ -6,6 +6,7 @@ from dataclasses import dataclass
|
|
6 |
|
7 |
import dateutil
|
8 |
from datetime import datetime
|
|
|
9 |
import numpy as np
|
10 |
|
11 |
from src.display.formatting import make_clickable_model
|
@@ -15,24 +16,26 @@ from src.submission.check_validity import is_model_on_hub
|
|
15 |
|
16 |
@dataclass
|
17 |
class EvalResult:
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
21 |
model: str
|
22 |
-
revision: str
|
23 |
results: dict
|
24 |
precision: str = ""
|
25 |
-
model_type: ModelType = ModelType.Unknown
|
26 |
-
weight_type: str = "Original"
|
27 |
-
architecture: str = "Unknown"
|
28 |
license: str = "?"
|
29 |
likes: int = 0
|
30 |
num_params: int = 0
|
31 |
-
date: str = ""
|
32 |
still_on_hub: bool = False
|
33 |
|
34 |
@classmethod
|
35 |
def init_from_json_file(self, json_filepath):
|
|
|
36 |
with open(json_filepath) as fp:
|
37 |
data = json.load(fp)
|
38 |
|
@@ -58,9 +61,14 @@ class EvalResult:
|
|
58 |
result_key = f"{org}_{model}_{precision}"
|
59 |
full_model = "/".join(org_and_model)
|
60 |
|
61 |
-
still_on_hub, error = is_model_on_hub(
|
62 |
full_model, config.get("model_sha", "main"), trust_remote_code=True
|
63 |
)
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
# Extract results available in this file (some results are split in several files)
|
66 |
results = {}
|
@@ -96,18 +104,21 @@ class EvalResult:
|
|
96 |
org=org,
|
97 |
model=model,
|
98 |
results=results,
|
99 |
-
precision=precision,
|
100 |
-
revision=config.get("model_sha", ""),
|
101 |
still_on_hub=still_on_hub,
|
|
|
102 |
)
|
103 |
|
104 |
-
def update_with_request_file(self):
|
105 |
-
|
|
|
106 |
|
107 |
try:
|
108 |
with open(request_file, "r") as f:
|
109 |
request = json.load(f)
|
110 |
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
|
|
111 |
self.license = request.get("license", "?")
|
112 |
self.likes = request.get("likes", 0)
|
113 |
self.num_params = request.get("params", 0)
|
@@ -116,6 +127,7 @@ class EvalResult:
|
|
116 |
print(f"Could not find request file for {self.org}/{self.model}")
|
117 |
|
118 |
def to_dict(self):
|
|
|
119 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
120 |
data_dict = {
|
121 |
"eval_name": self.eval_name, # not a column, just a save name,
|
@@ -123,6 +135,7 @@ class EvalResult:
|
|
123 |
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
124 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
125 |
AutoEvalColumn.weight_type.name: self.weight_type,
|
|
|
126 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
127 |
AutoEvalColumn.dummy.name: self.full_model,
|
128 |
AutoEvalColumn.revision.name: self.revision,
|
@@ -139,9 +152,10 @@ class EvalResult:
|
|
139 |
return data_dict
|
140 |
|
141 |
|
142 |
-
def get_request_file_for_model(model_name, precision):
|
|
|
143 |
request_files = os.path.join(
|
144 |
-
|
145 |
f"{model_name}_eval_request_*.json",
|
146 |
)
|
147 |
request_files = glob.glob(request_files)
|
@@ -160,8 +174,9 @@ def get_request_file_for_model(model_name, precision):
|
|
160 |
return request_file
|
161 |
|
162 |
|
163 |
-
def get_raw_eval_results(results_path: str) -> list[EvalResult]:
|
164 |
-
|
|
|
165 |
|
166 |
for root, _, files in os.walk(results_path):
|
167 |
# We should only have json files in model results
|
@@ -174,15 +189,14 @@ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
|
|
174 |
except dateutil.parser._parser.ParserError:
|
175 |
files = [files[-1]]
|
176 |
|
177 |
-
# up_to_date = files[-1]
|
178 |
for file in files:
|
179 |
-
|
180 |
|
181 |
eval_results = {}
|
182 |
-
for
|
183 |
# Creation of result
|
184 |
-
eval_result = EvalResult.init_from_json_file(
|
185 |
-
eval_result.update_with_request_file()
|
186 |
|
187 |
# Store results of same eval together
|
188 |
eval_name = eval_result.eval_name
|
|
|
6 |
|
7 |
import dateutil
|
8 |
from datetime import datetime
|
9 |
+
from transformers import AutoConfig
|
10 |
import numpy as np
|
11 |
|
12 |
from src.display.formatting import make_clickable_model
|
|
|
16 |
|
17 |
@dataclass
|
18 |
class EvalResult:
|
19 |
+
# Also see src.display.utils.AutoEvalColumn for what will be displayed.
|
20 |
+
eval_name: str # org_model_precision (uid)
|
21 |
+
full_model: str # org/model (path on hub)
|
22 |
+
org: str
|
23 |
model: str
|
24 |
+
revision: str # commit hash, "" if main
|
25 |
results: dict
|
26 |
precision: str = ""
|
27 |
+
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
28 |
+
weight_type: str = "Original" # Original or Adapter
|
29 |
+
architecture: str = "Unknown" # From config file
|
30 |
license: str = "?"
|
31 |
likes: int = 0
|
32 |
num_params: int = 0
|
33 |
+
date: str = "" # submission date of request file
|
34 |
still_on_hub: bool = False
|
35 |
|
36 |
@classmethod
|
37 |
def init_from_json_file(self, json_filepath):
|
38 |
+
"""Inits the result from the specific model result file"""
|
39 |
with open(json_filepath) as fp:
|
40 |
data = json.load(fp)
|
41 |
|
|
|
61 |
result_key = f"{org}_{model}_{precision}"
|
62 |
full_model = "/".join(org_and_model)
|
63 |
|
64 |
+
still_on_hub, error, model_config = is_model_on_hub(
|
65 |
full_model, config.get("model_sha", "main"), trust_remote_code=True
|
66 |
)
|
67 |
+
architecture = "?"
|
68 |
+
if model_config is not None:
|
69 |
+
architectures = getattr(model_config, "architectures", None)
|
70 |
+
if architectures:
|
71 |
+
architecture = ";".join(architectures)
|
72 |
|
73 |
# Extract results available in this file (some results are split in several files)
|
74 |
results = {}
|
|
|
104 |
org=org,
|
105 |
model=model,
|
106 |
results=results,
|
107 |
+
precision=precision,
|
108 |
+
revision= config.get("model_sha", ""),
|
109 |
still_on_hub=still_on_hub,
|
110 |
+
architecture=architecture
|
111 |
)
|
112 |
|
113 |
+
def update_with_request_file(self, requests_path):
|
114 |
+
"""Finds the relevant request file for the current model and updates info with it"""
|
115 |
+
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision)
|
116 |
|
117 |
try:
|
118 |
with open(request_file, "r") as f:
|
119 |
request = json.load(f)
|
120 |
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
121 |
+
self.weight_type = request.get("weight_type", "?")
|
122 |
self.license = request.get("license", "?")
|
123 |
self.likes = request.get("likes", 0)
|
124 |
self.num_params = request.get("params", 0)
|
|
|
127 |
print(f"Could not find request file for {self.org}/{self.model}")
|
128 |
|
129 |
def to_dict(self):
|
130 |
+
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
131 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
132 |
data_dict = {
|
133 |
"eval_name": self.eval_name, # not a column, just a save name,
|
|
|
135 |
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
136 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
137 |
AutoEvalColumn.weight_type.name: self.weight_type,
|
138 |
+
AutoEvalColumn.architecture.name: self.architecture,
|
139 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
140 |
AutoEvalColumn.dummy.name: self.full_model,
|
141 |
AutoEvalColumn.revision.name: self.revision,
|
|
|
152 |
return data_dict
|
153 |
|
154 |
|
155 |
+
def get_request_file_for_model(requests_path, model_name, precision):
|
156 |
+
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
157 |
request_files = os.path.join(
|
158 |
+
requests_path,
|
159 |
f"{model_name}_eval_request_*.json",
|
160 |
)
|
161 |
request_files = glob.glob(request_files)
|
|
|
174 |
return request_file
|
175 |
|
176 |
|
177 |
+
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
178 |
+
"""From the path of the results folder root, extract all needed info for results"""
|
179 |
+
model_result_filepaths = []
|
180 |
|
181 |
for root, _, files in os.walk(results_path):
|
182 |
# We should only have json files in model results
|
|
|
189 |
except dateutil.parser._parser.ParserError:
|
190 |
files = [files[-1]]
|
191 |
|
|
|
192 |
for file in files:
|
193 |
+
model_result_filepaths.append(os.path.join(root, file))
|
194 |
|
195 |
eval_results = {}
|
196 |
+
for model_result_filepath in model_result_filepaths:
|
197 |
# Creation of result
|
198 |
+
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
199 |
+
eval_result.update_with_request_file(requests_path)
|
200 |
|
201 |
# Store results of same eval together
|
202 |
eval_name = eval_result.eval_name
|
src/populate.py
CHANGED
@@ -9,8 +9,8 @@ from src.leaderboard.filter_models import filter_models
|
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
10 |
|
11 |
|
12 |
-
def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
13 |
-
raw_data = get_raw_eval_results(results_path)
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
all_data_json.append(baseline_row)
|
16 |
filter_models(all_data_json)
|
|
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
10 |
|
11 |
|
12 |
+
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
13 |
+
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
all_data_json.append(baseline_row)
|
16 |
filter_models(all_data_json)
|
src/submission/check_validity.py
CHANGED
@@ -38,17 +38,18 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
|
|
38 |
|
39 |
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False) -> tuple[bool, str]:
|
40 |
try:
|
41 |
-
AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
42 |
-
return True, None
|
43 |
|
44 |
except ValueError:
|
45 |
return (
|
46 |
False,
|
47 |
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
|
|
48 |
)
|
49 |
|
50 |
except Exception:
|
51 |
-
return False, "was not found on hub!"
|
52 |
|
53 |
|
54 |
def get_model_size(model_info: ModelInfo, precision: str):
|
|
|
38 |
|
39 |
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False) -> tuple[bool, str]:
|
40 |
try:
|
41 |
+
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
42 |
+
return True, None, config
|
43 |
|
44 |
except ValueError:
|
45 |
return (
|
46 |
False,
|
47 |
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
48 |
+
None
|
49 |
)
|
50 |
|
51 |
except Exception:
|
52 |
+
return False, "was not found on hub!", None
|
53 |
|
54 |
|
55 |
def get_model_size(model_info: ModelInfo, precision: str):
|
src/submission/submit.py
CHANGED
@@ -48,12 +48,12 @@ def add_new_eval(
|
|
48 |
|
49 |
# Is the model on the hub?
|
50 |
if weight_type in ["Delta", "Adapter"]:
|
51 |
-
base_model_on_hub, error = is_model_on_hub(base_model, revision, H4_TOKEN)
|
52 |
if not base_model_on_hub:
|
53 |
return styled_error(f'Base model "{base_model}" {error}')
|
54 |
|
55 |
if not weight_type == "Adapter":
|
56 |
-
model_on_hub, error = is_model_on_hub(model, revision)
|
57 |
if not model_on_hub:
|
58 |
return styled_error(f'Model "{model}" {error}')
|
59 |
|
|
|
48 |
|
49 |
# Is the model on the hub?
|
50 |
if weight_type in ["Delta", "Adapter"]:
|
51 |
+
base_model_on_hub, error, _ = is_model_on_hub(base_model, revision, H4_TOKEN)
|
52 |
if not base_model_on_hub:
|
53 |
return styled_error(f'Base model "{base_model}" {error}')
|
54 |
|
55 |
if not weight_type == "Adapter":
|
56 |
+
model_on_hub, error, _ = is_model_on_hub(model, revision)
|
57 |
if not model_on_hub:
|
58 |
return styled_error(f'Model "{model}" {error}')
|
59 |
|