Simplify Space
Browse files- .gitignore +5 -1
- README.md +1 -1
- app.py +43 -144
- requirements.txt +79 -6
- scripts/create_request_file.py +0 -107
- src/display/about.py +11 -75
- src/display/css_html_js.py +0 -13
- src/display/utils.py +4 -89
- src/envs.py +4 -5
- src/leaderboard/read_evals.py +12 -95
- src/populate.py +6 -6
- src/submission/check_validity.py +0 -103
- src/submission/submit.py +7 -65
.gitignore
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
auto_evals/
|
2 |
venv/
|
3 |
__pycache__/
|
4 |
-
env
|
5 |
.ipynb_checkpoints
|
6 |
*ipynb
|
7 |
.vscode/
|
@@ -13,3 +13,7 @@ eval-results/
|
|
13 |
auto_evals/
|
14 |
|
15 |
src/assets/model_counts.html
|
|
|
|
|
|
|
|
|
|
1 |
auto_evals/
|
2 |
venv/
|
3 |
__pycache__/
|
4 |
+
.env
|
5 |
.ipynb_checkpoints
|
6 |
*ipynb
|
7 |
.vscode/
|
|
|
13 |
auto_evals/
|
14 |
|
15 |
src/assets/model_counts.html
|
16 |
+
|
17 |
+
test
|
18 |
+
env
|
19 |
+
a.py
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: 🥇
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
|
|
1 |
---
|
2 |
+
title: Azerbaijani LLM Leaderboard
|
3 |
emoji: 🥇
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
app.py
CHANGED
@@ -6,11 +6,8 @@ import os
|
|
6 |
os.environ['CURL_CA_BUNDLE'] = ''
|
7 |
|
8 |
from src.display.about import (
|
9 |
-
CITATION_BUTTON_LABEL,
|
10 |
-
CITATION_BUTTON_TEXT,
|
11 |
EVALUATION_QUEUE_TEXT,
|
12 |
INTRODUCTION_TEXT,
|
13 |
-
LLM_BENCHMARKS_TEXT,
|
14 |
TITLE,
|
15 |
)
|
16 |
from src.display.css_html_js import custom_css
|
@@ -19,13 +16,9 @@ from src.display.utils import (
|
|
19 |
COLS,
|
20 |
EVAL_COLS,
|
21 |
EVAL_TYPES,
|
22 |
-
NUMERIC_INTERVALS,
|
23 |
TYPES,
|
24 |
AutoEvalColumn,
|
25 |
-
|
26 |
-
fields,
|
27 |
-
WeightType,
|
28 |
-
Precision
|
29 |
)
|
30 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO
|
31 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
@@ -38,20 +31,32 @@ def restart_space():
|
|
38 |
try:
|
39 |
print(EVAL_REQUESTS_PATH)
|
40 |
snapshot_download(
|
41 |
-
repo_id=QUEUE_REPO,
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
)
|
43 |
except Exception:
|
44 |
restart_space()
|
45 |
try:
|
46 |
print(EVAL_RESULTS_PATH)
|
47 |
snapshot_download(
|
48 |
-
repo_id=RESULTS_REPO,
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
)
|
50 |
except Exception:
|
51 |
restart_space()
|
52 |
|
53 |
|
54 |
-
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH,
|
55 |
leaderboard_df = original_df.copy()
|
56 |
|
57 |
(
|
@@ -65,14 +70,9 @@ leaderboard_df = original_df.copy()
|
|
65 |
def update_table(
|
66 |
hidden_df: pd.DataFrame,
|
67 |
columns: list,
|
68 |
-
type_query: list,
|
69 |
-
precision_query: str,
|
70 |
-
size_query: list,
|
71 |
-
show_deleted: bool,
|
72 |
query: str,
|
73 |
):
|
74 |
-
filtered_df =
|
75 |
-
filtered_df = filter_queries(query, filtered_df)
|
76 |
df = select_columns(filtered_df, columns)
|
77 |
return df
|
78 |
|
@@ -83,7 +83,7 @@ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
|
83 |
|
84 |
def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
85 |
always_here_cols = [
|
86 |
-
AutoEvalColumn.
|
87 |
AutoEvalColumn.model.name,
|
88 |
]
|
89 |
# We use COLS to maintain sorting
|
@@ -98,7 +98,6 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
|
|
98 |
if query != "":
|
99 |
queries = [q.strip() for q in query.split(";")]
|
100 |
for _q in queries:
|
101 |
-
_q = _q.strip()
|
102 |
if _q != "":
|
103 |
temp_filtered_df = search_table(filtered_df, _q)
|
104 |
if len(temp_filtered_df) > 0:
|
@@ -106,33 +105,12 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
|
|
106 |
if len(final_df) > 0:
|
107 |
filtered_df = pd.concat(final_df)
|
108 |
filtered_df = filtered_df.drop_duplicates(
|
109 |
-
subset=[AutoEvalColumn.model.name, AutoEvalColumn.
|
110 |
)
|
111 |
|
112 |
return filtered_df
|
113 |
|
114 |
|
115 |
-
def filter_models(
|
116 |
-
df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
|
117 |
-
) -> pd.DataFrame:
|
118 |
-
# Show all models
|
119 |
-
if show_deleted:
|
120 |
-
filtered_df = df
|
121 |
-
else: # Show only still on the hub models
|
122 |
-
filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
|
123 |
-
|
124 |
-
type_emoji = [t[0] for t in type_query]
|
125 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
126 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
127 |
-
|
128 |
-
numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
129 |
-
params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
130 |
-
mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
|
131 |
-
filtered_df = filtered_df.loc[mask]
|
132 |
-
|
133 |
-
return filtered_df
|
134 |
-
|
135 |
-
|
136 |
demo = gr.Blocks(css=custom_css)
|
137 |
with demo:
|
138 |
gr.HTML(TITLE)
|
@@ -141,55 +119,27 @@ with demo:
|
|
141 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
142 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
143 |
with gr.Row():
|
144 |
-
with gr.
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
elem_id="search-bar",
|
150 |
-
)
|
151 |
-
with gr.Row():
|
152 |
-
shown_columns = gr.CheckboxGroup(
|
153 |
-
choices=[
|
154 |
-
c.name
|
155 |
-
for c in fields(AutoEvalColumn)
|
156 |
-
if not c.hidden and not c.never_hidden and not c.dummy
|
157 |
-
],
|
158 |
-
value=[
|
159 |
-
c.name
|
160 |
-
for c in fields(AutoEvalColumn)
|
161 |
-
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
162 |
-
],
|
163 |
-
label="Select columns to show",
|
164 |
-
elem_id="column-select",
|
165 |
-
interactive=True,
|
166 |
-
)
|
167 |
-
with gr.Row():
|
168 |
-
deleted_models_visibility = gr.Checkbox(
|
169 |
-
value=False, label="Show gated/private/deleted models", interactive=True
|
170 |
)
|
171 |
-
with gr.
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
elem_id="filter-columns-precision",
|
186 |
-
)
|
187 |
-
filter_columns_size = gr.CheckboxGroup(
|
188 |
-
label="Model sizes (in billions of parameters)",
|
189 |
-
choices=list(NUMERIC_INTERVALS.keys()),
|
190 |
-
value=list(NUMERIC_INTERVALS.keys()),
|
191 |
interactive=True,
|
192 |
-
elem_id="filter-columns-size",
|
193 |
)
|
194 |
|
195 |
leaderboard_table = gr.components.Dataframe(
|
@@ -198,12 +148,12 @@ with demo:
|
|
198 |
+ shown_columns.value
|
199 |
+ [AutoEvalColumn.dummy.name]
|
200 |
],
|
201 |
-
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
202 |
datatype=TYPES,
|
203 |
elem_id="leaderboard-table",
|
204 |
interactive=False,
|
205 |
visible=True,
|
206 |
-
column_widths=["
|
207 |
)
|
208 |
|
209 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
@@ -218,33 +168,22 @@ with demo:
|
|
218 |
[
|
219 |
hidden_leaderboard_table_for_search,
|
220 |
shown_columns,
|
221 |
-
filter_columns_type,
|
222 |
-
filter_columns_precision,
|
223 |
-
filter_columns_size,
|
224 |
-
deleted_models_visibility,
|
225 |
search_bar,
|
226 |
],
|
227 |
leaderboard_table,
|
228 |
)
|
229 |
-
for selector in [shown_columns
|
230 |
selector.change(
|
231 |
update_table,
|
232 |
[
|
233 |
hidden_leaderboard_table_for_search,
|
234 |
shown_columns,
|
235 |
-
filter_columns_type,
|
236 |
-
filter_columns_precision,
|
237 |
-
filter_columns_size,
|
238 |
-
deleted_models_visibility,
|
239 |
search_bar,
|
240 |
],
|
241 |
leaderboard_table,
|
242 |
queue=True,
|
243 |
)
|
244 |
|
245 |
-
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
246 |
-
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
247 |
-
|
248 |
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
249 |
with gr.Column():
|
250 |
with gr.Row():
|
@@ -289,59 +228,19 @@ with demo:
|
|
289 |
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
290 |
|
291 |
with gr.Row():
|
292 |
-
with gr.
|
293 |
model_name_textbox = gr.Textbox(label="Model name")
|
294 |
-
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
295 |
-
model_type = gr.Dropdown(
|
296 |
-
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
297 |
-
label="Model type",
|
298 |
-
multiselect=False,
|
299 |
-
value=None,
|
300 |
-
interactive=True,
|
301 |
-
)
|
302 |
-
|
303 |
-
with gr.Column():
|
304 |
-
precision = gr.Dropdown(
|
305 |
-
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
306 |
-
label="Precision",
|
307 |
-
multiselect=False,
|
308 |
-
value="float16",
|
309 |
-
interactive=True,
|
310 |
-
)
|
311 |
-
weight_type = gr.Dropdown(
|
312 |
-
choices=[i.value.name for i in WeightType],
|
313 |
-
label="Weights type",
|
314 |
-
multiselect=False,
|
315 |
-
value="Original",
|
316 |
-
interactive=True,
|
317 |
-
)
|
318 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
319 |
|
320 |
submit_button = gr.Button("Submit Eval")
|
321 |
submission_result = gr.Markdown()
|
322 |
submit_button.click(
|
323 |
add_new_eval,
|
324 |
[
|
325 |
-
model_name_textbox
|
326 |
-
base_model_name_textbox,
|
327 |
-
revision_name_textbox,
|
328 |
-
precision,
|
329 |
-
weight_type,
|
330 |
-
model_type,
|
331 |
],
|
332 |
submission_result,
|
333 |
)
|
334 |
|
335 |
-
with gr.Row():
|
336 |
-
with gr.Accordion("📙 Citation", open=False):
|
337 |
-
citation_button = gr.Textbox(
|
338 |
-
value=CITATION_BUTTON_TEXT,
|
339 |
-
label=CITATION_BUTTON_LABEL,
|
340 |
-
lines=20,
|
341 |
-
elem_id="citation-button",
|
342 |
-
show_copy_button=True,
|
343 |
-
)
|
344 |
-
|
345 |
scheduler = BackgroundScheduler()
|
346 |
scheduler.add_job(restart_space, "interval", seconds=300)
|
347 |
scheduler.start()
|
|
|
6 |
os.environ['CURL_CA_BUNDLE'] = ''
|
7 |
|
8 |
from src.display.about import (
|
|
|
|
|
9 |
EVALUATION_QUEUE_TEXT,
|
10 |
INTRODUCTION_TEXT,
|
|
|
11 |
TITLE,
|
12 |
)
|
13 |
from src.display.css_html_js import custom_css
|
|
|
16 |
COLS,
|
17 |
EVAL_COLS,
|
18 |
EVAL_TYPES,
|
|
|
19 |
TYPES,
|
20 |
AutoEvalColumn,
|
21 |
+
fields
|
|
|
|
|
|
|
22 |
)
|
23 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO
|
24 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
|
|
31 |
try:
|
32 |
print(EVAL_REQUESTS_PATH)
|
33 |
snapshot_download(
|
34 |
+
repo_id=QUEUE_REPO,
|
35 |
+
local_dir=EVAL_REQUESTS_PATH,
|
36 |
+
repo_type="dataset",
|
37 |
+
tqdm_class=None,
|
38 |
+
etag_timeout=30,
|
39 |
+
force_download=True,
|
40 |
+
token=TOKEN
|
41 |
)
|
42 |
except Exception:
|
43 |
restart_space()
|
44 |
try:
|
45 |
print(EVAL_RESULTS_PATH)
|
46 |
snapshot_download(
|
47 |
+
repo_id=RESULTS_REPO,
|
48 |
+
local_dir=EVAL_RESULTS_PATH,
|
49 |
+
repo_type="dataset",
|
50 |
+
tqdm_class=None,
|
51 |
+
etag_timeout=30,
|
52 |
+
force_download=True,
|
53 |
+
token=TOKEN
|
54 |
)
|
55 |
except Exception:
|
56 |
restart_space()
|
57 |
|
58 |
|
59 |
+
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
|
60 |
leaderboard_df = original_df.copy()
|
61 |
|
62 |
(
|
|
|
70 |
def update_table(
|
71 |
hidden_df: pd.DataFrame,
|
72 |
columns: list,
|
|
|
|
|
|
|
|
|
73 |
query: str,
|
74 |
):
|
75 |
+
filtered_df = filter_queries(query, hidden_df)
|
|
|
76 |
df = select_columns(filtered_df, columns)
|
77 |
return df
|
78 |
|
|
|
83 |
|
84 |
def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
85 |
always_here_cols = [
|
86 |
+
AutoEvalColumn.model_submission_date.name,
|
87 |
AutoEvalColumn.model.name,
|
88 |
]
|
89 |
# We use COLS to maintain sorting
|
|
|
98 |
if query != "":
|
99 |
queries = [q.strip() for q in query.split(";")]
|
100 |
for _q in queries:
|
|
|
101 |
if _q != "":
|
102 |
temp_filtered_df = search_table(filtered_df, _q)
|
103 |
if len(temp_filtered_df) > 0:
|
|
|
105 |
if len(final_df) > 0:
|
106 |
filtered_df = pd.concat(final_df)
|
107 |
filtered_df = filtered_df.drop_duplicates(
|
108 |
+
subset=[AutoEvalColumn.model.name, AutoEvalColumn.model_submission_date.name]
|
109 |
)
|
110 |
|
111 |
return filtered_df
|
112 |
|
113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
demo = gr.Blocks(css=custom_css)
|
115 |
with demo:
|
116 |
gr.HTML(TITLE)
|
|
|
119 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
120 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
121 |
with gr.Row():
|
122 |
+
with gr.Row():
|
123 |
+
search_bar = gr.Textbox(
|
124 |
+
placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
|
125 |
+
show_label=False,
|
126 |
+
elem_id="search-bar",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
)
|
128 |
+
with gr.Row():
|
129 |
+
shown_columns = gr.CheckboxGroup(
|
130 |
+
choices=[
|
131 |
+
c.name
|
132 |
+
for c in fields(AutoEvalColumn)
|
133 |
+
if not c.hidden and not c.never_hidden and not c.dummy
|
134 |
+
],
|
135 |
+
value=[
|
136 |
+
c.name
|
137 |
+
for c in fields(AutoEvalColumn)
|
138 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
139 |
+
],
|
140 |
+
label="Select columns to show",
|
141 |
+
elem_id="column-select",
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
interactive=True,
|
|
|
143 |
)
|
144 |
|
145 |
leaderboard_table = gr.components.Dataframe(
|
|
|
148 |
+ shown_columns.value
|
149 |
+ [AutoEvalColumn.dummy.name]
|
150 |
],
|
151 |
+
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value + [AutoEvalColumn.dummy.name],
|
152 |
datatype=TYPES,
|
153 |
elem_id="leaderboard-table",
|
154 |
interactive=False,
|
155 |
visible=True,
|
156 |
+
column_widths=["15%", "30%"]
|
157 |
)
|
158 |
|
159 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
|
|
168 |
[
|
169 |
hidden_leaderboard_table_for_search,
|
170 |
shown_columns,
|
|
|
|
|
|
|
|
|
171 |
search_bar,
|
172 |
],
|
173 |
leaderboard_table,
|
174 |
)
|
175 |
+
for selector in [shown_columns]:
|
176 |
selector.change(
|
177 |
update_table,
|
178 |
[
|
179 |
hidden_leaderboard_table_for_search,
|
180 |
shown_columns,
|
|
|
|
|
|
|
|
|
181 |
search_bar,
|
182 |
],
|
183 |
leaderboard_table,
|
184 |
queue=True,
|
185 |
)
|
186 |
|
|
|
|
|
|
|
187 |
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
188 |
with gr.Column():
|
189 |
with gr.Row():
|
|
|
228 |
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
229 |
|
230 |
with gr.Row():
|
231 |
+
with gr.Row():
|
232 |
model_name_textbox = gr.Textbox(label="Model name")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
|
234 |
submit_button = gr.Button("Submit Eval")
|
235 |
submission_result = gr.Markdown()
|
236 |
submit_button.click(
|
237 |
add_new_eval,
|
238 |
[
|
239 |
+
model_name_textbox
|
|
|
|
|
|
|
|
|
|
|
240 |
],
|
241 |
submission_result,
|
242 |
)
|
243 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
scheduler = BackgroundScheduler()
|
245 |
scheduler.add_job(restart_space, "interval", seconds=300)
|
246 |
scheduler.start()
|
requirements.txt
CHANGED
@@ -1,15 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
APScheduler==3.10.1
|
|
|
|
|
2 |
black==23.11.0
|
|
|
|
|
3 |
click==8.1.3
|
|
|
|
|
4 |
datasets==2.14.5
|
5 |
-
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
matplotlib==3.7.1
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
pandas==2.0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
python-dateutil==2.8.2
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
tqdm==4.65.0
|
14 |
transformers==4.35.2
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.2.1
|
2 |
+
aiohappyeyeballs==2.4.2
|
3 |
+
aiohttp==3.10.8
|
4 |
+
aiosignal==1.3.1
|
5 |
+
altair==5.4.1
|
6 |
+
annotated-types==0.7.0
|
7 |
+
anyio==4.6.0
|
8 |
APScheduler==3.10.1
|
9 |
+
async-timeout==4.0.3
|
10 |
+
attrs==24.2.0
|
11 |
black==23.11.0
|
12 |
+
certifi==2024.8.30
|
13 |
+
charset-normalizer==3.3.2
|
14 |
click==8.1.3
|
15 |
+
contourpy==1.3.0
|
16 |
+
cycler==0.12.1
|
17 |
datasets==2.14.5
|
18 |
+
dill==0.3.7
|
19 |
+
exceptiongroup==1.2.2
|
20 |
+
fastapi==0.115.0
|
21 |
+
ffmpy==0.4.0
|
22 |
+
filelock==3.16.1
|
23 |
+
fonttools==4.54.1
|
24 |
+
frozenlist==1.4.1
|
25 |
+
fsspec==2023.6.0
|
26 |
+
gradio==4.44.1
|
27 |
+
gradio_client==1.3.0
|
28 |
+
h11==0.14.0
|
29 |
+
httpcore==1.0.5
|
30 |
+
httpx==0.27.2
|
31 |
+
huggingface-hub==0.25.1
|
32 |
+
idna==3.10
|
33 |
+
importlib_resources==6.4.5
|
34 |
+
Jinja2==3.1.4
|
35 |
+
jsonschema==4.23.0
|
36 |
+
jsonschema-specifications==2023.12.1
|
37 |
+
kiwisolver==1.4.7
|
38 |
+
markdown-it-py==3.0.0
|
39 |
+
MarkupSafe==2.1.5
|
40 |
matplotlib==3.7.1
|
41 |
+
mdurl==0.1.2
|
42 |
+
multidict==6.1.0
|
43 |
+
multiprocess==0.70.15
|
44 |
+
mypy-extensions==1.0.0
|
45 |
+
narwhals==1.8.4
|
46 |
+
numpy==1.26.4
|
47 |
+
orjson==3.10.7
|
48 |
+
packaging==24.1
|
49 |
pandas==2.0.0
|
50 |
+
pathspec==0.12.1
|
51 |
+
pillow==10.4.0
|
52 |
+
platformdirs==4.3.6
|
53 |
+
pyarrow==17.0.0
|
54 |
+
pydantic==2.9.2
|
55 |
+
pydantic_core==2.23.4
|
56 |
+
pydub==0.25.1
|
57 |
+
Pygments==2.18.0
|
58 |
+
pyparsing==3.1.4
|
59 |
python-dateutil==2.8.2
|
60 |
+
python-multipart==0.0.12
|
61 |
+
pytz==2024.2
|
62 |
+
PyYAML==6.0.2
|
63 |
+
referencing==0.35.1
|
64 |
+
regex==2024.9.11
|
65 |
+
requests==2.32.3
|
66 |
+
rich==13.8.1
|
67 |
+
rpds-py==0.20.0
|
68 |
+
ruff==0.6.8
|
69 |
+
safetensors==0.4.5
|
70 |
+
semantic-version==2.10.0
|
71 |
+
shellingham==1.5.4
|
72 |
+
six==1.16.0
|
73 |
+
sniffio==1.3.1
|
74 |
+
starlette==0.38.6
|
75 |
+
tokenizers==0.15.2
|
76 |
+
tomli==2.0.1
|
77 |
+
tomlkit==0.12.0
|
78 |
tqdm==4.65.0
|
79 |
transformers==4.35.2
|
80 |
+
typer==0.12.5
|
81 |
+
typing_extensions==4.12.2
|
82 |
+
tzdata==2024.2
|
83 |
+
tzlocal==5.2
|
84 |
+
urllib3==2.2.3
|
85 |
+
uvicorn==0.31.0
|
86 |
+
websockets==11.0.3
|
87 |
+
xxhash==3.5.0
|
88 |
+
yarl==1.13.1
|
scripts/create_request_file.py
DELETED
@@ -1,107 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
import pprint
|
4 |
-
import re
|
5 |
-
from datetime import datetime, timezone
|
6 |
-
|
7 |
-
import click
|
8 |
-
from colorama import Fore
|
9 |
-
from huggingface_hub import HfApi, snapshot_download
|
10 |
-
|
11 |
-
EVAL_REQUESTS_PATH = "eval-queue"
|
12 |
-
QUEUE_REPO = "LLM-Beetle/requests"
|
13 |
-
|
14 |
-
precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
|
15 |
-
model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
|
16 |
-
weight_types = ("Original", "Delta", "Adapter")
|
17 |
-
|
18 |
-
|
19 |
-
def get_model_size(model_info, precision: str):
|
20 |
-
size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
|
21 |
-
try:
|
22 |
-
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
23 |
-
except (AttributeError, TypeError):
|
24 |
-
try:
|
25 |
-
size_match = re.search(size_pattern, model_info.modelId.lower())
|
26 |
-
model_size = size_match.group(0)
|
27 |
-
model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
|
28 |
-
except AttributeError:
|
29 |
-
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
30 |
-
|
31 |
-
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
|
32 |
-
model_size = size_factor * model_size
|
33 |
-
return model_size
|
34 |
-
|
35 |
-
|
36 |
-
def main():
|
37 |
-
api = HfApi()
|
38 |
-
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
39 |
-
snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset")
|
40 |
-
|
41 |
-
model_name = click.prompt("Enter model name")
|
42 |
-
revision = click.prompt("Enter revision", default="main")
|
43 |
-
precision = click.prompt("Enter precision", default="float16", type=click.Choice(precisions))
|
44 |
-
model_type = click.prompt("Enter model type", type=click.Choice(model_types))
|
45 |
-
weight_type = click.prompt("Enter weight type", default="Original", type=click.Choice(weight_types))
|
46 |
-
base_model = click.prompt("Enter base model", default="")
|
47 |
-
status = click.prompt("Enter status", default="FINISHED")
|
48 |
-
|
49 |
-
try:
|
50 |
-
model_info = api.model_info(repo_id=model_name, revision=revision)
|
51 |
-
except Exception as e:
|
52 |
-
print(f"{Fore.RED}Could not find model info for {model_name} on the Hub\n{e}{Fore.RESET}")
|
53 |
-
return 1
|
54 |
-
|
55 |
-
model_size = get_model_size(model_info=model_info, precision=precision)
|
56 |
-
|
57 |
-
try:
|
58 |
-
license = model_info.cardData["license"]
|
59 |
-
except Exception:
|
60 |
-
license = "?"
|
61 |
-
|
62 |
-
eval_entry = {
|
63 |
-
"model": model_name,
|
64 |
-
"base_model": base_model,
|
65 |
-
"revision": revision,
|
66 |
-
"private": False,
|
67 |
-
"precision": precision,
|
68 |
-
"weight_type": weight_type,
|
69 |
-
"status": status,
|
70 |
-
"submitted_time": current_time,
|
71 |
-
"model_type": model_type,
|
72 |
-
"likes": model_info.likes,
|
73 |
-
"params": model_size,
|
74 |
-
"license": license,
|
75 |
-
}
|
76 |
-
|
77 |
-
user_name = ""
|
78 |
-
model_path = model_name
|
79 |
-
if "/" in model_name:
|
80 |
-
user_name = model_name.split("/")[0]
|
81 |
-
model_path = model_name.split("/")[1]
|
82 |
-
|
83 |
-
pprint.pprint(eval_entry)
|
84 |
-
|
85 |
-
if click.confirm("Do you want to continue? This request file will be pushed to the hub"):
|
86 |
-
click.echo("continuing...")
|
87 |
-
|
88 |
-
out_dir = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
89 |
-
os.makedirs(out_dir, exist_ok=True)
|
90 |
-
out_path = f"{out_dir}/{model_path}_eval_request_{False}_{precision}_{weight_type}.json"
|
91 |
-
|
92 |
-
with open(out_path, "w") as f:
|
93 |
-
f.write(json.dumps(eval_entry))
|
94 |
-
|
95 |
-
api.upload_file(
|
96 |
-
path_or_fileobj=out_path,
|
97 |
-
path_in_repo=out_path.split(f"{EVAL_REQUESTS_PATH}/")[1],
|
98 |
-
repo_id=QUEUE_REPO,
|
99 |
-
repo_type="dataset",
|
100 |
-
commit_message=f"Add {model_name} to eval queue",
|
101 |
-
)
|
102 |
-
else:
|
103 |
-
click.echo("aborting...")
|
104 |
-
|
105 |
-
|
106 |
-
if __name__ == "__main__":
|
107 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/about.py
CHANGED
@@ -12,99 +12,35 @@ class Task:
|
|
12 |
class Tasks(Enum):
|
13 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
14 |
task0 = Task("MMLU", "metric_name", "MMLU")
|
15 |
-
task1 = Task("
|
16 |
-
task2 = Task("
|
17 |
-
task3 = Task("
|
18 |
-
task4 = Task("
|
19 |
-
task5 = Task("Winogrande", "metric_name", "Winogrande")
|
20 |
|
21 |
|
22 |
# Your leaderboard name
|
23 |
-
TITLE = """<h1 align="center" id="space-title">
|
24 |
|
25 |
# What does your leaderboard evaluate?
|
26 |
INTRODUCTION_TEXT = """
|
27 |
-
Welcome to
|
28 |
-
The Leadboard uses [this](https://huggingface.co/collections/malhajar/openllmturkishleadboard-v02-datasets-662a8593043e73938e2f6b1e) currfelly curated benchmarks for evaluation.
|
29 |
-
The benchmarks are generated and checked using both GPT-4 and Human annotation rendering the leadboard the most valuable and accurate test in the LLM arena for Azerbaijani evaluation.
|
30 |
|
31 |
🚀 Submit Your Model 🚀
|
32 |
|
33 |
-
|
34 |
-
|
35 |
-
Join the forefront of Azerbaijani language technology. Submit your model, and let's advance Azerbaijani LLM's together!
|
36 |
|
37 |
"""
|
38 |
|
39 |
-
# Which evaluations are you running? how can people reproduce what you have?
|
40 |
-
LLM_BENCHMARKS_TEXT = f"""
|
41 |
-
## How it works
|
42 |
-
|
43 |
-
## Reproducibility
|
44 |
-
|
45 |
-
I use LM-Evaluation-Harness-Turkish, a version of the LM Evaluation Harness adapted for Turkish datasets, to ensure our leaderboard results are both reliable and replicable. Please see https://github.com/malhajar17/lm-evaluation-harness_turkish for more information
|
46 |
-
|
47 |
-
## How to Reproduce Results:
|
48 |
-
|
49 |
-
1) Set Up the repo: Clone the "lm-evaluation-harness_turkish" from https://github.com/malhajar17/lm-evaluation-harness_turkish and follow the installation instructions.
|
50 |
-
2) Run Evaluations: To get the results as on the leaderboard (Some tests might show small variations), use the following command, adjusting for your model. For example, with the Trendyol model:
|
51 |
-
```python
|
52 |
-
lm_eval --model vllm --model_args pretrained=Orbina/Orbita-v0.1 --tasks mmlu_tr_v0.2,arc_tr-v0.2,gsm8k_tr-v0.2,hellaswag_tr-v0.2,truthfulqa_v0.2,winogrande_tr-v0.2 --output /workspace/Orbina/Orbita-v0.1
|
53 |
-
```
|
54 |
-
3) Report Results: The results file generated is then uploaded to the OpenLLM Turkish Leaderboard.
|
55 |
-
|
56 |
-
## Notes:
|
57 |
-
|
58 |
-
- I currently use "vllm" which might differ slightly as per the LM Evaluation Harness.
|
59 |
-
- All the tests are using the same configuration used in the original OpenLLMLeadboard preciesly
|
60 |
-
|
61 |
-
The tasks and few shots parameters are:
|
62 |
-
- ARC: 25-shot, *arc-challenge* (`acc_norm`)
|
63 |
-
- HellaSwag: 10-shot, *hellaswag* (`acc_norm`)
|
64 |
-
- TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
|
65 |
-
- MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
|
66 |
-
- Winogrande: 5-shot, *winogrande* (`acc`)
|
67 |
-
- GSM8k: 5-shot, *gsm8k* (`acc`)
|
68 |
-
|
69 |
-
"""
|
70 |
|
71 |
EVALUATION_QUEUE_TEXT = """
|
72 |
## Some good practices before submitting a model
|
73 |
|
74 |
-
### 1) Make sure
|
75 |
-
|
76 |
-
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
77 |
-
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
78 |
-
model = AutoModel.from_pretrained("your model name", revision=revision)
|
79 |
-
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
80 |
-
```
|
81 |
-
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
82 |
-
|
83 |
-
Note: make sure your model is public!
|
84 |
-
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
85 |
-
|
86 |
-
### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
87 |
-
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
88 |
|
89 |
-
### 3) Make sure your model has an open license!
|
90 |
-
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
|
91 |
-
|
92 |
-
### 4) Fill up your model card
|
93 |
-
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
94 |
|
95 |
## In case of model failure
|
96 |
If your model is displayed in the `FAILED` category, its execution stopped.
|
97 |
Make sure you have followed the above steps first.
|
98 |
-
|
99 |
-
"""
|
100 |
-
|
101 |
-
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
102 |
-
CITATION_BUTTON_TEXT = r"""
|
103 |
-
@misc{openllm-Turkish-leaderboard,
|
104 |
-
author = {Mohamad Alhajar},
|
105 |
-
title = {Open LLM Turkish Leaderboard v0.2},
|
106 |
-
year = {2024},
|
107 |
-
publisher = {Mohamad Alhajar},
|
108 |
-
howpublished = "\url{https://huggingface.co/spaces/malhajar/OpenLLMTurkishLeaderboard}"
|
109 |
-
}
|
110 |
-
"""
|
|
|
12 |
class Tasks(Enum):
|
13 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
14 |
task0 = Task("MMLU", "metric_name", "MMLU")
|
15 |
+
task1 = Task("task_1", "metric_name", "task_1")
|
16 |
+
task2 = Task("task_2", "metric_name", "task_2")
|
17 |
+
task3 = Task("task_3", "metric_name", "task_3")
|
18 |
+
task4 = Task("task_4", "metric_name", "task_4")
|
|
|
19 |
|
20 |
|
21 |
# Your leaderboard name
|
22 |
+
TITLE = """<h1 align="center" id="space-title"> Azerbaijani Bank LLM Leaderboard</h1>"""
|
23 |
|
24 |
# What does your leaderboard evaluate?
|
25 |
INTRODUCTION_TEXT = """
|
26 |
+
Welcome to Kapital Bank's Azerbaijani LLM Leaderboard. We use benchmarks in finance, banking, and general knowledge for accurate evaluations.
|
|
|
|
|
27 |
|
28 |
🚀 Submit Your Model 🚀
|
29 |
|
30 |
+
If you have a fine-tuned Azerbaijani LLM, submit it for evaluation!
|
|
|
|
|
31 |
|
32 |
"""
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
EVALUATION_QUEUE_TEXT = """
|
36 |
## Some good practices before submitting a model
|
37 |
|
38 |
+
### 1) Make sure your model exists on hub.
|
39 |
+
### 2) Make sure your model is public.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
## In case of model failure
|
43 |
If your model is displayed in the `FAILED` category, its execution stopped.
|
44 |
Make sure you have followed the above steps first.
|
45 |
+
Please contact us if you are facing any trouble!
|
46 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/css_html_js.py
CHANGED
@@ -8,19 +8,6 @@ custom_css = """
|
|
8 |
font-size: 18px !important;
|
9 |
}
|
10 |
|
11 |
-
#citation-button span {
|
12 |
-
font-size: 16px !important;
|
13 |
-
}
|
14 |
-
|
15 |
-
#citation-button textarea {
|
16 |
-
font-size: 16px !important;
|
17 |
-
}
|
18 |
-
|
19 |
-
#citation-button > label > button {
|
20 |
-
margin: 6px;
|
21 |
-
transform: scale(1.3);
|
22 |
-
}
|
23 |
-
|
24 |
#leaderboard-table {
|
25 |
margin-top: 15px
|
26 |
}
|
|
|
8 |
font-size: 18px !important;
|
9 |
}
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
#leaderboard-table {
|
12 |
margin-top: 15px
|
13 |
}
|
src/display/utils.py
CHANGED
@@ -1,7 +1,4 @@
|
|
1 |
from dataclasses import dataclass, make_dataclass
|
2 |
-
from enum import Enum
|
3 |
-
|
4 |
-
import pandas as pd
|
5 |
|
6 |
from src.display.about import Tasks
|
7 |
|
@@ -24,22 +21,12 @@ class ColumnContent:
|
|
24 |
## Leaderboard columns
|
25 |
auto_eval_column_dict = []
|
26 |
# Init
|
27 |
-
auto_eval_column_dict.append(["
|
28 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
29 |
#Scores
|
30 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
31 |
for task in Tasks:
|
32 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
33 |
-
# Model information
|
34 |
-
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
35 |
-
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
36 |
-
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
37 |
-
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
38 |
-
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
39 |
-
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
40 |
-
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
41 |
-
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
42 |
-
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
43 |
# Dummy column for the search bar (hidden by the custom CSS)
|
44 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
45 |
|
@@ -50,86 +37,14 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=
|
|
50 |
@dataclass(frozen=True)
|
51 |
class EvalQueueColumn: # Queue column
|
52 |
model = ColumnContent("model", "markdown", True)
|
53 |
-
|
54 |
-
private = ColumnContent("private", "bool", True)
|
55 |
-
precision = ColumnContent("precision", "str", True)
|
56 |
-
weight_type = ColumnContent("weight_type", "str", "Original")
|
57 |
status = ColumnContent("status", "str", True)
|
58 |
-
|
59 |
-
## All the model information that we might need
|
60 |
-
@dataclass
|
61 |
-
class ModelDetails:
|
62 |
-
name: str
|
63 |
-
display_name: str = ""
|
64 |
-
symbol: str = "" # emoji
|
65 |
-
|
66 |
-
|
67 |
-
class ModelType(Enum):
|
68 |
-
PT = ModelDetails(name="pretrained", symbol="🟢")
|
69 |
-
FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
70 |
-
IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
71 |
-
RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
72 |
-
Unknown = ModelDetails(name="", symbol="?")
|
73 |
-
|
74 |
-
def to_str(self, separator=" "):
|
75 |
-
return f"{self.value.symbol}{separator}{self.value.name}"
|
76 |
-
|
77 |
-
@staticmethod
|
78 |
-
def from_str(type):
|
79 |
-
if "fine-tuned" in type or "🔶" in type:
|
80 |
-
return ModelType.FT
|
81 |
-
if "pretrained" in type or "🟢" in type:
|
82 |
-
return ModelType.PT
|
83 |
-
if "RL-tuned" in type or "🟦" in type:
|
84 |
-
return ModelType.RL
|
85 |
-
if "instruction-tuned" in type or "⭕" in type:
|
86 |
-
return ModelType.IFT
|
87 |
-
return ModelType.Unknown
|
88 |
-
|
89 |
-
class WeightType(Enum):
|
90 |
-
Adapter = ModelDetails("Adapter")
|
91 |
-
Original = ModelDetails("Original")
|
92 |
-
Delta = ModelDetails("Delta")
|
93 |
-
|
94 |
-
class Precision(Enum):
|
95 |
-
float16 = ModelDetails("float16")
|
96 |
-
bfloat16 = ModelDetails("bfloat16")
|
97 |
-
qt_8bit = ModelDetails("8bit")
|
98 |
-
qt_4bit = ModelDetails("4bit")
|
99 |
-
qt_GPTQ = ModelDetails("GPTQ")
|
100 |
-
Unknown = ModelDetails("?")
|
101 |
-
|
102 |
-
def from_str(precision):
|
103 |
-
if precision in ["torch.float16", "float16"]:
|
104 |
-
return Precision.float16
|
105 |
-
if precision in ["torch.bfloat16", "bfloat16"]:
|
106 |
-
return Precision.bfloat16
|
107 |
-
if precision in ["8bit"]:
|
108 |
-
return Precision.qt_8bit
|
109 |
-
if precision in ["4bit"]:
|
110 |
-
return Precision.qt_4bit
|
111 |
-
if precision in ["GPTQ", "None"]:
|
112 |
-
return Precision.qt_GPTQ
|
113 |
-
return Precision.Unknown
|
114 |
-
|
115 |
# Column selection
|
116 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
117 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
118 |
-
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
119 |
-
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
120 |
|
121 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
122 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
123 |
|
124 |
-
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
125 |
-
|
126 |
-
NUMERIC_INTERVALS = {
|
127 |
-
"?": pd.Interval(-1, 0, closed="right"),
|
128 |
-
"~1.5": pd.Interval(0, 2, closed="right"),
|
129 |
-
"~3": pd.Interval(2, 4, closed="right"),
|
130 |
-
"~7": pd.Interval(4, 9, closed="right"),
|
131 |
-
"~13": pd.Interval(9, 20, closed="right"),
|
132 |
-
"~35": pd.Interval(20, 45, closed="right"),
|
133 |
-
"~60": pd.Interval(45, 70, closed="right"),
|
134 |
-
"70+": pd.Interval(70, 10000, closed="right"),
|
135 |
-
}
|
|
|
1 |
from dataclasses import dataclass, make_dataclass
|
|
|
|
|
|
|
2 |
|
3 |
from src.display.about import Tasks
|
4 |
|
|
|
21 |
## Leaderboard columns
|
22 |
auto_eval_column_dict = []
|
23 |
# Init
|
24 |
+
auto_eval_column_dict.append(["model_submission_date", ColumnContent, ColumnContent("Submission Date", "str", True, never_hidden=True)])
|
25 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
26 |
#Scores
|
27 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
28 |
for task in Tasks:
|
29 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
# Dummy column for the search bar (hidden by the custom CSS)
|
31 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
32 |
|
|
|
37 |
@dataclass(frozen=True)
|
38 |
class EvalQueueColumn: # Queue column
|
39 |
model = ColumnContent("model", "markdown", True)
|
40 |
+
submitted_time = ColumnContent("submitted_time", "str", True)
|
|
|
|
|
|
|
41 |
status = ColumnContent("status", "str", True)
|
42 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
# Column selection
|
44 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
45 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
|
|
|
|
46 |
|
47 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
48 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
49 |
|
50 |
+
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/envs.py
CHANGED
@@ -1,14 +1,13 @@
|
|
1 |
import os
|
2 |
-
|
3 |
from huggingface_hub import HfApi
|
4 |
|
5 |
# clone / pull the lmeh eval data
|
6 |
TOKEN = os.environ.get("HF_TOKEN", None)
|
7 |
|
8 |
-
OWNER = "
|
9 |
-
REPO_ID = f"{OWNER}/
|
10 |
-
QUEUE_REPO = "
|
11 |
-
RESULTS_REPO = "
|
12 |
|
13 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
14 |
|
|
|
1 |
import os
|
|
|
2 |
from huggingface_hub import HfApi
|
3 |
|
4 |
# clone / pull the lmeh eval data
|
5 |
TOKEN = os.environ.get("HF_TOKEN", None)
|
6 |
|
7 |
+
OWNER = "kavsar"
|
8 |
+
REPO_ID = f"{OWNER}/frontend"
|
9 |
+
QUEUE_REPO = f"{OWNER}/requests"
|
10 |
+
RESULTS_REPO = f"{OWNER}/results"
|
11 |
|
12 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
13 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -8,27 +8,16 @@ import dateutil
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import AutoEvalColumn,
|
12 |
-
from src.submission.check_validity import is_model_on_hub
|
13 |
-
|
14 |
|
15 |
@dataclass
|
16 |
class EvalResult:
|
17 |
-
eval_name: str #
|
18 |
full_model: str # org/model (path on hub)
|
19 |
org: str
|
20 |
model: str
|
21 |
-
revision: str # commit hash, "" if main
|
22 |
results: dict
|
23 |
-
precision: Precision = Precision.Unknown
|
24 |
-
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
25 |
-
weight_type: WeightType = WeightType.Original # Original or Adapter
|
26 |
-
architecture: str = "Unknown"
|
27 |
-
license: str = "?"
|
28 |
-
likes: int = 0
|
29 |
-
num_params: int = 0
|
30 |
date: str = "" # submission date of request file
|
31 |
-
still_on_hub: bool = False
|
32 |
|
33 |
@classmethod
|
34 |
def init_from_json_file(self, json_filepath):
|
@@ -38,32 +27,16 @@ class EvalResult:
|
|
38 |
|
39 |
config = data.get("config")
|
40 |
|
41 |
-
# Precision
|
42 |
-
precision = Precision.from_str(config.get("model_dtype"))
|
43 |
-
|
44 |
# Get model and org
|
45 |
-
org_and_model = config.get("model_name",
|
46 |
org_and_model = org_and_model.split("/", 1)
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
else:
|
53 |
-
org = org_and_model[0]
|
54 |
-
model = org_and_model[1]
|
55 |
-
result_key = f"{org}_{model}_{precision.value.name}"
|
56 |
full_model = "/".join(org_and_model)
|
57 |
|
58 |
-
still_on_hub, _, model_config = is_model_on_hub(
|
59 |
-
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
60 |
-
)
|
61 |
-
architecture = "?"
|
62 |
-
if model_config is not None:
|
63 |
-
architectures = getattr(model_config, "architectures", None)
|
64 |
-
if architectures:
|
65 |
-
architecture = ";".join(architectures)
|
66 |
-
|
67 |
# Extract results available in this file (some results are split in several files)
|
68 |
results = {}
|
69 |
for task in Tasks:
|
@@ -83,46 +56,19 @@ class EvalResult:
|
|
83 |
org=org,
|
84 |
model=model,
|
85 |
results=results,
|
86 |
-
|
87 |
-
revision= config.get("model_sha", ""),
|
88 |
-
still_on_hub=still_on_hub,
|
89 |
-
architecture=architecture
|
90 |
)
|
91 |
|
92 |
-
def update_with_request_file(self, requests_path):
|
93 |
-
"""Finds the relevant request file for the current model and updates info with it"""
|
94 |
-
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
95 |
-
|
96 |
-
try:
|
97 |
-
with open(request_file, "r") as f:
|
98 |
-
request = json.load(f)
|
99 |
-
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
100 |
-
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
101 |
-
self.license = request.get("license", "?")
|
102 |
-
self.likes = request.get("likes", 0)
|
103 |
-
self.num_params = request.get("params", 0)
|
104 |
-
self.date = request.get("submitted_time", "")
|
105 |
-
except Exception:
|
106 |
-
print(f"Could not find request file for {self.org}/{self.model}")
|
107 |
|
108 |
def to_dict(self):
|
109 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
110 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
111 |
data_dict = {
|
112 |
"eval_name": self.eval_name, # not a column, just a save name,
|
113 |
-
AutoEvalColumn.
|
114 |
-
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
115 |
-
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
116 |
-
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
117 |
-
AutoEvalColumn.architecture.name: self.architecture,
|
118 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
119 |
AutoEvalColumn.dummy.name: self.full_model,
|
120 |
-
AutoEvalColumn.revision.name: self.revision,
|
121 |
AutoEvalColumn.average.name: average,
|
122 |
-
AutoEvalColumn.license.name: self.license,
|
123 |
-
AutoEvalColumn.likes.name: self.likes,
|
124 |
-
AutoEvalColumn.params.name: self.num_params,
|
125 |
-
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
126 |
}
|
127 |
|
128 |
for task in Tasks:
|
@@ -131,29 +77,7 @@ class EvalResult:
|
|
131 |
return data_dict
|
132 |
|
133 |
|
134 |
-
def
|
135 |
-
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
136 |
-
request_files = os.path.join(
|
137 |
-
requests_path,
|
138 |
-
f"{model_name}_eval_request_*.json",
|
139 |
-
)
|
140 |
-
request_files = glob.glob(request_files)
|
141 |
-
|
142 |
-
# Select correct request file (precision)
|
143 |
-
request_file = ""
|
144 |
-
request_files = sorted(request_files, reverse=True)
|
145 |
-
for tmp_request_file in request_files:
|
146 |
-
with open(tmp_request_file, "r") as f:
|
147 |
-
req_content = json.load(f)
|
148 |
-
if (
|
149 |
-
req_content["status"] in ["FINISHED"]
|
150 |
-
and req_content["precision"] == precision.split(".")[-1]
|
151 |
-
):
|
152 |
-
request_file = tmp_request_file
|
153 |
-
return request_file
|
154 |
-
|
155 |
-
|
156 |
-
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
157 |
"""From the path of the results folder root, extract all needed info for results"""
|
158 |
model_result_filepaths = []
|
159 |
|
@@ -163,10 +87,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
163 |
continue
|
164 |
|
165 |
# Sort the files by date
|
166 |
-
|
167 |
-
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
168 |
-
except dateutil.parser._parser.ParserError:
|
169 |
-
files = [files[-1]]
|
170 |
|
171 |
for file in files:
|
172 |
model_result_filepaths.append(os.path.join(root, file))
|
@@ -175,14 +96,10 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
175 |
for model_result_filepath in model_result_filepaths:
|
176 |
# Creation of result
|
177 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
178 |
-
eval_result.update_with_request_file(requests_path)
|
179 |
|
180 |
# Store results of same eval together
|
181 |
eval_name = eval_result.eval_name
|
182 |
-
|
183 |
-
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
184 |
-
else:
|
185 |
-
eval_results[eval_name] = eval_result
|
186 |
|
187 |
results = []
|
188 |
for v in eval_results.values():
|
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
+
from src.display.utils import AutoEvalColumn, Tasks
|
|
|
|
|
12 |
|
13 |
@dataclass
|
14 |
class EvalResult:
|
15 |
+
eval_name: str # org_model_date (uid)
|
16 |
full_model: str # org/model (path on hub)
|
17 |
org: str
|
18 |
model: str
|
|
|
19 |
results: dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
date: str = "" # submission date of request file
|
|
|
21 |
|
22 |
@classmethod
|
23 |
def init_from_json_file(self, json_filepath):
|
|
|
27 |
|
28 |
config = data.get("config")
|
29 |
|
|
|
|
|
|
|
30 |
# Get model and org
|
31 |
+
org_and_model = config.get("model_name", None)
|
32 |
org_and_model = org_and_model.split("/", 1)
|
33 |
|
34 |
+
org = org_and_model[0]
|
35 |
+
model = org_and_model[1]
|
36 |
+
date = config.get("submitted_time", None)
|
37 |
+
result_key = f"{org}_{model}_{date}"
|
|
|
|
|
|
|
|
|
38 |
full_model = "/".join(org_and_model)
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
# Extract results available in this file (some results are split in several files)
|
41 |
results = {}
|
42 |
for task in Tasks:
|
|
|
56 |
org=org,
|
57 |
model=model,
|
58 |
results=results,
|
59 |
+
date=date
|
|
|
|
|
|
|
60 |
)
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
def to_dict(self):
|
64 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
65 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
66 |
data_dict = {
|
67 |
"eval_name": self.eval_name, # not a column, just a save name,
|
68 |
+
AutoEvalColumn.model_submission_date.name: self.date,
|
|
|
|
|
|
|
|
|
69 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
70 |
AutoEvalColumn.dummy.name: self.full_model,
|
|
|
71 |
AutoEvalColumn.average.name: average,
|
|
|
|
|
|
|
|
|
72 |
}
|
73 |
|
74 |
for task in Tasks:
|
|
|
77 |
return data_dict
|
78 |
|
79 |
|
80 |
+
def get_raw_eval_results(results_path: str) -> list[EvalResult]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
"""From the path of the results folder root, extract all needed info for results"""
|
82 |
model_result_filepaths = []
|
83 |
|
|
|
87 |
continue
|
88 |
|
89 |
# Sort the files by date
|
90 |
+
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
|
|
|
|
|
|
91 |
|
92 |
for file in files:
|
93 |
model_result_filepaths.append(os.path.join(root, file))
|
|
|
96 |
for model_result_filepath in model_result_filepaths:
|
97 |
# Creation of result
|
98 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
|
|
99 |
|
100 |
# Store results of same eval together
|
101 |
eval_name = eval_result.eval_name
|
102 |
+
eval_results[eval_name] = eval_result
|
|
|
|
|
|
|
103 |
|
104 |
results = []
|
105 |
for v in eval_results.values():
|
src/populate.py
CHANGED
@@ -8,8 +8,8 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
11 |
-
def get_leaderboard_df(results_path: str,
|
12 |
-
raw_data = get_raw_eval_results(results_path
|
13 |
all_data_json = [v.to_dict() for v in raw_data]
|
14 |
|
15 |
df = pd.DataFrame.from_records(all_data_json)
|
@@ -32,7 +32,6 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
32 |
data = json.load(fp)
|
33 |
|
34 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
35 |
-
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
36 |
|
37 |
all_evals.append(data)
|
38 |
elif ".md" not in entry:
|
@@ -44,13 +43,14 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
44 |
data = json.load(fp)
|
45 |
|
46 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
47 |
-
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
48 |
all_evals.append(data)
|
49 |
|
50 |
-
pending_list = [e for e in all_evals if e["status"]
|
51 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
52 |
-
finished_list = [e for e in all_evals if e["status"]
|
|
|
53 |
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
54 |
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
55 |
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
|
|
56 |
return df_finished[cols], df_running[cols], df_pending[cols]
|
|
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
11 |
+
def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
+
raw_data = get_raw_eval_results(results_path)
|
13 |
all_data_json = [v.to_dict() for v in raw_data]
|
14 |
|
15 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
32 |
data = json.load(fp)
|
33 |
|
34 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
|
|
35 |
|
36 |
all_evals.append(data)
|
37 |
elif ".md" not in entry:
|
|
|
43 |
data = json.load(fp)
|
44 |
|
45 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
|
|
46 |
all_evals.append(data)
|
47 |
|
48 |
+
pending_list = [e for e in all_evals if e["status"] == "PENDING"]
|
49 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
50 |
+
finished_list = [e for e in all_evals if e["status"] == "FINISHED"]
|
51 |
+
|
52 |
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
53 |
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
54 |
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
55 |
+
|
56 |
return df_finished[cols], df_running[cols], df_pending[cols]
|
src/submission/check_validity.py
DELETED
@@ -1,103 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
import re
|
4 |
-
from collections import defaultdict
|
5 |
-
from datetime import datetime, timedelta, timezone
|
6 |
-
|
7 |
-
import huggingface_hub
|
8 |
-
from huggingface_hub import ModelCard
|
9 |
-
from huggingface_hub.hf_api import ModelInfo
|
10 |
-
from transformers import AutoConfig
|
11 |
-
from transformers.models.auto.tokenization_auto import tokenizer_class_from_name, get_tokenizer_config
|
12 |
-
|
13 |
-
def check_model_card(repo_id: str) -> tuple[bool, str]:
|
14 |
-
"""Checks if the model card and license exist and have been filled"""
|
15 |
-
try:
|
16 |
-
card = ModelCard.load(repo_id)
|
17 |
-
except huggingface_hub.utils.EntryNotFoundError:
|
18 |
-
return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
|
19 |
-
|
20 |
-
# Enforce license metadata
|
21 |
-
if card.data.license is None:
|
22 |
-
if not ("license_name" in card.data and "license_link" in card.data):
|
23 |
-
return False, (
|
24 |
-
"License not found. Please add a license to your model card using the `license` metadata or a"
|
25 |
-
" `license_name`/`license_link` pair."
|
26 |
-
)
|
27 |
-
|
28 |
-
# Enforce card content
|
29 |
-
if len(card.text) < 200:
|
30 |
-
return False, "Please add a description to your model card, it is too short."
|
31 |
-
|
32 |
-
return True, ""
|
33 |
-
|
34 |
-
|
35 |
-
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
|
36 |
-
"""Makes sure the model is on the hub, and uses a valid configuration (in the latest transformers version)"""
|
37 |
-
try:
|
38 |
-
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
39 |
-
if test_tokenizer:
|
40 |
-
tokenizer_config = get_tokenizer_config(model_name)
|
41 |
-
if tokenizer_config is not None:
|
42 |
-
tokenizer_class_candidate = tokenizer_config.get("tokenizer_class", None)
|
43 |
-
else:
|
44 |
-
tokenizer_class_candidate = config.tokenizer_class
|
45 |
-
|
46 |
-
|
47 |
-
tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
|
48 |
-
if tokenizer_class is None:
|
49 |
-
return (
|
50 |
-
False,
|
51 |
-
f"uses {tokenizer_class_candidate}, which is not in a transformers release, therefore not supported at the moment.",
|
52 |
-
None
|
53 |
-
)
|
54 |
-
return True, None, config
|
55 |
-
|
56 |
-
except ValueError:
|
57 |
-
return (
|
58 |
-
False,
|
59 |
-
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
60 |
-
None
|
61 |
-
)
|
62 |
-
|
63 |
-
except Exception as e:
|
64 |
-
return False, "was not found on hub!", None
|
65 |
-
|
66 |
-
|
67 |
-
def get_model_size(model_info: ModelInfo, precision: str):
|
68 |
-
"""Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
|
69 |
-
try:
|
70 |
-
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
71 |
-
except (AttributeError, TypeError):
|
72 |
-
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
73 |
-
|
74 |
-
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
|
75 |
-
model_size = size_factor * model_size
|
76 |
-
return model_size
|
77 |
-
|
78 |
-
def get_model_arch(model_info: ModelInfo):
|
79 |
-
"""Gets the model architecture from the configuration"""
|
80 |
-
return model_info.config.get("architectures", "Unknown")
|
81 |
-
|
82 |
-
def already_submitted_models(requested_models_dir: str) -> set[str]:
|
83 |
-
depth = 1
|
84 |
-
file_names = []
|
85 |
-
users_to_submission_dates = defaultdict(list)
|
86 |
-
|
87 |
-
for root, _, files in os.walk(requested_models_dir):
|
88 |
-
current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
|
89 |
-
if current_depth == depth:
|
90 |
-
for file in files:
|
91 |
-
if not file.endswith(".json"):
|
92 |
-
continue
|
93 |
-
with open(os.path.join(root, file), "r") as f:
|
94 |
-
info = json.load(f)
|
95 |
-
file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
|
96 |
-
|
97 |
-
# Select organisation
|
98 |
-
if info["model"].count("/") == 0 or "submitted_time" not in info:
|
99 |
-
continue
|
100 |
-
organisation, _ = info["model"].split("/")
|
101 |
-
users_to_submission_dates[organisation].append(info["submitted_time"])
|
102 |
-
|
103 |
-
return set(file_names), users_to_submission_dates
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/submission/submit.py
CHANGED
@@ -2,96 +2,38 @@ import json
|
|
2 |
import os
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
-
from src.display.formatting import styled_error, styled_message
|
6 |
-
from src.envs import API, EVAL_REQUESTS_PATH,
|
7 |
-
from src.submission.check_validity import (
|
8 |
-
already_submitted_models,
|
9 |
-
check_model_card,
|
10 |
-
get_model_size,
|
11 |
-
is_model_on_hub,
|
12 |
-
)
|
13 |
|
14 |
-
REQUESTED_MODELS = None
|
15 |
-
USERS_TO_SUBMISSION_DATES = None
|
16 |
-
|
17 |
-
def add_new_eval(
|
18 |
-
model: str,
|
19 |
-
base_model: str,
|
20 |
-
revision: str,
|
21 |
-
precision: str,
|
22 |
-
weight_type: str,
|
23 |
-
model_type: str,
|
24 |
-
):
|
25 |
-
global REQUESTED_MODELS
|
26 |
-
global USERS_TO_SUBMISSION_DATES
|
27 |
-
if not REQUESTED_MODELS:
|
28 |
-
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
29 |
|
|
|
30 |
user_name = ""
|
31 |
model_path = model
|
32 |
if "/" in model:
|
33 |
user_name = model.split("/")[0]
|
34 |
model_path = model.split("/")[1]
|
35 |
|
36 |
-
precision = precision.split(" ")[0]
|
37 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
38 |
|
39 |
-
if model_type is None or model_type == "":
|
40 |
-
return styled_error("Please select a model type.")
|
41 |
-
|
42 |
-
# Does the model actually exist?
|
43 |
-
if revision == "":
|
44 |
-
revision = "main"
|
45 |
-
|
46 |
-
# Is the model on the hub?
|
47 |
-
if weight_type in ["Delta", "Adapter"]:
|
48 |
-
base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
|
49 |
-
if not base_model_on_hub:
|
50 |
-
return styled_error(f'Base model "{base_model}" {error}')
|
51 |
-
|
52 |
# Is the model info correctly filled?
|
53 |
try:
|
54 |
-
model_info = API.model_info(repo_id=model, revision=
|
55 |
-
except Exception:
|
56 |
-
return styled_error("Could not get your model information. Please fill it up properly.")
|
57 |
-
|
58 |
-
model_size = get_model_size(model_info=model_info, precision=precision)
|
59 |
-
|
60 |
-
# Were the model card and license filled?
|
61 |
-
try:
|
62 |
-
license = model_info.cardData["license"]
|
63 |
except Exception:
|
64 |
-
return styled_error("
|
65 |
-
|
66 |
-
modelcard_OK, error_msg = check_model_card(model)
|
67 |
-
if not modelcard_OK:
|
68 |
-
return styled_error(error_msg)
|
69 |
|
70 |
# Seems good, creating the eval
|
71 |
print("Adding new eval")
|
72 |
|
73 |
eval_entry = {
|
74 |
"model": model,
|
75 |
-
"base_model": base_model,
|
76 |
-
"revision": revision,
|
77 |
-
"precision": precision,
|
78 |
-
"weight_type": weight_type,
|
79 |
"status": "PENDING",
|
80 |
"submitted_time": current_time,
|
81 |
-
"model_type": model_type,
|
82 |
-
"likes": model_info.likes,
|
83 |
-
"params": model_size,
|
84 |
-
"license": license,
|
85 |
}
|
86 |
|
87 |
-
# Check for duplicate submission
|
88 |
-
if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
|
89 |
-
return styled_warning("This model has been already submitted.")
|
90 |
-
|
91 |
print("Creating eval file")
|
92 |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
93 |
os.makedirs(OUT_DIR, exist_ok=True)
|
94 |
-
out_path = f"{OUT_DIR}/{model_path}
|
95 |
|
96 |
with open(out_path, "w") as f:
|
97 |
f.write(json.dumps(eval_entry))
|
@@ -109,5 +51,5 @@ def add_new_eval(
|
|
109 |
os.remove(out_path)
|
110 |
|
111 |
return styled_message(
|
112 |
-
"Your request has been submitted to the evaluation queue!\nPlease wait for up to
|
113 |
)
|
|
|
2 |
import os
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
+
from src.display.formatting import styled_error, styled_message
|
6 |
+
from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
+
def add_new_eval(model: str):
|
10 |
user_name = ""
|
11 |
model_path = model
|
12 |
if "/" in model:
|
13 |
user_name = model.split("/")[0]
|
14 |
model_path = model.split("/")[1]
|
15 |
|
|
|
16 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
# Is the model info correctly filled?
|
19 |
try:
|
20 |
+
model_info = API.model_info(repo_id=model, revision='main')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
except Exception:
|
22 |
+
return styled_error("Could not get your model information.")
|
|
|
|
|
|
|
|
|
23 |
|
24 |
# Seems good, creating the eval
|
25 |
print("Adding new eval")
|
26 |
|
27 |
eval_entry = {
|
28 |
"model": model,
|
|
|
|
|
|
|
|
|
29 |
"status": "PENDING",
|
30 |
"submitted_time": current_time,
|
|
|
|
|
|
|
|
|
31 |
}
|
32 |
|
|
|
|
|
|
|
|
|
33 |
print("Creating eval file")
|
34 |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
35 |
os.makedirs(OUT_DIR, exist_ok=True)
|
36 |
+
out_path = f"{OUT_DIR}/{model_path}_eval_request_{current_time}.json"
|
37 |
|
38 |
with open(out_path, "w") as f:
|
39 |
f.write(json.dumps(eval_entry))
|
|
|
51 |
os.remove(out_path)
|
52 |
|
53 |
return styled_message(
|
54 |
+
"Your request has been submitted to the evaluation queue!\nPlease wait for up to five minutes for the model to show in the PENDING list."
|
55 |
)
|