Spaces:
Running
Running
kexinhuang12345
commited on
Commit
•
6d97820
1
Parent(s):
1c85aff
update
Browse files- app.py +67 -129
- src/about.py +37 -35
- src/display/utils.py +69 -1
- src/envs.py +1 -1
- src/populate.py +40 -4
- src/submission/check_validity.py +1 -1
- src/submission/submit.py +44 -60
app.py
CHANGED
@@ -11,17 +11,23 @@ from src.about import (
|
|
11 |
INTRODUCTION_TEXT,
|
12 |
LLM_BENCHMARKS_TEXT,
|
13 |
TITLE,
|
|
|
14 |
)
|
15 |
from src.display.css_html_js import custom_css
|
16 |
from src.display.utils import (
|
17 |
BENCHMARK_COLS,
|
18 |
COLS,
|
|
|
19 |
EVAL_COLS,
|
20 |
EVAL_TYPES,
|
21 |
NUMERIC_INTERVALS,
|
22 |
TYPES,
|
23 |
-
|
|
|
24 |
ModelType,
|
|
|
|
|
|
|
25 |
fields,
|
26 |
WeightType,
|
27 |
Precision
|
@@ -50,44 +56,32 @@ except Exception:
|
|
50 |
restart_space()
|
51 |
|
52 |
|
53 |
-
|
54 |
leaderboard_df = original_df.copy()
|
55 |
|
56 |
-
(
|
57 |
-
finished_eval_queue_df,
|
58 |
-
running_eval_queue_df,
|
59 |
-
pending_eval_queue_df,
|
60 |
-
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
61 |
-
|
62 |
-
|
63 |
# Searching and filtering
|
64 |
def update_table(
|
65 |
hidden_df: pd.DataFrame,
|
66 |
columns: list,
|
67 |
-
type_query: list,
|
68 |
-
precision_query: str,
|
69 |
-
size_query: list,
|
70 |
-
show_deleted: bool,
|
71 |
query: str,
|
72 |
):
|
73 |
-
filtered_df = filter_models(hidden_df,
|
74 |
-
filtered_df = filter_queries(query,
|
75 |
df = select_columns(filtered_df, columns)
|
76 |
return df
|
77 |
|
78 |
|
79 |
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
80 |
-
return df[(df[
|
81 |
|
82 |
|
83 |
def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
84 |
always_here_cols = [
|
85 |
-
|
86 |
-
AutoEvalColumn.model.name,
|
87 |
]
|
88 |
# We use COLS to maintain sorting
|
89 |
filtered_df = df[
|
90 |
-
always_here_cols + [c for c in
|
91 |
]
|
92 |
return filtered_df
|
93 |
|
@@ -105,40 +99,39 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
|
|
105 |
if len(final_df) > 0:
|
106 |
filtered_df = pd.concat(final_df)
|
107 |
filtered_df = filtered_df.drop_duplicates(
|
108 |
-
subset=[
|
109 |
)
|
110 |
|
111 |
return filtered_df
|
112 |
|
113 |
|
114 |
def filter_models(
|
115 |
-
df: pd.DataFrame,
|
116 |
) -> pd.DataFrame:
|
117 |
# Show all models
|
118 |
if show_deleted:
|
119 |
filtered_df = df
|
120 |
else: # Show only still on the hub models
|
121 |
-
filtered_df = df[df[
|
122 |
|
123 |
-
type_emoji = [t[0] for t in type_query]
|
124 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
125 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
126 |
|
127 |
numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
128 |
-
params_column = pd.to_numeric(df[
|
129 |
mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
|
130 |
filtered_df = filtered_df.loc[mask]
|
131 |
|
132 |
return filtered_df
|
133 |
|
134 |
-
|
135 |
demo = gr.Blocks(css=custom_css)
|
136 |
with demo:
|
137 |
gr.HTML(TITLE)
|
138 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
139 |
|
140 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
141 |
-
with gr.TabItem("🏅
|
142 |
with gr.Row():
|
143 |
with gr.Column():
|
144 |
with gr.Row():
|
@@ -151,52 +144,27 @@ with demo:
|
|
151 |
shown_columns = gr.CheckboxGroup(
|
152 |
choices=[
|
153 |
c.name
|
154 |
-
for c in fields(
|
155 |
if not c.hidden and not c.never_hidden
|
156 |
],
|
157 |
value=[
|
158 |
c.name
|
159 |
-
for c in fields(
|
160 |
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
161 |
],
|
162 |
label="Select columns to show",
|
163 |
elem_id="column-select",
|
164 |
interactive=True,
|
165 |
)
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
)
|
170 |
-
with gr.Column(min_width=320):
|
171 |
-
#with gr.Box(elem_id="box-filter"):
|
172 |
-
filter_columns_type = gr.CheckboxGroup(
|
173 |
-
label="Model types",
|
174 |
-
choices=[t.to_str() for t in ModelType],
|
175 |
-
value=[t.to_str() for t in ModelType],
|
176 |
-
interactive=True,
|
177 |
-
elem_id="filter-columns-type",
|
178 |
-
)
|
179 |
-
filter_columns_precision = gr.CheckboxGroup(
|
180 |
-
label="Precision",
|
181 |
-
choices=[i.value.name for i in Precision],
|
182 |
-
value=[i.value.name for i in Precision],
|
183 |
-
interactive=True,
|
184 |
-
elem_id="filter-columns-precision",
|
185 |
-
)
|
186 |
-
filter_columns_size = gr.CheckboxGroup(
|
187 |
-
label="Model sizes (in billions of parameters)",
|
188 |
-
choices=list(NUMERIC_INTERVALS.keys()),
|
189 |
-
value=list(NUMERIC_INTERVALS.keys()),
|
190 |
-
interactive=True,
|
191 |
-
elem_id="filter-columns-size",
|
192 |
-
)
|
193 |
-
|
194 |
leaderboard_table = gr.components.Dataframe(
|
195 |
value=leaderboard_df[
|
196 |
-
[c.name for c in fields(
|
197 |
+ shown_columns.value
|
198 |
],
|
199 |
-
headers=[c.name for c in fields(
|
200 |
datatype=TYPES,
|
201 |
elem_id="leaderboard-table",
|
202 |
interactive=False,
|
@@ -205,8 +173,8 @@ with demo:
|
|
205 |
|
206 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
207 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
208 |
-
value=original_df[
|
209 |
-
headers=
|
210 |
datatype=TYPES,
|
211 |
visible=False,
|
212 |
)
|
@@ -215,116 +183,86 @@ with demo:
|
|
215 |
[
|
216 |
hidden_leaderboard_table_for_search,
|
217 |
shown_columns,
|
218 |
-
filter_columns_type,
|
219 |
-
filter_columns_precision,
|
220 |
-
filter_columns_size,
|
221 |
-
deleted_models_visibility,
|
222 |
search_bar,
|
223 |
],
|
224 |
leaderboard_table,
|
225 |
)
|
226 |
-
for selector in [shown_columns
|
227 |
selector.change(
|
228 |
update_table,
|
229 |
[
|
230 |
hidden_leaderboard_table_for_search,
|
231 |
shown_columns,
|
232 |
-
filter_columns_type,
|
233 |
-
filter_columns_precision,
|
234 |
-
filter_columns_size,
|
235 |
-
deleted_models_visibility,
|
236 |
search_bar,
|
237 |
],
|
238 |
leaderboard_table,
|
239 |
queue=True,
|
240 |
)
|
241 |
|
242 |
-
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
243 |
-
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
244 |
-
|
245 |
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
246 |
with gr.Column():
|
247 |
with gr.Row():
|
248 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
249 |
|
250 |
-
with gr.Column():
|
251 |
-
with gr.Accordion(
|
252 |
-
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
253 |
-
open=False,
|
254 |
-
):
|
255 |
-
with gr.Row():
|
256 |
-
finished_eval_table = gr.components.Dataframe(
|
257 |
-
value=finished_eval_queue_df,
|
258 |
-
headers=EVAL_COLS,
|
259 |
-
datatype=EVAL_TYPES,
|
260 |
-
row_count=5,
|
261 |
-
)
|
262 |
-
with gr.Accordion(
|
263 |
-
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
264 |
-
open=False,
|
265 |
-
):
|
266 |
-
with gr.Row():
|
267 |
-
running_eval_table = gr.components.Dataframe(
|
268 |
-
value=running_eval_queue_df,
|
269 |
-
headers=EVAL_COLS,
|
270 |
-
datatype=EVAL_TYPES,
|
271 |
-
row_count=5,
|
272 |
-
)
|
273 |
-
|
274 |
-
with gr.Accordion(
|
275 |
-
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
276 |
-
open=False,
|
277 |
-
):
|
278 |
-
with gr.Row():
|
279 |
-
pending_eval_table = gr.components.Dataframe(
|
280 |
-
value=pending_eval_queue_df,
|
281 |
-
headers=EVAL_COLS,
|
282 |
-
datatype=EVAL_TYPES,
|
283 |
-
row_count=5,
|
284 |
-
)
|
285 |
with gr.Row():
|
286 |
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
287 |
|
288 |
with gr.Row():
|
289 |
with gr.Column():
|
|
|
|
|
|
|
|
|
290 |
model_name_textbox = gr.Textbox(label="Model name")
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
|
|
295 |
multiselect=False,
|
296 |
value=None,
|
297 |
interactive=True,
|
298 |
)
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
choices=[i.value.name for i in
|
303 |
-
label="
|
304 |
multiselect=False,
|
305 |
-
value=
|
306 |
interactive=True,
|
307 |
)
|
308 |
-
|
309 |
-
|
310 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
311 |
multiselect=False,
|
312 |
-
value=
|
313 |
interactive=True,
|
314 |
)
|
315 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
316 |
|
317 |
submit_button = gr.Button("Submit Eval")
|
318 |
submission_result = gr.Markdown()
|
319 |
submit_button.click(
|
320 |
add_new_eval,
|
321 |
[
|
|
|
|
|
|
|
322 |
model_name_textbox,
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
|
|
|
|
328 |
],
|
329 |
submission_result,
|
330 |
)
|
|
|
11 |
INTRODUCTION_TEXT,
|
12 |
LLM_BENCHMARKS_TEXT,
|
13 |
TITLE,
|
14 |
+
nc_tasks
|
15 |
)
|
16 |
from src.display.css_html_js import custom_css
|
17 |
from src.display.utils import (
|
18 |
BENCHMARK_COLS,
|
19 |
COLS,
|
20 |
+
COLS_NC,
|
21 |
EVAL_COLS,
|
22 |
EVAL_TYPES,
|
23 |
NUMERIC_INTERVALS,
|
24 |
TYPES,
|
25 |
+
AutoEvalColumn_NodeClassification,
|
26 |
+
#AutoEvalColumn,
|
27 |
ModelType,
|
28 |
+
TASK_LIST,
|
29 |
+
OFFICIAL,
|
30 |
+
HONOR,
|
31 |
fields,
|
32 |
WeightType,
|
33 |
Precision
|
|
|
56 |
restart_space()
|
57 |
|
58 |
|
59 |
+
original_df = get_leaderboard_df(EVAL_REQUESTS_PATH, nc_tasks)
|
60 |
leaderboard_df = original_df.copy()
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
# Searching and filtering
|
63 |
def update_table(
|
64 |
hidden_df: pd.DataFrame,
|
65 |
columns: list,
|
|
|
|
|
|
|
|
|
66 |
query: str,
|
67 |
):
|
68 |
+
#filtered_df = filter_models(hidden_df, size_query, show_deleted)
|
69 |
+
filtered_df = filter_queries(query, hidden_df)
|
70 |
df = select_columns(filtered_df, columns)
|
71 |
return df
|
72 |
|
73 |
|
74 |
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
75 |
+
return df[(df[AutoEvalColumn_NodeClassification.model.name].str.contains(query, case=False))]
|
76 |
|
77 |
|
78 |
def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
79 |
always_here_cols = [
|
80 |
+
"Model"
|
|
|
81 |
]
|
82 |
# We use COLS to maintain sorting
|
83 |
filtered_df = df[
|
84 |
+
always_here_cols + [c for c in COLS_NC if c in df.columns and c in columns]
|
85 |
]
|
86 |
return filtered_df
|
87 |
|
|
|
99 |
if len(final_df) > 0:
|
100 |
filtered_df = pd.concat(final_df)
|
101 |
filtered_df = filtered_df.drop_duplicates(
|
102 |
+
subset=[AutoEvalColumn_NodeClassification.model.name]
|
103 |
)
|
104 |
|
105 |
return filtered_df
|
106 |
|
107 |
|
108 |
def filter_models(
|
109 |
+
df: pd.DataFrame, size_query: list, show_deleted: bool
|
110 |
) -> pd.DataFrame:
|
111 |
# Show all models
|
112 |
if show_deleted:
|
113 |
filtered_df = df
|
114 |
else: # Show only still on the hub models
|
115 |
+
filtered_df = df[df[AutoEvalColumn_NodeClassification.still_on_hub.name] == True]
|
116 |
|
117 |
+
#type_emoji = [t[0] for t in type_query]
|
118 |
+
#filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
119 |
+
#filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
120 |
|
121 |
numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
122 |
+
params_column = pd.to_numeric(df[AutoEvalColumn_NodeClassification.params.name], errors="coerce")
|
123 |
mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
|
124 |
filtered_df = filtered_df.loc[mask]
|
125 |
|
126 |
return filtered_df
|
127 |
|
|
|
128 |
demo = gr.Blocks(css=custom_css)
|
129 |
with demo:
|
130 |
gr.HTML(TITLE)
|
131 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
132 |
|
133 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
134 |
+
with gr.TabItem("🏅 Node Classification Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
|
135 |
with gr.Row():
|
136 |
with gr.Column():
|
137 |
with gr.Row():
|
|
|
144 |
shown_columns = gr.CheckboxGroup(
|
145 |
choices=[
|
146 |
c.name
|
147 |
+
for c in fields(AutoEvalColumn_NodeClassification)
|
148 |
if not c.hidden and not c.never_hidden
|
149 |
],
|
150 |
value=[
|
151 |
c.name
|
152 |
+
for c in fields(AutoEvalColumn_NodeClassification)
|
153 |
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
154 |
],
|
155 |
label="Select columns to show",
|
156 |
elem_id="column-select",
|
157 |
interactive=True,
|
158 |
)
|
159 |
+
|
160 |
+
print(leaderboard_df)
|
161 |
+
print(fields(AutoEvalColumn_NodeClassification))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
leaderboard_table = gr.components.Dataframe(
|
163 |
value=leaderboard_df[
|
164 |
+
[c.name for c in fields(AutoEvalColumn_NodeClassification) if c.never_hidden]
|
165 |
+ shown_columns.value
|
166 |
],
|
167 |
+
headers=[c.name for c in fields(AutoEvalColumn_NodeClassification) if c.never_hidden] + shown_columns.value,
|
168 |
datatype=TYPES,
|
169 |
elem_id="leaderboard-table",
|
170 |
interactive=False,
|
|
|
173 |
|
174 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
175 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
176 |
+
value=original_df[COLS_NC],
|
177 |
+
headers=COLS_NC,
|
178 |
datatype=TYPES,
|
179 |
visible=False,
|
180 |
)
|
|
|
183 |
[
|
184 |
hidden_leaderboard_table_for_search,
|
185 |
shown_columns,
|
|
|
|
|
|
|
|
|
186 |
search_bar,
|
187 |
],
|
188 |
leaderboard_table,
|
189 |
)
|
190 |
+
for selector in [shown_columns]:
|
191 |
selector.change(
|
192 |
update_table,
|
193 |
[
|
194 |
hidden_leaderboard_table_for_search,
|
195 |
shown_columns,
|
|
|
|
|
|
|
|
|
196 |
search_bar,
|
197 |
],
|
198 |
leaderboard_table,
|
199 |
queue=True,
|
200 |
)
|
201 |
|
|
|
|
|
|
|
202 |
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
203 |
with gr.Column():
|
204 |
with gr.Row():
|
205 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
with gr.Row():
|
208 |
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
209 |
|
210 |
with gr.Row():
|
211 |
with gr.Column():
|
212 |
+
author_name_textbox = gr.Textbox(label="Your name")
|
213 |
+
email_textbox = gr.Textbox(label="Your email")
|
214 |
+
relbench_version_textbox = gr.Textbox(label="RelBench version")
|
215 |
+
|
216 |
model_name_textbox = gr.Textbox(label="Model name")
|
217 |
+
|
218 |
+
'''
|
219 |
+
dataset_name_textbox = gr.Dropdown(
|
220 |
+
choices=[t.value.name for t in TASK_LIST],
|
221 |
+
label="Task name (e.g. rel-amazon-user-churn)",
|
222 |
multiselect=False,
|
223 |
value=None,
|
224 |
interactive=True,
|
225 |
)
|
226 |
+
'''
|
227 |
+
|
228 |
+
official_or_not = gr.Dropdown(
|
229 |
+
choices=[i.value.name for i in OFFICIAL],
|
230 |
+
label="Is it an official submission?",
|
231 |
multiselect=False,
|
232 |
+
value=None,
|
233 |
interactive=True,
|
234 |
)
|
235 |
+
paper_url_textbox = gr.Textbox(label="Paper URL Link")
|
236 |
+
github_url_textbox = gr.Textbox(label="GitHub URL Link")
|
237 |
+
|
238 |
+
with gr.Column():
|
239 |
+
test_performance = gr.TextArea(label="Test set performance, use {task: [mean,std]} format e.g. {'rel-amazon/user-churn': [0.352,0.023], 'rel-amazon/user-ltv': [0.304,0.022], ...}")
|
240 |
+
valid_performance = gr.TextArea(label="Validation set performance, use {task: [mean,std]} format e.g. {'rel-amazon/user-churn': [0.352,0.023], 'rel-amazon/user-ltv': [0.304,0.022], ...}")
|
241 |
+
parameters_textbox = gr.Textbox(label="Number of parameters")
|
242 |
+
honor_code = gr.Dropdown(
|
243 |
+
choices=[i.value.name for i in HONOR],
|
244 |
+
label="Click here to agree to the honor code",
|
245 |
multiselect=False,
|
246 |
+
value=None,
|
247 |
interactive=True,
|
248 |
)
|
|
|
249 |
|
250 |
submit_button = gr.Button("Submit Eval")
|
251 |
submission_result = gr.Markdown()
|
252 |
submit_button.click(
|
253 |
add_new_eval,
|
254 |
[
|
255 |
+
author_name_textbox,
|
256 |
+
email_textbox,
|
257 |
+
relbench_version_textbox,
|
258 |
model_name_textbox,
|
259 |
+
official_or_not,
|
260 |
+
test_performance,
|
261 |
+
valid_performance,
|
262 |
+
paper_url_textbox,
|
263 |
+
github_url_textbox,
|
264 |
+
parameters_textbox,
|
265 |
+
honor_code,
|
266 |
],
|
267 |
submission_result,
|
268 |
)
|
src/about.py
CHANGED
@@ -18,55 +18,57 @@ class Tasks(Enum):
|
|
18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
20 |
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
# Your leaderboard name
|
24 |
-
TITLE = """<
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
-
|
29 |
"""
|
30 |
|
31 |
# Which evaluations are you running? how can people reproduce what you have?
|
32 |
LLM_BENCHMARKS_TEXT = f"""
|
33 |
-
##
|
34 |
-
|
35 |
-
## Reproducibility
|
36 |
-
To reproduce our results, here is the commands you can run:
|
37 |
-
|
38 |
"""
|
39 |
|
40 |
EVALUATION_QUEUE_TEXT = """
|
41 |
-
##
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
### 3) Make sure your model has an open license!
|
59 |
-
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
|
60 |
-
|
61 |
-
### 4) Fill up your model card
|
62 |
-
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
63 |
-
|
64 |
-
## In case of model failure
|
65 |
-
If your model is displayed in the `FAILED` category, its execution stopped.
|
66 |
-
Make sure you have followed the above steps first.
|
67 |
-
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
68 |
"""
|
69 |
|
70 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
71 |
CITATION_BUTTON_TEXT = r"""
|
|
|
|
|
|
|
|
|
|
|
72 |
"""
|
|
|
18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
20 |
|
21 |
+
class nc_tasks(Enum):
|
22 |
+
task0 = Task("rel-amazon/user-churn", "auroc", "user-churn")
|
23 |
+
task1 = Task("rel-amazon/item-churn", "auroc", "item-churn")
|
24 |
+
task2 = Task("rel-avito/user-clicks", "auroc", "user-clicks")
|
25 |
+
task3 = Task("rel-avito/user-visits", "auroc", "user-visits")
|
26 |
+
task4 = Task("rel-hm/user-churn", "auroc", "hm-user-churn")
|
27 |
+
task5 = Task("rel-stack/user-badge", "auroc", "user-badge")
|
28 |
+
task6 = Task("rel-stack/user-engagement", "auroc", "user-engagement")
|
29 |
+
task7 = Task("rel-f1/driver-dnf", "auroc", "driver-dnf")
|
30 |
+
task8 = Task("rel-f1/driver-top3", "auroc", "driver-top3")
|
31 |
+
task9 = Task("rel-trial/study-outcome", "auroc", "study-outcome")
|
32 |
+
task10 = Task("rel-event/user-repeat", "auroc", "user-repeat")
|
33 |
+
task11 = Task("rel-event/user-ignore", "auroc", "user-ignore")
|
34 |
|
35 |
# Your leaderboard name
|
36 |
+
TITLE = """<p align="center"><img src="https://relbench.stanford.edu/img/logo.png" alt="logo" width="400px" /></p>"""
|
37 |
|
38 |
# What does your leaderboard evaluate?
|
39 |
INTRODUCTION_TEXT = """
|
40 |
+
Relational Deep Learning is a new approach for end-to-end representation learning on data spread across multiple tables, such as in a relational database (see our vision paper). RelBench is the accompanying benchmark which seeks to facilitate efficient, robust and reproducible research in this direction. It comprises of a collection of realistic, large-scale, and diverse datasets structured as relational tables, along with machine learning tasks defined on them. It provides full support for data downloading, task specification and standardized evaluation in an ML-framework-agnostic manner. Additionally, there is seamless integration with PyTorch Geometric to load the data as a graph and train GNN models, and with PyTorch Frame to encode the various types of table columns. Finally, there is a leaderboard for tracking progress.
|
41 |
"""
|
42 |
|
43 |
# Which evaluations are you running? how can people reproduce what you have?
|
44 |
LLM_BENCHMARKS_TEXT = f"""
|
45 |
+
## Overview of RelBench
|
|
|
|
|
|
|
|
|
46 |
"""
|
47 |
|
48 |
EVALUATION_QUEUE_TEXT = """
|
49 |
+
## Instruction to submit your model
|
50 |
+
|
51 |
+
Once you have developed your model and got results, you can submit your test results to our leaderboards. For each dataset, we require you to submit the following information.
|
52 |
+
|
53 |
+
- **Your name**: Primary contact's name
|
54 |
+
- **Your email**: Primary contact's email
|
55 |
+
- **RelBench version**: The RelBench version used to conduct the experiments.
|
56 |
+
- **Model name**: The name of the method. This is an unique identifier of the model. Please be distinct with any existing model names. It will be overriden if the same model name is submitted.
|
57 |
+
- **Task name**: The name of an Relbench dataset that you use to evaluate the method. Choose from the dropdown menus.
|
58 |
+
- **Is it an official submission**: Whether the implementation is official (implementation by authors who proposed the method) or unofficial (re-implementation of the method by non-authors).
|
59 |
+
- **Test performance**: Raw test performance output by RelBench model evaluators, where average and unbiased standard deviation must be taken over 5 different random seeds. You can either not fix random seeds at all, or use the random seeds from 0 to 4. We highly discourage you to tune the random seeds.
|
60 |
+
- **Validation performance**: Validation performance of the model that is used to report the test performance above.
|
61 |
+
- **Paper URL Link**: The original paper describing the method (arXiv link is recommended. paper needs not be peer-reviewed). If your method has any original component (e.g., even just combining existing methods XXX and YYY), you have to write a technical report describing it (e.g., how you exactly combined XXX and YYY).
|
62 |
+
- **GitHub URL Link**: The Github repository or directory containining all code to reproduce the result. A placeholder repository is not allowed.
|
63 |
+
- **Number of Parameters**: The number of parameters of your model, which can be calculated by sum(p.numel() for p in model.parameters()). If you use multi-stage training (e.g., apply node2vec and then MLP), please sum up all the parameters (both node2vec and MLP parameters).
|
64 |
+
- **Honor code**: Please acknowledge that your submission adheres to all the ethical policies and your result is reproducible.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
"""
|
66 |
|
67 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
68 |
CITATION_BUTTON_TEXT = r"""
|
69 |
+
@article{relbench,
|
70 |
+
title={Relational Deep Learning: Graph Representation Learning on Relational Tables},
|
71 |
+
author={Matthias Fey, Weihua Hu, Kexin Huang, Jan Eric Lenssen, Rishabh Ranjan, Joshua Robinson, Rex Ying, Jiaxuan You, Jure Leskovec},
|
72 |
+
year={2023}
|
73 |
+
}
|
74 |
"""
|
src/display/utils.py
CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.about import Tasks
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
@@ -43,6 +43,21 @@ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sh
|
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
## For the queue columns in the submission tab
|
47 |
@dataclass(frozen=True)
|
48 |
class EvalQueueColumn: # Queue column
|
@@ -83,6 +98,58 @@ class ModelType(Enum):
|
|
83 |
return ModelType.IFT
|
84 |
return ModelType.Unknown
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
class WeightType(Enum):
|
87 |
Adapter = ModelDetails("Adapter")
|
88 |
Original = ModelDetails("Original")
|
@@ -114,6 +181,7 @@ class Precision(Enum):
|
|
114 |
|
115 |
# Column selection
|
116 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
|
|
117 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
118 |
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
119 |
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
from src.about import Tasks, nc_tasks
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
45 |
|
46 |
+
|
47 |
+
auto_eval_column_dict_nc = []
|
48 |
+
auto_eval_column_dict_nc.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
49 |
+
auto_eval_column_dict_nc.append(["average_rank", ColumnContent, ColumnContent("Average Rank⬆️", "number", True)])
|
50 |
+
for task in nc_tasks:
|
51 |
+
auto_eval_column_dict_nc.append(['_'.join(task.value.col_name.split('-')), ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
52 |
+
auto_eval_column_dict_nc.append(["author", ColumnContent, ColumnContent("Author", "markdown", True, never_hidden=False)])
|
53 |
+
auto_eval_column_dict_nc.append(["email", ColumnContent, ColumnContent("Email", "markdown", True, never_hidden=False)])
|
54 |
+
auto_eval_column_dict_nc.append(["Paper_URL", ColumnContent, ColumnContent("Paper URL", "markdown", True, never_hidden=False)])
|
55 |
+
auto_eval_column_dict_nc.append(["Github_URL", ColumnContent, ColumnContent("Github URL", "markdown", True, never_hidden=False)])
|
56 |
+
auto_eval_column_dict_nc.append(["Time", ColumnContent, ColumnContent("Time", "markdown", True, never_hidden=False)])
|
57 |
+
auto_eval_column_dict_nc.append(["num_of_Params", ColumnContent, ColumnContent("# of Params", "markdown", True, never_hidden=False)])
|
58 |
+
|
59 |
+
AutoEvalColumn_NodeClassification = make_dataclass("AutoEvalColumn_NodeClassification", auto_eval_column_dict_nc, frozen=True)
|
60 |
+
|
61 |
## For the queue columns in the submission tab
|
62 |
@dataclass(frozen=True)
|
63 |
class EvalQueueColumn: # Queue column
|
|
|
98 |
return ModelType.IFT
|
99 |
return ModelType.Unknown
|
100 |
|
101 |
+
class OFFICIAL(Enum):
|
102 |
+
official = ModelDetails("Official")
|
103 |
+
unofficial = ModelDetails("Unofficial")
|
104 |
+
|
105 |
+
class HONOR(Enum):
|
106 |
+
yes = ModelDetails("Yes")
|
107 |
+
no = ModelDetails("No")
|
108 |
+
|
109 |
+
class TASK_LIST(Enum):
|
110 |
+
amazon_user_churn = ModelDetails("rel-amazon-user-churn")
|
111 |
+
amazon_item_churn = ModelDetails("rel-amazon-item-churn")
|
112 |
+
amazon_user_ltv = ModelDetails("rel-amazon-user-ltv")
|
113 |
+
amazon_item_ltv = ModelDetails("rel-amazon-item-ltv")
|
114 |
+
amazon_user_item_purchase = ModelDetails("rel-amazon-user-item-purchase")
|
115 |
+
amazon_user_item_rate = ModelDetails("rel-amazon-user-item-rate")
|
116 |
+
amazon_user_item_review = ModelDetails("rel-amazon-user-item-review")
|
117 |
+
|
118 |
+
# rel-stack
|
119 |
+
stack_user_engagement = ModelDetails("rel-stack-user-engagement")
|
120 |
+
stack_user_badge = ModelDetails("rel-stack-user-badge")
|
121 |
+
stack_post_votes = ModelDetails("rel-stack-post-votes")
|
122 |
+
stack_user_post_comment = ModelDetails("rel-stack-user-post-comment")
|
123 |
+
stack_user_post_related = ModelDetails("rel-stack-user-post-related")
|
124 |
+
|
125 |
+
# rel-trial
|
126 |
+
trial_study_outcome = ModelDetails("rel-trial-study-outcome")
|
127 |
+
trial_study_adverse = ModelDetails("rel-trial-study-adverse")
|
128 |
+
trial_site_success = ModelDetails("rel-trial-site-success")
|
129 |
+
trial_condition_sponsor_run = ModelDetails("rel-trial-condition-sponsor-run")
|
130 |
+
trial_site_sponsor_run = ModelDetails("rel-trial-site-sponsor-run")
|
131 |
+
|
132 |
+
# rel-f1
|
133 |
+
f1_driver_position = ModelDetails("rel-f1-driver-position")
|
134 |
+
f1_driver_dnf = ModelDetails("rel-f1-driver-dnf")
|
135 |
+
f1_driver_top3 = ModelDetails("rel-f1-driver-top3")
|
136 |
+
|
137 |
+
# rel-hm
|
138 |
+
hm_user_churn = ModelDetails("rel-hm-user-churn")
|
139 |
+
hm_item_sales = ModelDetails("rel-hm-item-sales")
|
140 |
+
hm_user_item_purchase = ModelDetails("rel-hm-user-item-purchase")
|
141 |
+
|
142 |
+
# rel-event
|
143 |
+
event_user_repeat = ModelDetails("rel-event-user-repeat")
|
144 |
+
event_user_ignore = ModelDetails("rel-event-user-ignore")
|
145 |
+
event_user_attendance = ModelDetails("rel-event-user-attendance")
|
146 |
+
|
147 |
+
# rel-avito
|
148 |
+
avito_user_visits = ModelDetails("rel-avito-user-visits")
|
149 |
+
avito_user_clicks = ModelDetails("rel-avito-user-clicks")
|
150 |
+
avito_ads_clicks = ModelDetails("rel-avito-ads-clicks")
|
151 |
+
avito_user_ad_visit = ModelDetails("rel-avito-user-ad-visit")
|
152 |
+
|
153 |
class WeightType(Enum):
|
154 |
Adapter = ModelDetails("Adapter")
|
155 |
Original = ModelDetails("Original")
|
|
|
181 |
|
182 |
# Column selection
|
183 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
184 |
+
COLS_NC = [c.name for c in fields(AutoEvalColumn_NodeClassification) if not c.hidden]
|
185 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
186 |
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
187 |
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
src/envs.py
CHANGED
@@ -6,7 +6,7 @@ from huggingface_hub import HfApi
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("TOKEN") # A read/write token for your org
|
8 |
|
9 |
-
OWNER = "
|
10 |
# ----------------------------------
|
11 |
|
12 |
REPO_ID = f"{OWNER}/leaderboard"
|
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("TOKEN") # A read/write token for your org
|
8 |
|
9 |
+
OWNER = "relbench" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
# ----------------------------------
|
11 |
|
12 |
REPO_ID = f"{OWNER}/leaderboard"
|
src/populate.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import json
|
2 |
import os
|
3 |
-
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
@@ -8,19 +8,55 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
|
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
-
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
18 |
-
df = df[cols].round(decimals=2)
|
19 |
|
20 |
# filter out if any of the benchmarks have not been produced
|
21 |
-
df = df[has_no_nan_values(df, benchmark_cols)]
|
22 |
return raw_data, df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
26 |
"""Creates the different dataframes for the evaluation queues requestes"""
|
|
|
1 |
import json
|
2 |
import os
|
3 |
+
from ast import literal_eval
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
|
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
11 |
+
'''
|
12 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
13 |
"""Creates a dataframe from all the individual experiment results"""
|
14 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
15 |
all_data_json = [v.to_dict() for v in raw_data]
|
16 |
|
17 |
df = pd.DataFrame.from_records(all_data_json)
|
18 |
+
#df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
19 |
+
#df = df[cols].round(decimals=2)
|
20 |
|
21 |
# filter out if any of the benchmarks have not been produced
|
22 |
+
#df = df[has_no_nan_values(df, benchmark_cols)]
|
23 |
return raw_data, df
|
24 |
+
'''
|
25 |
+
|
26 |
+
def get_leaderboard_df(EVAL_REQUESTS_PATH, tasks) -> pd.DataFrame:
|
27 |
+
|
28 |
+
model_result_filepaths = []
|
29 |
+
for root,_, files in os.walk(EVAL_REQUESTS_PATH):
|
30 |
+
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
31 |
+
continue
|
32 |
+
for file in files:
|
33 |
+
model_result_filepaths.append(os.path.join(root, file))
|
34 |
+
|
35 |
+
model_res = []
|
36 |
+
for model in model_result_filepaths:
|
37 |
+
import json
|
38 |
+
with open(model) as f:
|
39 |
+
model_res.append(json.load(f))
|
40 |
+
|
41 |
+
for model in model_res:
|
42 |
+
model["test"] = literal_eval(model["test"])
|
43 |
+
model["valid"] = literal_eval(model["valid"])
|
44 |
+
model["params"] = int(model["params"])
|
45 |
+
model['submitted_time'] = model['submitted_time'].split('T')[0]
|
46 |
+
#model['paper_url'] = '[Link](' + model['paper_url'] + ')'
|
47 |
+
#model['github_url'] = '[Link](' + model['github_url'] + ')'
|
48 |
+
|
49 |
+
name2short_name = {task.value.benchmark: task.value.col_name for task in tasks}
|
50 |
+
for model in model_res:
|
51 |
+
model.update({name2short_name[i]: str(model['test'][i][0])[:4] + '±' + str(model['test'][i][1])[:4] if i in model['test'] else '-' for i in name2short_name})
|
52 |
|
53 |
+
columns_to_show = ['model', 'author', 'email', 'paper_url', 'github_url', 'submitted_time', 'params'] + list(name2short_name.values())
|
54 |
+
df_res = pd.DataFrame([{col: model[col] for col in columns_to_show} for model in model_res])
|
55 |
+
ranks = df_res[list(name2short_name.values())].rank()
|
56 |
+
df_res.rename(columns={'model': 'Model', 'author': 'Author', 'email': 'Email', 'paper_url': 'Paper URL', 'github_url': 'Github URL', 'submitted_time': 'Time', 'params': '# of Params'}, inplace=True)
|
57 |
+
df_res['Average Rank⬆️'] = ranks.mean(axis=1)
|
58 |
+
df_res.sort_values(by='Average Rank⬆️', ascending=True, inplace=True)
|
59 |
+
return df_res
|
60 |
|
61 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
62 |
"""Creates the different dataframes for the evaluation queues requestes"""
|
src/submission/check_validity.py
CHANGED
@@ -88,7 +88,7 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
|
|
88 |
continue
|
89 |
with open(os.path.join(root, file), "r") as f:
|
90 |
info = json.load(f)
|
91 |
-
file_names.append(f"{info['model']}
|
92 |
|
93 |
# Select organisation
|
94 |
if info["model"].count("/") == 0 or "submitted_time" not in info:
|
|
|
88 |
continue
|
89 |
with open(os.path.join(root, file), "r") as f:
|
90 |
info = json.load(f)
|
91 |
+
file_names.append(f"{info['model']}")
|
92 |
|
93 |
# Select organisation
|
94 |
if info["model"].count("/") == 0 or "submitted_time" not in info:
|
src/submission/submit.py
CHANGED
@@ -14,93 +14,77 @@ from src.submission.check_validity import (
|
|
14 |
REQUESTED_MODELS = None
|
15 |
USERS_TO_SUBMISSION_DATES = None
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
def add_new_eval(
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
24 |
):
|
25 |
global REQUESTED_MODELS
|
26 |
global USERS_TO_SUBMISSION_DATES
|
27 |
if not REQUESTED_MODELS:
|
28 |
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
29 |
|
30 |
-
user_name = ""
|
31 |
model_path = model
|
32 |
-
if "/" in model:
|
33 |
-
user_name = model.split("/")[0]
|
34 |
-
model_path = model.split("/")[1]
|
35 |
|
36 |
-
precision = precision.split(" ")[0]
|
37 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
38 |
|
39 |
-
|
40 |
-
return styled_error("Please select a model type.")
|
41 |
-
|
42 |
-
# Does the model actually exist?
|
43 |
-
if revision == "":
|
44 |
-
revision = "main"
|
45 |
-
|
46 |
-
# Is the model on the hub?
|
47 |
-
if weight_type in ["Delta", "Adapter"]:
|
48 |
-
base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
|
49 |
-
if not base_model_on_hub:
|
50 |
-
return styled_error(f'Base model "{base_model}" {error}')
|
51 |
-
|
52 |
-
if not weight_type == "Adapter":
|
53 |
-
model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
|
54 |
-
if not model_on_hub:
|
55 |
-
return styled_error(f'Model "{model}" {error}')
|
56 |
-
|
57 |
-
# Is the model info correctly filled?
|
58 |
-
try:
|
59 |
-
model_info = API.model_info(repo_id=model, revision=revision)
|
60 |
-
except Exception:
|
61 |
-
return styled_error("Could not get your model information. Please fill it up properly.")
|
62 |
-
|
63 |
-
model_size = get_model_size(model_info=model_info, precision=precision)
|
64 |
-
|
65 |
-
# Were the model card and license filled?
|
66 |
-
try:
|
67 |
-
license = model_info.cardData["license"]
|
68 |
-
except Exception:
|
69 |
-
return styled_error("Please select a license for your model")
|
70 |
-
|
71 |
-
modelcard_OK, error_msg = check_model_card(model)
|
72 |
-
if not modelcard_OK:
|
73 |
-
return styled_error(error_msg)
|
74 |
|
75 |
# Seems good, creating the eval
|
76 |
print("Adding new eval")
|
77 |
|
78 |
eval_entry = {
|
79 |
"model": model,
|
80 |
-
"
|
81 |
-
"
|
82 |
-
"
|
83 |
-
"
|
|
|
|
|
|
|
|
|
|
|
84 |
"status": "PENDING",
|
85 |
"submitted_time": current_time,
|
86 |
-
"model_type": model_type,
|
87 |
-
"likes": model_info.likes,
|
88 |
"params": model_size,
|
89 |
-
"license": license,
|
90 |
"private": False,
|
91 |
}
|
92 |
|
93 |
-
# Check for duplicate submission
|
94 |
-
if f"{model}_{
|
95 |
-
|
96 |
|
97 |
print("Creating eval file")
|
98 |
-
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{
|
99 |
os.makedirs(OUT_DIR, exist_ok=True)
|
100 |
-
out_path = f"{OUT_DIR}/{model_path}
|
101 |
|
102 |
-
|
103 |
-
|
|
|
|
|
|
|
|
|
104 |
|
105 |
print("Uploading eval file")
|
106 |
API.upload_file(
|
|
|
14 |
REQUESTED_MODELS = None
|
15 |
USERS_TO_SUBMISSION_DATES = None
|
16 |
|
17 |
+
class CustomJSONEncoder(json.JSONEncoder):
|
18 |
+
def default(self, obj):
|
19 |
+
try:
|
20 |
+
return super().default(obj)
|
21 |
+
except TypeError:
|
22 |
+
return str(obj) # Convert non-serializable object to string
|
23 |
+
|
24 |
+
def add_new_eval_json(eval_entry, out_path):
|
25 |
+
with open(out_path, "w") as f:
|
26 |
+
f.write(json.dumps(eval_entry, cls=CustomJSONEncoder))
|
27 |
+
|
28 |
def add_new_eval(
|
29 |
+
author,
|
30 |
+
email,
|
31 |
+
relbench_version,
|
32 |
+
model,
|
33 |
+
official_or_not,
|
34 |
+
test_performance,
|
35 |
+
valid_performance,
|
36 |
+
paper_url,
|
37 |
+
github_url,
|
38 |
+
parameters,
|
39 |
+
honor_code,
|
40 |
):
|
41 |
global REQUESTED_MODELS
|
42 |
global USERS_TO_SUBMISSION_DATES
|
43 |
if not REQUESTED_MODELS:
|
44 |
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
45 |
|
|
|
46 |
model_path = model
|
|
|
|
|
|
|
47 |
|
48 |
+
#precision = precision.split(" ")[0]
|
49 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
50 |
|
51 |
+
model_size = parameters
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
# Seems good, creating the eval
|
54 |
print("Adding new eval")
|
55 |
|
56 |
eval_entry = {
|
57 |
"model": model,
|
58 |
+
"author": author,
|
59 |
+
"email": email,
|
60 |
+
"relbench_version": relbench_version,
|
61 |
+
"official_or_not": official_or_not,
|
62 |
+
"test": test_performance,
|
63 |
+
"valid": valid_performance,
|
64 |
+
"paper_url": paper_url,
|
65 |
+
"github_url": github_url,
|
66 |
+
"honor_code": honor_code,
|
67 |
"status": "PENDING",
|
68 |
"submitted_time": current_time,
|
|
|
|
|
69 |
"params": model_size,
|
|
|
70 |
"private": False,
|
71 |
}
|
72 |
|
73 |
+
# TODO: Check for duplicate submission
|
74 |
+
#if f"{model}_{author}_{precision}" in REQUESTED_MODELS:
|
75 |
+
# return styled_warning("This model has been already submitted.")
|
76 |
|
77 |
print("Creating eval file")
|
78 |
+
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{model}"
|
79 |
os.makedirs(OUT_DIR, exist_ok=True)
|
80 |
+
out_path = f"{OUT_DIR}/{model_path}_eval_request_False.json"
|
81 |
|
82 |
+
print(eval_entry)
|
83 |
+
|
84 |
+
#with open(out_path, "w") as f:
|
85 |
+
# f.write(json.dumps(eval_entry))
|
86 |
+
|
87 |
+
add_new_eval_json(eval_entry, out_path)
|
88 |
|
89 |
print("Uploading eval file")
|
90 |
API.upload_file(
|