Terry Zhuo
commited on
Commit
•
faf6544
1
Parent(s):
67bdb6e
add full results back
Browse files
app.py
CHANGED
@@ -150,26 +150,26 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, ba
|
|
150 |
raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
|
151 |
|
152 |
def get_latest_data_leaderboard(
|
153 |
-
|
154 |
hard_leaderboard_initial_df = None,
|
155 |
-
|
156 |
-
|
157 |
hard_elo_task_df = None,
|
158 |
hard_elo_bench_df = None,
|
159 |
-
|
160 |
-
|
161 |
hard_complete_solve_df = None,
|
162 |
hard_instruct_solve_df = None
|
163 |
):
|
164 |
global NEW_DATA_ON_LEADERBOARD
|
165 |
-
|
166 |
global HARD_LEADERBOARD_DF
|
167 |
-
|
168 |
-
|
169 |
global HARD_ELO_TASK_DF
|
170 |
global HARD_ELO_BENCH_DF
|
171 |
-
|
172 |
-
|
173 |
global HARD_COMPLETE_SOLVE_DF
|
174 |
global HARD_INSTRUCT_SOLVE_DF
|
175 |
|
@@ -183,10 +183,10 @@ def get_latest_data_leaderboard(
|
|
183 |
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
184 |
verification_mode="no_checks"
|
185 |
)
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
hard_leaderboard_dataset = datasets.load_dataset(
|
191 |
HARD_RESULT_REPO,
|
192 |
"default",
|
@@ -201,24 +201,24 @@ def get_latest_data_leaderboard(
|
|
201 |
)
|
202 |
HARD_LEADERBOARD_DF = hard_leaderboard_df
|
203 |
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
|
223 |
hard_elo_task_df = datasets.load_dataset(
|
224 |
HARD_ELO_REPO,
|
@@ -239,24 +239,24 @@ def get_latest_data_leaderboard(
|
|
239 |
HARD_ELO_TASK_DF = hard_elo_task_df
|
240 |
HARD_ELO_BENCH_DF = hard_elo_bench_df
|
241 |
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
|
261 |
hard_complete_solve_df = datasets.load_dataset(
|
262 |
HARD_SOLVE_REPO,
|
@@ -280,41 +280,41 @@ def get_latest_data_leaderboard(
|
|
280 |
NEW_DATA_ON_LEADERBOARD = False
|
281 |
|
282 |
else:
|
283 |
-
|
284 |
-
HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
|
285 |
-
|
286 |
# ELO_BENCH_DF = elo_bench_df
|
287 |
-
HARD_ELO_TASK_DF = hard_elo_task_df
|
288 |
HARD_ELO_BENCH_DF = hard_elo_bench_df
|
289 |
-
|
290 |
# INSTRUCT_SOLVE_DF = instruct_solve_df
|
291 |
-
HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
|
292 |
HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
|
293 |
|
294 |
-
|
295 |
-
return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
|
296 |
|
297 |
|
298 |
def init_space():
|
299 |
"""Initializes the application space, loading only necessary data."""
|
300 |
|
301 |
# Always redownload the leaderboard DataFrame
|
302 |
-
|
303 |
global HARD_LEADERBOARD_DF
|
304 |
-
|
305 |
-
|
306 |
global HARD_ELO_TASK_DF
|
307 |
global HARD_ELO_BENCH_DF
|
308 |
-
|
309 |
-
|
310 |
global HARD_COMPLETE_SOLVE_DF
|
311 |
global HARD_INSTRUCT_SOLVE_DF
|
312 |
|
313 |
-
|
314 |
-
HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
|
315 |
|
316 |
-
|
317 |
-
return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
|
318 |
|
319 |
# Initialize VoteManager
|
320 |
# vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
|
@@ -382,104 +382,104 @@ with main_block as demo:
|
|
382 |
|
383 |
# gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
384 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
with gr.TabItem("📝 About", id=3):
|
484 |
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
|
485 |
with gr.TabItem("🔎 Data Viewer", id="viewer"):
|
@@ -522,8 +522,8 @@ with main_block as demo:
|
|
522 |
show_copy_button=True,
|
523 |
)
|
524 |
|
525 |
-
|
526 |
-
main_block.load(fn=get_latest_data_leaderboard, inputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
|
527 |
# leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
|
528 |
# pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
|
529 |
|
|
|
150 |
raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
|
151 |
|
152 |
def get_latest_data_leaderboard(
|
153 |
+
leaderboard_initial_df = None,
|
154 |
hard_leaderboard_initial_df = None,
|
155 |
+
elo_task_df = None,
|
156 |
+
elo_bench_df = None,
|
157 |
hard_elo_task_df = None,
|
158 |
hard_elo_bench_df = None,
|
159 |
+
complete_solve_df = None,
|
160 |
+
instruct_solve_df = None,
|
161 |
hard_complete_solve_df = None,
|
162 |
hard_instruct_solve_df = None
|
163 |
):
|
164 |
global NEW_DATA_ON_LEADERBOARD
|
165 |
+
global LEADERBOARD_DF
|
166 |
global HARD_LEADERBOARD_DF
|
167 |
+
global ELO_TASK_DF
|
168 |
+
global ELO_BENCH_DF
|
169 |
global HARD_ELO_TASK_DF
|
170 |
global HARD_ELO_BENCH_DF
|
171 |
+
global COMPLETE_SOLVE_DF
|
172 |
+
global INSTRUCT_SOLVE_DF
|
173 |
global HARD_COMPLETE_SOLVE_DF
|
174 |
global HARD_INSTRUCT_SOLVE_DF
|
175 |
|
|
|
183 |
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
184 |
verification_mode="no_checks"
|
185 |
)
|
186 |
+
LEADERBOARD_DF = get_leaderboard_df(
|
187 |
+
leaderboard_dataset=leaderboard_dataset,
|
188 |
+
cols=COLS,
|
189 |
+
)
|
190 |
hard_leaderboard_dataset = datasets.load_dataset(
|
191 |
HARD_RESULT_REPO,
|
192 |
"default",
|
|
|
201 |
)
|
202 |
HARD_LEADERBOARD_DF = hard_leaderboard_df
|
203 |
|
204 |
+
elo_task_df = datasets.load_dataset(
|
205 |
+
ELO_REPO,
|
206 |
+
"default",
|
207 |
+
split="task_no_tie",
|
208 |
+
cache_dir=HF_HOME,
|
209 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
210 |
+
verification_mode="no_checks"
|
211 |
+
).to_pandas()
|
212 |
+
elo_bench_df = datasets.load_dataset(
|
213 |
+
ELO_REPO,
|
214 |
+
"default",
|
215 |
+
split="benchmark_tie",
|
216 |
+
cache_dir=HF_HOME,
|
217 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
218 |
+
verification_mode="no_checks"
|
219 |
+
).to_pandas()
|
220 |
+
ELO_TASK_DF = elo_task_df
|
221 |
+
ELO_BENCH_DF = elo_bench_df
|
222 |
|
223 |
hard_elo_task_df = datasets.load_dataset(
|
224 |
HARD_ELO_REPO,
|
|
|
239 |
HARD_ELO_TASK_DF = hard_elo_task_df
|
240 |
HARD_ELO_BENCH_DF = hard_elo_bench_df
|
241 |
|
242 |
+
complete_solve_df = datasets.load_dataset(
|
243 |
+
SOLVE_REPO,
|
244 |
+
"default",
|
245 |
+
split="complete",
|
246 |
+
cache_dir=HF_HOME,
|
247 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
248 |
+
verification_mode="no_checks"
|
249 |
+
).to_pandas()
|
250 |
+
instruct_solve_df = datasets.load_dataset(
|
251 |
+
SOLVE_REPO,
|
252 |
+
"default",
|
253 |
+
split="instruct",
|
254 |
+
cache_dir=HF_HOME,
|
255 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
256 |
+
verification_mode="no_checks"
|
257 |
+
).to_pandas()
|
258 |
+
COMPLETE_SOLVE_DF = complete_solve_df
|
259 |
+
INSTRUCT_SOLVE_DF = instruct_solve_df
|
260 |
|
261 |
hard_complete_solve_df = datasets.load_dataset(
|
262 |
HARD_SOLVE_REPO,
|
|
|
280 |
NEW_DATA_ON_LEADERBOARD = False
|
281 |
|
282 |
else:
|
283 |
+
LEADERBOARD_DF = leaderboard_initial_df
|
284 |
+
# HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
|
285 |
+
ELO_TASK_DF = elo_task_df
|
286 |
# ELO_BENCH_DF = elo_bench_df
|
287 |
+
# HARD_ELO_TASK_DF = hard_elo_task_df
|
288 |
HARD_ELO_BENCH_DF = hard_elo_bench_df
|
289 |
+
COMPLETE_SOLVE_DF = complete_solve_df
|
290 |
# INSTRUCT_SOLVE_DF = instruct_solve_df
|
291 |
+
# HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
|
292 |
HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
|
293 |
|
294 |
+
return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
|
295 |
+
# return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
|
296 |
|
297 |
|
298 |
def init_space():
|
299 |
"""Initializes the application space, loading only necessary data."""
|
300 |
|
301 |
# Always redownload the leaderboard DataFrame
|
302 |
+
global LEADERBOARD_DF
|
303 |
global HARD_LEADERBOARD_DF
|
304 |
+
global ELO_TASK_DF
|
305 |
+
global ELO_BENCH_DF
|
306 |
global HARD_ELO_TASK_DF
|
307 |
global HARD_ELO_BENCH_DF
|
308 |
+
global COMPLETE_SOLVE_DF
|
309 |
+
global INSTRUCT_SOLVE_DF
|
310 |
global HARD_COMPLETE_SOLVE_DF
|
311 |
global HARD_INSTRUCT_SOLVE_DF
|
312 |
|
313 |
+
LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
|
314 |
+
# HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
|
315 |
|
316 |
+
return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
|
317 |
+
# return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
|
318 |
|
319 |
# Initialize VoteManager
|
320 |
# vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
|
|
|
382 |
|
383 |
# gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
384 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
385 |
+
with gr.Tab("💎 Hard Set") as hard_tabs:
|
386 |
+
with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="hard_bench"):
|
387 |
+
hard_leaderboard = init_leaderboard(HARD_LEADERBOARD_DF)
|
388 |
+
gr.Markdown(
|
389 |
+
"""
|
390 |
+
**Notes:**
|
391 |
+
- For the efficiency reasons, we only display the Hard Set leaderboard.
|
392 |
+
- _Hard Set_ vs _Full Set_:
|
393 |
+
- <u>Hard Set</u>: A subset of ~150 BigCodeBench tasks which is more user-facing and challenging.
|
394 |
+
- <u>Full Set</u>: The full set of 1140 BigCodeBench tasks.
|
395 |
+
- _Complete_ vs _Instruct_:
|
396 |
+
- <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This split tests if the models are good at coding.
|
397 |
+
- <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
|
398 |
+
- `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
|
399 |
+
- `Average` is the average of `Complete` and `Instruct` when both are available.
|
400 |
+
- `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the Complete + Instruct splits. The rating starts from 1000 and is bootstrapped 500 times. We only consider the models having both `Complete` and `Instruct` scores.
|
401 |
+
- `#Act Params (B)` is the number of activated model parameters during inference.
|
402 |
+
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
|
403 |
+
- For more details check the 📝 About section.
|
404 |
+
""",
|
405 |
+
elem_classes="markdown-text",
|
406 |
+
)
|
407 |
+
|
408 |
+
with gr.TabItem("📊 Elo Rating", id="hard_elo"):
|
409 |
+
with gr.Column():
|
410 |
+
with gr.Group():
|
411 |
+
gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
|
412 |
+
hard_task_elo_map = gr.Plot()
|
413 |
+
hard_elo_task_gr = init_others(HARD_ELO_TASK_DF)
|
414 |
+
demo.load(plot_elo_mle, [hard_elo_task_gr],
|
415 |
+
hard_task_elo_map)
|
416 |
+
with gr.Group():
|
417 |
+
gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
|
418 |
+
hard_bench_elo_map = gr.Plot()
|
419 |
+
hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF)
|
420 |
+
demo.load(plot_elo_mle, [hard_elo_bench_gr],
|
421 |
+
hard_bench_elo_map)
|
422 |
+
|
423 |
+
with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
|
424 |
+
with gr.Column():
|
425 |
+
hard_complete_map = gr.Plot()
|
426 |
+
hard_complete_solve_gr = init_others(HARD_COMPLETE_SOLVE_DF)
|
427 |
+
demo.load(plot_solve_rate, [hard_complete_solve_gr,
|
428 |
+
gr.Textbox("Complete", visible=False),
|
429 |
+
gr.Number(10, visible=False),
|
430 |
+
gr.Number(16, visible=False),
|
431 |
+
], hard_complete_map)
|
432 |
+
hard_instruct_map = gr.Plot()
|
433 |
+
hard_instruct_solve_gr = init_others(HARD_INSTRUCT_SOLVE_DF)
|
434 |
+
demo.load(plot_solve_rate, [hard_instruct_solve_gr,
|
435 |
+
gr.Textbox("Instruct", visible=False),
|
436 |
+
gr.Number(10, visible=False),
|
437 |
+
gr.Number(16, visible=False),
|
438 |
+
], hard_instruct_map)
|
439 |
+
with gr.Tab("🎯 Full Set") as full_tabs:
|
440 |
+
with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="full_bench"):
|
441 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
442 |
+
gr.Markdown(
|
443 |
+
"""
|
444 |
+
**Notes:**
|
445 |
+
- _Complete_ vs _Instruct_:
|
446 |
+
- <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
|
447 |
+
- <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
|
448 |
+
- `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
|
449 |
+
- `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
|
450 |
+
- `size` is the amount of activated model weight during inference.
|
451 |
+
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
|
452 |
+
- For more details check the 📝 About section.
|
453 |
+
""",
|
454 |
+
elem_classes="markdown-text",
|
455 |
+
)
|
456 |
|
457 |
+
with gr.TabItem("📊 Elo Rating", id="full_elo"):
|
458 |
+
with gr.Column():
|
459 |
+
with gr.Group():
|
460 |
|
461 |
+
gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
|
462 |
+
task_elo_map = gr.Plot()
|
463 |
+
elo_task_gr = init_others(ELO_TASK_DF)
|
464 |
+
demo.load(plot_elo_mle, [elo_task_gr], task_elo_map)
|
465 |
+
with gr.Group():
|
466 |
+
gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
|
467 |
+
bench_elo_map = gr.Plot()
|
468 |
+
elo_bench_gr = init_others(ELO_BENCH_DF)
|
469 |
+
demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map)
|
470 |
|
471 |
+
with gr.TabItem("🧩 Solve Rate", id="full_solve"):
|
472 |
+
with gr.Column():
|
473 |
+
complete_map = gr.Plot()
|
474 |
+
complete_solve_gr = init_others(COMPLETE_SOLVE_DF)
|
475 |
+
demo.load(plot_solve_rate, [complete_solve_gr,
|
476 |
+
gr.Textbox("Complete", visible=False),
|
477 |
+
], complete_map)
|
478 |
+
instruct_map = gr.Plot()
|
479 |
+
instruct_solve_gr = init_others(INSTRUCT_SOLVE_DF)
|
480 |
+
demo.load(plot_solve_rate, [instruct_solve_gr,
|
481 |
+
gr.Textbox("Instruct", visible=False),
|
482 |
+
], instruct_map)
|
483 |
with gr.TabItem("📝 About", id=3):
|
484 |
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
|
485 |
with gr.TabItem("🔎 Data Viewer", id="viewer"):
|
|
|
522 |
show_copy_button=True,
|
523 |
)
|
524 |
|
525 |
+
main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
|
526 |
+
# main_block.load(fn=get_latest_data_leaderboard, inputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
|
527 |
# leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
|
528 |
# pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
|
529 |
|