Terry Zhuo commited on
Commit
faf6544
1 Parent(s): 67bdb6e

add full results back

Browse files
Files changed (1) hide show
  1. app.py +164 -164
app.py CHANGED
@@ -150,26 +150,26 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, ba
150
  raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
151
 
152
  def get_latest_data_leaderboard(
153
- # leaderboard_initial_df = None,
154
  hard_leaderboard_initial_df = None,
155
- # elo_task_df = None,
156
- # elo_bench_df = None,
157
  hard_elo_task_df = None,
158
  hard_elo_bench_df = None,
159
- # complete_solve_df = None,
160
- # instruct_solve_df = None,
161
  hard_complete_solve_df = None,
162
  hard_instruct_solve_df = None
163
  ):
164
  global NEW_DATA_ON_LEADERBOARD
165
- # global LEADERBOARD_DF
166
  global HARD_LEADERBOARD_DF
167
- # global ELO_TASK_DF
168
- # global ELO_BENCH_DF
169
  global HARD_ELO_TASK_DF
170
  global HARD_ELO_BENCH_DF
171
- # global COMPLETE_SOLVE_DF
172
- # global INSTRUCT_SOLVE_DF
173
  global HARD_COMPLETE_SOLVE_DF
174
  global HARD_INSTRUCT_SOLVE_DF
175
 
@@ -183,10 +183,10 @@ def get_latest_data_leaderboard(
183
  download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
184
  verification_mode="no_checks"
185
  )
186
- # LEADERBOARD_DF = get_leaderboard_df(
187
- # leaderboard_dataset=leaderboard_dataset,
188
- # cols=COLS,
189
- # )
190
  hard_leaderboard_dataset = datasets.load_dataset(
191
  HARD_RESULT_REPO,
192
  "default",
@@ -201,24 +201,24 @@ def get_latest_data_leaderboard(
201
  )
202
  HARD_LEADERBOARD_DF = hard_leaderboard_df
203
 
204
- # elo_task_df = datasets.load_dataset(
205
- # ELO_REPO,
206
- # "default",
207
- # split="task_no_tie",
208
- # cache_dir=HF_HOME,
209
- # download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
210
- # verification_mode="no_checks"
211
- # ).to_pandas()
212
- # elo_bench_df = datasets.load_dataset(
213
- # ELO_REPO,
214
- # "default",
215
- # split="benchmark_tie",
216
- # cache_dir=HF_HOME,
217
- # download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
218
- # verification_mode="no_checks"
219
- # ).to_pandas()
220
- # ELO_TASK_DF = elo_task_df
221
- # ELO_BENCH_DF = elo_bench_df
222
 
223
  hard_elo_task_df = datasets.load_dataset(
224
  HARD_ELO_REPO,
@@ -239,24 +239,24 @@ def get_latest_data_leaderboard(
239
  HARD_ELO_TASK_DF = hard_elo_task_df
240
  HARD_ELO_BENCH_DF = hard_elo_bench_df
241
 
242
- # complete_solve_df = datasets.load_dataset(
243
- # SOLVE_REPO,
244
- # "default",
245
- # split="complete",
246
- # cache_dir=HF_HOME,
247
- # download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
248
- # verification_mode="no_checks"
249
- # ).to_pandas()
250
- # instruct_solve_df = datasets.load_dataset(
251
- # SOLVE_REPO,
252
- # "default",
253
- # split="instruct",
254
- # cache_dir=HF_HOME,
255
- # download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
256
- # verification_mode="no_checks"
257
- # ).to_pandas()
258
- # COMPLETE_SOLVE_DF = complete_solve_df
259
- # INSTRUCT_SOLVE_DF = instruct_solve_df
260
 
261
  hard_complete_solve_df = datasets.load_dataset(
262
  HARD_SOLVE_REPO,
@@ -280,41 +280,41 @@ def get_latest_data_leaderboard(
280
  NEW_DATA_ON_LEADERBOARD = False
281
 
282
  else:
283
- # LEADERBOARD_DF = leaderboard_initial_df
284
- HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
285
- # ELO_TASK_DF = elo_task_df
286
  # ELO_BENCH_DF = elo_bench_df
287
- HARD_ELO_TASK_DF = hard_elo_task_df
288
  HARD_ELO_BENCH_DF = hard_elo_bench_df
289
- # COMPLETE_SOLVE_DF = complete_solve_df
290
  # INSTRUCT_SOLVE_DF = instruct_solve_df
291
- HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
292
  HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
293
 
294
- # return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
295
- return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
296
 
297
 
298
  def init_space():
299
  """Initializes the application space, loading only necessary data."""
300
 
301
  # Always redownload the leaderboard DataFrame
302
- # global LEADERBOARD_DF
303
  global HARD_LEADERBOARD_DF
304
- # global ELO_TASK_DF
305
- # global ELO_BENCH_DF
306
  global HARD_ELO_TASK_DF
307
  global HARD_ELO_BENCH_DF
308
- # global COMPLETE_SOLVE_DF
309
- # global INSTRUCT_SOLVE_DF
310
  global HARD_COMPLETE_SOLVE_DF
311
  global HARD_INSTRUCT_SOLVE_DF
312
 
313
- # LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
314
- HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
315
 
316
- # return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
317
- return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
318
 
319
  # Initialize VoteManager
320
  # vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
@@ -382,104 +382,104 @@ with main_block as demo:
382
 
383
  # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
384
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
385
- # with gr.Tab("💎 Hard Set") as hard_tabs:
386
- with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="hard_bench"):
387
- hard_leaderboard = init_leaderboard(HARD_LEADERBOARD_DF)
388
- gr.Markdown(
389
- """
390
- **Notes:**
391
- - For the efficiency reasons, we only display the Hard Set leaderboard.
392
- - _Hard Set_ vs _Full Set_:
393
- - <u>Hard Set</u>: A subset of ~150 BigCodeBench tasks which is more user-facing and challenging.
394
- - <u>Full Set</u>: The full set of 1140 BigCodeBench tasks.
395
- - _Complete_ vs _Instruct_:
396
- - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This split tests if the models are good at coding.
397
- - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
398
- - `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
399
- - `Average` is the average of `Complete` and `Instruct` when both are available.
400
- - `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the Complete + Instruct splits. The rating starts from 1000 and is bootstrapped 500 times. We only consider the models having both `Complete` and `Instruct` scores.
401
- - `#Act Params (B)` is the number of activated model parameters during inference.
402
- - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
403
- - For more details check the 📝 About section.
404
- """,
405
- elem_classes="markdown-text",
406
- )
407
-
408
- with gr.TabItem("📊 Elo Rating", id="hard_elo"):
409
- with gr.Column():
410
- with gr.Group():
411
- gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
412
- hard_task_elo_map = gr.Plot()
413
- hard_elo_task_gr = init_others(HARD_ELO_TASK_DF)
414
- demo.load(plot_elo_mle, [hard_elo_task_gr],
415
- hard_task_elo_map)
416
- with gr.Group():
417
- gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
418
- hard_bench_elo_map = gr.Plot()
419
- hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF)
420
- demo.load(plot_elo_mle, [hard_elo_bench_gr],
421
- hard_bench_elo_map)
422
-
423
- with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
424
- with gr.Column():
425
- hard_complete_map = gr.Plot()
426
- hard_complete_solve_gr = init_others(HARD_COMPLETE_SOLVE_DF)
427
- demo.load(plot_solve_rate, [hard_complete_solve_gr,
428
- gr.Textbox("Complete", visible=False),
429
- gr.Number(10, visible=False),
430
- gr.Number(16, visible=False),
431
- ], hard_complete_map)
432
- hard_instruct_map = gr.Plot()
433
- hard_instruct_solve_gr = init_others(HARD_INSTRUCT_SOLVE_DF)
434
- demo.load(plot_solve_rate, [hard_instruct_solve_gr,
435
- gr.Textbox("Instruct", visible=False),
436
- gr.Number(10, visible=False),
437
- gr.Number(16, visible=False),
438
- ], hard_instruct_map)
439
- # with gr.Tab("🎯 Full Set") as full_tabs:
440
- # with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="full_bench"):
441
- # leaderboard = init_leaderboard(LEADERBOARD_DF)
442
- # gr.Markdown(
443
- # """
444
- # **Notes:**
445
- # - _Complete_ vs _Instruct_:
446
- # - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
447
- # - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
448
- # - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
449
- # - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
450
- # - `size` is the amount of activated model weight during inference.
451
- # - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
452
- # - For more details check the 📝 About section.
453
- # """,
454
- # elem_classes="markdown-text",
455
- # )
456
 
457
- # with gr.TabItem("📊 Elo Rating", id="full_elo"):
458
- # with gr.Column():
459
- # with gr.Group():
460
 
461
- # gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
462
- # task_elo_map = gr.Plot()
463
- # elo_task_gr = init_others(ELO_TASK_DF)
464
- # demo.load(plot_elo_mle, [elo_task_gr], task_elo_map)
465
- # with gr.Group():
466
- # gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
467
- # bench_elo_map = gr.Plot()
468
- # elo_bench_gr = init_others(ELO_BENCH_DF)
469
- # demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map)
470
 
471
- # with gr.TabItem("🧩 Solve Rate", id="full_solve"):
472
- # with gr.Column():
473
- # complete_map = gr.Plot()
474
- # complete_solve_gr = init_others(COMPLETE_SOLVE_DF)
475
- # demo.load(plot_solve_rate, [complete_solve_gr,
476
- # gr.Textbox("Complete", visible=False),
477
- # ], complete_map)
478
- # instruct_map = gr.Plot()
479
- # instruct_solve_gr = init_others(INSTRUCT_SOLVE_DF)
480
- # demo.load(plot_solve_rate, [instruct_solve_gr,
481
- # gr.Textbox("Instruct", visible=False),
482
- # ], instruct_map)
483
  with gr.TabItem("📝 About", id=3):
484
  gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
485
  with gr.TabItem("🔎 Data Viewer", id="viewer"):
@@ -522,8 +522,8 @@ with main_block as demo:
522
  show_copy_button=True,
523
  )
524
 
525
- # main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
526
- main_block.load(fn=get_latest_data_leaderboard, inputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
527
  # leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
528
  # pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
529
 
 
150
  raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
151
 
152
  def get_latest_data_leaderboard(
153
+ leaderboard_initial_df = None,
154
  hard_leaderboard_initial_df = None,
155
+ elo_task_df = None,
156
+ elo_bench_df = None,
157
  hard_elo_task_df = None,
158
  hard_elo_bench_df = None,
159
+ complete_solve_df = None,
160
+ instruct_solve_df = None,
161
  hard_complete_solve_df = None,
162
  hard_instruct_solve_df = None
163
  ):
164
  global NEW_DATA_ON_LEADERBOARD
165
+ global LEADERBOARD_DF
166
  global HARD_LEADERBOARD_DF
167
+ global ELO_TASK_DF
168
+ global ELO_BENCH_DF
169
  global HARD_ELO_TASK_DF
170
  global HARD_ELO_BENCH_DF
171
+ global COMPLETE_SOLVE_DF
172
+ global INSTRUCT_SOLVE_DF
173
  global HARD_COMPLETE_SOLVE_DF
174
  global HARD_INSTRUCT_SOLVE_DF
175
 
 
183
  download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
184
  verification_mode="no_checks"
185
  )
186
+ LEADERBOARD_DF = get_leaderboard_df(
187
+ leaderboard_dataset=leaderboard_dataset,
188
+ cols=COLS,
189
+ )
190
  hard_leaderboard_dataset = datasets.load_dataset(
191
  HARD_RESULT_REPO,
192
  "default",
 
201
  )
202
  HARD_LEADERBOARD_DF = hard_leaderboard_df
203
 
204
+ elo_task_df = datasets.load_dataset(
205
+ ELO_REPO,
206
+ "default",
207
+ split="task_no_tie",
208
+ cache_dir=HF_HOME,
209
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
210
+ verification_mode="no_checks"
211
+ ).to_pandas()
212
+ elo_bench_df = datasets.load_dataset(
213
+ ELO_REPO,
214
+ "default",
215
+ split="benchmark_tie",
216
+ cache_dir=HF_HOME,
217
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
218
+ verification_mode="no_checks"
219
+ ).to_pandas()
220
+ ELO_TASK_DF = elo_task_df
221
+ ELO_BENCH_DF = elo_bench_df
222
 
223
  hard_elo_task_df = datasets.load_dataset(
224
  HARD_ELO_REPO,
 
239
  HARD_ELO_TASK_DF = hard_elo_task_df
240
  HARD_ELO_BENCH_DF = hard_elo_bench_df
241
 
242
+ complete_solve_df = datasets.load_dataset(
243
+ SOLVE_REPO,
244
+ "default",
245
+ split="complete",
246
+ cache_dir=HF_HOME,
247
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
248
+ verification_mode="no_checks"
249
+ ).to_pandas()
250
+ instruct_solve_df = datasets.load_dataset(
251
+ SOLVE_REPO,
252
+ "default",
253
+ split="instruct",
254
+ cache_dir=HF_HOME,
255
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
256
+ verification_mode="no_checks"
257
+ ).to_pandas()
258
+ COMPLETE_SOLVE_DF = complete_solve_df
259
+ INSTRUCT_SOLVE_DF = instruct_solve_df
260
 
261
  hard_complete_solve_df = datasets.load_dataset(
262
  HARD_SOLVE_REPO,
 
280
  NEW_DATA_ON_LEADERBOARD = False
281
 
282
  else:
283
+ LEADERBOARD_DF = leaderboard_initial_df
284
+ # HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
285
+ ELO_TASK_DF = elo_task_df
286
  # ELO_BENCH_DF = elo_bench_df
287
+ # HARD_ELO_TASK_DF = hard_elo_task_df
288
  HARD_ELO_BENCH_DF = hard_elo_bench_df
289
+ COMPLETE_SOLVE_DF = complete_solve_df
290
  # INSTRUCT_SOLVE_DF = instruct_solve_df
291
+ # HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
292
  HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
293
 
294
+ return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
295
+ # return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
296
 
297
 
298
  def init_space():
299
  """Initializes the application space, loading only necessary data."""
300
 
301
  # Always redownload the leaderboard DataFrame
302
+ global LEADERBOARD_DF
303
  global HARD_LEADERBOARD_DF
304
+ global ELO_TASK_DF
305
+ global ELO_BENCH_DF
306
  global HARD_ELO_TASK_DF
307
  global HARD_ELO_BENCH_DF
308
+ global COMPLETE_SOLVE_DF
309
+ global INSTRUCT_SOLVE_DF
310
  global HARD_COMPLETE_SOLVE_DF
311
  global HARD_INSTRUCT_SOLVE_DF
312
 
313
+ LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
314
+ # HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
315
 
316
+ return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
317
+ # return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
318
 
319
  # Initialize VoteManager
320
  # vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
 
382
 
383
  # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
384
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
385
+ with gr.Tab("💎 Hard Set") as hard_tabs:
386
+ with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="hard_bench"):
387
+ hard_leaderboard = init_leaderboard(HARD_LEADERBOARD_DF)
388
+ gr.Markdown(
389
+ """
390
+ **Notes:**
391
+ - For the efficiency reasons, we only display the Hard Set leaderboard.
392
+ - _Hard Set_ vs _Full Set_:
393
+ - <u>Hard Set</u>: A subset of ~150 BigCodeBench tasks which is more user-facing and challenging.
394
+ - <u>Full Set</u>: The full set of 1140 BigCodeBench tasks.
395
+ - _Complete_ vs _Instruct_:
396
+ - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This split tests if the models are good at coding.
397
+ - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
398
+ - `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
399
+ - `Average` is the average of `Complete` and `Instruct` when both are available.
400
+ - `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the Complete + Instruct splits. The rating starts from 1000 and is bootstrapped 500 times. We only consider the models having both `Complete` and `Instruct` scores.
401
+ - `#Act Params (B)` is the number of activated model parameters during inference.
402
+ - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
403
+ - For more details check the 📝 About section.
404
+ """,
405
+ elem_classes="markdown-text",
406
+ )
407
+
408
+ with gr.TabItem("📊 Elo Rating", id="hard_elo"):
409
+ with gr.Column():
410
+ with gr.Group():
411
+ gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
412
+ hard_task_elo_map = gr.Plot()
413
+ hard_elo_task_gr = init_others(HARD_ELO_TASK_DF)
414
+ demo.load(plot_elo_mle, [hard_elo_task_gr],
415
+ hard_task_elo_map)
416
+ with gr.Group():
417
+ gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
418
+ hard_bench_elo_map = gr.Plot()
419
+ hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF)
420
+ demo.load(plot_elo_mle, [hard_elo_bench_gr],
421
+ hard_bench_elo_map)
422
+
423
+ with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
424
+ with gr.Column():
425
+ hard_complete_map = gr.Plot()
426
+ hard_complete_solve_gr = init_others(HARD_COMPLETE_SOLVE_DF)
427
+ demo.load(plot_solve_rate, [hard_complete_solve_gr,
428
+ gr.Textbox("Complete", visible=False),
429
+ gr.Number(10, visible=False),
430
+ gr.Number(16, visible=False),
431
+ ], hard_complete_map)
432
+ hard_instruct_map = gr.Plot()
433
+ hard_instruct_solve_gr = init_others(HARD_INSTRUCT_SOLVE_DF)
434
+ demo.load(plot_solve_rate, [hard_instruct_solve_gr,
435
+ gr.Textbox("Instruct", visible=False),
436
+ gr.Number(10, visible=False),
437
+ gr.Number(16, visible=False),
438
+ ], hard_instruct_map)
439
+ with gr.Tab("🎯 Full Set") as full_tabs:
440
+ with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="full_bench"):
441
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
442
+ gr.Markdown(
443
+ """
444
+ **Notes:**
445
+ - _Complete_ vs _Instruct_:
446
+ - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
447
+ - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
448
+ - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
449
+ - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
450
+ - `size` is the amount of activated model weight during inference.
451
+ - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
452
+ - For more details check the 📝 About section.
453
+ """,
454
+ elem_classes="markdown-text",
455
+ )
456
 
457
+ with gr.TabItem("📊 Elo Rating", id="full_elo"):
458
+ with gr.Column():
459
+ with gr.Group():
460
 
461
+ gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
462
+ task_elo_map = gr.Plot()
463
+ elo_task_gr = init_others(ELO_TASK_DF)
464
+ demo.load(plot_elo_mle, [elo_task_gr], task_elo_map)
465
+ with gr.Group():
466
+ gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
467
+ bench_elo_map = gr.Plot()
468
+ elo_bench_gr = init_others(ELO_BENCH_DF)
469
+ demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map)
470
 
471
+ with gr.TabItem("🧩 Solve Rate", id="full_solve"):
472
+ with gr.Column():
473
+ complete_map = gr.Plot()
474
+ complete_solve_gr = init_others(COMPLETE_SOLVE_DF)
475
+ demo.load(plot_solve_rate, [complete_solve_gr,
476
+ gr.Textbox("Complete", visible=False),
477
+ ], complete_map)
478
+ instruct_map = gr.Plot()
479
+ instruct_solve_gr = init_others(INSTRUCT_SOLVE_DF)
480
+ demo.load(plot_solve_rate, [instruct_solve_gr,
481
+ gr.Textbox("Instruct", visible=False),
482
+ ], instruct_map)
483
  with gr.TabItem("📝 About", id=3):
484
  gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
485
  with gr.TabItem("🔎 Data Viewer", id="viewer"):
 
522
  show_copy_button=True,
523
  )
524
 
525
+ main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
526
+ # main_block.load(fn=get_latest_data_leaderboard, inputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
527
  # leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
528
  # pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
529