Spaces:

bigcode
/

bigcodebench-leaderboard

Running

App Files Files Community

Terry Zhuo commited on Aug 3

Commit

faf6544

•

1 Parent(s): 67bdb6e

add full results back

Browse files

Files changed (1) hide show

app.py +164 -164

app.py CHANGED Viewed

@@ -150,26 +150,26 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, ba
     raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
 def get_latest_data_leaderboard(
-    # leaderboard_initial_df = None,
     hard_leaderboard_initial_df = None,
-    # elo_task_df = None,
-    # elo_bench_df = None,
     hard_elo_task_df = None,
     hard_elo_bench_df = None,
-    # complete_solve_df = None,
-    # instruct_solve_df = None,
     hard_complete_solve_df = None,
     hard_instruct_solve_df = None
     ):
     global NEW_DATA_ON_LEADERBOARD
-    # global LEADERBOARD_DF
     global HARD_LEADERBOARD_DF
-    # global ELO_TASK_DF
-    # global ELO_BENCH_DF
     global HARD_ELO_TASK_DF
     global HARD_ELO_BENCH_DF
-    # global COMPLETE_SOLVE_DF
-    # global INSTRUCT_SOLVE_DF
     global HARD_COMPLETE_SOLVE_DF
     global HARD_INSTRUCT_SOLVE_DF
@@ -183,10 +183,10 @@ def get_latest_data_leaderboard(
             download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
             verification_mode="no_checks"
         )
-        # LEADERBOARD_DF = get_leaderboard_df(
-        #     leaderboard_dataset=leaderboard_dataset,
-        #     cols=COLS,
-        # )
         hard_leaderboard_dataset = datasets.load_dataset(
             HARD_RESULT_REPO,
             "default",
@@ -201,24 +201,24 @@ def get_latest_data_leaderboard(
         )
         HARD_LEADERBOARD_DF = hard_leaderboard_df
-        # elo_task_df = datasets.load_dataset(
-        #     ELO_REPO,
-        #     "default",
-        #     split="task_no_tie",
-        #     cache_dir=HF_HOME,
-        #     download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
-        #     verification_mode="no_checks"
-        # ).to_pandas()
-        # elo_bench_df = datasets.load_dataset(
-        #     ELO_REPO,
-        #     "default",
-        #     split="benchmark_tie",
-        #     cache_dir=HF_HOME,
-        #     download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
-        #     verification_mode="no_checks"
-        # ).to_pandas()
-        # ELO_TASK_DF = elo_task_df
-        # ELO_BENCH_DF = elo_bench_df
         hard_elo_task_df = datasets.load_dataset(
             HARD_ELO_REPO,
@@ -239,24 +239,24 @@ def get_latest_data_leaderboard(
         HARD_ELO_TASK_DF = hard_elo_task_df
         HARD_ELO_BENCH_DF = hard_elo_bench_df
-        # complete_solve_df = datasets.load_dataset(
-        #     SOLVE_REPO,
-        #     "default",
-        #     split="complete",
-        #     cache_dir=HF_HOME,
-        #     download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
-        #     verification_mode="no_checks"
-        # ).to_pandas()
-        # instruct_solve_df = datasets.load_dataset(
-        #     SOLVE_REPO,
-        #     "default",
-        #     split="instruct",
-        #     cache_dir=HF_HOME,
-        #     download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
-        #     verification_mode="no_checks"
-        # ).to_pandas()
-        # COMPLETE_SOLVE_DF = complete_solve_df
-        # INSTRUCT_SOLVE_DF = instruct_solve_df
         hard_complete_solve_df = datasets.load_dataset(
             HARD_SOLVE_REPO,
@@ -280,41 +280,41 @@ def get_latest_data_leaderboard(
         NEW_DATA_ON_LEADERBOARD = False
     else:
-        # LEADERBOARD_DF = leaderboard_initial_df
-        HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
-        # ELO_TASK_DF = elo_task_df
         # ELO_BENCH_DF = elo_bench_df
-        HARD_ELO_TASK_DF = hard_elo_task_df
         HARD_ELO_BENCH_DF = hard_elo_bench_df
-        # COMPLETE_SOLVE_DF = complete_solve_df
         # INSTRUCT_SOLVE_DF = instruct_solve_df
-        HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
         HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
-    # return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
-    return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
 def init_space():
     """Initializes the application space, loading only necessary data."""
     # Always redownload the leaderboard DataFrame
-    # global LEADERBOARD_DF
     global HARD_LEADERBOARD_DF
-    # global ELO_TASK_DF
-    # global ELO_BENCH_DF
     global HARD_ELO_TASK_DF
     global HARD_ELO_BENCH_DF
-    # global COMPLETE_SOLVE_DF
-    # global INSTRUCT_SOLVE_DF
     global HARD_COMPLETE_SOLVE_DF
     global HARD_INSTRUCT_SOLVE_DF
-    # LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
-    HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
-    # return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
-    return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
 # Initialize VoteManager
 # vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
@@ -382,104 +382,104 @@ with main_block as demo:
     # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        # with gr.Tab("💎 Hard Set") as hard_tabs:
-        with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="hard_bench"):
-            hard_leaderboard = init_leaderboard(HARD_LEADERBOARD_DF)
-            gr.Markdown(
-                """
-            **Notes:**
-            - For the efficiency reasons, we only display the Hard Set leaderboard.
-            - _Hard Set_ vs _Full Set_:
-                - <u>Hard Set</u>: A subset of ~150 BigCodeBench tasks which is more user-facing and challenging.
-                - <u>Full Set</u>: The full set of 1140 BigCodeBench tasks.
-            - _Complete_ vs _Instruct_:
-                - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This split tests if the models are good at coding.
-                - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
-            - `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
-            - `Average` is the average of `Complete` and `Instruct` when both are available.
-            - `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the Complete + Instruct splits. The rating starts from 1000 and is bootstrapped 500 times. We only consider the models having both `Complete` and `Instruct` scores.
-            - `#Act Params (B)` is the number of activated model parameters during inference.
-            - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
-            - For more details check the 📝 About section.
-            """,
-                elem_classes="markdown-text",
-            )
-        with gr.TabItem("📊 Elo Rating", id="hard_elo"):
-            with gr.Column():
-                with gr.Group():
-                    gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
-                    hard_task_elo_map = gr.Plot()
-                    hard_elo_task_gr = init_others(HARD_ELO_TASK_DF)
-                    demo.load(plot_elo_mle, [hard_elo_task_gr],
-                                hard_task_elo_map)
-                with gr.Group():
-                    gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
-                    hard_bench_elo_map = gr.Plot()
-                    hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF)
-                    demo.load(plot_elo_mle, [hard_elo_bench_gr],
-                                hard_bench_elo_map)
-        with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
-            with gr.Column():
-                hard_complete_map = gr.Plot()
-                hard_complete_solve_gr = init_others(HARD_COMPLETE_SOLVE_DF)
-                demo.load(plot_solve_rate, [hard_complete_solve_gr,
-                                            gr.Textbox("Complete", visible=False),
-                                            gr.Number(10, visible=False),
-                                            gr.Number(16, visible=False),
-                                            ], hard_complete_map)
-                hard_instruct_map = gr.Plot()
-                hard_instruct_solve_gr = init_others(HARD_INSTRUCT_SOLVE_DF)
-                demo.load(plot_solve_rate, [hard_instruct_solve_gr,
-                                            gr.Textbox("Instruct", visible=False),
-                                            gr.Number(10, visible=False),
-                                            gr.Number(16, visible=False),
-                                            ], hard_instruct_map)
-        # with gr.Tab("🎯 Full Set") as full_tabs:
-        #     with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="full_bench"):
-        #         leaderboard = init_leaderboard(LEADERBOARD_DF)
-        #         gr.Markdown(
-        #             """
-        #         **Notes:**
-        #         - _Complete_ vs _Instruct_:
-        #             - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
-        #             - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
-        #         - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
-        #         - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
-        #         - `size` is the amount of activated model weight during inference.
-        #         - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
-        #         - For more details check the 📝 About section.
-        #         """,
-        #             elem_classes="markdown-text",
-        #         )
-        #     with gr.TabItem("📊 Elo Rating", id="full_elo"):
-        #         with gr.Column():
-        #             with gr.Group():
-        #                 gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
-        #                 task_elo_map = gr.Plot()
-        #                 elo_task_gr = init_others(ELO_TASK_DF)
-        #                 demo.load(plot_elo_mle, [elo_task_gr], task_elo_map)
-        #             with gr.Group():
-        #                 gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
-        #                 bench_elo_map = gr.Plot()
-        #                 elo_bench_gr = init_others(ELO_BENCH_DF)
-        #                 demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map)
-        #     with gr.TabItem("🧩 Solve Rate", id="full_solve"):
-        #         with gr.Column():
-        #             complete_map = gr.Plot()
-        #             complete_solve_gr = init_others(COMPLETE_SOLVE_DF)
-        #             demo.load(plot_solve_rate, [complete_solve_gr,
-        #                                         gr.Textbox("Complete", visible=False),
-        #                                         ], complete_map)
-        #             instruct_map = gr.Plot()
-        #             instruct_solve_gr = init_others(INSTRUCT_SOLVE_DF)
-        #             demo.load(plot_solve_rate, [instruct_solve_gr,
-        #                                         gr.Textbox("Instruct", visible=False),
-        #                                         ], instruct_map)
         with gr.TabItem("📝 About", id=3):
             gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
         with gr.TabItem("🔎 Data Viewer", id="viewer"):
@@ -522,8 +522,8 @@ with main_block as demo:
                 show_copy_button=True,
             )
-    # main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
-    main_block.load(fn=get_latest_data_leaderboard, inputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
     # leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
     # pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])

     raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
 def get_latest_data_leaderboard(
+    leaderboard_initial_df = None,
     hard_leaderboard_initial_df = None,
+    elo_task_df = None,
+    elo_bench_df = None,
     hard_elo_task_df = None,
     hard_elo_bench_df = None,
+    complete_solve_df = None,
+    instruct_solve_df = None,
     hard_complete_solve_df = None,
     hard_instruct_solve_df = None
     ):
     global NEW_DATA_ON_LEADERBOARD
+    global LEADERBOARD_DF
     global HARD_LEADERBOARD_DF
+    global ELO_TASK_DF
+    global ELO_BENCH_DF
     global HARD_ELO_TASK_DF
     global HARD_ELO_BENCH_DF
+    global COMPLETE_SOLVE_DF
+    global INSTRUCT_SOLVE_DF
     global HARD_COMPLETE_SOLVE_DF
     global HARD_INSTRUCT_SOLVE_DF
             download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
             verification_mode="no_checks"
         )
+        LEADERBOARD_DF = get_leaderboard_df(
+            leaderboard_dataset=leaderboard_dataset,
+            cols=COLS,
+        )
         hard_leaderboard_dataset = datasets.load_dataset(
             HARD_RESULT_REPO,
             "default",
         )
         HARD_LEADERBOARD_DF = hard_leaderboard_df
+        elo_task_df = datasets.load_dataset(
+            ELO_REPO,
+            "default",
+            split="task_no_tie",
+            cache_dir=HF_HOME,
+            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
+            verification_mode="no_checks"
+        ).to_pandas()
+        elo_bench_df = datasets.load_dataset(
+            ELO_REPO,
+            "default",
+            split="benchmark_tie",
+            cache_dir=HF_HOME,
+            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
+            verification_mode="no_checks"
+        ).to_pandas()
+        ELO_TASK_DF = elo_task_df
+        ELO_BENCH_DF = elo_bench_df
         hard_elo_task_df = datasets.load_dataset(
             HARD_ELO_REPO,
         HARD_ELO_TASK_DF = hard_elo_task_df
         HARD_ELO_BENCH_DF = hard_elo_bench_df
+        complete_solve_df = datasets.load_dataset(
+            SOLVE_REPO,
+            "default",
+            split="complete",
+            cache_dir=HF_HOME,
+            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
+            verification_mode="no_checks"
+        ).to_pandas()
+        instruct_solve_df = datasets.load_dataset(
+            SOLVE_REPO,
+            "default",
+            split="instruct",
+            cache_dir=HF_HOME,
+            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
+            verification_mode="no_checks"
+        ).to_pandas()
+        COMPLETE_SOLVE_DF = complete_solve_df
+        INSTRUCT_SOLVE_DF = instruct_solve_df
         hard_complete_solve_df = datasets.load_dataset(
             HARD_SOLVE_REPO,
         NEW_DATA_ON_LEADERBOARD = False
     else:
+        LEADERBOARD_DF = leaderboard_initial_df
+        # HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
+        ELO_TASK_DF = elo_task_df
         # ELO_BENCH_DF = elo_bench_df
+        # HARD_ELO_TASK_DF = hard_elo_task_df
         HARD_ELO_BENCH_DF = hard_elo_bench_df
+        COMPLETE_SOLVE_DF = complete_solve_df
         # INSTRUCT_SOLVE_DF = instruct_solve_df
+        # HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
         HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
+    return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
+    # return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
 def init_space():
     """Initializes the application space, loading only necessary data."""
     # Always redownload the leaderboard DataFrame
+    global LEADERBOARD_DF
     global HARD_LEADERBOARD_DF
+    global ELO_TASK_DF
+    global ELO_BENCH_DF
     global HARD_ELO_TASK_DF
     global HARD_ELO_BENCH_DF
+    global COMPLETE_SOLVE_DF
+    global INSTRUCT_SOLVE_DF
     global HARD_COMPLETE_SOLVE_DF
     global HARD_INSTRUCT_SOLVE_DF
+    LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
+    # HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
+    return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
+    # return (HARD_LEADERBOARD_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
 # Initialize VoteManager
 # vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
     # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.Tab("💎 Hard Set") as hard_tabs:
+            with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="hard_bench"):
+                hard_leaderboard = init_leaderboard(HARD_LEADERBOARD_DF)
+                gr.Markdown(
+                    """
+                **Notes:**
+                - For the efficiency reasons, we only display the Hard Set leaderboard.
+                - _Hard Set_ vs _Full Set_:
+                    - <u>Hard Set</u>: A subset of ~150 BigCodeBench tasks which is more user-facing and challenging.
+                    - <u>Full Set</u>: The full set of 1140 BigCodeBench tasks.
+                - _Complete_ vs _Instruct_:
+                    - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This split tests if the models are good at coding.
+                    - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
+                - `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
+                - `Average` is the average of `Complete` and `Instruct` when both are available.
+                - `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the Complete + Instruct splits. The rating starts from 1000 and is bootstrapped 500 times. We only consider the models having both `Complete` and `Instruct` scores.
+                - `#Act Params (B)` is the number of activated model parameters during inference.
+                - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
+                - For more details check the 📝 About section.
+                """,
+                    elem_classes="markdown-text",
+                )
+            with gr.TabItem("📊 Elo Rating", id="hard_elo"):
+                with gr.Column():
+                    with gr.Group():
+                        gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
+                        hard_task_elo_map = gr.Plot()
+                        hard_elo_task_gr = init_others(HARD_ELO_TASK_DF)
+                        demo.load(plot_elo_mle, [hard_elo_task_gr],
+                                    hard_task_elo_map)
+                    with gr.Group():
+                        gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
+                        hard_bench_elo_map = gr.Plot()
+                        hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF)
+                        demo.load(plot_elo_mle, [hard_elo_bench_gr],
+                                    hard_bench_elo_map)
+            with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
+                with gr.Column():
+                    hard_complete_map = gr.Plot()
+                    hard_complete_solve_gr = init_others(HARD_COMPLETE_SOLVE_DF)
+                    demo.load(plot_solve_rate, [hard_complete_solve_gr,
+                                                gr.Textbox("Complete", visible=False),
+                                                gr.Number(10, visible=False),
+                                                gr.Number(16, visible=False),
+                                                ], hard_complete_map)
+                    hard_instruct_map = gr.Plot()
+                    hard_instruct_solve_gr = init_others(HARD_INSTRUCT_SOLVE_DF)
+                    demo.load(plot_solve_rate, [hard_instruct_solve_gr,
+                                                gr.Textbox("Instruct", visible=False),
+                                                gr.Number(10, visible=False),
+                                                gr.Number(16, visible=False),
+                                                ], hard_instruct_map)
+        with gr.Tab("🎯 Full Set") as full_tabs:
+            with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="full_bench"):
+                leaderboard = init_leaderboard(LEADERBOARD_DF)
+                gr.Markdown(
+                    """
+                **Notes:**
+                - _Complete_ vs _Instruct_:
+                    - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
+                    - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
+                - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
+                - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
+                - `size` is the amount of activated model weight during inference.
+                - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
+                - For more details check the 📝 About section.
+                """,
+                    elem_classes="markdown-text",
+                )
+            with gr.TabItem("📊 Elo Rating", id="full_elo"):
+                with gr.Column():
+                    with gr.Group():
+                        gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
+                        task_elo_map = gr.Plot()
+                        elo_task_gr = init_others(ELO_TASK_DF)
+                        demo.load(plot_elo_mle, [elo_task_gr], task_elo_map)
+                    with gr.Group():
+                        gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
+                        bench_elo_map = gr.Plot()
+                        elo_bench_gr = init_others(ELO_BENCH_DF)
+                        demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map)
+            with gr.TabItem("🧩 Solve Rate", id="full_solve"):
+                with gr.Column():
+                    complete_map = gr.Plot()
+                    complete_solve_gr = init_others(COMPLETE_SOLVE_DF)
+                    demo.load(plot_solve_rate, [complete_solve_gr,
+                                                gr.Textbox("Complete", visible=False),
+                                                ], complete_map)
+                    instruct_map = gr.Plot()
+                    instruct_solve_gr = init_others(INSTRUCT_SOLVE_DF)
+                    demo.load(plot_solve_rate, [instruct_solve_gr,
+                                                gr.Textbox("Instruct", visible=False),
+                                                ], instruct_map)
         with gr.TabItem("📝 About", id=3):
             gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
         with gr.TabItem("🔎 Data Viewer", id="viewer"):
                 show_copy_button=True,
             )
+    main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
+    # main_block.load(fn=get_latest_data_leaderboard, inputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[hard_leaderboard, hard_elo_task_gr, hard_elo_bench_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
     # leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
     # pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])