import os
import logging
import time
import schedule
import datetime
import gradio as gr
from threading import Thread
import datasets
from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
from apscheduler.schedulers.background import BackgroundScheduler
# Start ephemeral Spaces on PRs (see config in README.md)
from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
from src.display.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
# INTRODUCTION_TEXT,
TITLE,
ABOUT_TEXT,
SUBMISSION_TEXT_3,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
COLS,
EVAL_COLS,
EVAL_TYPES,
AutoEvalColumn,
fields,
EvalQueueColumn
)
from src.envs import (
API,
EVAL_REQUESTS_PATH,
RESULT_REPO,
DATA_VERSION,
DATA_REPO,
HARD_RESULT_REPO,
ELO_REPO,
HARD_ELO_REPO,
SOLVE_REPO,
HARD_SOLVE_REPO,
HF_TOKEN,
QUEUE_REPO,
REPO_ID,
VOTES_REPO,
VOTES_PATH,
HF_HOME,
)
from src.populate import get_evaluation_queue_df, get_leaderboard_df
from src.tools.plots import plot_elo_mle, plot_solve_rate
# from src.voting.vote_system import VoteManager, run_scheduler
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
# Start ephemeral Spaces on PRs (see config in README.md)
from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
# Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
# This controls whether a full initialization should be performed.
DO_FULL_INIT = True # os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
NEW_DATA_ON_LEADERBOARD = True
LEADERBOARD_DF = None
HARD_LEADERBOARD_DF = None
ELO_TASK_DF = None
ELO_BENCH_DF = None
HARD_ELO_TASK_DF = None
HARD_ELO_BENCH_DF = None
COMPLETE_SOLVE_DF = None
INSTRUCT_SOLVE_DF = None
HARD_COMPLETE_SOLVE_DF = None
HARD_INSTRUCT_SOLVE_DF = None
DATA = datasets.load_dataset(DATA_REPO, "default", cache_dir=HF_HOME, split=DATA_VERSION,
verification_mode="no_checks")
def filter_data(data, keyword):
if not keyword:
return data
filtered_data = [item for item in data if keyword.lower() in item['complete_prompt'].lower()]
return filtered_data
def update_display(search_keyword, index, show_test):
filtered_data = filter_data(DATA, search_keyword)
if not filtered_data:
return ["No data available. Check the search criteria."] + [""] * 4 + [0, gr.update(maximum=0, value=0)]
max_index = len(filtered_data) - 1
index = min(max(0, index), max_index)
task_id = filtered_data[index]['task_id']
snippet1 = filtered_data[index]['complete_prompt']
snippet2 = filtered_data[index]['instruct_prompt']
# snippet3 = filtered_data[index]['canonical_solution'] if show_solution else ""
snippet4 = filtered_data[index]['test'] if show_test else ""
return [
task_id,
snippet1,
snippet2,
# snippet3,
snippet4,
len(filtered_data),
gr.update(maximum=max_index, value=index)
]
def restart_space():
API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
def time_diff_wrapper(func):
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
diff = end_time - start_time
logging.info(f"Time taken for {func.__name__}: {diff} seconds")
return result
return wrapper
@time_diff_wrapper
def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
"""Download dataset with exponential backoff retries."""
attempt = 0
while attempt < max_attempts:
try:
logging.info(f"Downloading {repo_id} to {local_dir}")
snapshot_download(
repo_id=repo_id,
local_dir=local_dir,
repo_type=repo_type,
tqdm_class=None,
etag_timeout=30,
max_workers=8,
)
logging.info("Download successful")
return
except Exception as e:
wait_time = backoff_factor**attempt
logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
time.sleep(wait_time)
attempt += 1
raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
def get_latest_data_leaderboard(
leaderboard_initial_df = None,
hard_leaderboard_initial_df = None,
elo_task_df = None,
elo_bench_df = None,
hard_elo_task_df = None,
hard_elo_bench_df = None,
complete_solve_df = None,
instruct_solve_df = None,
hard_complete_solve_df = None,
hard_instruct_solve_df = None
):
global NEW_DATA_ON_LEADERBOARD
global LEADERBOARD_DF
global HARD_LEADERBOARD_DF
global ELO_TASK_DF
global ELO_BENCH_DF
global HARD_ELO_TASK_DF
global HARD_ELO_BENCH_DF
global COMPLETE_SOLVE_DF
global INSTRUCT_SOLVE_DF
global HARD_COMPLETE_SOLVE_DF
global HARD_INSTRUCT_SOLVE_DF
if NEW_DATA_ON_LEADERBOARD:
print("Leaderboard updated at reload!")
leaderboard_dataset = datasets.load_dataset(
RESULT_REPO,
"default",
split="train",
cache_dir=HF_HOME,
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
verification_mode="no_checks"
)
LEADERBOARD_DF = get_leaderboard_df(
leaderboard_dataset=leaderboard_dataset,
cols=COLS,
)
hard_leaderboard_dataset = datasets.load_dataset(
HARD_RESULT_REPO,
"default",
split="train",
cache_dir=HF_HOME,
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
verification_mode="no_checks"
)
hard_leaderboard_df = get_leaderboard_df(
leaderboard_dataset=hard_leaderboard_dataset,
cols=COLS,
)
HARD_LEADERBOARD_DF = hard_leaderboard_df
elo_task_df = datasets.load_dataset(
ELO_REPO,
"default",
split="task_no_tie",
cache_dir=HF_HOME,
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
verification_mode="no_checks"
).to_pandas()
elo_bench_df = datasets.load_dataset(
ELO_REPO,
"default",
split="benchmark_tie",
cache_dir=HF_HOME,
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
verification_mode="no_checks"
).to_pandas()
ELO_TASK_DF = elo_task_df
ELO_BENCH_DF = elo_bench_df
hard_elo_task_df = datasets.load_dataset(
HARD_ELO_REPO,
"default",
split="task_no_tie",
cache_dir=HF_HOME,
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
verification_mode="no_checks"
).to_pandas()
hard_elo_bench_df = datasets.load_dataset(
HARD_ELO_REPO,
"default",
split="benchmark_tie",
cache_dir=HF_HOME,
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
verification_mode="no_checks"
).to_pandas()
HARD_ELO_TASK_DF = hard_elo_task_df
HARD_ELO_BENCH_DF = hard_elo_bench_df
complete_solve_df = datasets.load_dataset(
SOLVE_REPO,
"default",
split="complete",
cache_dir=HF_HOME,
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
verification_mode="no_checks"
).to_pandas()
instruct_solve_df = datasets.load_dataset(
SOLVE_REPO,
"default",
split="instruct",
cache_dir=HF_HOME,
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
verification_mode="no_checks"
).to_pandas()
COMPLETE_SOLVE_DF = complete_solve_df
INSTRUCT_SOLVE_DF = instruct_solve_df
hard_complete_solve_df = datasets.load_dataset(
HARD_SOLVE_REPO,
"default",
split="complete",
cache_dir=HF_HOME,
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
verification_mode="no_checks"
).to_pandas()
hard_instruct_solve_df = datasets.load_dataset(
HARD_SOLVE_REPO,
"default",
split="instruct",
cache_dir=HF_HOME,
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
verification_mode="no_checks"
).to_pandas()
HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
NEW_DATA_ON_LEADERBOARD = False
else:
LEADERBOARD_DF = leaderboard_initial_df
HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
ELO_TASK_DF = elo_task_df
ELO_BENCH_DF = elo_bench_df
HARD_ELO_TASK_DF = hard_elo_task_df
HARD_ELO_BENCH_DF = hard_elo_bench_df
COMPLETE_SOLVE_DF = complete_solve_df
INSTRUCT_SOLVE_DF = instruct_solve_df
HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
def init_space():
"""Initializes the application space, loading only necessary data."""
# Always redownload the leaderboard DataFrame
global LEADERBOARD_DF
global HARD_LEADERBOARD_DF
global ELO_TASK_DF
global ELO_BENCH_DF
global HARD_ELO_TASK_DF
global HARD_ELO_BENCH_DF
global COMPLETE_SOLVE_DF
global INSTRUCT_SOLVE_DF
global HARD_COMPLETE_SOLVE_DF
global HARD_INSTRUCT_SOLVE_DF
LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
# Evaluation queue DataFrame retrieval is independent of initialization detail level
# eval_queue_dfs = get_latest_data_queue()
return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
# Initialize VoteManager
# vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
# Schedule the upload_votes method to run every 15 minutes
# schedule.every(15).minutes.do(vote_manager.upload_votes)
# Start the scheduler in a separate thread
# scheduler_thread = Thread(target=run_scheduler, args=(vote_manager,), daemon=True)
# scheduler_thread.start()
# Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
# This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, \
ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, \
COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, \
HARD_INSTRUCT_SOLVE_DF = init_space()
# Data processing for plots now only on demand in the respective Gradio tab
# def load_and_create_plots():
# plot_df = create_plot_df(create_scores_df(LEADERBOARD_DF))
# return plot_df
# Function to check if a user is logged in
def check_login(profile: gr.OAuthProfile | None) -> bool:
if profile is None:
return False
return True
def init_leaderboard(dataframe):
if dataframe is None or dataframe.empty:
raise ValueError("Leaderboard DataFrame is empty or None.")
return Leaderboard(
value=dataframe,
datatype=[c.type for c in fields(AutoEvalColumn)],
select_columns=SelectColumns(
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
label="Select Columns to Display:",
),
search_columns=[AutoEvalColumn.model.name],
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
filter_columns=[
ColumnFilter(AutoEvalColumn.type.name, type="checkboxgroup", label="Model Types"),
ColumnFilter(AutoEvalColumn.openness.name, type="checkboxgroup", label="Openness"),
ColumnFilter(AutoEvalColumn.size_range.name, type="dropdown", label="Model Size"),
ColumnFilter(AutoEvalColumn.moe.name, type="checkboxgroup", label="Model Architecture"),
],
bool_checkboxgroup_label="Hide models",
interactive=False,
)
def init_others(dataframe):
if dataframe is None or dataframe.empty:
raise ValueError("Gradio DataFrame is empty or None.")
return gr.Dataframe(dataframe, visible=False)
main_block = gr.Blocks(css=custom_css)
with main_block as demo:
with gr.Row(elem_id="header-row"):
gr.HTML(TITLE)
# gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.Tab("π Hard Set") as hard_tabs:
with gr.TabItem("π
Benchmark", elem_id="llm-benchmark-tab-table", id="hard_bench"):
hard_leaderboard = init_leaderboard(HARD_LEADERBOARD_DF)
gr.Markdown(
"""
**Notes:**
- _Hard Set_ vs _Full Set_:
- Hard Set: A subset of ~150 BigCodeBench tasks which is more user-facing and challenging.
- Full Set: The full set of 1140 BigCodeBench tasks.
- _Complete_ vs _Instruct_:
- Complete: Code Completion based on the (verbose) structured docstring. This split tests if the models are good at coding.
- Instruct (π₯Vibe Checkπ₯): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
- `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
- `Average` is the average of `Complete` and `Instruct` when both are available.
- `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
- `#Act Params (B)` is the number of activated model parameters during inference.
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
- For more details check the π About section.
""",
elem_classes="markdown-text",
)
with gr.TabItem("π Elo Rating", id="hard_elo"):
with gr.Column():
with gr.Group():
gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
hard_task_elo_map = gr.Plot()
hard_elo_task_gr = init_others(HARD_ELO_TASK_DF)
demo.load(plot_elo_mle, [hard_elo_task_gr],
hard_task_elo_map)
with gr.Group():
gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
hard_bench_elo_map = gr.Plot()
hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF)
demo.load(plot_elo_mle, [hard_elo_bench_gr],
hard_bench_elo_map)
with gr.TabItem("𧩠Solve Rate", id="hard_solve"):
with gr.Column():
hard_complete_map = gr.Plot()
hard_complete_solve_gr = init_others(HARD_COMPLETE_SOLVE_DF)
demo.load(plot_solve_rate, [hard_complete_solve_gr,
gr.Textbox("Complete", visible=False),
gr.Number(10, visible=False),
gr.Number(16, visible=False),
], hard_complete_map)
hard_instruct_map = gr.Plot()
hard_instruct_solve_gr = init_others(HARD_INSTRUCT_SOLVE_DF)
demo.load(plot_solve_rate, [hard_instruct_solve_gr,
gr.Textbox("Instruct", visible=False),
gr.Number(10, visible=False),
gr.Number(16, visible=False),
], hard_instruct_map)
with gr.Tab("π― Full Set") as full_tabs:
with gr.TabItem("π
Benchmark", elem_id="llm-benchmark-tab-table", id="full_bench"):
leaderboard = init_leaderboard(LEADERBOARD_DF)
gr.Markdown(
"""
**Notes:**
- _Complete_ vs _Instruct_:
- Complete: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
- Instruct (π₯Vibe Checkπ₯): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
- `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
- `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
- `size` is the amount of activated model weight during inference.
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
- For more details check the π About section.
""",
elem_classes="markdown-text",
)
with gr.TabItem("π Elo Rating", id="full_elo"):
with gr.Column():
with gr.Group():
gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
task_elo_map = gr.Plot()
elo_task_gr = init_others(ELO_TASK_DF)
demo.load(plot_elo_mle, [elo_task_gr], task_elo_map)
with gr.Group():
gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
bench_elo_map = gr.Plot()
elo_bench_gr = init_others(ELO_BENCH_DF)
demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map)
with gr.TabItem("𧩠Solve Rate", id="full_solve"):
with gr.Column():
complete_map = gr.Plot()
complete_solve_gr = init_others(COMPLETE_SOLVE_DF)
demo.load(plot_solve_rate, [complete_solve_gr,
gr.Textbox("Complete", visible=False),
], complete_map)
instruct_map = gr.Plot()
instruct_solve_gr = init_others(INSTRUCT_SOLVE_DF)
demo.load(plot_solve_rate, [instruct_solve_gr,
gr.Textbox("Instruct", visible=False),
], instruct_map)
with gr.TabItem("π About", id=3):
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
with gr.TabItem("π Data Viewer", id="viewer"):
search_input = gr.Textbox(label="Search by keyword")
count_output = gr.Number(label="Number of filtered items")
index_slider = gr.Slider(minimum=0, maximum=len(DATA)-1, step=1, label="Select Index")
# show_solution = gr.Checkbox(label="Show Solution")
show_test = gr.Checkbox(label="Show Test Cases")
update_button = gr.Button("Update")
task_id_output = gr.Textbox(label="Task ID")
code_completion = gr.Code(language="python", label="Code Completion")
nl_instruction = gr.Code(language="python", label="Natural Language Instruction")
# solution = gr.Code(language="python", label="Solution")
test_cases = gr.Code(language="python", label="Test Cases")
update_button.click(
update_display,
inputs=[search_input, index_slider, show_test],
outputs=[task_id_output, code_completion, nl_instruction, test_cases, count_output, index_slider]
)
# Initial load
demo.load(
update_display,
inputs=[search_input, index_slider, show_test],
outputs=[task_id_output, code_completion, nl_instruction, test_cases, count_output, index_slider]
)
with gr.TabItem("π Request", id=4):
gr.Markdown(SUBMISSION_TEXT_3)
with gr.Row():
with gr.Accordion("π Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
# leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
# pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
main_block.queue(default_concurrency_limit=40)
def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
# Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
# Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
# ht to Lucain!
if SPACE_ID is None:
print("Not in a Space: Space CI disabled.")
return WebhooksServer(ui=main_block)
if IS_EPHEMERAL_SPACE:
print("In an ephemeral Space: Space CI disabled.")
return WebhooksServer(ui=main_block)
card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
config = card.data.get("space_ci", {})
print(f"Enabling Space CI with config from README: {config}")
return configure_space_ci(
blocks=ui,
trusted_authors=config.get("trusted_authors"),
private=config.get("private", "auto"),
variables=config.get("variables", "auto"),
secrets=config.get("secrets"),
hardware=config.get("hardware"),
storage=config.get("storage"),
)
# Create webhooks server (with CI url if in Space and not ephemeral)
webhooks_server = enable_space_ci_and_return_server(ui=main_block)
# Add webhooks
@webhooks_server.add_webhook
def update_leaderboard(payload: WebhookPayload) -> None:
"""Redownloads the leaderboard dataset each time it updates"""
if payload.repo.type == "dataset" and payload.event.action == "update":
global NEW_DATA_ON_LEADERBOARD
if NEW_DATA_ON_LEADERBOARD:
return
NEW_DATA_ON_LEADERBOARD = True
for repo in [RESULT_REPO, HARD_RESULT_REPO, ELO_REPO, HARD_ELO_REPO, SOLVE_REPO, HARD_SOLVE_REPO]:
datasets.load_dataset(
repo,
"default",
cache_dir=HF_HOME,
download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
verification_mode="no_checks"
)
webhooks_server.launch()
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h as backup in case automatic updates are not working
scheduler.start()