n0w0f's picture
fix: show version
20ef9d6
import json
import os
import time
from pathlib import Path
import gradio as gr
from datasets import load_dataset,get_dataset_config_names
from src.display.formatting import benchmark_version_hyperlink, leaderboard_version_hyperlink
from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
from src.about import (
ABOUT_TEXT,
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
TITLE,
TITLE_MARKDOWN_DESCRIPTION,
)
from src.display.utils import ModelType, Precision, WeightType
from src.envs import API, BENCHMARK_REPO, REPO_ID
from src.populate import create_leaderboard_df, get_sorted_versions
from src.submission.check_validity import (
validate_report_format,
validate_results_coverage,
validate_results_structure,
)
from src.submission.submit import update_dataset_with_scores
STATIC_DIR = str(Path(__file__).parent / "src" / "static")
# Global state for leaderboard data
current_leaderboard_df = None
def initialize_leaderboard():
"""Initialize the global leaderboard DataFrame"""
global current_leaderboard_df
current_leaderboard_df = create_leaderboard_df()
return current_leaderboard_df
def process_submission(
model_name: str,
base_model: str,
revision: str,
precision: str,
weight_type: str,
model_type: str,
results_file: Path,
params: float,
is_reasoning: bool,
is_moe: bool,
progress=gr.Progress(),
) -> tuple[str, dict]:
"""Processes model submission and updates the leaderboard with visual progress tracking."""
try:
# Initial setup
progress(0, desc="Starting...")
# Read results file - 20% progress
with open(results_file.name, "r") as f:
results = json.load(f)
# Fetch dataset configurations early since we need them for multiple steps
dataset_configs = get_dataset_config_names(BENCHMARK_REPO)
# Process through validation steps with progress bar
steps = progress.tqdm(
[
("Validating structure", validate_results_structure, (results,)),
(
"Checking coverage",
validate_results_coverage,
(results, dataset_configs),
),
("Validating format", validate_report_format, (results,)),
],
desc="Processing submission...",
)
for desc, func, args in steps:
time.sleep(0.5) # Make progress visible
if not func(*args): # Unpack arguments with *args
return f"❌ Error during {desc.lower()}"
# Prepare metadata - 80% progress
progress(0.8, desc="Preparing metadata...")
meta_info = {
"model_id": f"{model_name}-{revision}",
"name": model_name,
"is_open_source": model_type == "open_source : Open Source Model",
# "Number of Params": params,
"is_reasoning": is_reasoning,
# "is_moe": is_moe,
}
# Update leaderboard
progress(0.9, desc="Updating leaderboard...")
update_dataset_with_scores(meta_info, results, dataset_configs)
progress(1.0, desc="Done!")
return "βœ… Successfully validated results and updated leaderboard!"
except Exception as e:
return f"❌ Error: {str(e)}"
def get_benchmark_version():
"""Get the current benchmark dataset version"""
try:
config = get_dataset_config_names(BENCHMARK_REPO)
_benchmark_dataset = load_dataset(BENCHMARK_REPO,config[0])
version = get_sorted_versions(_benchmark_dataset)
if version == 'train':
version = '1.0.0'
return '1.0.0'
except Exception:
return "Unknown"
# Create the Gradio interface
demo = gr.Blocks().queue()
demo.static_dir = STATIC_DIR
with demo:
gr.HTML(TITLE)
gr.Markdown(TITLE_MARKDOWN_DESCRIPTION)
with gr.Tabs() as tabs:
with gr.TabItem("πŸ… Leaderboard", elem_id="model-leaderboard", id=0):
# Get initial dataframe and available versions, so that see for a specific version
initial_df, available_versions = create_leaderboard_df()
benchmark_version = get_benchmark_version()
with gr.Row():
with gr.Column(scale=2):
version_dropdown = gr.Dropdown(
choices=available_versions,
value=available_versions[0],
label="Leaderboard Version",
interactive=True
)
with gr.Column(scale=1):
gr.HTML(benchmark_version_hyperlink(benchmark_version))
gr.HTML(leaderboard_version_hyperlink(available_versions[0]))
# gr.Markdown(f"*Dataset Version: {benchmark_version}*", elem_classes="markdown-text")
# Get column types dynamically
column_types = {
"Model": "str",
"Overall Score": "number",
"Open Source": "bool",
# "MoE": "bool",
"Reasoning": "bool",
**{
col: "number"
for col in initial_df.columns
if col
not in [
"Model",
"Overall Score",
# "Parameters (B)",
"Open Source",
# "MoE",
"Reasoning",
]
},
}
# Create leaderboard
leaderboard = Leaderboard(
value=initial_df,
datatype=column_types,
select_columns=SelectColumns(
default_selection=[
col
for col in initial_df.columns
if col not in ["Open Source", "Reasoning"]
],
cant_deselect=["Model", "Overall Score"],
),
search_columns=["Model"],
filter_columns=[
ColumnFilter(
"Open Source",
type="boolean",
label="Show only open source models",
default=False,
),
# ColumnFilter(
# "MoE",
# type="boolean",
# label="Show only MoE models",
# default=False,
# ),
ColumnFilter(
"Reasoning",
type="boolean",
label="Show only reasoning models",
default=False,
),
],
bool_checkboxgroup_label="Apply Filters",
)
# Update leaderboard when version changes
def update_leaderboard(version):
df, _ = create_leaderboard_df(version)
return df
version_dropdown.change(
fn=update_leaderboard,
inputs=[version_dropdown],
outputs=[leaderboard],
)
with gr.TabItem("πŸš€ Submit", elem_id="submit-tab", id=1):
with gr.Column():
gr.Markdown(
"## Submit your model evaluation results",
elem_classes="markdown-text",
)
# Model Information
with gr.Row():
with gr.Column():
model_name_textbox = gr.Textbox(label="Model name", placeholder="e.g., GPT-Chemistry")
revision_name_textbox = gr.Textbox(label="Version", placeholder="main")
model_type = gr.Dropdown(
choices=[str(t) for t in ModelType],
label="Model type",
multiselect=False,
value=None,
interactive=True,
)
with gr.Column():
precision = gr.Dropdown(
choices=[str(p) for p in Precision],
label="Precision",
multiselect=False,
value="float16",
interactive=True,
)
weight_type = gr.Dropdown(
choices=[str(w) for w in WeightType],
label="Weights type",
multiselect=False,
value="Original",
interactive=True,
)
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
# Additional Required Information
with gr.Row():
with gr.Column():
params_number = gr.Number(
label="Number of Parameters (in billions)",
value=None,
info="e.g., 7.0",
)
is_reasoning = gr.Checkbox(label="Uses reasoning/Chain-of-Thought", value=False)
is_moe = gr.Checkbox(label="Is Mixture of Experts (MoE)", value=False)
# Results File Upload
with gr.Row():
results_file = gr.File(
label="Upload Results JSON",
file_types=[".json"],
)
# Submit Button and Results
with gr.Row(equal_height=True):
# Submit Button and Results
submit_button = gr.Button("Submit and Update Leaderboard")
output = gr.Markdown(value="STATUS ... ", label="Submission Status")
def handle_submission(*args):
result = process_submission(*args)
if "βœ…" in result: # If submission was successful
time.sleep(5) # Wait 5 seconds
API.restart_space(repo_id=REPO_ID)
return result
submit_button.click(
fn=handle_submission,
inputs=[
model_name_textbox,
base_model_name_textbox,
revision_name_textbox,
precision,
weight_type,
model_type,
results_file,
params_number,
is_reasoning,
is_moe,
],
outputs=output,
show_progress=True,
)
with gr.TabItem("πŸ“ About", elem_id="about-tab", id=2):
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
with gr.Row():
with gr.Accordion("πŸ“™ Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
if __name__ == "__main__":
demo.queue()
demo.launch()