Leaderboard

Running

App Files Files Community

Jerrycool commited on Apr 26

Commit

be52959

verified ·

1 Parent(s): 1117820

Update app.py

Browse files

Files changed (1) hide show

app.py +214 -244

app.py CHANGED Viewed

@@ -1,275 +1,245 @@
-"""
-app.py — MLE-Dojo Dark-Theme Leaderboard
----------------------------------------
-Run:  python app.py
-"""
 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
-# ---------- Placeholder / Fallback Imports ----------
-try:
-    from src.about import (
-        CITATION_BUTTON_LABEL,
-        CITATION_BUTTON_TEXT,
-        EVALUATION_QUEUE_TEXT,
-        INTRODUCTION_TEXT,
-        LLM_BENCHMARKS_TEXT,
-        TITLE,  # 将被覆盖
-    )
-    from src.display.css_html_js import custom_css
-    from src.envs import REPO_ID
-    from src.submission.submit import add_new_eval
-except ImportError:
-    CITATION_BUTTON_LABEL = "Citation"
-    CITATION_BUTTON_TEXT = "Please cite us if you use this benchmark..."
-    EVALUATION_QUEUE_TEXT = "Current evaluation queue:"
-    INTRODUCTION_TEXT = "Welcome to the MLE-Dojo Benchmark Leaderboard."
-    LLM_BENCHMARKS_TEXT = "Information about the benchmarks..."
-    custom_css = ""
-    REPO_ID = "your/space-id"
-    def add_new_eval(*args):
-        return "Submission placeholder."
-# ---------- Elo Data ----------
 data = [
-    dict(model_name="gpt-4o-mini", url="https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/",
-         organizer="OpenAI", license="Proprietary", MLE_Lite_Elo=753, Tabular_Elo=839,
-         NLP_Elo=758, CV_Elo=754, Overall=778),
-    dict(model_name="gpt-4o", url="https://openai.com/index/hello-gpt-4o/",
-         organizer="OpenAI", license="Proprietary", MLE_Lite_Elo=830, Tabular_Elo=861,
-         NLP_Elo=903, CV_Elo=761, Overall=841),
-    dict(model_name="o3-mini", url="https://openai.com/index/openai-o3-mini/",
-         organizer="OpenAI", license="Proprietary", MLE_Lite_Elo=1108, Tabular_Elo=1019,
-         NLP_Elo=1056, CV_Elo=1207, Overall=1096),
-    dict(model_name="deepseek-v3", url="https://api-docs.deepseek.com/news/news1226",
-         organizer="DeepSeek", license="DeepSeek", MLE_Lite_Elo=1004, Tabular_Elo=1015,
-         NLP_Elo=1028, CV_Elo=1067, Overall=1023),
-    dict(model_name="deepseek-r1", url="https://api-docs.deepseek.com/news/news250120",
-         organizer="DeepSeek", license="DeepSeek", MLE_Lite_Elo=1137, Tabular_Elo=1053,
-         NLP_Elo=1103, CV_Elo=1083, Overall=1100),
-    dict(model_name="gemini-2.0-flash", url="https://ai.google.dev/gemini-api/docs/models#gemini-2.0-flash",
-         organizer="Google", license="Proprietary", MLE_Lite_Elo=847, Tabular_Elo=923,
-         NLP_Elo=860, CV_Elo=978, Overall=895),
-    dict(model_name="gemini-2.0-pro", url="https://blog.google/technology/google-deepmind/gemini-model-updates-february-2025/",
-         organizer="Google", license="Proprietary", MLE_Lite_Elo=1064, Tabular_Elo=1139,
-         NLP_Elo=1028, CV_Elo=973, Overall=1054),
-    dict(model_name="gemini-2.5-pro", url="https://deepmind.google/technologies/gemini/pro/",
-         organizer="Google", license="Proprietary", MLE_Lite_Elo=1257, Tabular_Elo=1150,
-         NLP_Elo=1266, CV_Elo=1177, Overall=1214),
 ]
 master_df = pd.DataFrame(data)
-# ---------- Category Logic ----------
-CATEGORIES = ["Overall", "MLE-Lite", "Tabular", "NLP", "CV"]
-DEFAULT_CATEGORY = "Overall"
 category_to_column = {
-    "Overall": "Overall",
-    "MLE-Lite": "MLE_Lite_Elo",
     "Tabular": "Tabular_Elo",
     "NLP": "NLP_Elo",
     "CV": "CV_Elo",
 }
-def update_leaderboard(category: str) -> pd.DataFrame:
-    col = category_to_column.get(category, category_to_column[DEFAULT_CATEGORY])
-    df = master_df[["model_name", "url", "organizer", "license", col]].copy()
-    df.sort_values(by=col, ascending=False, inplace=True)
-    df.reset_index(drop=True, inplace=True)
-    df.insert(0, "Rank", df.index + 1)
-    df["Model"] = df.apply(
-        lambda r: f"<a href='{r['url']}' target='_blank'>{r['model_name']}</a>", axis=1
-    )
-    df.rename(
-        columns={col: "Elo Score", "organizer": "Organizer", "license": "License"},
-        inplace=True,
-    )
-    return df[["Rank", "Model", "Organizer", "License", "Elo Score"]]
-# ---------- Dark-Theme CSS ----------
-dark_css = """
-/* ---- Google Font & Font Awesome ---- */
-@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
-body {
-    font-family: 'Inter', sans-serif;
-    background-color: #121212;
-    color: #e0e0e0;
-    font-size: 15px;
-}
-/* ---- Hero Section ---- */
-.hero-section {
-    background: linear-gradient(135deg, #333, #222);
-    color: #e0e0e0;
-    padding: 1.75rem 1rem;
-    border-radius: .75rem;
-    margin-bottom: 1.5rem;
-    text-align: center;
-    box-shadow: 0 4px 12px rgba(0,0,0,0.55);
-}
-.hero-section h1 {
-    margin: 0;
-    font-size: 2.2rem;
-    font-weight: 700;
-    display: inline-flex;
-    align-items: center;
-    gap: .5rem;
-}
-.hero-section h1 i {          /* 奖杯与文字同行 & 对齐 */
-    margin: 0;
-    font-size: 1em;
-}
-.hero-section h2 {
-    margin: .6rem 0 0;
-    font-size: 1.15rem;
-    font-weight: 400;
-    opacity: .8;
-}
-/* ---- Tabs ---- */
-.tab-buttons button {
-    border-radius: 20px !important;
-    padding: .55rem 1.15rem !important;
-    margin-right: .6rem !important;
-    background: #1e1e1e !important;
-    color: #e0e0e0 !important;
-    border: none !important;
-    font-size: .95rem !important;
-    font-weight: 500 !important;
-    transition: background .28s;
-}
-.tab-buttons button:hover     { background: #2c2c2c !important; }
-.tab-buttons button[aria-selected="true"] {
-    background: #444 !important;
-    color: #fff !important;
-}
-/* ---- Category Selector ---- */
-#category-selector label {
-    display: inline-block;
-    padding: .55rem 1.2rem;
-    margin-right: .5rem;
-    border-radius: 999px;
-    background: #1d1d1d;
-    cursor: pointer;
-    transition: background .28s, color .28s;
-    font-weight: 600;
-    font-size: .95rem;
-    color: #e0e0e0;
-}
-#category-selector input[type="radio"]:checked + label {
-    background: #3d3d3d;
-    color: #fff;
-}
-/* ---- Dataframe / Leaderboard ---- */
-.dataframe-container {
-    max-height: 420px;
-    overflow-y: auto;
-}
-.dataframe-container table {
-    width: 100%;
-    border-collapse: collapse;
-    border: none;
-    box-shadow: 0 2px 6px rgba(0,0,0,.55);
-    border-radius: .55rem;
-}
-.dataframe-container thead th {
-    background: #272727;
-    color: #e0e0e0;
-    font-weight: 600;
-    padding: .85rem 1rem;
-    font-size: .9rem;
-}
-.dataframe-container tbody tr:nth-child(odd)  { background: #1c1c1c; }
-.dataframe-container tbody tr:nth-child(even) { background: #222;   }
-.dataframe-container td, .dataframe-container th {
-    padding: .8rem 1rem;
-    font-size: .88rem;
-}
-.dataframe-container td a {
-    color: #8ab4f8;
-    text-decoration: none;
-}
-.dataframe-container td a:hover {
-    color: #a3c9ff;
-    text-decoration: underline;
-}
-"""
-custom_css += dark_css
-# ---------- Override Title ----------
-TITLE = """
-<div class="hero-section">
-  <h1><i class="fas fa-trophy"></i>MLE-Dojo Benchmark Leaderboard</h1>
-  <h2>Improving LLM Agents for Machine Learning Engineering</h2>
-</div>
-"""
-# ---------- Build Gradio App ----------
-demo = gr.Blocks(css=custom_css, theme=gr.themes.Base())
 with demo:
-    # 注入 Font Awesome（保证奖杯可用）
-    gr.HTML(
-        """
-<link rel="stylesheet"
-      href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css"
-      crossorigin="anonymous" referrerpolicy="no-referrer"/>
-"""
-    )
-    # -------- Header & Intro --------
     gr.HTML(TITLE)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    # -------- Tabs --------
-    with gr.Tabs(elem_classes="tab-buttons"):
-        # --- Leaderboard Tab ---
-        with gr.TabItem("📊 Leaderboard"):
-            gr.Markdown("### Model Elo Rankings by Category")
-            category_selector = gr.Radio(
-                choices=CATEGORIES,
-                value=DEFAULT_CATEGORY,
-                interactive=True,
-                elem_id="category-selector",
-                label="Select Category:",
-            )
-            leaderboard_df = gr.Dataframe(
-                value=update_leaderboard(DEFAULT_CATEGORY),
-                headers=["Rank", "Model", "Organizer", "License", "Elo Score"],
-                datatype=["number", "html", "str", "str", "number"],
-                interactive=False,
-                row_count=(len(master_df), "fixed"),
-                col_count=(5, "fixed"),
-                wrap=True,
-                elem_id="leaderboard-table",
-            )
-            category_selector.change(
-                fn=update_leaderboard,
-                inputs=category_selector,
-                outputs=leaderboard_df,
             )
-        # --- About Tab ---
-        with gr.TabItem("ℹ️ About"):
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-    # -------- Citation --------
-    with gr.Accordion("📙 Citation", open=False):
-        gr.Textbox(
-            value=CITATION_BUTTON_TEXT,
-            label=CITATION_BUTTON_LABEL,
-            lines=10,
-            elem_id="citation-button",
-            show_copy_button=True,
-        )
-if __name__ == "__main__":
-    print("Launching Gradio App in Dark Mode…")
-    demo.launch()

 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
+# Removed Hugging Face Hub imports as they are not needed for the simplified leaderboard
+# from huggingface_hub import snapshot_download, HfApi
+from src.about import ( # Assuming these still exist and are relevant for other tabs
+    CITATION_BUTTON_LABEL,
+    CITATION_BUTTON_TEXT,
+    EVALUATION_QUEUE_TEXT,
+    INTRODUCTION_TEXT,
+    LLM_BENCHMARKS_TEXT,
+    TITLE,
+)
+from src.display.css_html_js import custom_css # Keep custom CSS
+# Removed utils imports related to the old leaderboard
+# from src.display.utils import (...)
+from src.envs import REPO_ID # Keep if needed for restart_space or other functions
+# Removed constants related to old data paths and repos if not needed elsewhere
+# from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
+# Removed old data processing functions
+# from src.populate import get_evaluation_queue_df, get_leaderboard_df
+from src.submission.submit import add_new_eval # Keep submission logic
+# --- Elo Leaderboard Configuration ---
+# Data from the table provided by the user
 data = [
+    {'model': 'gpt-4o-mini', 'MLE-Lite_Elo': 753, 'Tabular_Elo': 839, 'NLP_Elo': 758, 'CV_Elo': 754, 'Overall': 778},
+    {'model': 'gpt-4o', 'MLE-Lite_Elo': 830, 'Tabular_Elo': 861, 'NLP_Elo': 903, 'CV_Elo': 761, 'Overall': 841},
+    {'model': 'o3-mini', 'MLE-Lite_Elo': 1108, 'Tabular_Elo': 1019, 'NLP_Elo': 1056, 'CV_Elo': 1207, 'Overall': 1096},
+    # Renamed 'DeepSeek-v3' to match previous list - adjust if needed
+    {'model': 'deepseek-v3', 'MLE-Lite_Elo': 1004, 'Tabular_Elo': 1015, 'NLP_Elo': 1028, 'CV_Elo': 1067, 'Overall': 1023},
+    # Renamed 'DeepSeek-r1' to match previous list - adjust if needed
+    {'model': 'deepseek-r1', 'MLE-Lite_Elo': 1137, 'Tabular_Elo': 1053, 'NLP_Elo': 1103, 'CV_Elo': 1083, 'Overall': 1100},
+    # Renamed 'Gemini-2.0-Flash' to match previous list - adjust if needed
+    {'model': 'gemini-2.0-flash', 'MLE-Lite_Elo': 847, 'Tabular_Elo': 923, 'NLP_Elo': 860, 'CV_Elo': 978, 'Overall': 895},
+    # Renamed 'Gemini-2.0-Pro' to match previous list - adjust if needed
+    {'model': 'gemini-2.0-pro', 'MLE-Lite_Elo': 1064, 'Tabular_Elo': 1139, 'NLP_Elo': 1028, 'CV_Elo': 973, 'Overall': 1054},
+    # Renamed 'Gemini-2.5-Pro' to match previous list - adjust if needed
+    {'model': 'gemini-2.5-pro', 'MLE-Lite_Elo': 1257, 'Tabular_Elo': 1150, 'NLP_Elo': 1266, 'CV_Elo': 1177, 'Overall': 1214},
 ]
+# Create a master DataFrame
 master_df = pd.DataFrame(data)
+# Define categories for selection (user-facing)
+CATEGORIES = ["MLE-Lite", "Tabular", "NLP", "CV", "Overall"]
+DEFAULT_CATEGORY = "Overall" # Set a default category
+# Map user-facing categories to DataFrame column names
 category_to_column = {
+    "MLE-Lite": "MLE-Lite_Elo",
     "Tabular": "Tabular_Elo",
     "NLP": "NLP_Elo",
     "CV": "CV_Elo",
+    "Overall": "Overall"
 }
+# --- Helper function to update leaderboard ---
+def update_leaderboard(category):
+    """
+    Selects the relevant columns for the category, renames the score column
+    to 'Elo Score', sorts by score descending, and returns the DataFrame.
+    """
+    score_column = category_to_column.get(category)
+    if score_column is None or score_column not in master_df.columns:
+        # Fallback if category or column is invalid
+        print(f"Warning: Invalid category '{category}' or column '{score_column}'. Falling back to default.")
+        score_column = category_to_column[DEFAULT_CATEGORY]
+        if score_column not in master_df.columns: # Check fallback column too
+             return pd.DataFrame({"Model": [], "Elo Score": []}) # Return empty if still invalid
+    # Select model and the specific score column
+    df = master_df[['model', score_column]].copy()
+    # Rename the score column to 'Elo Score' for consistent display
+    df.rename(columns={score_column: 'Elo Score'}, inplace=True)
+    # Sort by 'Elo Score' descending
+    df.sort_values(by='Elo Score', ascending=False, inplace=True)
+    # Reset index for cleaner display (optional)
+    df.reset_index(drop=True, inplace=True)
+    return df
+# --- Mock/Placeholder functions/data for other tabs ---
+# (Same as previous version - providing empty data)
+print("Warning: Evaluation queue data fetching is disabled/mocked due to leaderboard changes.")
+finished_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
+running_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
+pending_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"])
+EVAL_COLS = ["Model", "Status", "Requested", "Started"] # Define for the dataframe headers
+EVAL_TYPES = ["str", "str", "str", "str"] # Define for the dataframe types
+# --- Keep restart function if relevant ---
+# (Same as previous version)
+def restart_space():
+    print(f"Attempting to restart space: {REPO_ID}")
+    # Replace with your actual space restart mechanism if needed
+# --- Gradio App Definition ---
+demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
+            with gr.Column():
+                gr.Markdown("## Model Elo Rankings") # New title for the section
+                category_selector = gr.Radio(
+                    choices=CATEGORIES,
+                    label="Select Category to Sort By", # Updated label
+                    value=DEFAULT_CATEGORY, # Default selection
+                    interactive=True,
+                    container=False,
+                )
+                leaderboard_df_component = gr.Dataframe(
+                    # Initialize with sorted data for the default category
+                    value=update_leaderboard(DEFAULT_CATEGORY),
+                    headers=["Model", "Elo Score"],
+                    datatype=["str", "number"],
+                    interactive=False,
+                    # Adjust row count based on the number of models
+                    row_count=(len(master_df), "fixed"),
+                    col_count=(2, "fixed"),
+                )
+                # Link the radio button change to the update function
+                category_selector.change(
+                    fn=update_leaderboard,
+                    inputs=category_selector,
+                    outputs=leaderboard_df_component
+                )
+        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
+            # (Content unchanged)
+            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
+            # (Content unchanged, still uses potentially empty/mock queue data)
+            with gr.Column():
+                with gr.Row():
+                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+                with gr.Column():
+                    with gr.Accordion(
+                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                           finished_eval_table = gr.components.Dataframe(
+                                value=finished_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                           )
+                    with gr.Accordion(
+                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            running_eval_table = gr.components.Dataframe(
+                                value=running_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+                    with gr.Accordion(
+                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
+                        open=False,
+                    ):
+                        with gr.Row():
+                            pending_eval_table = gr.components.Dataframe(
+                                value=pending_eval_queue_df,
+                                headers=EVAL_COLS,
+                                datatype=EVAL_TYPES,
+                                row_count=5,
+                            )
+            with gr.Row():
+                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
+            with gr.Row():
+                 # Submission form - kept as is
+                with gr.Column():
+                    model_name_textbox = gr.Textbox(label="Model name")
+                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
+                    model_type = gr.Dropdown(
+                        choices=["Type A", "Type B", "Type C"], # Example choices
+                        label="Model type",
+                        multiselect=False,
+                        value=None,
+                        interactive=True,
+                    )
+                with gr.Column():
+                    precision = gr.Dropdown(
+                        choices=["float16", "bfloat16", "float32", "int8"], # Example choices
+                        label="Precision",
+                        multiselect=False,
+                        value="float16",
+                        interactive=True,
+                    )
+                    weight_type = gr.Dropdown(
+                        choices=["Original", "Adapter", "Delta"], # Example choices
+                        label="Weights type",
+                        multiselect=False,
+                        value="Original",
+                        interactive=True,
+                    )
+                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
+            submit_button = gr.Button("Submit Eval")
+            submission_result = gr.Markdown()
+            submit_button.click(
+                add_new_eval,
+                [
+                    model_name_textbox,
+                    base_model_name_textbox,
+                    revision_name_textbox,
+                    precision,
+                    weight_type,
+                    model_type,
+                ],
+                submission_result,
             )
+    with gr.Row():
+        with gr.Accordion("📙 Citation", open=False):
+             # (Content unchanged)
+            citation_button = gr.Textbox(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                lines=20,
+                elem_id="citation-button",
+                show_copy_button=True,
+            )
+# --- Keep scheduler if relevant ---
+# scheduler = BackgroundScheduler()
+# scheduler.add_job(restart_space, "interval", seconds=1800) # Restart every 30 mins
+# scheduler.start()
+# --- Launch the app ---
+demo.launch()