Spaces:

osunlp
/

Online_Mind2Web_Leaderboard

Running

File size: 12,590 Bytes

import os
import gradio as gr
import pandas as pd
import json
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import Counter
from apscheduler.schedulers.background import BackgroundScheduler
import numpy as np

from scorer import question_scorer
from content import format_error, format_warning, format_log, TITLE, LINKS, INTRODUCTION_TEXT, LEADERBOARD_TEXT, CITATION_BUTTON_LABEL, EVALUATION_DETAILS, CITATION_BUTTON_TEXT, model_hyperlink, SUBMIT_INTRODUCTION

TOKEN = os.environ.get("TOKEN", None)

OWNER = "Online-Mind2Web"
YEAR_VERSION = "2025"
LOCAL_DEBUG = True

def _format_sr_column(series: pd.Series) -> pd.Series:
    numeric = pd.to_numeric(series, errors="coerce")
    out = numeric.map(lambda x: f"{x:.1f}" if pd.notna(x) else "")

    # Wherever coercion failed (original was str / NaN), restore original value
    mask = numeric.isna() & series.notna()
    out[mask] = series[mask]
    return out

def get_dataframe_from_results(eval_path):
    df = pd.read_csv(eval_path)

    if "Verified" not in df.columns:
        df = df.sort_values(by=["Average SR"], ascending=False)
    else:
        df = df.sort_values(
        by=["Verified", "Average SR"],
        ascending=[False, False],
        kind="mergesort"
    )
    
    for col in ['Easy', 'Medium', 'Hard', 'Average SR']:
        if col in df.columns:
            df[col] = _format_sr_column(df[col])
    return df

auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
TYPES = ["markdown", "str", "markdown", "markdown", "number", "number", "number", "number", "str", "str", "markdown", "str"]

def df_to_gradio(df: pd.DataFrame) -> dict:
    display_df = df.drop(columns=["Release Time"], errors="ignore")
    headers = display_df.columns.tolist()
    data = display_df.values.tolist()
    # simple styling: red if not verified
    if "Verified" in display_df.columns:
        verified_idx = headers.index("Verified")
        styling = [["background-color:#ffcccc" if not row[verified_idx] else "" for _ in headers] for row in data]
    else:
        styling = [["" for _ in headers] for _ in data]
    return {"data": data, "headers": headers, "metadata": {"styling": styling}}

def gradio_plot_wrapper(json_file):
    return plot_heatmap_with_performance_bar(json_file.name)

def style_auto_df(df: pd.DataFrame):
    def _row_style(row):
        bg = "background-color: #ffcccc" if row["Verified"] != True else ""
        return [bg] * len(row)

    styler = df.style.apply(_row_style, axis=1)
    try:
        styler = styler.hide(axis="index")
    except Exception:
        pass
    return styler

def nice_bounds(low: float, high: float) -> tuple[float, float]:
    if low == high:
        low -= 1; high += 1
    return (np.floor(low / 10) * 10, np.ceil(high / 10) * 10)

def extract_agent_name(agent_str: str) -> str:
    """Extract agent name from markdown link format [Name](url) or plain text"""
    import re
    match = re.match(r'\[([^\]]+)\]', agent_str)
    if match:
        return match.group(1)
    return agent_str

def plot_sr_vs_time(df: pd.DataFrame, title: str = "Success rate over time") -> go.Figure:

    work = df[df["Verified"] == True].copy()  # filter out unverified rows

    work["Release Time"] = pd.to_datetime(work["Release Time"], errors="coerce")
    work["Average SR"] = pd.to_numeric(work["Average SR"], errors="coerce")
    work = work.dropna(subset=["Release Time", "Average SR"])

    agents = work["Agent"].unique().tolist()
    color_map = {a: f"hsl({int(360*i/len(agents))},70%,45%)" for i, a in enumerate(agents)}

    fig = go.Figure()

    y_min_raw, y_max_raw = work["Average SR"].min(), work["Average SR"].max()
    y_min, y_max = nice_bounds(y_min_raw, y_max_raw)
    band_edges = np.linspace(y_min, y_max, 4)
    band_cols = ["rgba(226,247,226,0.35)", "rgba(255,255,204,0.35)", "rgba(255,228,225,0.35)"]
    shapes = [
        dict(type="rect", xref="paper", yref="y", x0=0, x1=1, y0=band_edges[i], y1=band_edges[i+1],
             fillcolor=band_cols[i], line_width=0)
        for i in range(3)
    ]

    for _, row in work.iterrows():
        agent_display_name = extract_agent_name(row["Agent"])
        fig.add_trace(
            go.Scatter(
                x=[row["Release Time"]],
                y=[row["Average SR"]],
                mode="markers+text",
                text=[agent_display_name],
                textposition="top center",
                textfont=dict(size=11),
                marker=dict(size=10, color=color_map[row["Agent"]], opacity=0.9),
                hovertemplate="Agent: %{text}<br>SR: %{y:.1f}%<br>Date: %{x|%Y-%m}<extra></extra>",
                showlegend=False,
            )
        )

    if len(work) >= 2:
        x_numeric = work["Release Time"].map(pd.Timestamp.toordinal)
        slope, intercept = np.polyfit(x_numeric, work["Average SR"], 1)
        x_range = pd.date_range(work["Release Time"].min(), work["Release Time"].max(), freq="MS")
        y_pred = slope * x_range.map(pd.Timestamp.toordinal) + intercept
        fig.add_trace(go.Scatter(x=x_range, y=y_pred, mode="lines", line=dict(color="rgba(0,0,0,0.6)", dash="dash"), name="Trend", hoverinfo="skip"))
    
    fig.update_layout(
                    title=dict(text=title, x=0.5, xanchor="center", font=dict(size=22)),
                    xaxis_title="Release Time",
                    yaxis_title="Success Rate",
                    template="plotly_white",
                    width=1500,
                    height=700,
                    shapes=shapes
                    )
    fig.update_xaxes(dtick="M3", tickformat="%Y-%m", showspikes=True, spikemode="across", spikecolor="rgba(0,0,0,0.4)", spikethickness=1, spikedash="dot")
    fig.update_yaxes(range=[y_min, y_max], showspikes=True, spikemode="across", spikecolor="rgba(0,0,0,0.4)", spikethickness=1, spikedash="dot")
    return fig

def plot_heatmap_with_performance_bar(json_file):
    with open(json_file, "r") as f:
        data = json.load(f)

    agents = [k for k in data[0].keys() if k.endswith("_human_label")]
    records = []
    original_ids = [task["task_id"] for task in data]

    for task in data:
        task_id = task["task_id"]
        for agent in agents:
            raw_val = task.get(agent, "0")
            try:
                val = int(raw_val)
            except ValueError:
                val = 0
            val = 1 if val == 1 else 0
            records.append({
                "Task ID": task_id,
                "Agent": agent.replace("_human_label", ""),
                "Success": val
            })

    df = pd.DataFrame(records)
    pivot = df.pivot_table(index="Agent", columns="Task ID", values="Success", aggfunc="max")

    for task_id in original_ids:
        if task_id not in pivot.columns:
            pivot[task_id] = 0
    pivot = pivot[original_ids]

    agent_success_rate = pivot.sum(axis=1) / pivot.shape[1]
    pivot["SuccessRate"] = agent_success_rate
    pivot = pivot.sort_values(by="SuccessRate", ascending=False)
    pivot = pivot.drop(columns=["SuccessRate"])

    agent_name_map = {
    "Operator": "Operator",
    "Agent-E": "Agent-E",
    "Browser_Use": "Browser Use",
    "Claude_Computer_Use": "Claude Computer Use",
    "SeeAct": "SeeAct"
    }
    sorted_agents = pivot.index.tolist()
    pivot.index = [
        f"{agent_name_map.get(agent, agent)} ({agent_success_rate[agent]*100:.1f}%)"
        for agent in sorted_agents
    ]

    custom_labels = [["Success" if val == 1 else "Failure" for val in row] for row in pivot.values]
    any_agent_solved = pivot.max(axis=0).sum()
    best_agent_solved = pivot.sum(axis=1).max()
    total_tasks = len(original_ids)

    fig = make_subplots(
        rows=2, cols=1,
        row_heights=[0.8, 0.2],
        vertical_spacing=0.08,
        subplot_titles=("TASK ID", ""),
        shared_xaxes=False
    )

    fig.add_trace(go.Heatmap(
        z=pivot.values,
        x=pivot.columns,
        y=pivot.index,
        colorscale=[[0, "white"], [1, "skyblue"]],
        zmin=0,
        zmax=1,
        showscale=False,
        customdata=custom_labels,
        hovertemplate="Agent: %{y}<br>Task ID: %{x}<br>Completion: %{customdata}<extra></extra>"
    ), row=1, col=1)

    fig.add_trace(go.Bar(
        y=["Any agent", "Best agent"],
        x=[any_agent_solved, best_agent_solved],
        orientation='h',
        marker_color=["dodgerblue", "mediumseagreen"],
        text=[
            f"{int(any_agent_solved)}/{total_tasks} ({any_agent_solved / total_tasks:.1%})",
            f"{int(best_agent_solved)}/{total_tasks} ({best_agent_solved / total_tasks:.1%})"
        ],
        textposition="auto",
        showlegend=False
    ), row=2, col=1)

    fig.add_trace(go.Scatter(
        x=[None], y=[None],
        mode='markers',
        marker=dict(size=10, color='skyblue'),
        name='Success'
    ))
    fig.add_trace(go.Scatter(
        x=[None], y=[None],
        mode='markers',
        marker=dict(size=10, color='white', line=dict(width=1, color='black')),
        name='Failure'
    ))

    fig.update_xaxes(range=[0, total_tasks], row=2, col=1)
    fig.update_layout(
        height=600,
        xaxis=dict(showticklabels=False),
        yaxis=dict(title="Agent"),
        yaxis2=dict(title=""),
        margin=dict(t=60)
    )
    return fig

def refresh():
    auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
    human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
    sr_time_plot = plot_sr_vs_time(auto_eval_dataframe_test)
    auto_eval_dataframe_test = gr.Dataframe(value=df_to_gradio(auto_eval_dataframe_test), datatype=TYPES, interactive=False, wrap=False, elem_id="auto-leaderboard-table")

    return auto_eval_dataframe_test, human_eval_dataframe_test, sr_time_plot


demo = gr.Blocks(css="""#human-leaderboard-table { width: auto; min-width: calc(100% + 20px); }""")

with demo:
    gr.HTML(TITLE)
    gr.HTML(LINKS)
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    with gr.Row():
        with gr.Accordion("📙 Citation", open=False):
            citation_button = gr.Textbox(
                value=CITATION_BUTTON_TEXT,
                label=CITATION_BUTTON_LABEL,
                elem_id="citation-button",
                lines=10,
            ) 

    gr.Markdown(LEADERBOARD_TEXT, elem_classes="markdown-text")

    with gr.Tab("Human Evaluation", elem_id="human-tab", id=1):
        human_leaderboard_table_test = gr.Dataframe(
            value=human_eval_dataframe_test,
            datatype=TYPES,
            interactive=False,
            wrap=False
        )
        gr.Markdown("### Visualization")
        gr.Markdown("This figure presents a fine-grained heatmap illustrating task-level completion across different agents. Each row corresponds to a specific agent, and each column represents a task (identified by its task ID). Blue bars indicate successful completions, while white spaces denote failures. Any agent: A task is considered successful if at least one agent is able to complete it. (This style of visualization is inspired by [HAL](https://hal.cs.princeton.edu/).)")
        fig = plot_heatmap_with_performance_bar("./human_label_111825.json")
        gr.Plot(fig)
        gr.Markdown(EVALUATION_DETAILS)

    with gr.Tab("Auto Evaluation", elem_id="auto-tab", id=2):
        sr_time_plot = gr.Plot(plot_sr_vs_time(auto_eval_dataframe_test))
        gr.Markdown('### Agents highlighted in red represent unverified results that may involve unreliable evaluations and are provided for reference only. You can refer to the "Note" column for more details.')
        auto_leaderboard_table_test = gr.Dataframe(value=df_to_gradio(auto_eval_dataframe_test), datatype=TYPES, interactive=False, wrap=False, elem_id="auto-leaderboard-table")


    with gr.Tab("Submission Guideline", elem_id="submit-tab", id=3):
        with gr.Row():
            gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")

    refresh_button = gr.Button("Refresh")
    refresh_button.click(
        refresh,
        inputs=[],
        outputs=[
            auto_leaderboard_table_test,
            human_leaderboard_table_test,
            sr_time_plot
        ],
    )



scheduler = BackgroundScheduler()
scheduler.start()

if __name__ == "__main__":
    demo.launch(debug=True,share=True)