Spaces:

ServiceNow
/

browsergym-leaderboard

Running

File size: 19,284 Bytes

import json
import re
import os
import streamlit as st
import requests
import pandas as pd
from io import StringIO
import plotly.graph_objs as go
from huggingface_hub import HfApi
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
import streamlit.components.v1 as components
from datetime import datetime

from urllib.parse import quote
from pathlib import Path
import re
import html
from typing import Dict, Any

BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena-L2", "WorkArena-L3", "MiniWoB", "WebLINX", "AssistantBench"]

def sanitize_agent_name(agent_name):
    # Only allow alphanumeric chars, hyphen, underscore
    if agent_name.startswith('.'):
        raise ValueError("Agent name cannot start with a dot")
    
    if not re.match("^[a-zA-Z0-9-_][a-zA-Z0-9-_.]*$", agent_name):
        raise ValueError("Invalid agent name format")
    return agent_name

def safe_path_join(*parts):
    # Ensure we stay within results directory
    base = Path("results").resolve()
    try:
        path = base.joinpath(*parts).resolve()
        if not str(path).startswith(str(base)):
            raise ValueError("Path traversal detected")
        return path
    except Exception:
        raise ValueError("Invalid path")

def sanitize_column_name(col: str) -> str:
    """Sanitize column names for HTML display"""
    return html.escape(str(col))

def sanitize_cell_value(value: Any) -> str:
    if isinstance(value, (int, float)):
        return str(value)
    if isinstance(value, str) and '±' in value:
        score, std_err = value.split('±')
        return f'{score.strip()} <span style="font-size: smaller; color: var(--lighter-color);">±{std_err.strip()}</span>'
    return html.escape(str(value))

def create_html_table_main(df):
    col1, col2 = st.columns([2,6])
    with col1:
        sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("WebArena"), key="main_sort_column")
    with col2:
        sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key="main_sort_order")
    
    def get_sort_value(row):
            if row == "-":
                return float('-inf')
            else:
                try:
                    return float(row)
                except ValueError:
                    return row
                
    # Sort dataframe
    if sort_order == "Ascending":
        df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value))
    else:
        df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value))
    html = '''
    <style>
        table {
            width: 100%;
            border-collapse: collapse;
        }
        th, td {
            border: 1px solid #ddd;
            padding: 8px;
            text-align: center;
        }
        th {
            font-weight: bold;
        }
        .table-container {
            padding-bottom: 20px;
        }
    </style>
    '''
    html += '<div class="table-container">'
    html += '<table>'
    html += '<thead><tr>'
    for column in df.columns:
        html += f'<th>{sanitize_column_name(column)}</th>'
    html += '</tr></thead>'
    html += '<tbody>'
    for _, row in df.iterrows():
        html += '<tr>'
        for col in df.columns:
            if col == "Agent":
                html += f'<td>{row[col]}</td>'
            else:
                html += f'<td>{sanitize_cell_value(row[col])}</td>'
        html += '</tr>'
    html += '</tbody></table>'
    html += '</div>'
    return html

def create_html_table_benchmark(df, benchmark):
    col1, col2 = st.columns([2,6])
    with col1:
        sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("Score"), key=f"benchmark_sort_column_{benchmark}")
    with col2:
        sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key=f"benchmark_sort_order_{benchmark}")
    
    def get_sort_value(row):
            if row == "-":
                return float('-inf')
            else:
                try:
                    return float(row)
                except ValueError:
                    return row
                
    # Sort dataframe
    if sort_order == "Ascending":
        df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value))
    else:
        df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value))

    html = '''
    <style>
        table {
            width: 100%;
            border-collapse: collapse;
        }
        th, td {
            border: 1px solid #ddd;
            padding: 8px;
            text-align: center;
        }
        th {
            font-weight: bold;
        }
        .table-container {
            padding-bottom: 20px;
        }
    </style>
    '''
    html += '<div class="table-container">'
    html += '<table>'
    html += '<thead><tr>'
    for column in df.columns:
        if column == "Reproduced_all" or column == "std_err":
            continue
        html += f'<th>{sanitize_column_name(column)}</th>'
    html += '</tr></thead>'
    html += '<tbody>'
    for _, row in df.iterrows():
        html += '<tr>'
        for column in df.columns:
            if column == "Reproduced":
                if row[column] == "-":
                    html += f'<td>{sanitize_cell_value(row[column])}</td>'
                else:
                    summary = sanitize_cell_value(row[column])
                    details = "<br>".join(map(sanitize_cell_value, row["Reproduced_all"]))
                    html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
            elif column == "Reproduced_all" or column == "std_err":
                continue
            elif column == "Score":
                score_with_std_err = f'{row[column]} ± {row["std_err"]}'
                html += f'<td>{sanitize_cell_value(score_with_std_err)}</td>'
            else:
                html += f'<td>{sanitize_cell_value(row[column])}</td>'
        html += '</tr>'
    html += '</tbody></table>'
    html += '</div>'
    return html

def check_sanity(agent):
    try:
        safe_agent = sanitize_agent_name(agent)
        for benchmark in BENCHMARKS:
            file_path = safe_path_join(safe_agent, f"{benchmark.lower()}.json")
            if not file_path.is_file():
                continue
            original_count = 0
            with open(file_path) as f:
                results = json.load(f)
                for result in results:
                    if not all(key in result for key in ["agent_name", "benchmark", "original_or_reproduced", "score", "std_err", "benchmark_specific", "benchmark_tuned", "followed_evaluation_protocol", "reproducible", "comments", "study_id", "date_time"]):
                        return False
                    if result["agent_name"] != agent:
                        return False
                    if result["benchmark"] != benchmark:
                        return False
                    if result["original_or_reproduced"] == "Original":
                        original_count += 1
            if original_count != 1:
                return False
        return True
    except ValueError:
        return False

def main():
    st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded")
    st.markdown("""
        <style>
        :root {
            --lighter-color: #888; /* Default for light theme */
        }
        @media (prefers-color-scheme: dark) {
            :root {
                --lighter-color: #ccc; /* Default for dark theme */
            }
        }
        </style>
    """, unsafe_allow_html=True)

    st.markdown("""
        <head>
            <meta http-equiv="Content-Security-Policy" 
                content="default-src 'self' https://huggingface.co;
                        script-src 'self' 'unsafe-inline';
                        style-src 'self' 'unsafe-inline';
                        img-src 'self' data: https:;
                        frame-ancestors 'none';">
            <meta http-equiv="X-Frame-Options" content="DENY">
            <meta http-equiv="X-Content-Type-Options" content="nosniff">
            <meta http-equiv="Referrer-Policy" content="strict-origin-when-cross-origin">
        </head>
    """, unsafe_allow_html=True)

    all_agents = os.listdir("results")
    all_results = {}
    for agent in all_agents:
        if not check_sanity(agent):
            st.error(f"Results for {agent} are not in the correct format.")
            continue
        agent_results = []
        for benchmark in BENCHMARKS:
            file_path = safe_path_join(agent, f"{benchmark.lower()}.json")
            if not file_path.is_file():
                continue
            with open(file_path) as f:
                agent_results.extend(json.load(f))
        all_results[agent] = agent_results

    st.title("🏆 BrowserGym Leaderboard")
    st.markdown("Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.")
    # content = create_yall()
    tabs = st.tabs(["🏆 Main Leaderboard",] +  BENCHMARKS + ["📝 About"])

    with tabs[0]:
        # Leaderboard tab
        def get_leaderboard_dict(results):
            leaderboard_dict = []
            for key, values in results.items():
                result_dict = {"Agent": key}
                for benchmark in BENCHMARKS:
                    if any(value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original" for value in values):
                        result_dict[benchmark] = [value["score"] for value in values if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original"][0]
                    else:
                        result_dict[benchmark] = "-"
                leaderboard_dict.append(result_dict)
            return leaderboard_dict
        leaderboard_dict = get_leaderboard_dict(all_results)
        # print (leaderboard_dict)
        full_df = pd.DataFrame.from_dict(leaderboard_dict)

        df = pd.DataFrame(columns=full_df.columns)
        dfs_to_concat = []
        dfs_to_concat.append(full_df)

        # Concatenate the DataFrames
        if dfs_to_concat:
            df = pd.concat(dfs_to_concat, ignore_index=True)

        for benchmark in BENCHMARKS:
            df[benchmark] = df[benchmark].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
            df[benchmark] = df[benchmark].astype(str)
        # Add a search bar
        search_query = st.text_input("Search agents", "", key="search_main")

        # Filter the DataFrame based on the search query
        if search_query:
            df = df[df['Agent'].str.contains(search_query, case=False)]

        # Display the filtered DataFrame or the entire leaderboard

        def make_hyperlink(agent_name):
            try:
                safe_name = sanitize_agent_name(agent_name)
                safe_url = f"https://huggingface.co/spaces/ServiceNow/browsergym-leaderboard/blob/main/results/{quote(safe_name)}/README.md"
                return f'<a href="{html.escape(safe_url)}" target="_blank">{html.escape(safe_name)}</a>'
            except ValueError:
                return ""
        
        df['Agent'] = df['Agent'].apply(make_hyperlink)
        html_table = create_html_table_main(df)
        st.markdown(html_table, unsafe_allow_html=True)

        if st.button("Export to CSV", key="export_main"):
            # Export the DataFrame to CSV
            csv_data = df.to_csv(index=False)

            # Create a link to download the CSV file
            st.download_button(
                label="Download CSV",
                data=csv_data,
                file_name="leaderboard.csv",
                key="download-csv",
                help="Click to download the CSV file",
            )

    with tabs[-1]:
            st.markdown('''
# BrowserGym Leaderboard

This leaderboard tracks performance of various agents on web navigation tasks.

## How to Submit Results for New Agents

### 1. Create Results Directory
Create a new folder in the `results` directory with your agent's name:
```bash
results/
└── your-agent-name/
    ├── README.md
    ├── webarena.json
    ├── workarena-l1.json
    ├── workarena++-l2.json
    ├── workarena++-l3.json
    └── miniwob.json
```


### 2. Add Agent Details

Create a `README.md` in your agent's folder with the following details:

#### Required Information
- **Model Name**: Base model used (e.g., GPT-4, Claude-2)
- **Model Architecture**: Architecture details and any modifications
- **Input/Output Format**: How inputs are processed and outputs generated
- **Training Details**: Training configuration if applicable
  - Dataset used
  - Number of training steps
  - Hardware used
  - Training time

#### Optional Information
- **Paper Link**: Link to published paper/preprint if available
- **Code Repository**: Link to public code implementation
- **Additional Notes**: Any special configurations or requirements
- **License**: License information for your agent

Make sure to organize the information in clear sections using Markdown.

### 3. Add Benchmark Results

Create separate JSON files for each benchmark following this format:

```json
[
    {
        "agent_name": "your-agent-name",
        "study_id": "unique-study-identifier-from-agentlab", 
        "date_time": "YYYY-MM-DD HH:MM:SS",
        "benchmark": "WebArena",
        "score": 0.0,
        "std_err": 0.0,
        "benchmark_specific": "Yes/No",
        "benchmark_tuned": "Yes/No",
        "followed_evaluation_protocol": "Yes/No", 
        "reproducible": "Yes/No",
        "comments": "Additional details",
        "original_or_reproduced": "Original"
    }
]
```

Please add all the benchmark files in separate json files named as follows:

- `webarena.json`
- `workarena-l1.json`
- `workarena-l2.json`
- `workarena-l3.json`
- `miniwob.json`

Each file must contain a JSON array with a single object following the format above. The benchmark field in each file must match the benchmark name exactly ([`WebArena`, `WorkArena-L1`, `WorkArena-L2`, `WorkArena-L3`, `MiniWoB`]) and benchmark_lowercase.json as the filename.

### 4. Submit PR

1. Open the community tab and press "New Pull Request"
2. Give it a new title to the PR and follow the steps mentioned
3. Publish the branch 

## How to Submit Reproducibility Results for Existing Agents

Open the results file for the agent and benchmark you reproduced the results for.

### 1. Add reproduced results


Append the following entry in the json file. Ensure you set `original_or_reproduced` as `Reproduced`.

```json
[
    {
        "agent_name": "your-agent-name",
        "study_id": "unique-study-identifier-from-agentlab", 
        "date_time": "YYYY-MM-DD HH:MM:SS",
        "benchmark": "WebArena",
        "score": 0.0,
        "std_err": 0.0,
        "benchmark_specific": "Yes/No",
        "benchmark_tuned": "Yes/No",
        "followed_evaluation_protocol": "Yes/No", 
        "reproducible": "Yes/No",
        "comments": "Additional details",
        "original_or_reproduced": "Reproduced"
    }
]
```

### 2. Submit PR

1. Open the community tab and press "New Pull Request"
2. Give it a new title to the PR and follow the steps mentioned
3. Publish the branch

## License

MIT
                ''')
    for i, benchmark in enumerate(BENCHMARKS, start=1):
        with tabs[i]:
            def get_benchmark_dict(results, benchmark):
                benchmark_dict = []
                for key, values in results.items():
                    result_dict = {"Agent": key}
                    flag = 0
                    for value in values:
                        if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original":
                            result_dict["Score"] = value["score"]
                            result_dict["std_err"] = value["std_err"]
                            result_dict["Benchmark Specific"] = value["benchmark_specific"]
                            result_dict["Benchmark Tuned"] = value["benchmark_tuned"]
                            result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"]
                            result_dict["Reproducible"] = value["reproducible"]
                            result_dict["Comments"] = value["comments"]
                            result_dict["Study ID"] = value["study_id"]
                            value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p")
                            result_dict["Date"] = value["date_time"]
                            result_dict["Reproduced"] = []
                            result_dict["Reproduced_all"] = []
                            flag = 1
                        if not flag:
                            result_dict["Score"] = "-"
                            result_dict["std_err"] = "-"
                            result_dict["Benchmark Specific"] = "-"
                            result_dict["Benchmark Tuned"] = "-"
                            result_dict["Followed Evaluation Protocol"] = "-"
                            result_dict["Reproducible"] = "-"
                            result_dict["Comments"] = "-"
                            result_dict["Study ID"] = "-"
                            result_dict["Date"] = "-"
                            result_dict["Reproduced"] = []
                            result_dict["Reproduced_all"] = []
                        if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced":
                            result_dict["Reproduced"].append(value["score"])
                            value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p")
                            result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])]))
                    if result_dict["Reproduced"]:
                        result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"]))
                    else:
                        result_dict["Reproduced"] = "-"
                    benchmark_dict.append(result_dict)
                return benchmark_dict
            benchmark_dict = get_benchmark_dict(all_results, benchmark=benchmark)
            # print (leaderboard_dict)
            full_df = pd.DataFrame.from_dict(benchmark_dict)
            df_ = pd.DataFrame(columns=full_df.columns)
            dfs_to_concat = []
            dfs_to_concat.append(full_df)

            # Concatenate the DataFrames
            if dfs_to_concat:
                df_ = pd.concat(dfs_to_concat, ignore_index=True)
            df_['Score'] = df_['Score'].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
            df_['std_err'] = df_['std_err'].apply(lambda x: f"{x:.1f}" if x != "-" else "-")
            df_['Score'] = df_['Score'].astype(str)
            html_table = create_html_table_benchmark(df_, benchmark)
            st.markdown(html_table, unsafe_allow_html=True)
                
        
if __name__ == "__main__":
    main()