import json import re import os import streamlit as st import requests import pandas as pd from io import StringIO import plotly.graph_objs as go from huggingface_hub import HfApi from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError import streamlit.components.v1 as components from datetime import datetime from urllib.parse import quote from pathlib import Path import re import html from typing import Dict, Any BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena-L2", "WorkArena-L3", "MiniWoB", "WebLINX", "VisualWebArena", "AssistantBench"] def sanitize_agent_name(agent_name): # Only allow alphanumeric chars, hyphen, underscore if agent_name.startswith('.'): raise ValueError("Agent name cannot start with a dot") if not re.match("^[a-zA-Z0-9-_][a-zA-Z0-9-_.]*$", agent_name): raise ValueError("Invalid agent name format") return agent_name def safe_path_join(*parts): # Ensure we stay within results directory base = Path("results").resolve() try: path = base.joinpath(*parts).resolve() if not str(path).startswith(str(base)): raise ValueError("Path traversal detected") return path except Exception: raise ValueError("Invalid path") def sanitize_column_name(col: str) -> str: """Sanitize column names for HTML display""" return html.escape(str(col)) def sanitize_cell_value(value: Any) -> str: if isinstance(value, (int, float)): return str(value) if isinstance(value, str) and '±' in value: score, std_err = value.split('±') return f'{score.strip()} ±{std_err.strip()}' return html.escape(str(value)) def create_html_table_main(df): col1, col2 = st.columns([2,6]) with col1: sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("WebArena"), key="main_sort_column") with col2: sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key="main_sort_order") def get_sort_value(row): if row == "-": return float('-inf') else: try: return float(row) except ValueError: return row # Sort dataframe if sort_order == "Ascending": df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value)) else: df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value)) html = ''' ''' html += '
' html += '' html += '' for column in df.columns: html += f'' html += '' html += '' for _, row in df.iterrows(): html += '' for col in df.columns: if col == "Agent": html += f'' else: html += f'' html += '' html += '
{sanitize_column_name(column)}
{row[col]}{sanitize_cell_value(row[col])}
' html += '
' return html def create_html_table_benchmark(df, benchmark): col1, col2 = st.columns([2,6]) with col1: sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("Score"), key=f"benchmark_sort_column_{benchmark}") with col2: sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key=f"benchmark_sort_order_{benchmark}") def get_sort_value(row): if row == "-": return float('-inf') else: try: return float(row) except ValueError: return row # Sort dataframe if sort_order == "Ascending": df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value)) else: df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value)) html = ''' ''' html += '
' html += '' html += '' for column in df.columns: if column == "Reproduced_all" or column == "std_err": continue html += f'' html += '' html += '' for _, row in df.iterrows(): html += '' for column in df.columns: if column == "Reproduced": if row[column] == "-": html += f'' else: summary = sanitize_cell_value(row[column]) details = "
".join(map(sanitize_cell_value, row["Reproduced_all"])) html += f'' elif column == "Reproduced_all" or column == "std_err": continue elif column == "Score": score_with_std_err = f'{row[column]} ± {row["std_err"]}' html += f'' else: html += f'' html += '' html += '
{sanitize_column_name(column)}
{sanitize_cell_value(row[column])}
{summary}{details}
{sanitize_cell_value(score_with_std_err)}{sanitize_cell_value(row[column])}
' html += '
' return html def check_sanity(agent): try: safe_agent = sanitize_agent_name(agent) for benchmark in BENCHMARKS: file_path = safe_path_join(safe_agent, f"{benchmark.lower()}.json") if not file_path.is_file(): continue original_count = 0 with open(file_path) as f: results = json.load(f) for result in results: if not all(key in result for key in ["agent_name", "benchmark", "original_or_reproduced", "score", "std_err", "benchmark_specific", "benchmark_tuned", "followed_evaluation_protocol", "reproducible", "comments", "study_id", "date_time"]): return False if result["agent_name"] != agent: return False if result["benchmark"] != benchmark: return False if result["original_or_reproduced"] == "Original": original_count += 1 if original_count != 1: return False return True except ValueError: return False def main(): st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded") st.markdown(""" """, unsafe_allow_html=True) st.markdown(""" """, unsafe_allow_html=True) all_agents = os.listdir("results") all_results = {} for agent in all_agents: if not check_sanity(agent): st.error(f"Results for {agent} are not in the correct format.") continue agent_results = [] for benchmark in BENCHMARKS: file_path = safe_path_join(agent, f"{benchmark.lower()}.json") if not file_path.is_file(): continue with open(file_path) as f: agent_results.extend(json.load(f)) all_results[agent] = agent_results st.title("🏆 BrowserGym Leaderboard") st.markdown("Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.") # content = create_yall() tabs = st.tabs(["🏆 Main Leaderboard",] + BENCHMARKS + ["📝 About"]) with tabs[0]: # Leaderboard tab def get_leaderboard_dict(results): leaderboard_dict = [] for key, values in results.items(): result_dict = {"Agent": key} for benchmark in BENCHMARKS: if any(value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original" for value in values): result_dict[benchmark] = [value["score"] for value in values if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original"][0] else: result_dict[benchmark] = "-" leaderboard_dict.append(result_dict) return leaderboard_dict leaderboard_dict = get_leaderboard_dict(all_results) # print (leaderboard_dict) full_df = pd.DataFrame.from_dict(leaderboard_dict) df = pd.DataFrame(columns=full_df.columns) dfs_to_concat = [] dfs_to_concat.append(full_df) # Concatenate the DataFrames if dfs_to_concat: df = pd.concat(dfs_to_concat, ignore_index=True) for benchmark in BENCHMARKS: df[benchmark] = df[benchmark].apply(lambda x: f"{x:.2f}" if x != "-" else "-") df[benchmark] = df[benchmark].astype(str) # Add a search bar search_query = st.text_input("Search agents", "", key="search_main") # Filter the DataFrame based on the search query if search_query: df = df[df['Agent'].str.contains(search_query, case=False)] # Display the filtered DataFrame or the entire leaderboard def make_hyperlink(agent_name): try: safe_name = sanitize_agent_name(agent_name) safe_url = f"https://huggingface.co/spaces/ServiceNow/browsergym-leaderboard/blob/main/results/{quote(safe_name)}/README.md" return f'{html.escape(safe_name)}' except ValueError: return "" df['Agent'] = df['Agent'].apply(make_hyperlink) html_table = create_html_table_main(df) st.markdown(html_table, unsafe_allow_html=True) if st.button("Export to CSV", key="export_main"): # Export the DataFrame to CSV csv_data = df.to_csv(index=False) # Create a link to download the CSV file st.download_button( label="Download CSV", data=csv_data, file_name="leaderboard.csv", key="download-csv", help="Click to download the CSV file", ) with tabs[-1]: st.markdown(''' # BrowserGym Leaderboard This leaderboard tracks performance of various agents on web navigation tasks. ## How to Submit Results for New Agents ### 1. Create Results Directory Create a new folder in the `results` directory with your agent's name: ```bash results/ └── your-agent-name/ ├── README.md ├── webarena.json ├── workarena-l1.json ├── workarena++-l2.json ├── workarena++-l3.json └── miniwob.json ``` ### 2. Add Agent Details Create a `README.md` in your agent's folder with the following details: #### Required Information - **Model Name**: Base model used (e.g., GPT-4, Claude-2) - **Model Architecture**: Architecture details and any modifications - **Input/Output Format**: How inputs are processed and outputs generated - **Training Details**: Training configuration if applicable - Dataset used - Number of training steps - Hardware used - Training time #### Optional Information - **Paper Link**: Link to published paper/preprint if available - **Code Repository**: Link to public code implementation - **Additional Notes**: Any special configurations or requirements - **License**: License information for your agent Make sure to organize the information in clear sections using Markdown. ### 3. Add Benchmark Results Create separate JSON files for each benchmark following this format: ```json [ { "agent_name": "your-agent-name", "study_id": "unique-study-identifier-from-agentlab", "date_time": "YYYY-MM-DD HH:MM:SS", "benchmark": "WebArena", "score": 0.0, "std_err": 0.0, "benchmark_specific": "Yes/No", "benchmark_tuned": "Yes/No", "followed_evaluation_protocol": "Yes/No", "reproducible": "Yes/No", "comments": "Additional details", "original_or_reproduced": "Original" } ] ``` Please add all the benchmark files in separate json files named as follows: - `webarena.json` - `workarena-l1.json` - `workarena-l2.json` - `workarena-l3.json` - `miniwob.json` Each file must contain a JSON array with a single object following the format above. The benchmark field in each file must match the benchmark name exactly ([`WebArena`, `WorkArena-L1`, `WorkArena-L2`, `WorkArena-L3`, `MiniWoB`]) and benchmark_lowercase.json as the filename. ### 4. Submit PR 1. Open the community tab and press "New Pull Request" 2. Give it a new title to the PR and follow the steps mentioned 3. Publish the branch ## How to Submit Reproducibility Results for Existing Agents Open the results file for the agent and benchmark you reproduced the results for. ### 1. Add reproduced results Append the following entry in the json file. Ensure you set `original_or_reproduced` as `Reproduced`. ```json [ { "agent_name": "your-agent-name", "study_id": "unique-study-identifier-from-agentlab", "date_time": "YYYY-MM-DD HH:MM:SS", "benchmark": "WebArena", "score": 0.0, "std_err": 0.0, "benchmark_specific": "Yes/No", "benchmark_tuned": "Yes/No", "followed_evaluation_protocol": "Yes/No", "reproducible": "Yes/No", "comments": "Additional details", "original_or_reproduced": "Reproduced" } ] ``` ### 2. Submit PR 1. Open the community tab and press "New Pull Request" 2. Give it a new title to the PR and follow the steps mentioned 3. Publish the branch ## License MIT ''') for i, benchmark in enumerate(BENCHMARKS, start=1): with tabs[i]: def get_benchmark_dict(results, benchmark): benchmark_dict = [] for key, values in results.items(): result_dict = {"Agent": key} flag = 0 for value in values: if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original": result_dict["Score"] = value["score"] result_dict["std_err"] = value["std_err"] result_dict["Benchmark Specific"] = value["benchmark_specific"] result_dict["Benchmark Tuned"] = value["benchmark_tuned"] result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"] result_dict["Reproducible"] = value["reproducible"] result_dict["Comments"] = value["comments"] result_dict["Study ID"] = value["study_id"] value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p") result_dict["Date"] = value["date_time"] result_dict["Reproduced"] = [] result_dict["Reproduced_all"] = [] flag = 1 if not flag: result_dict["Score"] = "-" result_dict["std_err"] = "-" result_dict["Benchmark Specific"] = "-" result_dict["Benchmark Tuned"] = "-" result_dict["Followed Evaluation Protocol"] = "-" result_dict["Reproducible"] = "-" result_dict["Comments"] = "-" result_dict["Study ID"] = "-" result_dict["Date"] = "-" result_dict["Reproduced"] = [] result_dict["Reproduced_all"] = [] if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced": result_dict["Reproduced"].append(value["score"]) value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p") result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])])) if result_dict["Reproduced"]: result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"])) else: result_dict["Reproduced"] = "-" benchmark_dict.append(result_dict) return benchmark_dict benchmark_dict = get_benchmark_dict(all_results, benchmark=benchmark) # print (leaderboard_dict) full_df = pd.DataFrame.from_dict(benchmark_dict) df_ = pd.DataFrame(columns=full_df.columns) dfs_to_concat = [] dfs_to_concat.append(full_df) # Concatenate the DataFrames if dfs_to_concat: df_ = pd.concat(dfs_to_concat, ignore_index=True) df_['Score'] = df_['Score'].apply(lambda x: f"{x:.2f}" if x != "-" else "-") df_['std_err'] = df_['std_err'].apply(lambda x: f"{x:.1f}" if x != "-" else "-") df_['Score'] = df_['Score'].astype(str) html_table = create_html_table_benchmark(df_, benchmark) st.markdown(html_table, unsafe_allow_html=True) if __name__ == "__main__": main()