import json
import re
import os
import streamlit as st
import requests
import pandas as pd
from io import StringIO
import plotly.graph_objs as go
from huggingface_hub import HfApi
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
import streamlit.components.v1 as components
from datetime import datetime
from urllib.parse import quote
from pathlib import Path
import re
import html
from typing import Dict, Any
BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena-L2", "WorkArena-L3", "MiniWoB", "WebLINX", "VisualWebArena", "AssistantBench"]
def sanitize_agent_name(agent_name):
# Only allow alphanumeric chars, hyphen, underscore
if agent_name.startswith('.'):
raise ValueError("Agent name cannot start with a dot")
if not re.match("^[a-zA-Z0-9-_][a-zA-Z0-9-_.]*$", agent_name):
raise ValueError("Invalid agent name format")
return agent_name
def safe_path_join(*parts):
# Ensure we stay within results directory
base = Path("results").resolve()
try:
path = base.joinpath(*parts).resolve()
if not str(path).startswith(str(base)):
raise ValueError("Path traversal detected")
return path
except Exception:
raise ValueError("Invalid path")
def sanitize_column_name(col: str) -> str:
"""Sanitize column names for HTML display"""
return html.escape(str(col))
def sanitize_cell_value(value: Any) -> str:
if isinstance(value, (int, float)):
return str(value)
if isinstance(value, str) and '±' in value:
score, std_err = value.split('±')
return f'{score.strip()} ±{std_err.strip()}'
return html.escape(str(value))
def create_html_table_main(df):
col1, col2 = st.columns([2,6])
with col1:
sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("WebArena"), key="main_sort_column")
with col2:
sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key="main_sort_order")
def get_sort_value(row):
if row == "-":
return float('-inf')
else:
try:
return float(row)
except ValueError:
return row
# Sort dataframe
if sort_order == "Ascending":
df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value))
else:
df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value))
html = '''
'''
html += '
'
html += '
'
html += ''
for column in df.columns:
html += f'{sanitize_column_name(column)} | '
html += '
'
html += ''
for _, row in df.iterrows():
html += ''
for col in df.columns:
if col == "Agent":
html += f'{row[col]} | '
else:
html += f'{sanitize_cell_value(row[col])} | '
html += '
'
html += '
'
html += '
'
return html
def create_html_table_benchmark(df, benchmark):
col1, col2 = st.columns([2,6])
with col1:
sort_column = st.selectbox("Sort by", df.columns.tolist(), index=df.columns.tolist().index("Score"), key=f"benchmark_sort_column_{benchmark}")
with col2:
sort_order = st.radio("Order", ["Ascending", "Descending"], index=1, horizontal=True, key=f"benchmark_sort_order_{benchmark}")
def get_sort_value(row):
if row == "-":
return float('-inf')
else:
try:
return float(row)
except ValueError:
return row
# Sort dataframe
if sort_order == "Ascending":
df = df.sort_values(by=sort_column, key=lambda x: x.apply(get_sort_value))
else:
df = df.sort_values(by=sort_column, ascending=False, key=lambda x: x.apply(get_sort_value))
html = '''
'''
html += ''
html += '
'
html += ''
for column in df.columns:
if column == "Reproduced_all" or column == "std_err":
continue
html += f'{sanitize_column_name(column)} | '
html += '
'
html += ''
for _, row in df.iterrows():
html += ''
for column in df.columns:
if column == "Reproduced":
if row[column] == "-":
html += f'{sanitize_cell_value(row[column])} | '
else:
summary = sanitize_cell_value(row[column])
details = "
".join(map(sanitize_cell_value, row["Reproduced_all"]))
html += f'{summary}{details} | '
elif column == "Reproduced_all" or column == "std_err":
continue
elif column == "Score":
score_with_std_err = f'{row[column]} ± {row["std_err"]}'
html += f'{sanitize_cell_value(score_with_std_err)} | '
else:
html += f'{sanitize_cell_value(row[column])} | '
html += '
'
html += '
'
html += '
'
return html
def check_sanity(agent):
try:
safe_agent = sanitize_agent_name(agent)
for benchmark in BENCHMARKS:
file_path = safe_path_join(safe_agent, f"{benchmark.lower()}.json")
if not file_path.is_file():
continue
original_count = 0
with open(file_path) as f:
results = json.load(f)
for result in results:
if not all(key in result for key in ["agent_name", "benchmark", "original_or_reproduced", "score", "std_err", "benchmark_specific", "benchmark_tuned", "followed_evaluation_protocol", "reproducible", "comments", "study_id", "date_time"]):
return False
if result["agent_name"] != agent:
return False
if result["benchmark"] != benchmark:
return False
if result["original_or_reproduced"] == "Original":
original_count += 1
if original_count != 1:
return False
return True
except ValueError:
return False
def main():
st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded")
st.markdown("""
""", unsafe_allow_html=True)
st.markdown("""
""", unsafe_allow_html=True)
all_agents = os.listdir("results")
all_results = {}
for agent in all_agents:
if not check_sanity(agent):
st.error(f"Results for {agent} are not in the correct format.")
continue
agent_results = []
for benchmark in BENCHMARKS:
file_path = safe_path_join(agent, f"{benchmark.lower()}.json")
if not file_path.is_file():
continue
with open(file_path) as f:
agent_results.extend(json.load(f))
all_results[agent] = agent_results
st.title("🏆 BrowserGym Leaderboard")
st.markdown("Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.")
# content = create_yall()
tabs = st.tabs(["🏆 Main Leaderboard",] + BENCHMARKS + ["📝 About"])
with tabs[0]:
# Leaderboard tab
def get_leaderboard_dict(results):
leaderboard_dict = []
for key, values in results.items():
result_dict = {"Agent": key}
for benchmark in BENCHMARKS:
if any(value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original" for value in values):
result_dict[benchmark] = [value["score"] for value in values if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original"][0]
else:
result_dict[benchmark] = "-"
leaderboard_dict.append(result_dict)
return leaderboard_dict
leaderboard_dict = get_leaderboard_dict(all_results)
# print (leaderboard_dict)
full_df = pd.DataFrame.from_dict(leaderboard_dict)
df = pd.DataFrame(columns=full_df.columns)
dfs_to_concat = []
dfs_to_concat.append(full_df)
# Concatenate the DataFrames
if dfs_to_concat:
df = pd.concat(dfs_to_concat, ignore_index=True)
for benchmark in BENCHMARKS:
df[benchmark] = df[benchmark].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
df[benchmark] = df[benchmark].astype(str)
# Add a search bar
search_query = st.text_input("Search agents", "", key="search_main")
# Filter the DataFrame based on the search query
if search_query:
df = df[df['Agent'].str.contains(search_query, case=False)]
# Display the filtered DataFrame or the entire leaderboard
def make_hyperlink(agent_name):
try:
safe_name = sanitize_agent_name(agent_name)
safe_url = f"https://huggingface.co/spaces/ServiceNow/browsergym-leaderboard/blob/main/results/{quote(safe_name)}/README.md"
return f'{html.escape(safe_name)}'
except ValueError:
return ""
df['Agent'] = df['Agent'].apply(make_hyperlink)
html_table = create_html_table_main(df)
st.markdown(html_table, unsafe_allow_html=True)
if st.button("Export to CSV", key="export_main"):
# Export the DataFrame to CSV
csv_data = df.to_csv(index=False)
# Create a link to download the CSV file
st.download_button(
label="Download CSV",
data=csv_data,
file_name="leaderboard.csv",
key="download-csv",
help="Click to download the CSV file",
)
with tabs[-1]:
st.markdown('''
# BrowserGym Leaderboard
This leaderboard tracks performance of various agents on web navigation tasks.
## How to Submit Results for New Agents
### 1. Create Results Directory
Create a new folder in the `results` directory with your agent's name:
```bash
results/
└── your-agent-name/
├── README.md
├── webarena.json
├── workarena-l1.json
├── workarena++-l2.json
├── workarena++-l3.json
└── miniwob.json
```
### 2. Add Agent Details
Create a `README.md` in your agent's folder with the following details:
#### Required Information
- **Model Name**: Base model used (e.g., GPT-4, Claude-2)
- **Model Architecture**: Architecture details and any modifications
- **Input/Output Format**: How inputs are processed and outputs generated
- **Training Details**: Training configuration if applicable
- Dataset used
- Number of training steps
- Hardware used
- Training time
#### Optional Information
- **Paper Link**: Link to published paper/preprint if available
- **Code Repository**: Link to public code implementation
- **Additional Notes**: Any special configurations or requirements
- **License**: License information for your agent
Make sure to organize the information in clear sections using Markdown.
### 3. Add Benchmark Results
Create separate JSON files for each benchmark following this format:
```json
[
{
"agent_name": "your-agent-name",
"study_id": "unique-study-identifier-from-agentlab",
"date_time": "YYYY-MM-DD HH:MM:SS",
"benchmark": "WebArena",
"score": 0.0,
"std_err": 0.0,
"benchmark_specific": "Yes/No",
"benchmark_tuned": "Yes/No",
"followed_evaluation_protocol": "Yes/No",
"reproducible": "Yes/No",
"comments": "Additional details",
"original_or_reproduced": "Original"
}
]
```
Please add all the benchmark files in separate json files named as follows:
- `webarena.json`
- `workarena-l1.json`
- `workarena-l2.json`
- `workarena-l3.json`
- `miniwob.json`
Each file must contain a JSON array with a single object following the format above. The benchmark field in each file must match the benchmark name exactly ([`WebArena`, `WorkArena-L1`, `WorkArena-L2`, `WorkArena-L3`, `MiniWoB`]) and benchmark_lowercase.json as the filename.
### 4. Submit PR
1. Open the community tab and press "New Pull Request"
2. Give it a new title to the PR and follow the steps mentioned
3. Publish the branch
## How to Submit Reproducibility Results for Existing Agents
Open the results file for the agent and benchmark you reproduced the results for.
### 1. Add reproduced results
Append the following entry in the json file. Ensure you set `original_or_reproduced` as `Reproduced`.
```json
[
{
"agent_name": "your-agent-name",
"study_id": "unique-study-identifier-from-agentlab",
"date_time": "YYYY-MM-DD HH:MM:SS",
"benchmark": "WebArena",
"score": 0.0,
"std_err": 0.0,
"benchmark_specific": "Yes/No",
"benchmark_tuned": "Yes/No",
"followed_evaluation_protocol": "Yes/No",
"reproducible": "Yes/No",
"comments": "Additional details",
"original_or_reproduced": "Reproduced"
}
]
```
### 2. Submit PR
1. Open the community tab and press "New Pull Request"
2. Give it a new title to the PR and follow the steps mentioned
3. Publish the branch
## License
MIT
''')
for i, benchmark in enumerate(BENCHMARKS, start=1):
with tabs[i]:
def get_benchmark_dict(results, benchmark):
benchmark_dict = []
for key, values in results.items():
result_dict = {"Agent": key}
flag = 0
for value in values:
if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original":
result_dict["Score"] = value["score"]
result_dict["std_err"] = value["std_err"]
result_dict["Benchmark Specific"] = value["benchmark_specific"]
result_dict["Benchmark Tuned"] = value["benchmark_tuned"]
result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"]
result_dict["Reproducible"] = value["reproducible"]
result_dict["Comments"] = value["comments"]
result_dict["Study ID"] = value["study_id"]
value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p")
result_dict["Date"] = value["date_time"]
result_dict["Reproduced"] = []
result_dict["Reproduced_all"] = []
flag = 1
if not flag:
result_dict["Score"] = "-"
result_dict["std_err"] = "-"
result_dict["Benchmark Specific"] = "-"
result_dict["Benchmark Tuned"] = "-"
result_dict["Followed Evaluation Protocol"] = "-"
result_dict["Reproducible"] = "-"
result_dict["Comments"] = "-"
result_dict["Study ID"] = "-"
result_dict["Date"] = "-"
result_dict["Reproduced"] = []
result_dict["Reproduced_all"] = []
if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced":
result_dict["Reproduced"].append(value["score"])
value["date_time"] = datetime.strptime(value["date_time"], "%Y-%m-%d %H:%M:%S").strftime("%B %d, %Y %I:%M %p")
result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])]))
if result_dict["Reproduced"]:
result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"]))
else:
result_dict["Reproduced"] = "-"
benchmark_dict.append(result_dict)
return benchmark_dict
benchmark_dict = get_benchmark_dict(all_results, benchmark=benchmark)
# print (leaderboard_dict)
full_df = pd.DataFrame.from_dict(benchmark_dict)
df_ = pd.DataFrame(columns=full_df.columns)
dfs_to_concat = []
dfs_to_concat.append(full_df)
# Concatenate the DataFrames
if dfs_to_concat:
df_ = pd.concat(dfs_to_concat, ignore_index=True)
df_['Score'] = df_['Score'].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
df_['std_err'] = df_['std_err'].apply(lambda x: f"{x:.1f}" if x != "-" else "-")
df_['Score'] = df_['Score'].astype(str)
html_table = create_html_table_benchmark(df_, benchmark)
st.markdown(html_table, unsafe_allow_html=True)
if __name__ == "__main__":
main()