Spaces:
Sleeping
Sleeping
Init push
Browse files- .gitignore +2 -0
- Dockerfile +11 -0
- app.py +389 -0
- requirements.txt +6 -0
- results.json +178 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
*.pyc
|
Dockerfile
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9
|
2 |
+
|
3 |
+
WORKDIR /code
|
4 |
+
|
5 |
+
COPY ./requirements.txt /code/requirements.txt
|
6 |
+
COPY ./app.py /code/app.py
|
7 |
+
COPY ./results.json /code/results.json
|
8 |
+
|
9 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
10 |
+
|
11 |
+
CMD ["streamlit", "run", "/code/app.py", "--server.address", "0.0.0.0", "--server.port", "7860"]
|
app.py
ADDED
@@ -0,0 +1,389 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import re
|
3 |
+
import streamlit as st
|
4 |
+
import requests
|
5 |
+
import pandas as pd
|
6 |
+
from io import StringIO
|
7 |
+
import plotly.graph_objs as go
|
8 |
+
from huggingface_hub import HfApi
|
9 |
+
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
|
10 |
+
|
11 |
+
@st.cache_data
|
12 |
+
def get_model_info(df):
|
13 |
+
api = HfApi()
|
14 |
+
|
15 |
+
# Initialize new columns for likes and tags
|
16 |
+
df['Likes'] = None
|
17 |
+
df['Tags'] = None
|
18 |
+
|
19 |
+
# Iterate through DataFrame rows
|
20 |
+
for index, row in df.iterrows():
|
21 |
+
model = row['Model'].strip()
|
22 |
+
try:
|
23 |
+
model_info = api.model_info(repo_id=str(model))
|
24 |
+
df.loc[index, 'Likes'] = model_info.likes
|
25 |
+
df.loc[index, 'Tags'] = ', '.join(model_info.tags)
|
26 |
+
|
27 |
+
except (RepositoryNotFoundError, RevisionNotFoundError):
|
28 |
+
df.loc[index, 'Likes'] = -1
|
29 |
+
df.loc[index, 'Tags'] = ''
|
30 |
+
|
31 |
+
return df
|
32 |
+
|
33 |
+
|
34 |
+
|
35 |
+
def create_bar_chart(df, category):
|
36 |
+
"""Create and display a bar chart for a given category."""
|
37 |
+
st.write(f"### {category} Scores")
|
38 |
+
|
39 |
+
# Sort the DataFrame based on the category score
|
40 |
+
sorted_df = df[['Model', category]].sort_values(by=category, ascending=True)
|
41 |
+
|
42 |
+
# Create the bar chart with a color gradient (using 'Viridis' color scale as an example)
|
43 |
+
fig = go.Figure(go.Bar(
|
44 |
+
x=sorted_df[category],
|
45 |
+
y=sorted_df['Model'],
|
46 |
+
orientation='h',
|
47 |
+
marker=dict(color=sorted_df[category], colorscale='Inferno')
|
48 |
+
))
|
49 |
+
|
50 |
+
# Update layout for better readability
|
51 |
+
fig.update_layout(
|
52 |
+
margin=dict(l=20, r=20, t=20, b=20)
|
53 |
+
)
|
54 |
+
|
55 |
+
# Adjust the height of the chart based on the number of rows in the DataFrame
|
56 |
+
st.plotly_chart(fig, use_container_width=True, height=35)
|
57 |
+
|
58 |
+
|
59 |
+
def main():
|
60 |
+
st.set_page_config(page_title="WebAgent Leaderboard", layout="wide")
|
61 |
+
|
62 |
+
with open("results.json") as f:
|
63 |
+
all_results = json.load(f)
|
64 |
+
|
65 |
+
st.title("🏆 WebAgent Leaderboard")
|
66 |
+
st.markdown("Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.")
|
67 |
+
# content = create_yall()
|
68 |
+
tab1, tab2, tab3, tab4 = st.tabs(["🏆 WebAgent Leaderboard", "WorkArena++-L2 Leaderboard", "WorkArena++-L3 Leaderboard", "📝 About"])
|
69 |
+
|
70 |
+
# Leaderboard tab
|
71 |
+
with tab1:
|
72 |
+
score_columns = ['WorkArena-L1', 'WorkArena++-L2', 'WorkArena++-L3', 'MiniWoB', 'WebArena']
|
73 |
+
full_df = pd.DataFrame.from_dict(all_results["workarena_agent_curriculum"])
|
74 |
+
|
75 |
+
df = pd.DataFrame(columns=full_df.columns)
|
76 |
+
dfs_to_concat = []
|
77 |
+
dfs_to_concat.append(full_df)
|
78 |
+
|
79 |
+
# Concatenate the DataFrames
|
80 |
+
if dfs_to_concat:
|
81 |
+
df = pd.concat(dfs_to_concat, ignore_index=True)
|
82 |
+
|
83 |
+
df['Average'] = sum(df[column] for column in score_columns)/len(score_columns)
|
84 |
+
# Sort values
|
85 |
+
df = df.sort_values(by='Average', ascending=False)
|
86 |
+
|
87 |
+
# Add a search bar
|
88 |
+
search_query = st.text_input("Search models", "", key="search_main")
|
89 |
+
|
90 |
+
# Filter the DataFrame based on the search query
|
91 |
+
if search_query:
|
92 |
+
df = df[df['Model'].str.contains(search_query, case=False)]
|
93 |
+
|
94 |
+
# Display the filtered DataFrame or the entire leaderboard
|
95 |
+
st.dataframe(
|
96 |
+
df[['Model'] + score_columns + ['Average']],
|
97 |
+
use_container_width=True,
|
98 |
+
column_config={
|
99 |
+
"WorkArena-L1": {'alignment': 'center'},
|
100 |
+
"WorkArena++-L2": {'alignment': 'center'},
|
101 |
+
"WorkArena++-L3": {'alignment': 'center'},
|
102 |
+
"MiniWoB": {'alignment': 'center'},
|
103 |
+
"WebArena": {'alignment': 'center'},
|
104 |
+
},
|
105 |
+
hide_index=True,
|
106 |
+
# height=int(len(df) * 36.2),
|
107 |
+
)
|
108 |
+
|
109 |
+
# Comparison between models
|
110 |
+
selected_models = st.multiselect('Select models to compare', df['Model'].unique())
|
111 |
+
comparison_df = df[df['Model'].isin(selected_models)]
|
112 |
+
st.dataframe(
|
113 |
+
comparison_df.style.highlight_max(axis=0, subset=df.columns[1:]).format({"WorkArena-L1": "{:.2f}".format, "MiniWoB": "{:.2f}".format, "WorkArena++-L2": "{:.2f}".format, "WorkArena++-L3": "{:.2f}".format, "WebArena": "{:.2f}".format}),
|
114 |
+
use_container_width=True,
|
115 |
+
# column_config={
|
116 |
+
# "L1": {'alignment': 'center'},
|
117 |
+
# "L2-Memory": {'alignment': 'center'},
|
118 |
+
# "L2-Retrieval": {'alignment': 'center'},
|
119 |
+
# "L3-Memory": {'alignment': 'center'},
|
120 |
+
# "L3-Retrieval": {'alignment': 'center'},
|
121 |
+
# },
|
122 |
+
hide_index=True,
|
123 |
+
)
|
124 |
+
|
125 |
+
# Add a button to export data to CSV
|
126 |
+
if st.button("Export to CSV", key="export_main"):
|
127 |
+
# Export the DataFrame to CSV
|
128 |
+
csv_data = df.to_csv(index=False)
|
129 |
+
|
130 |
+
# Create a link to download the CSV file
|
131 |
+
st.download_button(
|
132 |
+
label="Download CSV",
|
133 |
+
data=csv_data,
|
134 |
+
file_name="leaderboard.csv",
|
135 |
+
key="download-csv",
|
136 |
+
help="Click to download the CSV file",
|
137 |
+
)
|
138 |
+
|
139 |
+
# # Human curriculum
|
140 |
+
# score_columns = ['WorkArena++-L2', 'WorkArena++-L3']
|
141 |
+
# st.markdown('''
|
142 |
+
# ### Human subset results
|
143 |
+
# ''')
|
144 |
+
# full_df = pd.DataFrame.from_dict(all_results["workarena_human_curriculum"])
|
145 |
+
|
146 |
+
# df = pd.DataFrame(columns=full_df.columns)
|
147 |
+
|
148 |
+
# # Create a DataFrame based on selected filters
|
149 |
+
# dfs_to_concat = []
|
150 |
+
# dfs_to_concat.append(full_df)
|
151 |
+
|
152 |
+
# # Concatenate the DataFrames
|
153 |
+
# if dfs_to_concat:
|
154 |
+
# df = pd.concat(dfs_to_concat, ignore_index=True)
|
155 |
+
|
156 |
+
# # Sort values
|
157 |
+
# df = df.sort_values(by='WorkArena++-L2', ascending=False)
|
158 |
+
|
159 |
+
# # Display the filtered DataFrame or the entire leaderboard
|
160 |
+
# st.dataframe(
|
161 |
+
# df[['Model'] + score_columns],
|
162 |
+
# use_container_width=True,
|
163 |
+
# column_config={
|
164 |
+
# "WorkArena-L1": {'alignment': 'center'},
|
165 |
+
# "WorkArena++-L2": {'alignment': 'center'},
|
166 |
+
# "WorkArena++-L3": {'alignment': 'center'},
|
167 |
+
# "MiniWoB": {'alignment': 'center'},
|
168 |
+
# "WebArena": {'alignment': 'center'},
|
169 |
+
# },
|
170 |
+
# hide_index=True,
|
171 |
+
# # height=int(len(df) * 36.2),
|
172 |
+
# )
|
173 |
+
|
174 |
+
with tab2:
|
175 |
+
score_columns = ['Overall', 'Contextual Understanding', 'Data-driven Decision Making', 'Planning and Problem Solving', 'Information Retrieval', 'Sophisticated Memorization']
|
176 |
+
full_df = pd.DataFrame.from_dict(all_results["workarena_l2_agent_curriculum"])
|
177 |
+
|
178 |
+
df = pd.DataFrame(columns=full_df.columns)
|
179 |
+
dfs_to_concat = []
|
180 |
+
dfs_to_concat.append(full_df)
|
181 |
+
|
182 |
+
# Concatenate the DataFrames
|
183 |
+
if dfs_to_concat:
|
184 |
+
df = pd.concat(dfs_to_concat, ignore_index=True)
|
185 |
+
|
186 |
+
# Sort values
|
187 |
+
df = df.sort_values(by='Overall', ascending=False)
|
188 |
+
|
189 |
+
# Add a search bar
|
190 |
+
search_query = st.text_input("Search models", "", key="search_l2")
|
191 |
+
|
192 |
+
# Filter the DataFrame based on the search query
|
193 |
+
if search_query:
|
194 |
+
df = df[df['Model'].str.contains(search_query, case=False)]
|
195 |
+
|
196 |
+
# Display the filtered DataFrame or the entire leaderboard
|
197 |
+
st.dataframe(
|
198 |
+
df[['Model'] + score_columns],
|
199 |
+
use_container_width=True,
|
200 |
+
column_config={
|
201 |
+
"Overall": {'alignment': 'center'},
|
202 |
+
"Contextual Understanding": {'alignment': 'center'},
|
203 |
+
"Data-driven Decision Making": {'alignment': 'center'},
|
204 |
+
"Planning and Problem Solving": {'alignment': 'center'},
|
205 |
+
"Information Retrieval": {'alignment': 'center'},
|
206 |
+
"Sophisticated Memorization": {'alignment': 'center'},
|
207 |
+
},
|
208 |
+
hide_index=True,
|
209 |
+
# height=int(len(df) * 36.2),
|
210 |
+
)
|
211 |
+
|
212 |
+
# Comparison between models
|
213 |
+
selected_models = st.multiselect('Select models to compare', df['Model'].unique())
|
214 |
+
comparison_df = df[df['Model'].isin(selected_models)]
|
215 |
+
st.dataframe(
|
216 |
+
comparison_df.style.highlight_max(axis=0, subset=df.columns[1:]).format({"Overall": "{:.2f}".format, "Contextual Understanding": "{:.2f}".format, "Data-driven Decision Making": "{:.2f}".format, "Planning and Problem Solving": "{:.2f}".format, "Information Retrieval": "{:.2f}".format, "Sophisticated Memorization": "{:.2f}".format}),
|
217 |
+
use_container_width=True,
|
218 |
+
# column_config={
|
219 |
+
# "Overall": {'alignment': 'center'},
|
220 |
+
# "Contextual Understanding": {'alignment': 'center'},
|
221 |
+
# "Data-driven Decision Making": {'alignment': 'center'},
|
222 |
+
# "Planning and Problem Solving": {'alignment': 'center'},
|
223 |
+
# "Information Retrieval": {'alignment': 'center'},
|
224 |
+
# "Sophisticated Memorization": {'alignment': 'center'},
|
225 |
+
# },
|
226 |
+
hide_index=True,
|
227 |
+
)
|
228 |
+
|
229 |
+
# Add a button to export data to CSV
|
230 |
+
if st.button("Export to CSV", key="export_l2"):
|
231 |
+
# Export the DataFrame to CSV
|
232 |
+
csv_data = df.to_csv(index=False)
|
233 |
+
|
234 |
+
# Create a link to download the CSV file
|
235 |
+
st.download_button(
|
236 |
+
label="Download CSV",
|
237 |
+
data=csv_data,
|
238 |
+
file_name="leaderboard.csv",
|
239 |
+
key="download-csv",
|
240 |
+
help="Click to download the CSV file",
|
241 |
+
)
|
242 |
+
|
243 |
+
# Human curriculum
|
244 |
+
st.markdown('''
|
245 |
+
### Human subset results
|
246 |
+
''')
|
247 |
+
full_df = pd.DataFrame.from_dict(all_results["workarena_l2_human_curriculum"])
|
248 |
+
|
249 |
+
df = pd.DataFrame(columns=full_df.columns)
|
250 |
+
|
251 |
+
# Create a DataFrame based on selected filters
|
252 |
+
dfs_to_concat = []
|
253 |
+
dfs_to_concat.append(full_df)
|
254 |
+
|
255 |
+
# Concatenate the DataFrames
|
256 |
+
if dfs_to_concat:
|
257 |
+
df = pd.concat(dfs_to_concat, ignore_index=True)
|
258 |
+
|
259 |
+
# Sort values
|
260 |
+
df = df.sort_values(by='Overall', ascending=False)
|
261 |
+
|
262 |
+
# Display the filtered DataFrame or the entire leaderboard
|
263 |
+
st.dataframe(
|
264 |
+
df[['Model'] + score_columns],
|
265 |
+
use_container_width=True,
|
266 |
+
column_config={
|
267 |
+
"Overall": {'alignment': 'center'},
|
268 |
+
"Contextual Understanding": {'alignment': 'center'},
|
269 |
+
"Data-driven Decision Making": {'alignment': 'center'},
|
270 |
+
"Planning and Problem Solving": {'alignment': 'center'},
|
271 |
+
"Information Retrieval": {'alignment': 'center'},
|
272 |
+
"Sophisticated Memorization": {'alignment': 'center'},
|
273 |
+
},
|
274 |
+
hide_index=True,
|
275 |
+
# height=int(len(df) * 36.2),
|
276 |
+
)
|
277 |
+
|
278 |
+
with tab3:
|
279 |
+
score_columns = ['Overall', 'Contextual Understanding', 'Data-driven Decision Making', 'Planning and Problem Solving', 'Information Retrieval', 'Sophisticated Memorization']
|
280 |
+
full_df = pd.DataFrame.from_dict(all_results["workarena_l3_agent_curriculum"])
|
281 |
+
|
282 |
+
df = pd.DataFrame(columns=full_df.columns)
|
283 |
+
dfs_to_concat = []
|
284 |
+
dfs_to_concat.append(full_df)
|
285 |
+
|
286 |
+
# Concatenate the DataFrames
|
287 |
+
if dfs_to_concat:
|
288 |
+
df = pd.concat(dfs_to_concat, ignore_index=True)
|
289 |
+
|
290 |
+
# Sort values
|
291 |
+
df = df.sort_values(by='Overall', ascending=False)
|
292 |
+
|
293 |
+
# Add a search bar
|
294 |
+
search_query = st.text_input("Search models", "", key="search_l3")
|
295 |
+
|
296 |
+
# Filter the DataFrame based on the search query
|
297 |
+
if search_query:
|
298 |
+
df = df[df['Model'].str.contains(search_query, case=False)]
|
299 |
+
|
300 |
+
# Display the filtered DataFrame or the entire leaderboard
|
301 |
+
st.dataframe(
|
302 |
+
df[['Model'] + score_columns],
|
303 |
+
use_container_width=True,
|
304 |
+
column_config={
|
305 |
+
"Overall": {'alignment': 'center'},
|
306 |
+
"Contextual Understanding": {'alignment': 'center'},
|
307 |
+
"Data-driven Decision Making": {'alignment': 'center'},
|
308 |
+
"Planning and Problem Solving": {'alignment': 'center'},
|
309 |
+
"Information Retrieval": {'alignment': 'center'},
|
310 |
+
"Sophisticated Memorization": {'alignment': 'center'},
|
311 |
+
},
|
312 |
+
hide_index=True,
|
313 |
+
# height=int(len(df) * 36.2),
|
314 |
+
)
|
315 |
+
|
316 |
+
# Comparison between models
|
317 |
+
selected_models = st.multiselect('Select models to compare', df['Model'].unique())
|
318 |
+
comparison_df = df[df['Model'].isin(selected_models)]
|
319 |
+
st.dataframe(
|
320 |
+
comparison_df.style.highlight_max(axis=0, subset=df.columns[1:]).format({"Overall": "{:.2f}".format, "Contextual Understanding": "{:.2f}".format, "Data-driven Decision Making": "{:.2f}".format, "Planning and Problem Solving": "{:.2f}".format, "Information Retrieval": "{:.2f}".format, "Sophisticated Memorization": "{:.2f}".format}),
|
321 |
+
use_container_width=True,
|
322 |
+
# column_config={
|
323 |
+
# "Overall": {'alignment': 'center'},
|
324 |
+
# "Contextual Understanding": {'alignment': 'center'},
|
325 |
+
# "Data-driven Decision Making": {'alignment': 'center'},
|
326 |
+
# "Planning and Problem Solving": {'alignment': 'center'},
|
327 |
+
# "Information Retrieval": {'alignment': 'center'},
|
328 |
+
# "Sophisticated Memorization": {'alignment': 'center'},
|
329 |
+
# },
|
330 |
+
hide_index=True,
|
331 |
+
)
|
332 |
+
|
333 |
+
# Add a button to export data to CSV
|
334 |
+
if st.button("Export to CSV", key="export_l3"):
|
335 |
+
# Export the DataFrame to CSV
|
336 |
+
csv_data = df.to_csv(index=False)
|
337 |
+
|
338 |
+
# Create a link to download the CSV file
|
339 |
+
st.download_button(
|
340 |
+
label="Download CSV",
|
341 |
+
data=csv_data,
|
342 |
+
file_name="leaderboard.csv",
|
343 |
+
key="download-csv",
|
344 |
+
help="Click to download the CSV file",
|
345 |
+
)
|
346 |
+
|
347 |
+
# Human curriculum
|
348 |
+
st.markdown('''
|
349 |
+
### Human subset results
|
350 |
+
''')
|
351 |
+
full_df = pd.DataFrame.from_dict(all_results["workarena_l3_human_curriculum"])
|
352 |
+
|
353 |
+
df = pd.DataFrame(columns=full_df.columns)
|
354 |
+
|
355 |
+
# Create a DataFrame based on selected filters
|
356 |
+
dfs_to_concat = []
|
357 |
+
dfs_to_concat.append(full_df)
|
358 |
+
|
359 |
+
# Concatenate the DataFrames
|
360 |
+
if dfs_to_concat:
|
361 |
+
df = pd.concat(dfs_to_concat, ignore_index=True)
|
362 |
+
|
363 |
+
# Sort values
|
364 |
+
df = df.sort_values(by='Overall', ascending=False)
|
365 |
+
|
366 |
+
# Display the filtered DataFrame or the entire leaderboard
|
367 |
+
st.dataframe(
|
368 |
+
df[['Model'] + score_columns],
|
369 |
+
use_container_width=True,
|
370 |
+
column_config={
|
371 |
+
"Overall": {'alignment': 'center'},
|
372 |
+
"Contextual Understanding": {'alignment': 'center'},
|
373 |
+
"Data-driven Decision Making": {'alignment': 'center'},
|
374 |
+
"Planning and Problem Solving": {'alignment': 'center'},
|
375 |
+
"Information Retrieval": {'alignment': 'center'},
|
376 |
+
"Sophisticated Memorization": {'alignment': 'center'},
|
377 |
+
},
|
378 |
+
hide_index=True,
|
379 |
+
# height=int(len(df) * 36.2),
|
380 |
+
)
|
381 |
+
|
382 |
+
# About tab
|
383 |
+
with tab4:
|
384 |
+
st.markdown('''
|
385 |
+
### Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.
|
386 |
+
''')
|
387 |
+
|
388 |
+
if __name__ == "__main__":
|
389 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit==1.23
|
2 |
+
pandas
|
3 |
+
requests
|
4 |
+
plotly
|
5 |
+
gistyc
|
6 |
+
huggingface_hub
|
results.json
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"workarena_agent_curriculum": [
|
3 |
+
{
|
4 |
+
"Model": "GPT-3.5",
|
5 |
+
"WorkArena-L1": 6.1,
|
6 |
+
"WorkArena++-L2": 0.0,
|
7 |
+
"WorkArena++-L3": 0.0,
|
8 |
+
"MiniWoB": 43.4,
|
9 |
+
"WebArena": 6.7
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"Model": "GPT-4o",
|
13 |
+
"WorkArena-L1": 42.7,
|
14 |
+
"WorkArena++-L2": 3.0,
|
15 |
+
"WorkArena++-L3": 0.0,
|
16 |
+
"MiniWoB": 71.3,
|
17 |
+
"WebArena": 23.5
|
18 |
+
},
|
19 |
+
{
|
20 |
+
"Model": "GPT-4o-V",
|
21 |
+
"WorkArena-L1": 41.8,
|
22 |
+
"WorkArena++-L2": 3.8,
|
23 |
+
"WorkArena++-L3": 0.0,
|
24 |
+
"MiniWoB": 72.5,
|
25 |
+
"WebArena": 24.0
|
26 |
+
},
|
27 |
+
{
|
28 |
+
"Model": "LLaMA-3-70b",
|
29 |
+
"WorkArena-L1": 17.9,
|
30 |
+
"WorkArena++-L2": 0.0,
|
31 |
+
"WorkArena++-L3": 0.0,
|
32 |
+
"MiniWoB": 68.2,
|
33 |
+
"WebArena": 11.0
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"Model": "Mixtral-8x22b",
|
37 |
+
"WorkArena-L1": 12.4,
|
38 |
+
"WorkArena++-L2": 0.0,
|
39 |
+
"WorkArena++-L3": 0.0,
|
40 |
+
"MiniWoB": 62.4,
|
41 |
+
"WebArena": 12.6
|
42 |
+
}
|
43 |
+
],
|
44 |
+
"workarena_l2_agent_curriculum": [
|
45 |
+
{
|
46 |
+
"Model": "GPT-3.5",
|
47 |
+
"Overall": 0.0,
|
48 |
+
"Contextual Understanding": 0.0,
|
49 |
+
"Data-driven Decision Making": 0.0,
|
50 |
+
"Planning and Problem Solving": 0.0,
|
51 |
+
"Information Retrieval": 0.0,
|
52 |
+
"Sophisticated Memorization": 0.0
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"Model": "GPT-4o",
|
56 |
+
"Overall": 3.0,
|
57 |
+
"Contextual Understanding": 0.0,
|
58 |
+
"Data-driven Decision Making": 0.0,
|
59 |
+
"Planning and Problem Solving": 0.0,
|
60 |
+
"Information Retrieval": 0.0,
|
61 |
+
"Sophisticated Memorization": 14.6
|
62 |
+
},
|
63 |
+
{
|
64 |
+
"Model": "GPT-4o-V",
|
65 |
+
"Overall": 3.8,
|
66 |
+
"Contextual Understanding": 0.0,
|
67 |
+
"Data-driven Decision Making": 0.0,
|
68 |
+
"Planning and Problem Solving": 0.0,
|
69 |
+
"Information Retrieval": 3.6,
|
70 |
+
"Sophisticated Memorization": 14.6
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"Model": "LLaMA-3-70b",
|
74 |
+
"Overall": 0.0,
|
75 |
+
"Contextual Understanding": 0.0,
|
76 |
+
"Data-driven Decision Making": 0.0,
|
77 |
+
"Planning and Problem Solving": 0.0,
|
78 |
+
"Information Retrieval": 0.0,
|
79 |
+
"Sophisticated Memorization": 0.0
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"Model": "Mixtral-8x22b",
|
83 |
+
"Overall": 0.0,
|
84 |
+
"Contextual Understanding": 0.0,
|
85 |
+
"Data-driven Decision Making": 0.0,
|
86 |
+
"Planning and Problem Solving": 0.0,
|
87 |
+
"Information Retrieval": 0.0,
|
88 |
+
"Sophisticated Memorization": 0.0
|
89 |
+
}
|
90 |
+
],
|
91 |
+
"workarena_l2_human_curriculum": [
|
92 |
+
{
|
93 |
+
"Model": "Human",
|
94 |
+
"Overall": 93.9,
|
95 |
+
"Contextual Understanding": 100.0,
|
96 |
+
"Data-driven Decision Making": 84.6,
|
97 |
+
"Planning and Problem Solving": 100.0,
|
98 |
+
"Information Retrieval": 100.0,
|
99 |
+
"Sophisticated Memorization": 91.7
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"Model": "GPT-4o",
|
103 |
+
"Overall": 2.1,
|
104 |
+
"Contextual Understanding": 0.0,
|
105 |
+
"Data-driven Decision Making": 0.0,
|
106 |
+
"Planning and Problem Solving": 0.0,
|
107 |
+
"Information Retrieval": 0.0,
|
108 |
+
"Sophisticated Memorization": 8.3
|
109 |
+
}
|
110 |
+
],
|
111 |
+
"workarena_l3_agent_curriculum": [
|
112 |
+
{
|
113 |
+
"Model": "GPT-3.5",
|
114 |
+
"Overall": 0.0,
|
115 |
+
"Contextual Understanding": 0.0,
|
116 |
+
"Data-driven Decision Making": 0.0,
|
117 |
+
"Planning and Problem Solving": 0.0,
|
118 |
+
"Information Retrieval": 0.0,
|
119 |
+
"Sophisticated Memorization": 0.0
|
120 |
+
},
|
121 |
+
{
|
122 |
+
"Model": "GPT-4o",
|
123 |
+
"Overall": 0.0,
|
124 |
+
"Contextual Understanding": 0.0,
|
125 |
+
"Data-driven Decision Making": 0.0,
|
126 |
+
"Planning and Problem Solving": 0.0,
|
127 |
+
"Information Retrieval": 0.0,
|
128 |
+
"Sophisticated Memorization": 0.0
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"Model": "GPT-4o-V",
|
132 |
+
"Overall": 0.0,
|
133 |
+
"Contextual Understanding": 0.0,
|
134 |
+
"Data-driven Decision Making": 0.0,
|
135 |
+
"Planning and Problem Solving": 0.0,
|
136 |
+
"Information Retrieval": 0.0,
|
137 |
+
"Sophisticated Memorization": 0.0
|
138 |
+
},
|
139 |
+
{
|
140 |
+
"Model": "LLaMA-3-70b",
|
141 |
+
"Overall": 0.0,
|
142 |
+
"Contextual Understanding": 0.0,
|
143 |
+
"Data-driven Decision Making": 0.0,
|
144 |
+
"Planning and Problem Solving": 0.0,
|
145 |
+
"Information Retrieval": 0.0,
|
146 |
+
"Sophisticated Memorization": 0.0
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"Model": "Mixtral-8x22b",
|
150 |
+
"Overall": 0.0,
|
151 |
+
"Contextual Understanding": 0.0,
|
152 |
+
"Data-driven Decision Making": 0.0,
|
153 |
+
"Planning and Problem Solving": 0.0,
|
154 |
+
"Information Retrieval": 0.0,
|
155 |
+
"Sophisticated Memorization": 0.0
|
156 |
+
}
|
157 |
+
],
|
158 |
+
"workarena_l3_human_curriculum": [
|
159 |
+
{
|
160 |
+
"Model": "Human",
|
161 |
+
"Overall": 93.9,
|
162 |
+
"Contextual Understanding": 87.5,
|
163 |
+
"Data-driven Decision Making": 100.0,
|
164 |
+
"Planning and Problem Solving": 87.5,
|
165 |
+
"Information Retrieval": 100.0,
|
166 |
+
"Sophisticated Memorization": 91.7
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"Model": "GPT-4o",
|
170 |
+
"Overall": 0.0,
|
171 |
+
"Contextual Understanding": 0.0,
|
172 |
+
"Data-driven Decision Making": 0.0,
|
173 |
+
"Planning and Problem Solving": 0.0,
|
174 |
+
"Information Retrieval": 0.0,
|
175 |
+
"Sophisticated Memorization": 0.0
|
176 |
+
}
|
177 |
+
]
|
178 |
+
}
|