megh1211 commited on
Commit
a59bcfa
1 Parent(s): 29ea9a5
Files changed (5) hide show
  1. .gitignore +2 -0
  2. Dockerfile +11 -0
  3. app.py +389 -0
  4. requirements.txt +6 -0
  5. results.json +178 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__/
2
+ *.pyc
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+ COPY ./app.py /code/app.py
7
+ COPY ./results.json /code/results.json
8
+
9
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
10
+
11
+ CMD ["streamlit", "run", "/code/app.py", "--server.address", "0.0.0.0", "--server.port", "7860"]
app.py ADDED
@@ -0,0 +1,389 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ import streamlit as st
4
+ import requests
5
+ import pandas as pd
6
+ from io import StringIO
7
+ import plotly.graph_objs as go
8
+ from huggingface_hub import HfApi
9
+ from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
10
+
11
+ @st.cache_data
12
+ def get_model_info(df):
13
+ api = HfApi()
14
+
15
+ # Initialize new columns for likes and tags
16
+ df['Likes'] = None
17
+ df['Tags'] = None
18
+
19
+ # Iterate through DataFrame rows
20
+ for index, row in df.iterrows():
21
+ model = row['Model'].strip()
22
+ try:
23
+ model_info = api.model_info(repo_id=str(model))
24
+ df.loc[index, 'Likes'] = model_info.likes
25
+ df.loc[index, 'Tags'] = ', '.join(model_info.tags)
26
+
27
+ except (RepositoryNotFoundError, RevisionNotFoundError):
28
+ df.loc[index, 'Likes'] = -1
29
+ df.loc[index, 'Tags'] = ''
30
+
31
+ return df
32
+
33
+
34
+
35
+ def create_bar_chart(df, category):
36
+ """Create and display a bar chart for a given category."""
37
+ st.write(f"### {category} Scores")
38
+
39
+ # Sort the DataFrame based on the category score
40
+ sorted_df = df[['Model', category]].sort_values(by=category, ascending=True)
41
+
42
+ # Create the bar chart with a color gradient (using 'Viridis' color scale as an example)
43
+ fig = go.Figure(go.Bar(
44
+ x=sorted_df[category],
45
+ y=sorted_df['Model'],
46
+ orientation='h',
47
+ marker=dict(color=sorted_df[category], colorscale='Inferno')
48
+ ))
49
+
50
+ # Update layout for better readability
51
+ fig.update_layout(
52
+ margin=dict(l=20, r=20, t=20, b=20)
53
+ )
54
+
55
+ # Adjust the height of the chart based on the number of rows in the DataFrame
56
+ st.plotly_chart(fig, use_container_width=True, height=35)
57
+
58
+
59
+ def main():
60
+ st.set_page_config(page_title="WebAgent Leaderboard", layout="wide")
61
+
62
+ with open("results.json") as f:
63
+ all_results = json.load(f)
64
+
65
+ st.title("🏆 WebAgent Leaderboard")
66
+ st.markdown("Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.")
67
+ # content = create_yall()
68
+ tab1, tab2, tab3, tab4 = st.tabs(["🏆 WebAgent Leaderboard", "WorkArena++-L2 Leaderboard", "WorkArena++-L3 Leaderboard", "📝 About"])
69
+
70
+ # Leaderboard tab
71
+ with tab1:
72
+ score_columns = ['WorkArena-L1', 'WorkArena++-L2', 'WorkArena++-L3', 'MiniWoB', 'WebArena']
73
+ full_df = pd.DataFrame.from_dict(all_results["workarena_agent_curriculum"])
74
+
75
+ df = pd.DataFrame(columns=full_df.columns)
76
+ dfs_to_concat = []
77
+ dfs_to_concat.append(full_df)
78
+
79
+ # Concatenate the DataFrames
80
+ if dfs_to_concat:
81
+ df = pd.concat(dfs_to_concat, ignore_index=True)
82
+
83
+ df['Average'] = sum(df[column] for column in score_columns)/len(score_columns)
84
+ # Sort values
85
+ df = df.sort_values(by='Average', ascending=False)
86
+
87
+ # Add a search bar
88
+ search_query = st.text_input("Search models", "", key="search_main")
89
+
90
+ # Filter the DataFrame based on the search query
91
+ if search_query:
92
+ df = df[df['Model'].str.contains(search_query, case=False)]
93
+
94
+ # Display the filtered DataFrame or the entire leaderboard
95
+ st.dataframe(
96
+ df[['Model'] + score_columns + ['Average']],
97
+ use_container_width=True,
98
+ column_config={
99
+ "WorkArena-L1": {'alignment': 'center'},
100
+ "WorkArena++-L2": {'alignment': 'center'},
101
+ "WorkArena++-L3": {'alignment': 'center'},
102
+ "MiniWoB": {'alignment': 'center'},
103
+ "WebArena": {'alignment': 'center'},
104
+ },
105
+ hide_index=True,
106
+ # height=int(len(df) * 36.2),
107
+ )
108
+
109
+ # Comparison between models
110
+ selected_models = st.multiselect('Select models to compare', df['Model'].unique())
111
+ comparison_df = df[df['Model'].isin(selected_models)]
112
+ st.dataframe(
113
+ comparison_df.style.highlight_max(axis=0, subset=df.columns[1:]).format({"WorkArena-L1": "{:.2f}".format, "MiniWoB": "{:.2f}".format, "WorkArena++-L2": "{:.2f}".format, "WorkArena++-L3": "{:.2f}".format, "WebArena": "{:.2f}".format}),
114
+ use_container_width=True,
115
+ # column_config={
116
+ # "L1": {'alignment': 'center'},
117
+ # "L2-Memory": {'alignment': 'center'},
118
+ # "L2-Retrieval": {'alignment': 'center'},
119
+ # "L3-Memory": {'alignment': 'center'},
120
+ # "L3-Retrieval": {'alignment': 'center'},
121
+ # },
122
+ hide_index=True,
123
+ )
124
+
125
+ # Add a button to export data to CSV
126
+ if st.button("Export to CSV", key="export_main"):
127
+ # Export the DataFrame to CSV
128
+ csv_data = df.to_csv(index=False)
129
+
130
+ # Create a link to download the CSV file
131
+ st.download_button(
132
+ label="Download CSV",
133
+ data=csv_data,
134
+ file_name="leaderboard.csv",
135
+ key="download-csv",
136
+ help="Click to download the CSV file",
137
+ )
138
+
139
+ # # Human curriculum
140
+ # score_columns = ['WorkArena++-L2', 'WorkArena++-L3']
141
+ # st.markdown('''
142
+ # ### Human subset results
143
+ # ''')
144
+ # full_df = pd.DataFrame.from_dict(all_results["workarena_human_curriculum"])
145
+
146
+ # df = pd.DataFrame(columns=full_df.columns)
147
+
148
+ # # Create a DataFrame based on selected filters
149
+ # dfs_to_concat = []
150
+ # dfs_to_concat.append(full_df)
151
+
152
+ # # Concatenate the DataFrames
153
+ # if dfs_to_concat:
154
+ # df = pd.concat(dfs_to_concat, ignore_index=True)
155
+
156
+ # # Sort values
157
+ # df = df.sort_values(by='WorkArena++-L2', ascending=False)
158
+
159
+ # # Display the filtered DataFrame or the entire leaderboard
160
+ # st.dataframe(
161
+ # df[['Model'] + score_columns],
162
+ # use_container_width=True,
163
+ # column_config={
164
+ # "WorkArena-L1": {'alignment': 'center'},
165
+ # "WorkArena++-L2": {'alignment': 'center'},
166
+ # "WorkArena++-L3": {'alignment': 'center'},
167
+ # "MiniWoB": {'alignment': 'center'},
168
+ # "WebArena": {'alignment': 'center'},
169
+ # },
170
+ # hide_index=True,
171
+ # # height=int(len(df) * 36.2),
172
+ # )
173
+
174
+ with tab2:
175
+ score_columns = ['Overall', 'Contextual Understanding', 'Data-driven Decision Making', 'Planning and Problem Solving', 'Information Retrieval', 'Sophisticated Memorization']
176
+ full_df = pd.DataFrame.from_dict(all_results["workarena_l2_agent_curriculum"])
177
+
178
+ df = pd.DataFrame(columns=full_df.columns)
179
+ dfs_to_concat = []
180
+ dfs_to_concat.append(full_df)
181
+
182
+ # Concatenate the DataFrames
183
+ if dfs_to_concat:
184
+ df = pd.concat(dfs_to_concat, ignore_index=True)
185
+
186
+ # Sort values
187
+ df = df.sort_values(by='Overall', ascending=False)
188
+
189
+ # Add a search bar
190
+ search_query = st.text_input("Search models", "", key="search_l2")
191
+
192
+ # Filter the DataFrame based on the search query
193
+ if search_query:
194
+ df = df[df['Model'].str.contains(search_query, case=False)]
195
+
196
+ # Display the filtered DataFrame or the entire leaderboard
197
+ st.dataframe(
198
+ df[['Model'] + score_columns],
199
+ use_container_width=True,
200
+ column_config={
201
+ "Overall": {'alignment': 'center'},
202
+ "Contextual Understanding": {'alignment': 'center'},
203
+ "Data-driven Decision Making": {'alignment': 'center'},
204
+ "Planning and Problem Solving": {'alignment': 'center'},
205
+ "Information Retrieval": {'alignment': 'center'},
206
+ "Sophisticated Memorization": {'alignment': 'center'},
207
+ },
208
+ hide_index=True,
209
+ # height=int(len(df) * 36.2),
210
+ )
211
+
212
+ # Comparison between models
213
+ selected_models = st.multiselect('Select models to compare', df['Model'].unique())
214
+ comparison_df = df[df['Model'].isin(selected_models)]
215
+ st.dataframe(
216
+ comparison_df.style.highlight_max(axis=0, subset=df.columns[1:]).format({"Overall": "{:.2f}".format, "Contextual Understanding": "{:.2f}".format, "Data-driven Decision Making": "{:.2f}".format, "Planning and Problem Solving": "{:.2f}".format, "Information Retrieval": "{:.2f}".format, "Sophisticated Memorization": "{:.2f}".format}),
217
+ use_container_width=True,
218
+ # column_config={
219
+ # "Overall": {'alignment': 'center'},
220
+ # "Contextual Understanding": {'alignment': 'center'},
221
+ # "Data-driven Decision Making": {'alignment': 'center'},
222
+ # "Planning and Problem Solving": {'alignment': 'center'},
223
+ # "Information Retrieval": {'alignment': 'center'},
224
+ # "Sophisticated Memorization": {'alignment': 'center'},
225
+ # },
226
+ hide_index=True,
227
+ )
228
+
229
+ # Add a button to export data to CSV
230
+ if st.button("Export to CSV", key="export_l2"):
231
+ # Export the DataFrame to CSV
232
+ csv_data = df.to_csv(index=False)
233
+
234
+ # Create a link to download the CSV file
235
+ st.download_button(
236
+ label="Download CSV",
237
+ data=csv_data,
238
+ file_name="leaderboard.csv",
239
+ key="download-csv",
240
+ help="Click to download the CSV file",
241
+ )
242
+
243
+ # Human curriculum
244
+ st.markdown('''
245
+ ### Human subset results
246
+ ''')
247
+ full_df = pd.DataFrame.from_dict(all_results["workarena_l2_human_curriculum"])
248
+
249
+ df = pd.DataFrame(columns=full_df.columns)
250
+
251
+ # Create a DataFrame based on selected filters
252
+ dfs_to_concat = []
253
+ dfs_to_concat.append(full_df)
254
+
255
+ # Concatenate the DataFrames
256
+ if dfs_to_concat:
257
+ df = pd.concat(dfs_to_concat, ignore_index=True)
258
+
259
+ # Sort values
260
+ df = df.sort_values(by='Overall', ascending=False)
261
+
262
+ # Display the filtered DataFrame or the entire leaderboard
263
+ st.dataframe(
264
+ df[['Model'] + score_columns],
265
+ use_container_width=True,
266
+ column_config={
267
+ "Overall": {'alignment': 'center'},
268
+ "Contextual Understanding": {'alignment': 'center'},
269
+ "Data-driven Decision Making": {'alignment': 'center'},
270
+ "Planning and Problem Solving": {'alignment': 'center'},
271
+ "Information Retrieval": {'alignment': 'center'},
272
+ "Sophisticated Memorization": {'alignment': 'center'},
273
+ },
274
+ hide_index=True,
275
+ # height=int(len(df) * 36.2),
276
+ )
277
+
278
+ with tab3:
279
+ score_columns = ['Overall', 'Contextual Understanding', 'Data-driven Decision Making', 'Planning and Problem Solving', 'Information Retrieval', 'Sophisticated Memorization']
280
+ full_df = pd.DataFrame.from_dict(all_results["workarena_l3_agent_curriculum"])
281
+
282
+ df = pd.DataFrame(columns=full_df.columns)
283
+ dfs_to_concat = []
284
+ dfs_to_concat.append(full_df)
285
+
286
+ # Concatenate the DataFrames
287
+ if dfs_to_concat:
288
+ df = pd.concat(dfs_to_concat, ignore_index=True)
289
+
290
+ # Sort values
291
+ df = df.sort_values(by='Overall', ascending=False)
292
+
293
+ # Add a search bar
294
+ search_query = st.text_input("Search models", "", key="search_l3")
295
+
296
+ # Filter the DataFrame based on the search query
297
+ if search_query:
298
+ df = df[df['Model'].str.contains(search_query, case=False)]
299
+
300
+ # Display the filtered DataFrame or the entire leaderboard
301
+ st.dataframe(
302
+ df[['Model'] + score_columns],
303
+ use_container_width=True,
304
+ column_config={
305
+ "Overall": {'alignment': 'center'},
306
+ "Contextual Understanding": {'alignment': 'center'},
307
+ "Data-driven Decision Making": {'alignment': 'center'},
308
+ "Planning and Problem Solving": {'alignment': 'center'},
309
+ "Information Retrieval": {'alignment': 'center'},
310
+ "Sophisticated Memorization": {'alignment': 'center'},
311
+ },
312
+ hide_index=True,
313
+ # height=int(len(df) * 36.2),
314
+ )
315
+
316
+ # Comparison between models
317
+ selected_models = st.multiselect('Select models to compare', df['Model'].unique())
318
+ comparison_df = df[df['Model'].isin(selected_models)]
319
+ st.dataframe(
320
+ comparison_df.style.highlight_max(axis=0, subset=df.columns[1:]).format({"Overall": "{:.2f}".format, "Contextual Understanding": "{:.2f}".format, "Data-driven Decision Making": "{:.2f}".format, "Planning and Problem Solving": "{:.2f}".format, "Information Retrieval": "{:.2f}".format, "Sophisticated Memorization": "{:.2f}".format}),
321
+ use_container_width=True,
322
+ # column_config={
323
+ # "Overall": {'alignment': 'center'},
324
+ # "Contextual Understanding": {'alignment': 'center'},
325
+ # "Data-driven Decision Making": {'alignment': 'center'},
326
+ # "Planning and Problem Solving": {'alignment': 'center'},
327
+ # "Information Retrieval": {'alignment': 'center'},
328
+ # "Sophisticated Memorization": {'alignment': 'center'},
329
+ # },
330
+ hide_index=True,
331
+ )
332
+
333
+ # Add a button to export data to CSV
334
+ if st.button("Export to CSV", key="export_l3"):
335
+ # Export the DataFrame to CSV
336
+ csv_data = df.to_csv(index=False)
337
+
338
+ # Create a link to download the CSV file
339
+ st.download_button(
340
+ label="Download CSV",
341
+ data=csv_data,
342
+ file_name="leaderboard.csv",
343
+ key="download-csv",
344
+ help="Click to download the CSV file",
345
+ )
346
+
347
+ # Human curriculum
348
+ st.markdown('''
349
+ ### Human subset results
350
+ ''')
351
+ full_df = pd.DataFrame.from_dict(all_results["workarena_l3_human_curriculum"])
352
+
353
+ df = pd.DataFrame(columns=full_df.columns)
354
+
355
+ # Create a DataFrame based on selected filters
356
+ dfs_to_concat = []
357
+ dfs_to_concat.append(full_df)
358
+
359
+ # Concatenate the DataFrames
360
+ if dfs_to_concat:
361
+ df = pd.concat(dfs_to_concat, ignore_index=True)
362
+
363
+ # Sort values
364
+ df = df.sort_values(by='Overall', ascending=False)
365
+
366
+ # Display the filtered DataFrame or the entire leaderboard
367
+ st.dataframe(
368
+ df[['Model'] + score_columns],
369
+ use_container_width=True,
370
+ column_config={
371
+ "Overall": {'alignment': 'center'},
372
+ "Contextual Understanding": {'alignment': 'center'},
373
+ "Data-driven Decision Making": {'alignment': 'center'},
374
+ "Planning and Problem Solving": {'alignment': 'center'},
375
+ "Information Retrieval": {'alignment': 'center'},
376
+ "Sophisticated Memorization": {'alignment': 'center'},
377
+ },
378
+ hide_index=True,
379
+ # height=int(len(df) * 36.2),
380
+ )
381
+
382
+ # About tab
383
+ with tab4:
384
+ st.markdown('''
385
+ ### Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.
386
+ ''')
387
+
388
+ if __name__ == "__main__":
389
+ main()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit==1.23
2
+ pandas
3
+ requests
4
+ plotly
5
+ gistyc
6
+ huggingface_hub
results.json ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "workarena_agent_curriculum": [
3
+ {
4
+ "Model": "GPT-3.5",
5
+ "WorkArena-L1": 6.1,
6
+ "WorkArena++-L2": 0.0,
7
+ "WorkArena++-L3": 0.0,
8
+ "MiniWoB": 43.4,
9
+ "WebArena": 6.7
10
+ },
11
+ {
12
+ "Model": "GPT-4o",
13
+ "WorkArena-L1": 42.7,
14
+ "WorkArena++-L2": 3.0,
15
+ "WorkArena++-L3": 0.0,
16
+ "MiniWoB": 71.3,
17
+ "WebArena": 23.5
18
+ },
19
+ {
20
+ "Model": "GPT-4o-V",
21
+ "WorkArena-L1": 41.8,
22
+ "WorkArena++-L2": 3.8,
23
+ "WorkArena++-L3": 0.0,
24
+ "MiniWoB": 72.5,
25
+ "WebArena": 24.0
26
+ },
27
+ {
28
+ "Model": "LLaMA-3-70b",
29
+ "WorkArena-L1": 17.9,
30
+ "WorkArena++-L2": 0.0,
31
+ "WorkArena++-L3": 0.0,
32
+ "MiniWoB": 68.2,
33
+ "WebArena": 11.0
34
+ },
35
+ {
36
+ "Model": "Mixtral-8x22b",
37
+ "WorkArena-L1": 12.4,
38
+ "WorkArena++-L2": 0.0,
39
+ "WorkArena++-L3": 0.0,
40
+ "MiniWoB": 62.4,
41
+ "WebArena": 12.6
42
+ }
43
+ ],
44
+ "workarena_l2_agent_curriculum": [
45
+ {
46
+ "Model": "GPT-3.5",
47
+ "Overall": 0.0,
48
+ "Contextual Understanding": 0.0,
49
+ "Data-driven Decision Making": 0.0,
50
+ "Planning and Problem Solving": 0.0,
51
+ "Information Retrieval": 0.0,
52
+ "Sophisticated Memorization": 0.0
53
+ },
54
+ {
55
+ "Model": "GPT-4o",
56
+ "Overall": 3.0,
57
+ "Contextual Understanding": 0.0,
58
+ "Data-driven Decision Making": 0.0,
59
+ "Planning and Problem Solving": 0.0,
60
+ "Information Retrieval": 0.0,
61
+ "Sophisticated Memorization": 14.6
62
+ },
63
+ {
64
+ "Model": "GPT-4o-V",
65
+ "Overall": 3.8,
66
+ "Contextual Understanding": 0.0,
67
+ "Data-driven Decision Making": 0.0,
68
+ "Planning and Problem Solving": 0.0,
69
+ "Information Retrieval": 3.6,
70
+ "Sophisticated Memorization": 14.6
71
+ },
72
+ {
73
+ "Model": "LLaMA-3-70b",
74
+ "Overall": 0.0,
75
+ "Contextual Understanding": 0.0,
76
+ "Data-driven Decision Making": 0.0,
77
+ "Planning and Problem Solving": 0.0,
78
+ "Information Retrieval": 0.0,
79
+ "Sophisticated Memorization": 0.0
80
+ },
81
+ {
82
+ "Model": "Mixtral-8x22b",
83
+ "Overall": 0.0,
84
+ "Contextual Understanding": 0.0,
85
+ "Data-driven Decision Making": 0.0,
86
+ "Planning and Problem Solving": 0.0,
87
+ "Information Retrieval": 0.0,
88
+ "Sophisticated Memorization": 0.0
89
+ }
90
+ ],
91
+ "workarena_l2_human_curriculum": [
92
+ {
93
+ "Model": "Human",
94
+ "Overall": 93.9,
95
+ "Contextual Understanding": 100.0,
96
+ "Data-driven Decision Making": 84.6,
97
+ "Planning and Problem Solving": 100.0,
98
+ "Information Retrieval": 100.0,
99
+ "Sophisticated Memorization": 91.7
100
+ },
101
+ {
102
+ "Model": "GPT-4o",
103
+ "Overall": 2.1,
104
+ "Contextual Understanding": 0.0,
105
+ "Data-driven Decision Making": 0.0,
106
+ "Planning and Problem Solving": 0.0,
107
+ "Information Retrieval": 0.0,
108
+ "Sophisticated Memorization": 8.3
109
+ }
110
+ ],
111
+ "workarena_l3_agent_curriculum": [
112
+ {
113
+ "Model": "GPT-3.5",
114
+ "Overall": 0.0,
115
+ "Contextual Understanding": 0.0,
116
+ "Data-driven Decision Making": 0.0,
117
+ "Planning and Problem Solving": 0.0,
118
+ "Information Retrieval": 0.0,
119
+ "Sophisticated Memorization": 0.0
120
+ },
121
+ {
122
+ "Model": "GPT-4o",
123
+ "Overall": 0.0,
124
+ "Contextual Understanding": 0.0,
125
+ "Data-driven Decision Making": 0.0,
126
+ "Planning and Problem Solving": 0.0,
127
+ "Information Retrieval": 0.0,
128
+ "Sophisticated Memorization": 0.0
129
+ },
130
+ {
131
+ "Model": "GPT-4o-V",
132
+ "Overall": 0.0,
133
+ "Contextual Understanding": 0.0,
134
+ "Data-driven Decision Making": 0.0,
135
+ "Planning and Problem Solving": 0.0,
136
+ "Information Retrieval": 0.0,
137
+ "Sophisticated Memorization": 0.0
138
+ },
139
+ {
140
+ "Model": "LLaMA-3-70b",
141
+ "Overall": 0.0,
142
+ "Contextual Understanding": 0.0,
143
+ "Data-driven Decision Making": 0.0,
144
+ "Planning and Problem Solving": 0.0,
145
+ "Information Retrieval": 0.0,
146
+ "Sophisticated Memorization": 0.0
147
+ },
148
+ {
149
+ "Model": "Mixtral-8x22b",
150
+ "Overall": 0.0,
151
+ "Contextual Understanding": 0.0,
152
+ "Data-driven Decision Making": 0.0,
153
+ "Planning and Problem Solving": 0.0,
154
+ "Information Retrieval": 0.0,
155
+ "Sophisticated Memorization": 0.0
156
+ }
157
+ ],
158
+ "workarena_l3_human_curriculum": [
159
+ {
160
+ "Model": "Human",
161
+ "Overall": 93.9,
162
+ "Contextual Understanding": 87.5,
163
+ "Data-driven Decision Making": 100.0,
164
+ "Planning and Problem Solving": 87.5,
165
+ "Information Retrieval": 100.0,
166
+ "Sophisticated Memorization": 91.7
167
+ },
168
+ {
169
+ "Model": "GPT-4o",
170
+ "Overall": 0.0,
171
+ "Contextual Understanding": 0.0,
172
+ "Data-driven Decision Making": 0.0,
173
+ "Planning and Problem Solving": 0.0,
174
+ "Information Retrieval": 0.0,
175
+ "Sophisticated Memorization": 0.0
176
+ }
177
+ ]
178
+ }