meghsn commited on
Commit
51b9b31
·
1 Parent(s): f4d95d8

Updated latest results

Browse files
Files changed (32) hide show
  1. app.py +20 -5
  2. results/Bgym-Claude-3.5-Sonnet/assistantbench.json +16 -0
  3. results/Bgym-Claude-3.5-Sonnet/miniwob.json +16 -0
  4. results/Bgym-Claude-3.5-Sonnet/webarena.json +16 -0
  5. results/Bgym-Claude-3.5-Sonnet/weblinx.json +16 -0
  6. results/Bgym-GPT-4o-mini/assistantbench.json +16 -0
  7. results/Bgym-GPT-4o-mini/miniwob.json +2 -2
  8. results/Bgym-GPT-4o-mini/webarena.json +16 -0
  9. results/Bgym-GPT-4o-mini/weblinx.json +16 -0
  10. results/Bgym-GPT-4o-mini/workarena-l3.json +16 -0
  11. results/Bgym-GPT-4o/assistantbench.json +16 -0
  12. results/Bgym-GPT-4o/miniwob.json +1 -1
  13. results/Bgym-GPT-4o/webarena.json +2 -2
  14. results/Bgym-GPT-4o/weblinx.json +16 -0
  15. results/Bgym-GPT-o1-mini/assistantbench.json +16 -0
  16. results/Bgym-GPT-o1-mini/miniwob.json +16 -0
  17. results/Bgym-GPT-o1-mini/webarena.json +16 -0
  18. results/Bgym-GPT-o1-mini/weblinx.json +16 -0
  19. results/Bgym-GPT-o1-mini/workarena-l3.json +16 -0
  20. results/Bgym-Llama-3.1-405b/README.md +1 -0
  21. results/Bgym-Llama-3.1-405b/assistantbench.json +16 -0
  22. results/Bgym-Llama-3.1-405b/miniwob.json +16 -0
  23. results/Bgym-Llama-3.1-405b/webarena.json +16 -0
  24. results/Bgym-Llama-3.1-405b/weblinx.json +16 -0
  25. results/Bgym-Llama-3.1-405b/workarena-l1.json +16 -0
  26. results/Bgym-Llama-3.1-405b/workarena-l2.json +16 -0
  27. results/Bgym-Llama-3.1-405b/workarena-l3.json +16 -0
  28. results/Bgym-Llama-3.1-70b/assistantbench.json +16 -0
  29. results/Bgym-Llama-3.1-70b/miniwob.json +16 -0
  30. results/Bgym-Llama-3.1-70b/webarena.json +16 -0
  31. results/Bgym-Llama-3.1-70b/weblinx.json +16 -0
  32. results/Bgym-Llama-3.1-70b/workarena-l3.json +16 -0
app.py CHANGED
@@ -44,9 +44,11 @@ def sanitize_column_name(col: str) -> str:
44
  return html.escape(str(col))
45
 
46
  def sanitize_cell_value(value: Any) -> str:
47
- """Sanitize cell values for HTML display"""
48
  if isinstance(value, (int, float)):
49
  return str(value)
 
 
 
50
  return html.escape(str(value))
51
 
52
  def create_html_table_main(df):
@@ -169,8 +171,9 @@ def create_html_table_benchmark(df, benchmark):
169
  html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
170
  elif column == "Reproduced_all":
171
  continue
172
- # elif column == "Score":
173
- # html += f'<td>{row[column]}</td>'
 
174
  else:
175
  html += f'<td>{sanitize_cell_value(row[column])}</td>'
176
  html += '</tr>'
@@ -205,6 +208,19 @@ def check_sanity(agent):
205
 
206
  def main():
207
  st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded")
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  st.markdown("""
209
  <head>
210
  <meta http-equiv="Content-Security-Policy"
@@ -477,9 +493,8 @@ MIT
477
  if dfs_to_concat:
478
  df_ = pd.concat(dfs_to_concat, ignore_index=True)
479
  df_['Score'] = df_['Score'].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
 
480
  df_['Score'] = df_['Score'].astype(str)
481
- df_['Score'] = df_.apply(lambda row: f"{row['Score']} ± {row['std_err']}", axis=1)
482
- df_ = df_.drop(columns=['std_err'])
483
  html_table = create_html_table_benchmark(df_, benchmark)
484
  st.markdown(html_table, unsafe_allow_html=True)
485
 
 
44
  return html.escape(str(col))
45
 
46
  def sanitize_cell_value(value: Any) -> str:
 
47
  if isinstance(value, (int, float)):
48
  return str(value)
49
+ if isinstance(value, str) and '±' in value:
50
+ score, std_err = value.split('±')
51
+ return f'{score.strip()} <span style="font-size: smaller; color: var(--lighter-color);">±{std_err.strip()}</span>'
52
  return html.escape(str(value))
53
 
54
  def create_html_table_main(df):
 
171
  html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
172
  elif column == "Reproduced_all":
173
  continue
174
+ elif column == "Score":
175
+ score_with_std_err = f'{row[column]} ± {row["std_err"]}'
176
+ html += f'<td>{sanitize_cell_value(score_with_std_err)}</td>'
177
  else:
178
  html += f'<td>{sanitize_cell_value(row[column])}</td>'
179
  html += '</tr>'
 
208
 
209
  def main():
210
  st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded")
211
+ st.markdown("""
212
+ <style>
213
+ :root {
214
+ --lighter-color: #888; /* Default for light theme */
215
+ }
216
+ @media (prefers-color-scheme: dark) {
217
+ :root {
218
+ --lighter-color: #ccc; /* Default for dark theme */
219
+ }
220
+ }
221
+ </style>
222
+ """, unsafe_allow_html=True)
223
+
224
  st.markdown("""
225
  <head>
226
  <meta http-equiv="Content-Security-Policy"
 
493
  if dfs_to_concat:
494
  df_ = pd.concat(dfs_to_concat, ignore_index=True)
495
  df_['Score'] = df_['Score'].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
496
+ df_['std_err'] = df_['std_err'].apply(lambda x: f"{x:.1f}" if x != "-" else "-")
497
  df_['Score'] = df_['Score'].astype(str)
 
 
498
  html_table = create_html_table_benchmark(df_, benchmark)
499
  st.markdown(html_table, unsafe_allow_html=True)
500
 
results/Bgym-Claude-3.5-Sonnet/assistantbench.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Claude-3.5-Sonnet",
4
+ "study_id": "study_id",
5
+ "benchmark": "AssistantBench",
6
+ "score": 5.2,
7
+ "std_err": 1.5,
8
+ "benchmark_specific": "No",
9
+ "benchmark_tuned": "No",
10
+ "followed_evaluation_protocol": "Yes",
11
+ "reproducible": "Yes",
12
+ "comments": "NA",
13
+ "original_or_reproduced": "Original",
14
+ "date_time": "2021-01-01 12:00:00"
15
+ }
16
+ ]
results/Bgym-Claude-3.5-Sonnet/miniwob.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Claude-3.5-Sonnet",
4
+ "study_id": "study_id",
5
+ "benchmark": "MiniWoB",
6
+ "score": 69.8,
7
+ "std_err": 1.8,
8
+ "benchmark_specific": "No",
9
+ "benchmark_tuned": "No",
10
+ "followed_evaluation_protocol": "Yes",
11
+ "reproducible": "Yes",
12
+ "comments": "NA",
13
+ "original_or_reproduced": "Original",
14
+ "date_time": "2021-01-01 12:00:00"
15
+ }
16
+ ]
results/Bgym-Claude-3.5-Sonnet/webarena.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Claude-3.5-Sonnet",
4
+ "study_id": "study_id",
5
+ "benchmark": "WebArena",
6
+ "score": 36.2,
7
+ "std_err": 1.7,
8
+ "benchmark_specific": "No",
9
+ "benchmark_tuned": "No",
10
+ "followed_evaluation_protocol": "Yes",
11
+ "reproducible": "Yes",
12
+ "comments": "NA",
13
+ "original_or_reproduced": "Original",
14
+ "date_time": "2021-01-01 12:00:00"
15
+ }
16
+ ]
results/Bgym-Claude-3.5-Sonnet/weblinx.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Claude-3.5-Sonnet",
4
+ "study_id": "study_id",
5
+ "benchmark": "WebLINX",
6
+ "score": 13.7,
7
+ "std_err": 0.6,
8
+ "benchmark_specific": "No",
9
+ "benchmark_tuned": "No",
10
+ "followed_evaluation_protocol": "Yes",
11
+ "reproducible": "Yes",
12
+ "comments": "NA",
13
+ "original_or_reproduced": "Original",
14
+ "date_time": "2021-01-01 12:00:00"
15
+ }
16
+ ]
results/Bgym-GPT-4o-mini/assistantbench.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-4o-mini",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "AssistantBench",
7
+ "score": 2.1,
8
+ "std_err": 1.0,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-4o-mini/miniwob.json CHANGED
@@ -4,8 +4,8 @@
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "MiniWoB",
7
- "score": 58.8,
8
- "std_err": 1.4,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
 
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "MiniWoB",
7
+ "score": 56.6,
8
+ "std_err": 2.0,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
results/Bgym-GPT-4o-mini/webarena.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-4o-mini",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WebArena",
7
+ "score": 17.4,
8
+ "std_err": 1.3,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-4o-mini/weblinx.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-4o-mini",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WebLINX",
7
+ "score": 11.6,
8
+ "std_err": 0.6,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-4o-mini/workarena-l3.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-4o-mini",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WorkArena-L3",
7
+ "score": 0.0,
8
+ "std_err": 0.0,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-4o/assistantbench.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-4o",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "AssistantBench",
7
+ "score": 4.8,
8
+ "std_err": 2.4,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-4o/miniwob.json CHANGED
@@ -4,7 +4,7 @@
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "MiniWoB",
7
- "score": 65.6,
8
  "std_err": 1.9,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
 
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "MiniWoB",
7
+ "score": 63.8,
8
  "std_err": 1.9,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
results/Bgym-GPT-4o/webarena.json CHANGED
@@ -4,8 +4,8 @@
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WebArena",
7
- "score": 23.5,
8
- "std_err": 0.4,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
 
4
  "study_id": "study_id",
5
  "date_time": "2021-01-01 12:00:00",
6
  "benchmark": "WebArena",
7
+ "score": 31.4,
8
+ "std_err": 1.6,
9
  "benchmark_specific": "No",
10
  "benchmark_tuned": "No",
11
  "followed_evaluation_protocol": "Yes",
results/Bgym-GPT-4o/weblinx.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-4o",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WebLINX",
7
+ "score": 12.5,
8
+ "std_err": 0.6,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-o1-mini/assistantbench.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-o1-mini",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "AssistantBench",
7
+ "score": 6.9,
8
+ "std_err": 2.2,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-o1-mini/miniwob.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-o1-mini",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "MiniWoB",
7
+ "score": 67.8,
8
+ "std_err": 1.9,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-o1-mini/webarena.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-o1-mini",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WebArena",
7
+ "score": 28.6,
8
+ "std_err": 1.6,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-o1-mini/weblinx.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-o1-mini",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WebLINX",
7
+ "score": 12.5,
8
+ "std_err": 0.6,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-o1-mini/workarena-l3.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-o1-mini",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WorkArena-L3",
7
+ "score": 0.0,
8
+ "std_err": 0.0,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-Llama-3.1-405b/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ ### Llama-3.1-405B
results/Bgym-Llama-3.1-405b/assistantbench.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Llama-3.1-405b",
4
+ "study_id": "study_id",
5
+ "benchmark": "AssistantBench",
6
+ "score": 3.9,
7
+ "std_err": 1.0,
8
+ "benchmark_specific": "No",
9
+ "benchmark_tuned": "No",
10
+ "followed_evaluation_protocol": "Yes",
11
+ "reproducible": "Yes",
12
+ "comments": "NA",
13
+ "original_or_reproduced": "Original",
14
+ "date_time": "2021-01-01 12:00:00"
15
+ }
16
+ ]
results/Bgym-Llama-3.1-405b/miniwob.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Llama-3.1-405b",
4
+ "study_id": "study_id",
5
+ "benchmark": "MiniWoB",
6
+ "score": 64.6,
7
+ "std_err": 1.9,
8
+ "benchmark_specific": "No",
9
+ "benchmark_tuned": "No",
10
+ "followed_evaluation_protocol": "Yes",
11
+ "reproducible": "Yes",
12
+ "comments": "NA",
13
+ "original_or_reproduced": "Original",
14
+ "date_time": "2021-01-01 12:00:00"
15
+ }
16
+ ]
results/Bgym-Llama-3.1-405b/webarena.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Llama-3.1-405b",
4
+ "study_id": "study_id",
5
+ "benchmark": "WebArena",
6
+ "score": 24.0,
7
+ "std_err": 1.5,
8
+ "benchmark_specific": "No",
9
+ "benchmark_tuned": "No",
10
+ "followed_evaluation_protocol": "Yes",
11
+ "reproducible": "Yes",
12
+ "comments": "NA",
13
+ "original_or_reproduced": "Original",
14
+ "date_time": "2021-01-01 12:00:00"
15
+ }
16
+ ]
results/Bgym-Llama-3.1-405b/weblinx.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Llama-3.1-405b",
4
+ "study_id": "study_id",
5
+ "benchmark": "WebLINX",
6
+ "score": 7.9,
7
+ "std_err": 0.5,
8
+ "benchmark_specific": "No",
9
+ "benchmark_tuned": "No",
10
+ "followed_evaluation_protocol": "Yes",
11
+ "reproducible": "Yes",
12
+ "comments": "NA",
13
+ "original_or_reproduced": "Original",
14
+ "date_time": "2021-01-01 12:00:00"
15
+ }
16
+ ]
results/Bgym-Llama-3.1-405b/workarena-l1.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Llama-3.1-405b",
4
+ "study_id": "study_id",
5
+ "benchmark": "WorkArena-L1",
6
+ "score": 43.3,
7
+ "std_err": 2.7,
8
+ "benchmark_specific": "No",
9
+ "benchmark_tuned": "No",
10
+ "followed_evaluation_protocol": "Yes",
11
+ "reproducible": "Yes",
12
+ "comments": "NA",
13
+ "original_or_reproduced": "Original",
14
+ "date_time": "2021-01-01 12:00:00"
15
+ }
16
+ ]
results/Bgym-Llama-3.1-405b/workarena-l2.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Llama-3.1-405b",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WorkArena-L2",
7
+ "score": 7.2,
8
+ "std_err": 1.7,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-Llama-3.1-405b/workarena-l3.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Llama-3.1-405b",
4
+ "study_id": "study_id",
5
+ "benchmark": "WorkArena-L3",
6
+ "score": 0.0,
7
+ "std_err": 0.0,
8
+ "benchmark_specific": "No",
9
+ "benchmark_tuned": "No",
10
+ "followed_evaluation_protocol": "Yes",
11
+ "reproducible": "Yes",
12
+ "comments": "NA",
13
+ "original_or_reproduced": "Original",
14
+ "date_time": "2021-01-01 12:00:00"
15
+ }
16
+ ]
results/Bgym-Llama-3.1-70b/assistantbench.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Llama-3.1-70b",
4
+ "study_id": "study_id",
5
+ "benchmark": "AssistantBench",
6
+ "score": 2.8,
7
+ "std_err": 1.1,
8
+ "benchmark_specific": "No",
9
+ "benchmark_tuned": "No",
10
+ "followed_evaluation_protocol": "Yes",
11
+ "reproducible": "Yes",
12
+ "comments": "NA",
13
+ "original_or_reproduced": "Original",
14
+ "date_time": "2021-01-01 12:00:00"
15
+ }
16
+ ]
results/Bgym-Llama-3.1-70b/miniwob.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Llama-3.1-70b",
4
+ "study_id": "study_id",
5
+ "benchmark": "MiniWoB",
6
+ "score": 57.6,
7
+ "std_err": 2.0,
8
+ "benchmark_specific": "No",
9
+ "benchmark_tuned": "No",
10
+ "followed_evaluation_protocol": "Yes",
11
+ "reproducible": "Yes",
12
+ "comments": "NA",
13
+ "original_or_reproduced": "Original",
14
+ "date_time": "2021-01-01 12:00:00"
15
+ }
16
+ ]
results/Bgym-Llama-3.1-70b/webarena.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Llama-3.1-70b",
4
+ "study_id": "study_id",
5
+ "benchmark": "WebArena",
6
+ "score": 18.4,
7
+ "std_err": 1.4,
8
+ "benchmark_specific": "No",
9
+ "benchmark_tuned": "No",
10
+ "followed_evaluation_protocol": "Yes",
11
+ "reproducible": "Yes",
12
+ "comments": "NA",
13
+ "original_or_reproduced": "Original",
14
+ "date_time": "2021-01-01 12:00:00"
15
+ }
16
+ ]
results/Bgym-Llama-3.1-70b/weblinx.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Llama-3.1-70b",
4
+ "study_id": "study_id",
5
+ "benchmark": "WebLINX",
6
+ "score": 8.9,
7
+ "std_err": 0.5,
8
+ "benchmark_specific": "No",
9
+ "benchmark_tuned": "No",
10
+ "followed_evaluation_protocol": "Yes",
11
+ "reproducible": "Yes",
12
+ "comments": "NA",
13
+ "original_or_reproduced": "Original",
14
+ "date_time": "2021-01-01 12:00:00"
15
+ }
16
+ ]
results/Bgym-Llama-3.1-70b/workarena-l3.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Llama-3.1-70b",
4
+ "study_id": "study_id",
5
+ "benchmark": "WorkArena-L3",
6
+ "score": 0.0,
7
+ "std_err": 0.0,
8
+ "benchmark_specific": "No",
9
+ "benchmark_tuned": "No",
10
+ "followed_evaluation_protocol": "Yes",
11
+ "reproducible": "Yes",
12
+ "comments": "NA",
13
+ "original_or_reproduced": "Original",
14
+ "date_time": "2021-01-01 12:00:00"
15
+ }
16
+ ]