Updated latest results
Browse files- app.py +20 -5
- results/Bgym-Claude-3.5-Sonnet/assistantbench.json +16 -0
- results/Bgym-Claude-3.5-Sonnet/miniwob.json +16 -0
- results/Bgym-Claude-3.5-Sonnet/webarena.json +16 -0
- results/Bgym-Claude-3.5-Sonnet/weblinx.json +16 -0
- results/Bgym-GPT-4o-mini/assistantbench.json +16 -0
- results/Bgym-GPT-4o-mini/miniwob.json +2 -2
- results/Bgym-GPT-4o-mini/webarena.json +16 -0
- results/Bgym-GPT-4o-mini/weblinx.json +16 -0
- results/Bgym-GPT-4o-mini/workarena-l3.json +16 -0
- results/Bgym-GPT-4o/assistantbench.json +16 -0
- results/Bgym-GPT-4o/miniwob.json +1 -1
- results/Bgym-GPT-4o/webarena.json +2 -2
- results/Bgym-GPT-4o/weblinx.json +16 -0
- results/Bgym-GPT-o1-mini/assistantbench.json +16 -0
- results/Bgym-GPT-o1-mini/miniwob.json +16 -0
- results/Bgym-GPT-o1-mini/webarena.json +16 -0
- results/Bgym-GPT-o1-mini/weblinx.json +16 -0
- results/Bgym-GPT-o1-mini/workarena-l3.json +16 -0
- results/Bgym-Llama-3.1-405b/README.md +1 -0
- results/Bgym-Llama-3.1-405b/assistantbench.json +16 -0
- results/Bgym-Llama-3.1-405b/miniwob.json +16 -0
- results/Bgym-Llama-3.1-405b/webarena.json +16 -0
- results/Bgym-Llama-3.1-405b/weblinx.json +16 -0
- results/Bgym-Llama-3.1-405b/workarena-l1.json +16 -0
- results/Bgym-Llama-3.1-405b/workarena-l2.json +16 -0
- results/Bgym-Llama-3.1-405b/workarena-l3.json +16 -0
- results/Bgym-Llama-3.1-70b/assistantbench.json +16 -0
- results/Bgym-Llama-3.1-70b/miniwob.json +16 -0
- results/Bgym-Llama-3.1-70b/webarena.json +16 -0
- results/Bgym-Llama-3.1-70b/weblinx.json +16 -0
- results/Bgym-Llama-3.1-70b/workarena-l3.json +16 -0
app.py
CHANGED
@@ -44,9 +44,11 @@ def sanitize_column_name(col: str) -> str:
|
|
44 |
return html.escape(str(col))
|
45 |
|
46 |
def sanitize_cell_value(value: Any) -> str:
|
47 |
-
"""Sanitize cell values for HTML display"""
|
48 |
if isinstance(value, (int, float)):
|
49 |
return str(value)
|
|
|
|
|
|
|
50 |
return html.escape(str(value))
|
51 |
|
52 |
def create_html_table_main(df):
|
@@ -169,8 +171,9 @@ def create_html_table_benchmark(df, benchmark):
|
|
169 |
html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
|
170 |
elif column == "Reproduced_all":
|
171 |
continue
|
172 |
-
|
173 |
-
|
|
|
174 |
else:
|
175 |
html += f'<td>{sanitize_cell_value(row[column])}</td>'
|
176 |
html += '</tr>'
|
@@ -205,6 +208,19 @@ def check_sanity(agent):
|
|
205 |
|
206 |
def main():
|
207 |
st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
st.markdown("""
|
209 |
<head>
|
210 |
<meta http-equiv="Content-Security-Policy"
|
@@ -477,9 +493,8 @@ MIT
|
|
477 |
if dfs_to_concat:
|
478 |
df_ = pd.concat(dfs_to_concat, ignore_index=True)
|
479 |
df_['Score'] = df_['Score'].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
|
|
|
480 |
df_['Score'] = df_['Score'].astype(str)
|
481 |
-
df_['Score'] = df_.apply(lambda row: f"{row['Score']} ± {row['std_err']}", axis=1)
|
482 |
-
df_ = df_.drop(columns=['std_err'])
|
483 |
html_table = create_html_table_benchmark(df_, benchmark)
|
484 |
st.markdown(html_table, unsafe_allow_html=True)
|
485 |
|
|
|
44 |
return html.escape(str(col))
|
45 |
|
46 |
def sanitize_cell_value(value: Any) -> str:
|
|
|
47 |
if isinstance(value, (int, float)):
|
48 |
return str(value)
|
49 |
+
if isinstance(value, str) and '±' in value:
|
50 |
+
score, std_err = value.split('±')
|
51 |
+
return f'{score.strip()} <span style="font-size: smaller; color: var(--lighter-color);">±{std_err.strip()}</span>'
|
52 |
return html.escape(str(value))
|
53 |
|
54 |
def create_html_table_main(df):
|
|
|
171 |
html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
|
172 |
elif column == "Reproduced_all":
|
173 |
continue
|
174 |
+
elif column == "Score":
|
175 |
+
score_with_std_err = f'{row[column]} ± {row["std_err"]}'
|
176 |
+
html += f'<td>{sanitize_cell_value(score_with_std_err)}</td>'
|
177 |
else:
|
178 |
html += f'<td>{sanitize_cell_value(row[column])}</td>'
|
179 |
html += '</tr>'
|
|
|
208 |
|
209 |
def main():
|
210 |
st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded")
|
211 |
+
st.markdown("""
|
212 |
+
<style>
|
213 |
+
:root {
|
214 |
+
--lighter-color: #888; /* Default for light theme */
|
215 |
+
}
|
216 |
+
@media (prefers-color-scheme: dark) {
|
217 |
+
:root {
|
218 |
+
--lighter-color: #ccc; /* Default for dark theme */
|
219 |
+
}
|
220 |
+
}
|
221 |
+
</style>
|
222 |
+
""", unsafe_allow_html=True)
|
223 |
+
|
224 |
st.markdown("""
|
225 |
<head>
|
226 |
<meta http-equiv="Content-Security-Policy"
|
|
|
493 |
if dfs_to_concat:
|
494 |
df_ = pd.concat(dfs_to_concat, ignore_index=True)
|
495 |
df_['Score'] = df_['Score'].apply(lambda x: f"{x:.2f}" if x != "-" else "-")
|
496 |
+
df_['std_err'] = df_['std_err'].apply(lambda x: f"{x:.1f}" if x != "-" else "-")
|
497 |
df_['Score'] = df_['Score'].astype(str)
|
|
|
|
|
498 |
html_table = create_html_table_benchmark(df_, benchmark)
|
499 |
st.markdown(html_table, unsafe_allow_html=True)
|
500 |
|
results/Bgym-Claude-3.5-Sonnet/assistantbench.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Claude-3.5-Sonnet",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"benchmark": "AssistantBench",
|
6 |
+
"score": 5.2,
|
7 |
+
"std_err": 1.5,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2021-01-01 12:00:00"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-Claude-3.5-Sonnet/miniwob.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Claude-3.5-Sonnet",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"benchmark": "MiniWoB",
|
6 |
+
"score": 69.8,
|
7 |
+
"std_err": 1.8,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2021-01-01 12:00:00"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-Claude-3.5-Sonnet/webarena.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Claude-3.5-Sonnet",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"benchmark": "WebArena",
|
6 |
+
"score": 36.2,
|
7 |
+
"std_err": 1.7,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2021-01-01 12:00:00"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-Claude-3.5-Sonnet/weblinx.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Claude-3.5-Sonnet",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"benchmark": "WebLINX",
|
6 |
+
"score": 13.7,
|
7 |
+
"std_err": 0.6,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2021-01-01 12:00:00"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-GPT-4o-mini/assistantbench.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-GPT-4o-mini",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "AssistantBench",
|
7 |
+
"score": 2.1,
|
8 |
+
"std_err": 1.0,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-GPT-4o-mini/miniwob.json
CHANGED
@@ -4,8 +4,8 @@
|
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "MiniWoB",
|
7 |
-
"score":
|
8 |
-
"std_err":
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
|
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "MiniWoB",
|
7 |
+
"score": 56.6,
|
8 |
+
"std_err": 2.0,
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
results/Bgym-GPT-4o-mini/webarena.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-GPT-4o-mini",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WebArena",
|
7 |
+
"score": 17.4,
|
8 |
+
"std_err": 1.3,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-GPT-4o-mini/weblinx.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-GPT-4o-mini",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WebLINX",
|
7 |
+
"score": 11.6,
|
8 |
+
"std_err": 0.6,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-GPT-4o-mini/workarena-l3.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-GPT-4o-mini",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WorkArena-L3",
|
7 |
+
"score": 0.0,
|
8 |
+
"std_err": 0.0,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-GPT-4o/assistantbench.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-GPT-4o",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "AssistantBench",
|
7 |
+
"score": 4.8,
|
8 |
+
"std_err": 2.4,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-GPT-4o/miniwob.json
CHANGED
@@ -4,7 +4,7 @@
|
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "MiniWoB",
|
7 |
-
"score":
|
8 |
"std_err": 1.9,
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
|
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "MiniWoB",
|
7 |
+
"score": 63.8,
|
8 |
"std_err": 1.9,
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
results/Bgym-GPT-4o/webarena.json
CHANGED
@@ -4,8 +4,8 @@
|
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WebArena",
|
7 |
-
"score":
|
8 |
-
"std_err":
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
|
|
4 |
"study_id": "study_id",
|
5 |
"date_time": "2021-01-01 12:00:00",
|
6 |
"benchmark": "WebArena",
|
7 |
+
"score": 31.4,
|
8 |
+
"std_err": 1.6,
|
9 |
"benchmark_specific": "No",
|
10 |
"benchmark_tuned": "No",
|
11 |
"followed_evaluation_protocol": "Yes",
|
results/Bgym-GPT-4o/weblinx.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-GPT-4o",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WebLINX",
|
7 |
+
"score": 12.5,
|
8 |
+
"std_err": 0.6,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-GPT-o1-mini/assistantbench.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-GPT-o1-mini",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "AssistantBench",
|
7 |
+
"score": 6.9,
|
8 |
+
"std_err": 2.2,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-GPT-o1-mini/miniwob.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-GPT-o1-mini",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "MiniWoB",
|
7 |
+
"score": 67.8,
|
8 |
+
"std_err": 1.9,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-GPT-o1-mini/webarena.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-GPT-o1-mini",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WebArena",
|
7 |
+
"score": 28.6,
|
8 |
+
"std_err": 1.6,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-GPT-o1-mini/weblinx.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-GPT-o1-mini",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WebLINX",
|
7 |
+
"score": 12.5,
|
8 |
+
"std_err": 0.6,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-GPT-o1-mini/workarena-l3.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-GPT-o1-mini",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WorkArena-L3",
|
7 |
+
"score": 0.0,
|
8 |
+
"std_err": 0.0,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-Llama-3.1-405b/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
### Llama-3.1-405B
|
results/Bgym-Llama-3.1-405b/assistantbench.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Llama-3.1-405b",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"benchmark": "AssistantBench",
|
6 |
+
"score": 3.9,
|
7 |
+
"std_err": 1.0,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2021-01-01 12:00:00"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-Llama-3.1-405b/miniwob.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Llama-3.1-405b",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"benchmark": "MiniWoB",
|
6 |
+
"score": 64.6,
|
7 |
+
"std_err": 1.9,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2021-01-01 12:00:00"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-Llama-3.1-405b/webarena.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Llama-3.1-405b",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"benchmark": "WebArena",
|
6 |
+
"score": 24.0,
|
7 |
+
"std_err": 1.5,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2021-01-01 12:00:00"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-Llama-3.1-405b/weblinx.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Llama-3.1-405b",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"benchmark": "WebLINX",
|
6 |
+
"score": 7.9,
|
7 |
+
"std_err": 0.5,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2021-01-01 12:00:00"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-Llama-3.1-405b/workarena-l1.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Llama-3.1-405b",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"benchmark": "WorkArena-L1",
|
6 |
+
"score": 43.3,
|
7 |
+
"std_err": 2.7,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2021-01-01 12:00:00"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-Llama-3.1-405b/workarena-l2.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Llama-3.1-405b",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"date_time": "2021-01-01 12:00:00",
|
6 |
+
"benchmark": "WorkArena-L2",
|
7 |
+
"score": 7.2,
|
8 |
+
"std_err": 1.7,
|
9 |
+
"benchmark_specific": "No",
|
10 |
+
"benchmark_tuned": "No",
|
11 |
+
"followed_evaluation_protocol": "Yes",
|
12 |
+
"reproducible": "Yes",
|
13 |
+
"comments": "NA",
|
14 |
+
"original_or_reproduced": "Original"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-Llama-3.1-405b/workarena-l3.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Llama-3.1-405b",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"benchmark": "WorkArena-L3",
|
6 |
+
"score": 0.0,
|
7 |
+
"std_err": 0.0,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2021-01-01 12:00:00"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-Llama-3.1-70b/assistantbench.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Llama-3.1-70b",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"benchmark": "AssistantBench",
|
6 |
+
"score": 2.8,
|
7 |
+
"std_err": 1.1,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2021-01-01 12:00:00"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-Llama-3.1-70b/miniwob.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Llama-3.1-70b",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"benchmark": "MiniWoB",
|
6 |
+
"score": 57.6,
|
7 |
+
"std_err": 2.0,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2021-01-01 12:00:00"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-Llama-3.1-70b/webarena.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Llama-3.1-70b",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"benchmark": "WebArena",
|
6 |
+
"score": 18.4,
|
7 |
+
"std_err": 1.4,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2021-01-01 12:00:00"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-Llama-3.1-70b/weblinx.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Llama-3.1-70b",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"benchmark": "WebLINX",
|
6 |
+
"score": 8.9,
|
7 |
+
"std_err": 0.5,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2021-01-01 12:00:00"
|
15 |
+
}
|
16 |
+
]
|
results/Bgym-Llama-3.1-70b/workarena-l3.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"agent_name": "Bgym-Llama-3.1-70b",
|
4 |
+
"study_id": "study_id",
|
5 |
+
"benchmark": "WorkArena-L3",
|
6 |
+
"score": 0.0,
|
7 |
+
"std_err": 0.0,
|
8 |
+
"benchmark_specific": "No",
|
9 |
+
"benchmark_tuned": "No",
|
10 |
+
"followed_evaluation_protocol": "Yes",
|
11 |
+
"reproducible": "Yes",
|
12 |
+
"comments": "NA",
|
13 |
+
"original_or_reproduced": "Original",
|
14 |
+
"date_time": "2021-01-01 12:00:00"
|
15 |
+
}
|
16 |
+
]
|