test-agent
#1
by
meghsn
- opened
results/test-agent/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
### Test agent
|
results/test-agent/miniwob.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "test-agent",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "MiniWoB",
|
| 7 |
+
"score": 43.4,
|
| 8 |
+
"std_err": 0.1,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/test-agent/webarena.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "test-agent",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "WebArena",
|
| 7 |
+
"score": 6.7,
|
| 8 |
+
"std_err": 0.2,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/test-agent/workarena-l1.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "test-agent",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "WorkArena-L1",
|
| 7 |
+
"score": 6.1,
|
| 8 |
+
"std_err": 0.3,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/test-agent/workarena-l2.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "test-agent",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "WorkArena-L2",
|
| 7 |
+
"score": 0.0,
|
| 8 |
+
"std_err": 0.0,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|
results/test-agent/workarena-l3.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"agent_name": "test-agent",
|
| 4 |
+
"study_id": "study_id",
|
| 5 |
+
"date_time": "2021-01-01 12:00:00",
|
| 6 |
+
"benchmark": "WorkArena-L3",
|
| 7 |
+
"score": 0.0,
|
| 8 |
+
"std_err": 0.0,
|
| 9 |
+
"benchmark_specific": "No",
|
| 10 |
+
"benchmark_tuned": "No",
|
| 11 |
+
"followed_evaluation_protocol": "Yes",
|
| 12 |
+
"reproducible": "Yes",
|
| 13 |
+
"comments": "NA",
|
| 14 |
+
"original_or_reproduced": "Original"
|
| 15 |
+
}
|
| 16 |
+
]
|