Spaces:
Sleeping
Sleeping
{ | |
"workarena_agent_curriculum": [ | |
{ | |
"Model": "GPT-3.5", | |
"WorkArena-L1": 6.1, | |
"WorkArena++-L2": 0.0, | |
"WorkArena++-L3": 0.0, | |
"MiniWoB": 43.4, | |
"WebArena": 6.7 | |
}, | |
{ | |
"Model": "GPT-4o", | |
"WorkArena-L1": 42.7, | |
"WorkArena++-L2": 3.0, | |
"WorkArena++-L3": 0.0, | |
"MiniWoB": 71.3, | |
"WebArena": 23.5 | |
}, | |
{ | |
"Model": "GPT-4o-V", | |
"WorkArena-L1": 41.8, | |
"WorkArena++-L2": 3.8, | |
"WorkArena++-L3": 0.0, | |
"MiniWoB": 72.5, | |
"WebArena": 24.0 | |
}, | |
{ | |
"Model": "LLaMA-3-70b", | |
"WorkArena-L1": 17.9, | |
"WorkArena++-L2": 0.0, | |
"WorkArena++-L3": 0.0, | |
"MiniWoB": 68.2, | |
"WebArena": 11.0 | |
}, | |
{ | |
"Model": "Mixtral-8x22b", | |
"WorkArena-L1": 12.4, | |
"WorkArena++-L2": 0.0, | |
"WorkArena++-L3": 0.0, | |
"MiniWoB": 62.4, | |
"WebArena": 12.6 | |
} | |
], | |
"workarena_l2_agent_curriculum": [ | |
{ | |
"Model": "GPT-3.5", | |
"Overall": 0.0, | |
"Contextual Understanding": 0.0, | |
"Data-driven Decision Making": 0.0, | |
"Planning and Problem Solving": 0.0, | |
"Information Retrieval": 0.0, | |
"Sophisticated Memorization": 0.0 | |
}, | |
{ | |
"Model": "GPT-4o", | |
"Overall": 3.0, | |
"Contextual Understanding": 0.0, | |
"Data-driven Decision Making": 0.0, | |
"Planning and Problem Solving": 0.0, | |
"Information Retrieval": 0.0, | |
"Sophisticated Memorization": 14.6 | |
}, | |
{ | |
"Model": "GPT-4o-V", | |
"Overall": 3.8, | |
"Contextual Understanding": 0.0, | |
"Data-driven Decision Making": 0.0, | |
"Planning and Problem Solving": 0.0, | |
"Information Retrieval": 3.6, | |
"Sophisticated Memorization": 14.6 | |
}, | |
{ | |
"Model": "LLaMA-3-70b", | |
"Overall": 0.0, | |
"Contextual Understanding": 0.0, | |
"Data-driven Decision Making": 0.0, | |
"Planning and Problem Solving": 0.0, | |
"Information Retrieval": 0.0, | |
"Sophisticated Memorization": 0.0 | |
}, | |
{ | |
"Model": "Mixtral-8x22b", | |
"Overall": 0.0, | |
"Contextual Understanding": 0.0, | |
"Data-driven Decision Making": 0.0, | |
"Planning and Problem Solving": 0.0, | |
"Information Retrieval": 0.0, | |
"Sophisticated Memorization": 0.0 | |
} | |
], | |
"workarena_l2_human_curriculum": [ | |
{ | |
"Model": "Human", | |
"Overall": 93.9, | |
"Contextual Understanding": 100.0, | |
"Data-driven Decision Making": 84.6, | |
"Planning and Problem Solving": 100.0, | |
"Information Retrieval": 100.0, | |
"Sophisticated Memorization": 91.7 | |
}, | |
{ | |
"Model": "GPT-4o", | |
"Overall": 2.1, | |
"Contextual Understanding": 0.0, | |
"Data-driven Decision Making": 0.0, | |
"Planning and Problem Solving": 0.0, | |
"Information Retrieval": 0.0, | |
"Sophisticated Memorization": 8.3 | |
} | |
], | |
"workarena_l3_agent_curriculum": [ | |
{ | |
"Model": "GPT-3.5", | |
"Overall": 0.0, | |
"Contextual Understanding": 0.0, | |
"Data-driven Decision Making": 0.0, | |
"Planning and Problem Solving": 0.0, | |
"Information Retrieval": 0.0, | |
"Sophisticated Memorization": 0.0 | |
}, | |
{ | |
"Model": "GPT-4o", | |
"Overall": 0.0, | |
"Contextual Understanding": 0.0, | |
"Data-driven Decision Making": 0.0, | |
"Planning and Problem Solving": 0.0, | |
"Information Retrieval": 0.0, | |
"Sophisticated Memorization": 0.0 | |
}, | |
{ | |
"Model": "GPT-4o-V", | |
"Overall": 0.0, | |
"Contextual Understanding": 0.0, | |
"Data-driven Decision Making": 0.0, | |
"Planning and Problem Solving": 0.0, | |
"Information Retrieval": 0.0, | |
"Sophisticated Memorization": 0.0 | |
}, | |
{ | |
"Model": "LLaMA-3-70b", | |
"Overall": 0.0, | |
"Contextual Understanding": 0.0, | |
"Data-driven Decision Making": 0.0, | |
"Planning and Problem Solving": 0.0, | |
"Information Retrieval": 0.0, | |
"Sophisticated Memorization": 0.0 | |
}, | |
{ | |
"Model": "Mixtral-8x22b", | |
"Overall": 0.0, | |
"Contextual Understanding": 0.0, | |
"Data-driven Decision Making": 0.0, | |
"Planning and Problem Solving": 0.0, | |
"Information Retrieval": 0.0, | |
"Sophisticated Memorization": 0.0 | |
} | |
], | |
"workarena_l3_human_curriculum": [ | |
{ | |
"Model": "Human", | |
"Overall": 93.9, | |
"Contextual Understanding": 87.5, | |
"Data-driven Decision Making": 100.0, | |
"Planning and Problem Solving": 87.5, | |
"Information Retrieval": 100.0, | |
"Sophisticated Memorization": 91.7 | |
}, | |
{ | |
"Model": "GPT-4o", | |
"Overall": 0.0, | |
"Contextual Understanding": 0.0, | |
"Data-driven Decision Making": 0.0, | |
"Planning and Problem Solving": 0.0, | |
"Information Retrieval": 0.0, | |
"Sophisticated Memorization": 0.0 | |
} | |
] | |
} |