WebAgent-Leaderboard / results.json
megh1211's picture
Init push
a59bcfa
{
"workarena_agent_curriculum": [
{
"Model": "GPT-3.5",
"WorkArena-L1": 6.1,
"WorkArena++-L2": 0.0,
"WorkArena++-L3": 0.0,
"MiniWoB": 43.4,
"WebArena": 6.7
},
{
"Model": "GPT-4o",
"WorkArena-L1": 42.7,
"WorkArena++-L2": 3.0,
"WorkArena++-L3": 0.0,
"MiniWoB": 71.3,
"WebArena": 23.5
},
{
"Model": "GPT-4o-V",
"WorkArena-L1": 41.8,
"WorkArena++-L2": 3.8,
"WorkArena++-L3": 0.0,
"MiniWoB": 72.5,
"WebArena": 24.0
},
{
"Model": "LLaMA-3-70b",
"WorkArena-L1": 17.9,
"WorkArena++-L2": 0.0,
"WorkArena++-L3": 0.0,
"MiniWoB": 68.2,
"WebArena": 11.0
},
{
"Model": "Mixtral-8x22b",
"WorkArena-L1": 12.4,
"WorkArena++-L2": 0.0,
"WorkArena++-L3": 0.0,
"MiniWoB": 62.4,
"WebArena": 12.6
}
],
"workarena_l2_agent_curriculum": [
{
"Model": "GPT-3.5",
"Overall": 0.0,
"Contextual Understanding": 0.0,
"Data-driven Decision Making": 0.0,
"Planning and Problem Solving": 0.0,
"Information Retrieval": 0.0,
"Sophisticated Memorization": 0.0
},
{
"Model": "GPT-4o",
"Overall": 3.0,
"Contextual Understanding": 0.0,
"Data-driven Decision Making": 0.0,
"Planning and Problem Solving": 0.0,
"Information Retrieval": 0.0,
"Sophisticated Memorization": 14.6
},
{
"Model": "GPT-4o-V",
"Overall": 3.8,
"Contextual Understanding": 0.0,
"Data-driven Decision Making": 0.0,
"Planning and Problem Solving": 0.0,
"Information Retrieval": 3.6,
"Sophisticated Memorization": 14.6
},
{
"Model": "LLaMA-3-70b",
"Overall": 0.0,
"Contextual Understanding": 0.0,
"Data-driven Decision Making": 0.0,
"Planning and Problem Solving": 0.0,
"Information Retrieval": 0.0,
"Sophisticated Memorization": 0.0
},
{
"Model": "Mixtral-8x22b",
"Overall": 0.0,
"Contextual Understanding": 0.0,
"Data-driven Decision Making": 0.0,
"Planning and Problem Solving": 0.0,
"Information Retrieval": 0.0,
"Sophisticated Memorization": 0.0
}
],
"workarena_l2_human_curriculum": [
{
"Model": "Human",
"Overall": 93.9,
"Contextual Understanding": 100.0,
"Data-driven Decision Making": 84.6,
"Planning and Problem Solving": 100.0,
"Information Retrieval": 100.0,
"Sophisticated Memorization": 91.7
},
{
"Model": "GPT-4o",
"Overall": 2.1,
"Contextual Understanding": 0.0,
"Data-driven Decision Making": 0.0,
"Planning and Problem Solving": 0.0,
"Information Retrieval": 0.0,
"Sophisticated Memorization": 8.3
}
],
"workarena_l3_agent_curriculum": [
{
"Model": "GPT-3.5",
"Overall": 0.0,
"Contextual Understanding": 0.0,
"Data-driven Decision Making": 0.0,
"Planning and Problem Solving": 0.0,
"Information Retrieval": 0.0,
"Sophisticated Memorization": 0.0
},
{
"Model": "GPT-4o",
"Overall": 0.0,
"Contextual Understanding": 0.0,
"Data-driven Decision Making": 0.0,
"Planning and Problem Solving": 0.0,
"Information Retrieval": 0.0,
"Sophisticated Memorization": 0.0
},
{
"Model": "GPT-4o-V",
"Overall": 0.0,
"Contextual Understanding": 0.0,
"Data-driven Decision Making": 0.0,
"Planning and Problem Solving": 0.0,
"Information Retrieval": 0.0,
"Sophisticated Memorization": 0.0
},
{
"Model": "LLaMA-3-70b",
"Overall": 0.0,
"Contextual Understanding": 0.0,
"Data-driven Decision Making": 0.0,
"Planning and Problem Solving": 0.0,
"Information Retrieval": 0.0,
"Sophisticated Memorization": 0.0
},
{
"Model": "Mixtral-8x22b",
"Overall": 0.0,
"Contextual Understanding": 0.0,
"Data-driven Decision Making": 0.0,
"Planning and Problem Solving": 0.0,
"Information Retrieval": 0.0,
"Sophisticated Memorization": 0.0
}
],
"workarena_l3_human_curriculum": [
{
"Model": "Human",
"Overall": 93.9,
"Contextual Understanding": 87.5,
"Data-driven Decision Making": 100.0,
"Planning and Problem Solving": 87.5,
"Information Retrieval": 100.0,
"Sophisticated Memorization": 91.7
},
{
"Model": "GPT-4o",
"Overall": 0.0,
"Contextual Understanding": 0.0,
"Data-driven Decision Making": 0.0,
"Planning and Problem Solving": 0.0,
"Information Retrieval": 0.0,
"Sophisticated Memorization": 0.0
}
]
}