Spaces:

meghsn
/

WebAgent-Leaderboard

Sleeping

App Files Files Community

WebAgent-Leaderboard / results.json

megh1211

Init push

a59bcfa 4 months ago

raw

history blame contribute delete

5.7 kB

	{
	"workarena_agent_curriculum": [
	{
	"Model": "GPT-3.5",
	"WorkArena-L1": 6.1,
	"WorkArena++-L2": 0.0,
	"WorkArena++-L3": 0.0,
	"MiniWoB": 43.4,
	"WebArena": 6.7
	},
	{
	"Model": "GPT-4o",
	"WorkArena-L1": 42.7,
	"WorkArena++-L2": 3.0,
	"WorkArena++-L3": 0.0,
	"MiniWoB": 71.3,
	"WebArena": 23.5
	},
	{
	"Model": "GPT-4o-V",
	"WorkArena-L1": 41.8,
	"WorkArena++-L2": 3.8,
	"WorkArena++-L3": 0.0,
	"MiniWoB": 72.5,
	"WebArena": 24.0
	},
	{
	"Model": "LLaMA-3-70b",
	"WorkArena-L1": 17.9,
	"WorkArena++-L2": 0.0,
	"WorkArena++-L3": 0.0,
	"MiniWoB": 68.2,
	"WebArena": 11.0
	},
	{
	"Model": "Mixtral-8x22b",
	"WorkArena-L1": 12.4,
	"WorkArena++-L2": 0.0,
	"WorkArena++-L3": 0.0,
	"MiniWoB": 62.4,
	"WebArena": 12.6
	}
	],
	"workarena_l2_agent_curriculum": [
	{
	"Model": "GPT-3.5",
	"Overall": 0.0,
	"Contextual Understanding": 0.0,
	"Data-driven Decision Making": 0.0,
	"Planning and Problem Solving": 0.0,
	"Information Retrieval": 0.0,
	"Sophisticated Memorization": 0.0
	},
	{
	"Model": "GPT-4o",
	"Overall": 3.0,
	"Contextual Understanding": 0.0,
	"Data-driven Decision Making": 0.0,
	"Planning and Problem Solving": 0.0,
	"Information Retrieval": 0.0,
	"Sophisticated Memorization": 14.6
	},
	{
	"Model": "GPT-4o-V",
	"Overall": 3.8,
	"Contextual Understanding": 0.0,
	"Data-driven Decision Making": 0.0,
	"Planning and Problem Solving": 0.0,
	"Information Retrieval": 3.6,
	"Sophisticated Memorization": 14.6
	},
	{
	"Model": "LLaMA-3-70b",
	"Overall": 0.0,
	"Contextual Understanding": 0.0,
	"Data-driven Decision Making": 0.0,
	"Planning and Problem Solving": 0.0,
	"Information Retrieval": 0.0,
	"Sophisticated Memorization": 0.0
	},
	{
	"Model": "Mixtral-8x22b",
	"Overall": 0.0,
	"Contextual Understanding": 0.0,
	"Data-driven Decision Making": 0.0,
	"Planning and Problem Solving": 0.0,
	"Information Retrieval": 0.0,
	"Sophisticated Memorization": 0.0
	}
	],
	"workarena_l2_human_curriculum": [
	{
	"Model": "Human",
	"Overall": 93.9,
	"Contextual Understanding": 100.0,
	"Data-driven Decision Making": 84.6,
	"Planning and Problem Solving": 100.0,
	"Information Retrieval": 100.0,
	"Sophisticated Memorization": 91.7
	},
	{
	"Model": "GPT-4o",
	"Overall": 2.1,
	"Contextual Understanding": 0.0,
	"Data-driven Decision Making": 0.0,
	"Planning and Problem Solving": 0.0,
	"Information Retrieval": 0.0,
	"Sophisticated Memorization": 8.3
	}
	],
	"workarena_l3_agent_curriculum": [
	{
	"Model": "GPT-3.5",
	"Overall": 0.0,
	"Contextual Understanding": 0.0,
	"Data-driven Decision Making": 0.0,
	"Planning and Problem Solving": 0.0,
	"Information Retrieval": 0.0,
	"Sophisticated Memorization": 0.0
	},
	{
	"Model": "GPT-4o",
	"Overall": 0.0,
	"Contextual Understanding": 0.0,
	"Data-driven Decision Making": 0.0,
	"Planning and Problem Solving": 0.0,
	"Information Retrieval": 0.0,
	"Sophisticated Memorization": 0.0
	},
	{
	"Model": "GPT-4o-V",
	"Overall": 0.0,
	"Contextual Understanding": 0.0,
	"Data-driven Decision Making": 0.0,
	"Planning and Problem Solving": 0.0,
	"Information Retrieval": 0.0,
	"Sophisticated Memorization": 0.0
	},
	{
	"Model": "LLaMA-3-70b",
	"Overall": 0.0,
	"Contextual Understanding": 0.0,
	"Data-driven Decision Making": 0.0,
	"Planning and Problem Solving": 0.0,
	"Information Retrieval": 0.0,
	"Sophisticated Memorization": 0.0
	},
	{
	"Model": "Mixtral-8x22b",
	"Overall": 0.0,
	"Contextual Understanding": 0.0,
	"Data-driven Decision Making": 0.0,
	"Planning and Problem Solving": 0.0,
	"Information Retrieval": 0.0,
	"Sophisticated Memorization": 0.0
	}
	],
	"workarena_l3_human_curriculum": [
	{
	"Model": "Human",
	"Overall": 93.9,
	"Contextual Understanding": 87.5,
	"Data-driven Decision Making": 100.0,
	"Planning and Problem Solving": 87.5,
	"Information Retrieval": 100.0,
	"Sophisticated Memorization": 91.7
	},
	{
	"Model": "GPT-4o",
	"Overall": 0.0,
	"Contextual Understanding": 0.0,
	"Data-driven Decision Making": 0.0,
	"Planning and Problem Solving": 0.0,
	"Information Retrieval": 0.0,
	"Sophisticated Memorization": 0.0
	}
	]
	}