Spaces:
Running
Running
File size: 3,941 Bytes
7c691e6 178673f 7c691e6 178673f 7c691e6 178673f 7c691e6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
# This file contains information about verified agent results for different benchmarks.
# Format:
# benchmark_name:
# - agent_name: "Name of the agent"
# verification_date: YYYY-MM-DD
usaco:
- agent_name: "USACO Reflexion + Episodic (gpt-4o-mini-2024-07-18)"
verification_date: 2024-08-20
- agent_name: "USACO Reflexion + Episodic + Semantic (gpt-4o-mini-2024-07-18)"
verification_date: 2024-08-20
- agent_name: "USACO Reflexion (gpt-4o-mini-2024-07-18)"
verification_date: 2024-08-20
- agent_name: "USACO Episodic (gpt-4o-mini-2024-07-18)"
verification_date: 2024-08-12
- agent_name: "USACO Reflexion + Semantic (gpt-4o-mini-2024-07-18)"
verification_date: 2024-08-20
- agent_name: "USACO Zero-shot (gpt-4o-mini-2024-07-18)"
verification_date: 2024-08-11
- agent_name: "USACO Semantic (gpt-4o-mini-2024-07-18)"
verification_date: 2024-08-12
- agent_name: USACO Reflexion + Episodic + Semantic (gpt-4o-2024-05-13)
verification_date: 2024-08-25
- agent_name: USACO Reflexion + Episodic (gpt-4o-2024-05-13)
verification_date: 2024-08-25
- agent_name: USACO Reflexion + Semantic (gpt-4o-2024-05-13)
verification_date: 2024-08-25
- agent_name: Episodic Retrial (2x) (gpt-4o-2024-05-13)
verification_date: 2024-08-25
- agent_name: Episodic Retrial (3x) (gpt-4o-mini-2024-07-18)
verification_date: 2024-08-25
- agent_name: Episodic Retrial (2x) (gpt-4o-mini-2024-07-18)
verification_date: 2024-08-25
- agent_name: Episodic Retrial (5x) (gpt-4o-mini-2024-07-18)
verification_date: 2024-08-25
- agent_name: Episodic Warming (3 Steps) (gpt-4o-mini-2024-07-18)
verification_date: 2024-08-24
- agent_name: USACO Episodic (gpt-4o-2024-05-13)
verification_date: 2024-08-24
- agent_name: USACO Semantic (gpt-4o-2024-05-13)
verification_date: 2024-08-24
- agent_name: Zero-shot Retrial (2x) (gpt-4o-mini-2024-07-18)
verification_date: 2024-08-24
- agent_name: Zero-shot Retrial (3x) (gpt-4o-mini-2024-07-18)
verification_date: 2024-08-24
- agent_name: Zero-shot Retrial (5x) (gpt-4o-mini-2024-07-18)
verification_date: 2024-08-24
- agent_name: USACO Zero-shot (gpt-4o-2024-05-13)
verification_date: 2024-08-24
swebench_verified_mini:
- agent_name: "Agentless (gpt-4o-mini-2024-07-18)"
verification_date: 2024-08-17
- agent_name: "SWE-agent (gpt-4o-mini-2024-07-18) (Cost Limit: $1)"
verification_date: 2024-08-19
- agent_name: "Moatless (gpt-4o-mini-2024-07-18)"
verification_date: 2024-10-30
- agent_name: "Moatless (gpt-4o-2024-08-06)"
verification_date: 2024-10-30
- agent_name: "Moatless (claude-3-5-sonnet-20241022)"
verification_date: 2024-10-30
- agent_name: "Agentless (o1-mini-2024-09-12)"
verification_date: 2024-10-30
swebench_verified:
- agent_name: "Moatless (gpt-4o-2024-08-06)"
verification_date: 2024-10-30
mlagentbench:
- agent_name: "MLAgentBench ResearchAgent (gpt-4o-mini-2024-07-18)"
verification_date: 2024-08-19
corebench_easy:
- agent_name: "AutoGPT (GPT-4o)"
verification_date: 2024-09-28
- agent_name: "AutoGPT (GPT-4o-mini)"
verification_date: 2024-09-28
- agent_name: "CORE-Agent (GPT-4o)"
verification_date: 2024-09-28
- agent_name: "CORE-Agent (GPT-4o-mini)"
verification_date: 2024-09-28
corebench_medium:
- agent_name: "AutoGPT (GPT-4o)"
verification_date: 2024-09-28
- agent_name: "AutoGPT (GPT-4o-mini)"
verification_date: 2024-09-28
- agent_name: "CORE-Agent (GPT-4o)"
verification_date: 2024-09-28
- agent_name: "CORE-Agent (GPT-4o-mini)"
verification_date: 2024-09-28
corebench_hard:
- agent_name: "AutoGPT (GPT-4o)"
verification_date: 2024-09-28
- agent_name: "AutoGPT (GPT-4o-mini)"
verification_date: 2024-09-28
- agent_name: "CORE-Agent (GPT-4o)"
verification_date: 2024-09-28
- agent_name: "CORE-Agent (GPT-4o-mini)"
verification_date: 2024-09-28 |