|
import { Benchmark } from "./types"; |
|
|
|
export const openaiBenchmarks: Benchmark[] = [ |
|
{ |
|
model: "GPT-4o-2024-11-20", |
|
provider: "OpenAI", |
|
inputPrice: 2.5, |
|
outputPrice: 10.0, |
|
source: "https://github.com/openai/simple-evals", |
|
benchmark: { |
|
mmlu: 85.7, |
|
gpqa: 46.0, |
|
humaneval: 90.2, |
|
simpleqa: 38.8, |
|
|
|
|
|
|
|
}, |
|
}, |
|
{ |
|
model: "GPT-4o-2024-08-06", |
|
provider: "OpenAI", |
|
inputPrice: 2.5, |
|
outputPrice: 10.0, |
|
source: "https://github.com/openai/simple-evals", |
|
benchmark: { |
|
mmlu: 88.7, |
|
gpqa: 53.1, |
|
humaneval: 90.2, |
|
simpleqa: 40.1, |
|
|
|
|
|
|
|
}, |
|
}, |
|
{ |
|
model: "GPT-4o-2024-05-13", |
|
provider: "OpenAI", |
|
inputPrice: 5.0, |
|
outputPrice: 15.0, |
|
source: "https://github.com/openai/simple-evals", |
|
benchmark: { |
|
mmlu: 87.2, |
|
gpqa: 49.9, |
|
humaneval: 91.0, |
|
simpleqa: 39.0, |
|
|
|
|
|
|
|
}, |
|
}, |
|
{ |
|
model: "GPT-4o-mini-2024-07-18", |
|
provider: "OpenAI", |
|
inputPrice: 0.15, |
|
outputPrice: 0.60, |
|
source: "https://github.com/openai/simple-evals", |
|
benchmark: { |
|
mmlu: 82.0, |
|
gpqa: 40.2, |
|
humaneval: 87.2, |
|
mmmu: 59.4, |
|
simpleqa: 9.5, |
|
|
|
|
|
|
|
}, |
|
}, |
|
{ |
|
model: "GPT-4.1-2025-04-14", |
|
provider: "OpenAI", |
|
inputPrice: 2.0, |
|
outputPrice: 8.0, |
|
source: "https://openai.com/index/gpt-4-1/", |
|
benchmark: { |
|
mmlu: 90.2, |
|
gpqa: 66.3, |
|
gpqa_diamond: 66.3, |
|
humaneval: 94.5, |
|
simpleqa: 41.6, |
|
swe_bench_verified: 54.6, |
|
aider_polyglot: 52.9, |
|
mmmlu: 90.2, |
|
video_mme: 72.0, |
|
|
|
aime_24: 48.1, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mrcr_v2_avg_128k: 57.2, |
|
mrcr_v2_pointwise_1m: 46.3, |
|
}, |
|
}, |
|
{ |
|
model: "GPT-4.1-mini-2025-04-14", |
|
provider: "OpenAI", |
|
inputPrice: 0.4, |
|
outputPrice: 1.6, |
|
source: "https://openai.com/index/gpt-4-1/", |
|
benchmark: { |
|
mmlu: 87.5, |
|
gpqa: 65.0, |
|
gpqa_diamond: 65.0, |
|
humaneval: 93.8, |
|
simpleqa: 16.8, |
|
swe_bench_verified: 23.6, |
|
aider_polyglot: 31.6, |
|
mmmlu: 87.5, |
|
|
|
aime_24: 49.6, |
|
mrcr_v2_avg_128k: 47.2, |
|
mrcr_v2_pointwise_1m: 33.3, |
|
|
|
}, |
|
}, |
|
{ |
|
model: "GPT-4.1-nano-2025-04-14", |
|
provider: "OpenAI", |
|
inputPrice: 0.1, |
|
outputPrice: 0.4, |
|
source: "https://openai.com/index/gpt-4-1/", |
|
benchmark: { |
|
mmlu: 80.1, |
|
gpqa: 50.3, |
|
gpqa_diamond: 50.3, |
|
humaneval: 87.0, |
|
simpleqa: 7.6, |
|
swe_bench_verified: 9.8, |
|
aider_polyglot: 6.2, |
|
mmmlu: 80.1, |
|
aime_24: 29.4, |
|
mrcr_v2_avg_128k: 36.6, |
|
mrcr_v2_pointwise_1m: 12.0, |
|
|
|
}, |
|
}, |
|
|
|
{ |
|
model: "GPT-4.5-preview-2025-02-27", |
|
provider: "OpenAI", |
|
inputPrice: 75.0, |
|
outputPrice: 150.0, |
|
source: "https://github.com/openai/simple-evals", |
|
benchmark: { |
|
mmlu: 90.8, |
|
gpqa: 69.5, |
|
simpleqa: 62.5, |
|
humaneval: 88.6, |
|
|
|
|
|
|
|
|
|
}, |
|
}, |
|
{ |
|
model: "GPT-4-turbo-2024-04-09", |
|
provider: "OpenAI", |
|
inputPrice: 10.0, |
|
outputPrice: 30.0, |
|
source: "https://github.com/openai/simple-evals", |
|
benchmark: { |
|
mmlu: 86.7, |
|
gpqa: 49.3, |
|
humaneval: 88.2, |
|
simpleqa: 24.2, |
|
|
|
|
|
|
|
|
|
}, |
|
}, |
|
{ |
|
model: "GPT-4-0125-preview", |
|
provider: "OpenAI", |
|
inputPrice: 10.0, |
|
outputPrice: 30.0, |
|
source: "https://github.com/openai/simple-evals", |
|
benchmark: { |
|
mmlu: 85.4, |
|
gpqa: 41.4, |
|
humaneval: 86.6, |
|
|
|
|
|
|
|
}, |
|
}, |
|
{ |
|
model: "GPT-4-1106-preview", |
|
provider: "OpenAI", |
|
inputPrice: 10.0, |
|
outputPrice: 30.0, |
|
source: "https://github.com/openai/simple-evals", |
|
benchmark: { |
|
mmlu: 84.7, |
|
gpqa: 42.5, |
|
humaneval: 83.7, |
|
|
|
|
|
|
|
}, |
|
}, |
|
{ |
|
model: "OpenAI o3", |
|
provider: "OpenAI", |
|
inputPrice: 2.0, |
|
outputPrice: 8.0, |
|
source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini", |
|
benchmark: { |
|
aime_24: 91.6, |
|
aime_2025: 88.9, |
|
|
|
gpqa_diamond: 83.3, |
|
humanitys_last_exam: 20.32, |
|
mmmu: 82.9, |
|
|
|
|
|
|
|
swe_bench_verified: 69.1, |
|
aider_polyglot: 81.3, |
|
|
|
|
|
tau_bench_airline: 52.0, |
|
tau_bench_retail: 73.9, |
|
}, |
|
}, |
|
{ |
|
model: "OpenAI o3-pro", |
|
provider: "OpenAI", |
|
inputPrice: 20.0, |
|
outputPrice: 80.0, |
|
source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini", |
|
benchmark: { |
|
|
|
|
|
gpqa_diamond: 83.3, |
|
humanitys_last_exam: 24.90, |
|
}, |
|
}, |
|
{ |
|
model: "OpenAI o4-mini", |
|
provider: "OpenAI", |
|
inputPrice: 1.10, |
|
outputPrice: 4.40, |
|
|
|
source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini", |
|
benchmark: { |
|
aime_24: 93.4, |
|
aime_2025: 92.7, |
|
|
|
gpqa_diamond: 81.4, |
|
humanitys_last_exam: 14.28, |
|
mmmu: 81.6, |
|
|
|
|
|
|
|
swe_bench_verified: 68.1, |
|
aider_polyglot: 68.9, |
|
|
|
|
|
tau_bench_airline: 49.2, |
|
tau_bench_retail: 71.8, |
|
}, |
|
}, |
|
{ |
|
model: "OpenAI o1", |
|
provider: "OpenAI", |
|
inputPrice: 15.0, |
|
outputPrice: 60.0, |
|
source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini", |
|
benchmark: { |
|
aime_24: 74.3, |
|
aime_2025: 79.2, |
|
|
|
gpqa_diamond: 78.0, |
|
humanitys_last_exam: 8.12, |
|
mmmu: 77.6, |
|
|
|
|
|
|
|
swe_bench_verified: 48.9, |
|
aider_polyglot: 64.4, |
|
|
|
|
|
tau_bench_airline: 50.0, |
|
tau_bench_retail: 70.8, |
|
}, |
|
}, |
|
{ |
|
model: "OpenAI o3-mini", |
|
provider: "OpenAI", |
|
inputPrice: 1.10, |
|
outputPrice: 4.40, |
|
source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini", |
|
benchmark: { |
|
aime_24: 87.3, |
|
aime_2025: 86.5, |
|
|
|
gpqa_diamond: 77.0, |
|
humanitys_last_exam: 13.40, |
|
|
|
|
|
swe_bench_verified: 49.3, |
|
aider_polyglot: 61.7, |
|
|
|
|
|
tau_bench_airline: 32.4, |
|
tau_bench_retail: 57.6, |
|
}, |
|
}, |
|
]; |
|
|