BiGGen-Bench-Leaderboard / data /eval_by_gpt-4-turbo-2024-04-09.csv
scottsuk0306's picture
Update
0c155df
raw
history blame contribute delete
No virus
11.2 kB
model_name,grounding,instruction_following,planning,reasoning,refinement,safety,theory_of_mind,tool_usage,multilingual
phi-1,1.1125,1.01,1.0,1.0,1.4342105263157894,1.5072463768115942,1.0,1.0125,
phi-1_5,2.475,2.89,2.5,2.24,2.526315789473684,2.869565217391304,2.95,1.525,
phi-2,3.1375,2.92,2.857142857142857,2.8,2.763157894736842,3.4057971014492754,3.2,1.7875,
Qwen1.5-0.5B,2.025,2.12,1.7,1.58,2.1578947368421053,2.0144927536231885,1.8,1.275,
Qwen1.5-1.8B,2.5375,2.85,2.3857142857142857,1.98,2.6052631578947367,2.4782608695652173,2.55,1.525,
Qwen1.5-4B,2.8875,2.94,2.7285714285714286,2.45,2.6973684210526314,3.3333333333333335,2.73,1.9,
gemma-2b,2.3375,2.72,2.357142857142857,2.16,2.0933333333333333,2.6231884057971016,2.32,1.4875,
OLMo-1B,1.7625,1.8,1.4428571428571428,1.33,1.9473684210526316,2.1884057971014492,1.59,1.125,
Qwen1.5-0.5B-Chat,2.2,2.61,2.057142857142857,1.76,2.0,2.391304347826087,2.38,1.4625,1.1594202898550725
Qwen1.5-1.8B-Chat,2.8125,3.27,2.914285714285714,2.28,2.8552631578947367,2.681159420289855,3.13,1.9875,1.3
Qwen1.5-4B-Chat,2.9,3.19,3.085714285714286,2.83,3.0,3.3333333333333335,3.07,2.4,1.4714285714285715
Phi-3-mini-4k-instruct,3.725,3.88,3.8,3.81,3.973684210526316,4.144927536231884,3.9,3.3375,1.9142857142857144
Phi-3-mini-128k-instruct,3.7125,3.8,3.7,3.82,3.513157894736842,3.9565217391304346,3.83,3.1,1.8285714285714285
gemma-2b-it,2.875,3.24,3.1142857142857143,2.48,2.8815789473684212,3.753623188405797,3.15,1.9625,1.6571428571428573
gemma-1.1-2b-it,2.9,3.34,3.2285714285714286,2.74,3.0526315789473686,3.971014492753623,3.37,1.975,1.4714285714285715
gemma-7b,1.325,1.49,1.1857142857142857,1.34,1.5789473684210527,2.1594202898550723,1.2,1.0125,
Mistral-7B-v0.1,3.225,3.3,3.242857142857143,2.86,2.763157894736842,3.4057971014492754,3.09,2.1625,
Mistral-7B-v0.2,3.15,3.33,3.1,2.78,2.891891891891892,3.3768115942028984,3.29,2.275,
Qwen1.5-7B,2.9875,3.14,3.0142857142857142,2.65,2.8266666666666667,3.101449275362319,2.77,2.4875,
Yi-6B,2.9375,2.97,2.657142857142857,2.36,2.486842105263158,3.2318840579710146,2.89,1.55,
Llama-2-7b-hf,2.6125,2.87,2.5142857142857142,2.18,2.210526315789474,3.217391304347826,2.6,1.45,
CodeLlama-7b-hf,1.9625,2.25,1.7714285714285714,1.72,2.1184210526315788,2.347826086956522,1.9,1.5625,
Meta-Llama-3-8B,3.2625,2.94,2.657142857142857,2.39,3.039473684210526,2.898550724637681,2.82,1.9375,
llemma_7b,2.4125,2.57,2.085714285714286,2.24,2.3026315789473686,2.5217391304347827,2.19,1.8375,
OLMo-7B,2.3875,2.26,1.9285714285714286,1.84,2.1052631578947367,2.652173913043478,2.16,1.3125,
gemma-7b-it,3.3125,3.43,3.0714285714285716,2.97,3.026315789473684,3.7681159420289854,3.15,2.325,1.7857142857142858
gemma-1.1-7b-it,3.5875,3.53,3.3714285714285714,3.25,3.25,4.043478260869565,3.44,2.7875,2.0
Mistral-7B-Instruct-v0.2,3.7,3.87,3.8,3.18,3.4473684210526314,3.8260869565217392,3.77,3.3625,2.2857142857142856
Qwen1.5-7B-Chat,3.5875,3.88,3.7142857142857144,3.3,3.3947368421052633,3.7246376811594204,3.7,3.15,2.057142857142857
Yi-6B-Chat,3.275,3.52,3.414285714285714,2.85,3.08,3.4782608695652173,3.676767676767677,2.3375,1.457142857142857
Llama-2-7b-chat-hf,3.3875,3.58,3.585714285714286,2.85,2.960526315789474,4.144927536231884,3.65,2.3,2.0285714285714285
CodeLlama-7b-Instruct-hf,3.2125,3.36,3.2857142857142856,2.75,2.960526315789474,3.753623188405797,3.22,2.575,1.7714285714285714
Meta-Llama-3-8B-Instruct,4.125,3.94,3.9285714285714284,3.47,3.506666666666667,3.7246376811594204,3.83,3.5,2.914285714285714
OLMo-7B-SFT,2.95,3.27,2.9571428571428573,2.4,2.6842105263157894,3.3333333333333335,2.93,2.0875,1.1857142857142857
OLMo-7B-Instruct,3.1125,3.54,3.2714285714285714,2.47,2.776315789473684,3.101449275362319,3.31,2.2125,1.4142857142857144
tulu-2-7b,2.8625,3.34,3.2285714285714286,2.81,2.973684210526316,3.63768115942029,3.26,2.2125,1.7142857142857142
tulu-2-dpo-7b,3.2375,3.76,3.5,2.79,3.0789473684210527,3.753623188405797,3.68,2.4375,1.9714285714285715
codetulu-2-7b,3.1125,3.41,3.1142857142857143,2.73,2.9078947368421053,3.246376811594203,3.25,2.7875,1.8
Orca-2-7b,2.425,2.27,1.3714285714285714,1.85,2.3157894736842106,2.5942028985507246,2.24,1.6,1.7285714285714286
openchat-3.5-0106,3.6375,3.84,3.757142857142857,3.34,3.5657894736842106,3.7246376811594204,3.66,3.125,2.157142857142857
OpenHermes-2-Mistral-7B,3.525,3.66,3.8,3.28,3.28,3.2318840579710146,3.45,2.925,1.9142857142857144
OpenHermes-2.5-Mistral-7B,3.6875,3.66,3.7285714285714286,3.28,3.276315789473684,3.4347826086956523,3.57,3.0625,2.1
Nous-Hermes-2-Mistral-7B-DPO,3.6625,3.74,3.8,3.26,3.3552631578947367,3.3768115942028984,3.69,3.0625,2.1714285714285713
Starling-LM-7B-alpha,3.7125,3.72,3.8285714285714287,3.33,3.223684210526316,3.9130434782608696,3.54,3.025,2.2285714285714286
Starling-LM-7B-beta,3.8,3.84,4.0,3.56,3.546666666666667,3.869565217391304,3.87,3.5625,2.2714285714285714
mistral-orpo-alpha,3.525,3.7,3.6,3.11,3.1710526315789473,3.971014492753623,3.5,2.95,2.085714285714286
mistral-orpo-beta,3.6125,3.8,3.6857142857142855,3.12,3.263157894736842,3.6956521739130435,3.58,3.025,2.1
zephyr-7b-beta,3.55,3.72,3.7285714285714286,3.23,3.3815789473684212,3.550724637681159,3.73,3.2875,1.9428571428571428
Qwen1.5-14B,3.5375,3.41,3.157142857142857,3.0,3.0921052631578947,2.579710144927536,3.16,2.9125,
Llama-2-13b-hf,2.85,3.09,2.7857142857142856,2.28,2.5789473684210527,3.347826086956522,2.88,1.8125,
CodeLlama-13b-hf,2.3,2.3,1.957142857142857,2.01,2.0921052631578947,2.449275362318841,2.15,1.8125,
SOLAR-10.7B-v1.0,3.25,3.56,3.3714285714285714,2.96,3.1973684210526314,3.6666666666666665,3.42,2.5625,
Qwen1.5-14B-Chat,3.625,3.9,3.857142857142857,3.36,3.263157894736842,3.8550724637681157,3.52,3.2,2.3857142857142857
SOLAR-10.7B-Instruct-v1.0,3.8125,3.77,3.857142857142857,3.42,3.3815789473684212,3.8260869565217392,3.9,3.4125,2.442857142857143
aya-101,1.2875,1.45,1.4714285714285715,1.25,1.9078947368421053,1.6666666666666667,1.38,1.1625,1.1285714285714286
Llama-2-13b-chat-hf,3.6625,3.92,3.6857142857142855,2.76,3.0789473684210527,4.318840579710145,3.71,2.6,2.1142857142857143
CodeLlama-13b-Instruct-hf,3.2625,3.34,3.357142857142857,2.77,2.8947368421052633,4.043478260869565,3.38,2.6,1.8857142857142857
tulu-2-13b,3.15,3.38,3.4,2.8,3.026666666666667,3.7681159420289854,3.39,2.775,2.0285714285714285
tulu-2-dpo-13b,3.45,3.77,3.6,2.9,3.1842105263157894,3.8405797101449277,3.59,3.05,2.142857142857143
codetulu-2-13b,3.225,3.5,3.4,2.8,3.1973684210526314,3.289855072463768,3.38,3.2375,1.8857142857142857
Orca-2-13b,2.9375,2.49,1.7857142857142858,2.24,2.486842105263158,2.8115942028985508,2.8,2.3625,2.0428571428571427
Yi-34B,3.5125,3.54,3.5285714285714285,3.27,3.24,3.579710144927536,3.39,2.5125,
llemma_34b,2.9875,2.97,2.742857142857143,2.75,2.8157894736842106,2.971014492753623,2.84,2.0875,
Qwen1.5-32B,3.325,3.64,3.5142857142857142,3.31,3.1184210526315788,3.3333333333333335,3.33,2.925,
CodeLlama-34b-hf,2.8125,2.66,2.4857142857142858,2.17,2.5657894736842106,2.7246376811594204,2.59,2.0625,
Mixtral-8x7B-v0.1,3.7125,3.58,3.5,3.3,3.236842105263158,3.869565217391304,3.59,2.775,
Yi-34B-Chat,3.7375,3.83,3.914285714285714,3.57,3.675675675675676,3.8840579710144927,3.96,3.0375,2.1857142857142855
Nous-Hermes-2-Yi-34B,3.3375,3.65,3.642857142857143,3.53,3.3733333333333335,3.536231884057971,3.56,3.175,2.0714285714285716
CodeLlama-34b-Instruct-hf,3.5,3.5,3.4571428571428573,3.04,3.0789473684210527,4.130434782608695,3.46,2.7375,2.1142857142857143
codetulu-2-34b,3.45,3.51,3.6857142857142855,3.01,3.210526315789474,3.652173913043478,3.5,3.35,2.0
Qwen1.5-32B-Chat,3.7875,3.85,4.0285714285714285,3.62,3.3947368421052633,4.217391304347826,3.87,3.7375,2.7142857142857144
Mixtral-8x7B-Instruct-v0.1,3.9,3.88,3.6,3.71,3.4342105263157894,3.8115942028985508,3.81,3.4125,2.7142857142857144
Nous-Hermes-2-Mixtral-8x7B-SFT,3.65,3.78,3.7142857142857144,3.39,3.460526315789474,3.608695652173913,3.63,3.5375,2.4
Nous-Hermes-2-Mixtral-8x7B-DPO,3.8125,4.06,3.9571428571428573,3.53,3.3421052631578947,3.739130434782609,3.79,3.6625,2.557142857142857
c4ai-command-r-v01,3.8125,3.88,3.9,3.39,3.4473684210526314,3.898550724637681,3.9,3.1875,2.1857142857142855
Llama-2-70b-hf,3.425,3.56,3.3857142857142857,3.06,3.1333333333333333,3.869565217391304,3.48,2.625,
CodeLlama-70b-hf,2.9375,2.62,2.557142857142857,2.44,2.506666666666667,2.8405797101449277,2.44,2.4,
Mixtral-8x22B-v0.1-AWQ,3.6875,3.7,3.742857142857143,3.5,3.539473684210526,4.0,3.49,3.1875,
Meta-Llama-3-70B,3.35,3.33,3.1142857142857143,3.04,3.3421052631578947,3.260869565217391,3.04,2.5,
Qwen1.5-72B,3.4875,3.6,3.5,3.25,3.2266666666666666,3.9420289855072466,3.38,2.9875,
Llama-2-70b-chat-hf,3.6625,3.88,3.9285714285714284,3.22,3.36,4.3768115942028984,3.73,3.1875,2.3857142857142857
CodeLlama-70b-Instruct-hf,2.85,2.7,2.6714285714285713,2.83,2.7466666666666666,4.101449275362318,2.55,1.9875,1.9285714285714286
tulu-2-dpo-70b,3.7,3.89,3.9,3.36,3.4210526315789473,3.753623188405797,3.83,3.6125,2.3142857142857145
c4ai-command-r-plus-GPTQ,3.9875,4.0,4.185714285714286,3.64,3.460526315789474,3.971014492753623,3.94,3.525,2.757142857142857
Meta-Llama-3-70B-Instruct,4.125,4.18,4.185714285714286,3.87,3.9066666666666667,4.0144927536231885,4.04,3.775,3.3142857142857145
Mixtral-8x22B-Instruct-v0.1-AWQ,4.0125,4.0,4.0,3.96,3.8421052631578947,4.086956521739131,3.87,3.7125,2.7142857142857144
zephyr-orpo-141b-A35b-v0.1-AWQ,3.55,3.62,3.9571428571428573,3.52,3.6184210526315788,3.449275362318841,3.58,3.2875,2.585714285714286
Qwen1.5-72B-Chat,3.8875,3.99,4.0285714285714285,3.68,3.6315789473684212,3.9565217391304346,3.96,3.525,2.914285714285714
qwen-110b-chat,4.15,4.01,4.228571428571429,3.94,3.8815789473684212,4.043478260869565,3.99,3.5875,2.7714285714285714
gpt-3.5-turbo-1106,4.025,3.79,3.8285714285714287,3.51,3.4342105263157894,4.0,3.67,3.1625,2.557142857142857
gpt-3.5-turbo-0125,3.925,3.85,3.842857142857143,3.65,3.4342105263157894,3.8840579710144927,3.79,3.1375,2.6142857142857143
gpt-4-1106-preview,4.2875,4.23,4.271428571428571,4.22,4.171052631578948,4.565217391304348,4.24,3.775,3.6
gpt-4-0125-preview,4.3,4.2,4.357142857142857,4.16,4.144736842105263,4.173913043478261,4.26,3.925,3.5428571428571427
gpt-4-turbo-2024-04-09,4.3125,4.13,4.3,4.2,4.105263157894737,4.086956521739131,4.12,3.8,3.4714285714285715
gpt-4o-2024-05-13,4.2375,4.26,4.357142857142857,4.21,4.078947368421052,4.057971014492754,4.08,3.85,3.642857142857143
mistral-medium,3.9625,3.94,4.0285714285714285,3.95,3.776315789473684,4.057971014492754,3.9,3.8625,2.9285714285714284
mistral-large,4.025,3.99,4.0285714285714285,3.93,3.776315789473684,3.9130434782608696,3.93,3.825,2.8857142857142857
gemini-1.0-pro,3.6,3.84,3.8714285714285714,3.62,3.3733333333333335,3.9420289855072466,3.75,3.125,3.1857142857142855
gemini-pro-1.5,4.05,4.04,4.128571428571429,4.06,3.6710526315789473,4.115942028985507,4.07,3.4875,3.257142857142857
gemini-flash-1.5,4.1375,3.91,3.9714285714285715,3.92,3.453333333333333,4.217391304347826,3.96,3.625,2.6714285714285713
claude-3-haiku-20240307,4.1375,4.01,4.128571428571429,3.69,3.6315789473684212,4.304347826086956,3.98,3.75,3.0714285714285716
claude-3-sonnet-20240229,4.25,3.92,4.171428571428572,3.91,3.723684210526316,4.36231884057971,4.0,3.75,3.1857142857142855
claude-3-opus-20240229,4.2875,4.06,4.185714285714286,3.97,3.9078947368421053,4.536231884057971,4.09,3.7875,3.5714285714285716