mcp-bench / data.json
ztwang's picture
Upload 22 files
4966301 verified
raw
history blame
7.81 kB
{
"lastUpdated": "2025-09-05",
"models": [
{
"name": "llama-3-1-8b-instruct",
"overall_score": 0.428,
"valid_tool_schema": 96.1,
"compliance": 89.4,
"task_success": 90.9,
"schema_understanding": 0.261,
"task_completion": 0.295,
"tool_usage": 0.352,
"planning_effectiveness": 0.310,
"task_information": 0.221,
"tool_parameter": 0.141,
"dependency": 0.428
},
{
"name": "llama-3-2-90b-vision-instruct",
"overall_score": 0.495,
"valid_tool_schema": 99.6,
"compliance": 85.0,
"task_success": 90.9,
"schema_understanding": 0.293,
"task_completion": 0.444,
"tool_usage": 0.515,
"planning_effectiveness": 0.427,
"task_information": 0.267,
"tool_parameter": 0.173,
"dependency": 0.495
},
{
"name": "nova-micro-v1",
"overall_score": 0.508,
"valid_tool_schema": 96.0,
"compliance": 93.1,
"task_success": 87.8,
"schema_understanding": 0.339,
"task_completion": 0.419,
"tool_usage": 0.504,
"planning_effectiveness": 0.428,
"task_information": 0.315,
"tool_parameter": 0.212,
"dependency": 0.508
},
{
"name": "llama-3-1-70b-instruct",
"overall_score": 0.510,
"valid_tool_schema": 99.2,
"compliance": 90.5,
"task_success": 92.5,
"schema_understanding": 0.314,
"task_completion": 0.432,
"tool_usage": 0.523,
"planning_effectiveness": 0.451,
"task_information": 0.287,
"tool_parameter": 0.191,
"dependency": 0.510
},
{
"name": "mistral-small-2503",
"overall_score": 0.530,
"valid_tool_schema": 96.4,
"compliance": 95.6,
"task_success": 86.2,
"schema_understanding": 0.373,
"task_completion": 0.445,
"tool_usage": 0.537,
"planning_effectiveness": 0.446,
"task_information": 0.349,
"tool_parameter": 0.232,
"dependency": 0.530
},
{
"name": "gpt-4o-mini",
"overall_score": 0.557,
"valid_tool_schema": 97.5,
"compliance": 98.1,
"task_success": 93.9,
"schema_understanding": 0.374,
"task_completion": 0.500,
"tool_usage": 0.555,
"planning_effectiveness": 0.544,
"task_information": 0.352,
"tool_parameter": 0.201,
"dependency": 0.557
},
{
"name": "llama-3-3-70b-instruct",
"overall_score": 0.558,
"valid_tool_schema": 99.5,
"compliance": 93.8,
"task_success": 91.6,
"schema_understanding": 0.349,
"task_completion": 0.493,
"tool_usage": 0.583,
"planning_effectiveness": 0.525,
"task_information": 0.355,
"tool_parameter": 0.262,
"dependency": 0.558
},
{
"name": "gemma-3-27b-it",
"overall_score": 0.582,
"valid_tool_schema": 98.8,
"compliance": 97.6,
"task_success": 94.4,
"schema_understanding": 0.378,
"task_completion": 0.530,
"tool_usage": 0.608,
"planning_effectiveness": 0.572,
"task_information": 0.383,
"tool_parameter": 0.249,
"dependency": 0.582
},
{
"name": "gpt-4o",
"overall_score": 0.595,
"valid_tool_schema": 98.9,
"compliance": 98.3,
"task_success": 92.8,
"schema_understanding": 0.394,
"task_completion": 0.542,
"tool_usage": 0.627,
"planning_effectiveness": 0.587,
"task_information": 0.405,
"tool_parameter": 0.272,
"dependency": 0.595
},
{
"name": "gemini-2.5-flash-lite",
"overall_score": 0.598,
"valid_tool_schema": 99.4,
"compliance": 97.8,
"task_success": 94.3,
"schema_understanding": 0.412,
"task_completion": 0.577,
"tool_usage": 0.627,
"planning_effectiveness": 0.597,
"task_information": 0.404,
"tool_parameter": 0.226,
"dependency": 0.598
},
{
"name": "qwen3-30b-a3b-instruct-2507",
"overall_score": 0.627,
"valid_tool_schema": 99.0,
"compliance": 98.4,
"task_success": 92.3,
"schema_understanding": 0.481,
"task_completion": 0.530,
"tool_usage": 0.658,
"planning_effectiveness": 0.638,
"task_information": 0.473,
"tool_parameter": 0.303,
"dependency": 0.627
},
{
"name": "kimi-k2",
"overall_score": 0.629,
"valid_tool_schema": 98.8,
"compliance": 98.1,
"task_success": 94.5,
"schema_understanding": 0.502,
"task_completion": 0.577,
"tool_usage": 0.631,
"planning_effectiveness": 0.623,
"task_information": 0.448,
"tool_parameter": 0.307,
"dependency": 0.629
},
{
"name": "gpt-oss-20b",
"overall_score": 0.654,
"valid_tool_schema": 98.8,
"compliance": 99.1,
"task_success": 93.6,
"schema_understanding": 0.547,
"task_completion": 0.623,
"tool_usage": 0.661,
"planning_effectiveness": 0.638,
"task_information": 0.509,
"tool_parameter": 0.309,
"dependency": 0.654
},
{
"name": "glm-4.5",
"overall_score": 0.668,
"valid_tool_schema": 99.7,
"compliance": 99.7,
"task_success": 97.4,
"schema_understanding": 0.525,
"task_completion": 0.682,
"tool_usage": 0.680,
"planning_effectiveness": 0.661,
"task_information": 0.523,
"tool_parameter": 0.297,
"dependency": 0.668
},
{
"name": "qwen3-235b-a22b-2507",
"overall_score": 0.678,
"valid_tool_schema": 99.1,
"compliance": 99.3,
"task_success": 94.8,
"schema_understanding": 0.549,
"task_completion": 0.625,
"tool_usage": 0.688,
"planning_effectiveness": 0.712,
"task_information": 0.542,
"tool_parameter": 0.355,
"dependency": 0.678
},
{
"name": "claude-sonnet-4",
"overall_score": 0.681,
"valid_tool_schema": 100.0,
"compliance": 99.8,
"task_success": 98.8,
"schema_understanding": 0.554,
"task_completion": 0.676,
"tool_usage": 0.689,
"planning_effectiveness": 0.671,
"task_information": 0.541,
"tool_parameter": 0.328,
"dependency": 0.681
},
{
"name": "gemini-2.5-pro",
"overall_score": 0.690,
"valid_tool_schema": 99.4,
"compliance": 99.6,
"task_success": 96.9,
"schema_understanding": 0.562,
"task_completion": 0.725,
"tool_usage": 0.717,
"planning_effectiveness": 0.670,
"task_information": 0.541,
"tool_parameter": 0.329,
"dependency": 0.690
},
{
"name": "gpt-oss-120b",
"overall_score": 0.692,
"valid_tool_schema": 97.7,
"compliance": 98.8,
"task_success": 94.0,
"schema_understanding": 0.636,
"task_completion": 0.705,
"tool_usage": 0.691,
"planning_effectiveness": 0.661,
"task_information": 0.576,
"tool_parameter": 0.329,
"dependency": 0.692
},
{
"name": "o3",
"overall_score": 0.715,
"valid_tool_schema": 99.3,
"compliance": 99.9,
"task_success": 97.1,
"schema_understanding": 0.641,
"task_completion": 0.706,
"tool_usage": 0.724,
"planning_effectiveness": 0.726,
"task_information": 0.592,
"tool_parameter": 0.359,
"dependency": 0.715
},
{
"name": "gpt-5",
"overall_score": 0.749,
"valid_tool_schema": 100.0,
"compliance": 99.3,
"task_success": 99.1,
"schema_understanding": 0.677,
"task_completion": 0.828,
"tool_usage": 0.767,
"planning_effectiveness": 0.749,
"task_information": 0.649,
"tool_parameter": 0.339,
"dependency": 0.749
}
]
}