| { | |
| "lastUpdated": "2025-09-05", | |
| "models": [ | |
| { | |
| "name": "llama-3-1-8b-instruct", | |
| "overall_score": 0.428, | |
| "valid_tool_schema": 96.1, | |
| "compliance": 89.4, | |
| "task_success": 90.9, | |
| "schema_understanding": 0.261, | |
| "task_completion": 0.295, | |
| "tool_usage": 0.352, | |
| "planning_effectiveness": 0.310, | |
| "task_information": 0.221, | |
| "tool_parameter": 0.141, | |
| "dependency": 0.428 | |
| }, | |
| { | |
| "name": "llama-3-2-90b-vision-instruct", | |
| "overall_score": 0.495, | |
| "valid_tool_schema": 99.6, | |
| "compliance": 85.0, | |
| "task_success": 90.9, | |
| "schema_understanding": 0.293, | |
| "task_completion": 0.444, | |
| "tool_usage": 0.515, | |
| "planning_effectiveness": 0.427, | |
| "task_information": 0.267, | |
| "tool_parameter": 0.173, | |
| "dependency": 0.495 | |
| }, | |
| { | |
| "name": "nova-micro-v1", | |
| "overall_score": 0.508, | |
| "valid_tool_schema": 96.0, | |
| "compliance": 93.1, | |
| "task_success": 87.8, | |
| "schema_understanding": 0.339, | |
| "task_completion": 0.419, | |
| "tool_usage": 0.504, | |
| "planning_effectiveness": 0.428, | |
| "task_information": 0.315, | |
| "tool_parameter": 0.212, | |
| "dependency": 0.508 | |
| }, | |
| { | |
| "name": "llama-3-1-70b-instruct", | |
| "overall_score": 0.510, | |
| "valid_tool_schema": 99.2, | |
| "compliance": 90.5, | |
| "task_success": 92.5, | |
| "schema_understanding": 0.314, | |
| "task_completion": 0.432, | |
| "tool_usage": 0.523, | |
| "planning_effectiveness": 0.451, | |
| "task_information": 0.287, | |
| "tool_parameter": 0.191, | |
| "dependency": 0.510 | |
| }, | |
| { | |
| "name": "mistral-small-2503", | |
| "overall_score": 0.530, | |
| "valid_tool_schema": 96.4, | |
| "compliance": 95.6, | |
| "task_success": 86.2, | |
| "schema_understanding": 0.373, | |
| "task_completion": 0.445, | |
| "tool_usage": 0.537, | |
| "planning_effectiveness": 0.446, | |
| "task_information": 0.349, | |
| "tool_parameter": 0.232, | |
| "dependency": 0.530 | |
| }, | |
| { | |
| "name": "gpt-4o-mini", | |
| "overall_score": 0.557, | |
| "valid_tool_schema": 97.5, | |
| "compliance": 98.1, | |
| "task_success": 93.9, | |
| "schema_understanding": 0.374, | |
| "task_completion": 0.500, | |
| "tool_usage": 0.555, | |
| "planning_effectiveness": 0.544, | |
| "task_information": 0.352, | |
| "tool_parameter": 0.201, | |
| "dependency": 0.557 | |
| }, | |
| { | |
| "name": "llama-3-3-70b-instruct", | |
| "overall_score": 0.558, | |
| "valid_tool_schema": 99.5, | |
| "compliance": 93.8, | |
| "task_success": 91.6, | |
| "schema_understanding": 0.349, | |
| "task_completion": 0.493, | |
| "tool_usage": 0.583, | |
| "planning_effectiveness": 0.525, | |
| "task_information": 0.355, | |
| "tool_parameter": 0.262, | |
| "dependency": 0.558 | |
| }, | |
| { | |
| "name": "gemma-3-27b-it", | |
| "overall_score": 0.582, | |
| "valid_tool_schema": 98.8, | |
| "compliance": 97.6, | |
| "task_success": 94.4, | |
| "schema_understanding": 0.378, | |
| "task_completion": 0.530, | |
| "tool_usage": 0.608, | |
| "planning_effectiveness": 0.572, | |
| "task_information": 0.383, | |
| "tool_parameter": 0.249, | |
| "dependency": 0.582 | |
| }, | |
| { | |
| "name": "gpt-4o", | |
| "overall_score": 0.595, | |
| "valid_tool_schema": 98.9, | |
| "compliance": 98.3, | |
| "task_success": 92.8, | |
| "schema_understanding": 0.394, | |
| "task_completion": 0.542, | |
| "tool_usage": 0.627, | |
| "planning_effectiveness": 0.587, | |
| "task_information": 0.405, | |
| "tool_parameter": 0.272, | |
| "dependency": 0.595 | |
| }, | |
| { | |
| "name": "gemini-2.5-flash-lite", | |
| "overall_score": 0.598, | |
| "valid_tool_schema": 99.4, | |
| "compliance": 97.8, | |
| "task_success": 94.3, | |
| "schema_understanding": 0.412, | |
| "task_completion": 0.577, | |
| "tool_usage": 0.627, | |
| "planning_effectiveness": 0.597, | |
| "task_information": 0.404, | |
| "tool_parameter": 0.226, | |
| "dependency": 0.598 | |
| }, | |
| { | |
| "name": "qwen3-30b-a3b-instruct-2507", | |
| "overall_score": 0.627, | |
| "valid_tool_schema": 99.0, | |
| "compliance": 98.4, | |
| "task_success": 92.3, | |
| "schema_understanding": 0.481, | |
| "task_completion": 0.530, | |
| "tool_usage": 0.658, | |
| "planning_effectiveness": 0.638, | |
| "task_information": 0.473, | |
| "tool_parameter": 0.303, | |
| "dependency": 0.627 | |
| }, | |
| { | |
| "name": "kimi-k2", | |
| "overall_score": 0.629, | |
| "valid_tool_schema": 98.8, | |
| "compliance": 98.1, | |
| "task_success": 94.5, | |
| "schema_understanding": 0.502, | |
| "task_completion": 0.577, | |
| "tool_usage": 0.631, | |
| "planning_effectiveness": 0.623, | |
| "task_information": 0.448, | |
| "tool_parameter": 0.307, | |
| "dependency": 0.629 | |
| }, | |
| { | |
| "name": "gpt-oss-20b", | |
| "overall_score": 0.654, | |
| "valid_tool_schema": 98.8, | |
| "compliance": 99.1, | |
| "task_success": 93.6, | |
| "schema_understanding": 0.547, | |
| "task_completion": 0.623, | |
| "tool_usage": 0.661, | |
| "planning_effectiveness": 0.638, | |
| "task_information": 0.509, | |
| "tool_parameter": 0.309, | |
| "dependency": 0.654 | |
| }, | |
| { | |
| "name": "glm-4.5", | |
| "overall_score": 0.668, | |
| "valid_tool_schema": 99.7, | |
| "compliance": 99.7, | |
| "task_success": 97.4, | |
| "schema_understanding": 0.525, | |
| "task_completion": 0.682, | |
| "tool_usage": 0.680, | |
| "planning_effectiveness": 0.661, | |
| "task_information": 0.523, | |
| "tool_parameter": 0.297, | |
| "dependency": 0.668 | |
| }, | |
| { | |
| "name": "qwen3-235b-a22b-2507", | |
| "overall_score": 0.678, | |
| "valid_tool_schema": 99.1, | |
| "compliance": 99.3, | |
| "task_success": 94.8, | |
| "schema_understanding": 0.549, | |
| "task_completion": 0.625, | |
| "tool_usage": 0.688, | |
| "planning_effectiveness": 0.712, | |
| "task_information": 0.542, | |
| "tool_parameter": 0.355, | |
| "dependency": 0.678 | |
| }, | |
| { | |
| "name": "claude-sonnet-4", | |
| "overall_score": 0.681, | |
| "valid_tool_schema": 100.0, | |
| "compliance": 99.8, | |
| "task_success": 98.8, | |
| "schema_understanding": 0.554, | |
| "task_completion": 0.676, | |
| "tool_usage": 0.689, | |
| "planning_effectiveness": 0.671, | |
| "task_information": 0.541, | |
| "tool_parameter": 0.328, | |
| "dependency": 0.681 | |
| }, | |
| { | |
| "name": "gemini-2.5-pro", | |
| "overall_score": 0.690, | |
| "valid_tool_schema": 99.4, | |
| "compliance": 99.6, | |
| "task_success": 96.9, | |
| "schema_understanding": 0.562, | |
| "task_completion": 0.725, | |
| "tool_usage": 0.717, | |
| "planning_effectiveness": 0.670, | |
| "task_information": 0.541, | |
| "tool_parameter": 0.329, | |
| "dependency": 0.690 | |
| }, | |
| { | |
| "name": "gpt-oss-120b", | |
| "overall_score": 0.692, | |
| "valid_tool_schema": 97.7, | |
| "compliance": 98.8, | |
| "task_success": 94.0, | |
| "schema_understanding": 0.636, | |
| "task_completion": 0.705, | |
| "tool_usage": 0.691, | |
| "planning_effectiveness": 0.661, | |
| "task_information": 0.576, | |
| "tool_parameter": 0.329, | |
| "dependency": 0.692 | |
| }, | |
| { | |
| "name": "o3", | |
| "overall_score": 0.715, | |
| "valid_tool_schema": 99.3, | |
| "compliance": 99.9, | |
| "task_success": 97.1, | |
| "schema_understanding": 0.641, | |
| "task_completion": 0.706, | |
| "tool_usage": 0.724, | |
| "planning_effectiveness": 0.726, | |
| "task_information": 0.592, | |
| "tool_parameter": 0.359, | |
| "dependency": 0.715 | |
| }, | |
| { | |
| "name": "gpt-5", | |
| "overall_score": 0.749, | |
| "valid_tool_schema": 100.0, | |
| "compliance": 99.3, | |
| "task_success": 99.1, | |
| "schema_understanding": 0.677, | |
| "task_completion": 0.828, | |
| "tool_usage": 0.767, | |
| "planning_effectiveness": 0.749, | |
| "task_information": 0.649, | |
| "tool_parameter": 0.339, | |
| "dependency": 0.749 | |
| } | |
| ] | |
| } |