mcp-bench / script.js
ztwang's picture
Upload 22 files
4966301 verified
raw
history blame
20.6 kB
const LEADERBOARD_DATA = {
"lastUpdated": "2025-09-05",
"models": [
{
"name": "llama-3-1-8b-instruct",
"valid_tool_name_rate": 96.1,
"schema_compliance": 89.4,
"execution_success": 90.9,
"task_fulfillment": 0.261,
"information_grounding": 0.295,
"tool_appropriateness": 0.352,
"parameter_accuracy": 0.310,
"dependency_awareness": 0.221,
"parallelism_efficiency": 0.141,
"overall_score": 0.428
},
{
"name": "llama-3-2-90b-vision-instruct",
"valid_tool_name_rate": 99.6,
"schema_compliance": 85.0,
"execution_success": 90.9,
"task_fulfillment": 0.293,
"information_grounding": 0.444,
"tool_appropriateness": 0.515,
"parameter_accuracy": 0.427,
"dependency_awareness": 0.267,
"parallelism_efficiency": 0.173,
"overall_score": 0.495
},
{
"name": "nova-micro-v1",
"valid_tool_name_rate": 96.0,
"schema_compliance": 93.1,
"execution_success": 87.8,
"task_fulfillment": 0.339,
"information_grounding": 0.419,
"tool_appropriateness": 0.504,
"parameter_accuracy": 0.428,
"dependency_awareness": 0.315,
"parallelism_efficiency": 0.212,
"overall_score": 0.508
},
{
"name": "llama-3-1-70b-instruct",
"valid_tool_name_rate": 99.2,
"schema_compliance": 90.5,
"execution_success": 92.5,
"task_fulfillment": 0.314,
"information_grounding": 0.432,
"tool_appropriateness": 0.523,
"parameter_accuracy": 0.451,
"dependency_awareness": 0.287,
"parallelism_efficiency": 0.191,
"overall_score": 0.510
},
{
"name": "mistral-small-2503",
"valid_tool_name_rate": 96.4,
"schema_compliance": 95.6,
"execution_success": 86.2,
"task_fulfillment": 0.373,
"information_grounding": 0.445,
"tool_appropriateness": 0.537,
"parameter_accuracy": 0.446,
"dependency_awareness": 0.349,
"parallelism_efficiency": 0.232,
"overall_score": 0.530
},
{
"name": "gpt-4o-mini",
"valid_tool_name_rate": 97.5,
"schema_compliance": 98.1,
"execution_success": 93.9,
"task_fulfillment": 0.374,
"information_grounding": 0.500,
"tool_appropriateness": 0.555,
"parameter_accuracy": 0.544,
"dependency_awareness": 0.352,
"parallelism_efficiency": 0.201,
"overall_score": 0.557
},
{
"name": "llama-3-3-70b-instruct",
"valid_tool_name_rate": 99.5,
"schema_compliance": 93.8,
"execution_success": 91.6,
"task_fulfillment": 0.349,
"information_grounding": 0.493,
"tool_appropriateness": 0.583,
"parameter_accuracy": 0.525,
"dependency_awareness": 0.355,
"parallelism_efficiency": 0.262,
"overall_score": 0.558
},
{
"name": "gemma-3-27b-it",
"valid_tool_name_rate": 98.8,
"schema_compliance": 97.6,
"execution_success": 94.4,
"task_fulfillment": 0.378,
"information_grounding": 0.530,
"tool_appropriateness": 0.608,
"parameter_accuracy": 0.572,
"dependency_awareness": 0.383,
"parallelism_efficiency": 0.249,
"overall_score": 0.582
},
{
"name": "gpt-4o",
"valid_tool_name_rate": 98.9,
"schema_compliance": 98.3,
"execution_success": 92.8,
"task_fulfillment": 0.394,
"information_grounding": 0.542,
"tool_appropriateness": 0.627,
"parameter_accuracy": 0.587,
"dependency_awareness": 0.405,
"parallelism_efficiency": 0.272,
"overall_score": 0.595
},
{
"name": "gemini-2.5-flash-lite",
"valid_tool_name_rate": 99.4,
"schema_compliance": 97.8,
"execution_success": 94.3,
"task_fulfillment": 0.412,
"information_grounding": 0.577,
"tool_appropriateness": 0.627,
"parameter_accuracy": 0.597,
"dependency_awareness": 0.404,
"parallelism_efficiency": 0.226,
"overall_score": 0.598
},
{
"name": "qwen3-30b-a3b-instruct-2507",
"valid_tool_name_rate": 99.0,
"schema_compliance": 98.4,
"execution_success": 92.3,
"task_fulfillment": 0.481,
"information_grounding": 0.530,
"tool_appropriateness": 0.658,
"parameter_accuracy": 0.638,
"dependency_awareness": 0.473,
"parallelism_efficiency": 0.303,
"overall_score": 0.627
},
{
"name": "kimi-k2",
"valid_tool_name_rate": 98.8,
"schema_compliance": 98.1,
"execution_success": 94.5,
"task_fulfillment": 0.502,
"information_grounding": 0.577,
"tool_appropriateness": 0.631,
"parameter_accuracy": 0.623,
"dependency_awareness": 0.448,
"parallelism_efficiency": 0.307,
"overall_score": 0.629
},
{
"name": "gpt-oss-20b",
"valid_tool_name_rate": 98.8,
"schema_compliance": 99.1,
"execution_success": 93.6,
"task_fulfillment": 0.547,
"information_grounding": 0.623,
"tool_appropriateness": 0.661,
"parameter_accuracy": 0.638,
"dependency_awareness": 0.509,
"parallelism_efficiency": 0.309,
"overall_score": 0.654
},
{
"name": "glm-4.5",
"valid_tool_name_rate": 99.7,
"schema_compliance": 99.7,
"execution_success": 97.4,
"task_fulfillment": 0.525,
"information_grounding": 0.682,
"tool_appropriateness": 0.680,
"parameter_accuracy": 0.661,
"dependency_awareness": 0.523,
"parallelism_efficiency": 0.297,
"overall_score": 0.668
},
{
"name": "qwen3-235b-a22b-2507",
"valid_tool_name_rate": 99.1,
"schema_compliance": 99.3,
"execution_success": 94.8,
"task_fulfillment": 0.549,
"information_grounding": 0.625,
"tool_appropriateness": 0.688,
"parameter_accuracy": 0.712,
"dependency_awareness": 0.542,
"parallelism_efficiency": 0.355,
"overall_score": 0.678
},
{
"name": "claude-sonnet-4",
"valid_tool_name_rate": 100.0,
"schema_compliance": 99.8,
"execution_success": 98.8,
"task_fulfillment": 0.554,
"information_grounding": 0.676,
"tool_appropriateness": 0.689,
"parameter_accuracy": 0.671,
"dependency_awareness": 0.541,
"parallelism_efficiency": 0.328,
"overall_score": 0.681
},
{
"name": "gemini-2.5-pro",
"valid_tool_name_rate": 99.4,
"schema_compliance": 99.6,
"execution_success": 96.9,
"task_fulfillment": 0.562,
"information_grounding": 0.725,
"tool_appropriateness": 0.717,
"parameter_accuracy": 0.670,
"dependency_awareness": 0.541,
"parallelism_efficiency": 0.329,
"overall_score": 0.690
},
{
"name": "gpt-oss-120b",
"valid_tool_name_rate": 97.7,
"schema_compliance": 98.8,
"execution_success": 94.0,
"task_fulfillment": 0.636,
"information_grounding": 0.705,
"tool_appropriateness": 0.691,
"parameter_accuracy": 0.661,
"dependency_awareness": 0.576,
"parallelism_efficiency": 0.329,
"overall_score": 0.692
},
{
"name": "o3",
"valid_tool_name_rate": 99.3,
"schema_compliance": 99.9,
"execution_success": 97.1,
"task_fulfillment": 0.641,
"information_grounding": 0.706,
"tool_appropriateness": 0.724,
"parameter_accuracy": 0.726,
"dependency_awareness": 0.592,
"parallelism_efficiency": 0.359,
"overall_score": 0.715
},
{
"name": "gpt-5",
"valid_tool_name_rate": 100.0,
"schema_compliance": 99.3,
"execution_success": 99.1,
"task_fulfillment": 0.677,
"information_grounding": 0.828,
"tool_appropriateness": 0.767,
"parameter_accuracy": 0.749,
"dependency_awareness": 0.649,
"parallelism_efficiency": 0.339,
"overall_score": 0.749
}
]
};
class LeaderboardApp {
constructor() {
this.data = null;
this.filteredData = null;
this.currentSort = { column: 'overall_score', ascending: false };
this.init();
}
async init() {
try {
this.loadData();
this.setupEventListeners();
this.renderTable();
this.updateLastUpdated();
} catch (error) {
console.error('Failed to initialize app:', error);
this.showError('Failed to load leaderboard data');
}
}
loadData() {
const loading = document.getElementById('loading');
loading.classList.add('active');
try {
this.data = LEADERBOARD_DATA;
this.filteredData = [...this.data.models];
this.sortData();
} finally {
loading.classList.remove('active');
}
}
setupEventListeners() {
const searchInput = document.getElementById('searchInput');
const sortSelect = document.getElementById('sortSelect');
const sortOrder = document.getElementById('sortOrder');
const tableHeaders = document.querySelectorAll('.sortable');
searchInput.addEventListener('input', (e) => this.handleSearch(e.target.value));
sortSelect.addEventListener('change', (e) => this.handleSortChange(e.target.value));
sortOrder.addEventListener('click', () => this.toggleSortOrder());
tableHeaders.forEach(header => {
header.addEventListener('click', () => {
const column = header.dataset.column;
this.handleColumnSort(column);
});
});
}
handleSearch(query) {
const searchTerm = query.toLowerCase().trim();
if (searchTerm === '') {
this.filteredData = [...this.data.models];
} else {
this.filteredData = this.data.models.filter(model =>
model.name.toLowerCase().includes(searchTerm)
);
}
this.sortData();
this.renderTable();
}
handleSortChange(column) {
this.currentSort.column = column;
this.sortData();
this.renderTable();
this.updateSortIndicators();
}
handleColumnSort(column) {
if (this.currentSort.column === column) {
this.currentSort.ascending = !this.currentSort.ascending;
} else {
this.currentSort.column = column;
this.currentSort.ascending = false;
}
document.getElementById('sortSelect').value = column;
this.sortData();
this.renderTable();
this.updateSortIndicators();
this.updateSortOrderButton();
}
toggleSortOrder() {
this.currentSort.ascending = !this.currentSort.ascending;
this.sortData();
this.renderTable();
this.updateSortOrderButton();
}
sortData() {
const { column, ascending } = this.currentSort;
this.filteredData.sort((a, b) => {
let aValue = a[column];
let bValue = b[column];
if (typeof aValue === 'string') {
aValue = aValue.toLowerCase();
bValue = bValue.toLowerCase();
}
let comparison = 0;
if (aValue > bValue) comparison = 1;
if (aValue < bValue) comparison = -1;
return ascending ? comparison : -comparison;
});
}
renderTable() {
const tableBody = document.getElementById('tableBody');
if (this.filteredData.length === 0) {
tableBody.innerHTML = `
<tr>
<td colspan="9" class="no-results">
<i class="fas fa-search"></i>
No models found matching your search criteria
</td>
</tr>
`;
return;
}
tableBody.innerHTML = this.filteredData
.map((model) => this.createTableRow(model))
.join('');
}
createTableRow(model) {
return `
<tr>
<td class="model-col">
<span class="model-name">${model.name}</span>
</td>
<td class="score-col">
<span class="score ${this.getScoreClass(model.overall_score)}">
${model.overall_score.toFixed(3)}
</span>
</td>
<td class="metric-col">
${this.createMetricCell(model.valid_tool_name_rate, true)}
</td>
<td class="metric-col">
${this.createMetricCell(model.schema_compliance, true)}
</td>
<td class="metric-col">
${this.createMetricCell(model.execution_success, true)}
</td>
<td class="metric-col">
${this.createMetricCell(model.task_fulfillment)}
</td>
<td class="metric-col">
${this.createMetricCell(model.information_grounding)}
</td>
<td class="metric-col">
${this.createMetricCell(model.tool_appropriateness)}
</td>
<td class="metric-col">
${this.createMetricCell(model.parameter_accuracy)}
</td>
<td class="metric-col">
${this.createMetricCell(model.dependency_awareness)}
</td>
<td class="metric-col">
${this.createMetricCell(model.parallelism_efficiency)}
</td>
</tr>
`;
}
createMetricCell(value, isPercentage = false) {
const displayValue = isPercentage ?
`${value.toFixed(1)}%` :
value.toFixed(3);
const normalizedValue = isPercentage ? value / 100 : value;
const scoreClass = this.getScoreClass(normalizedValue);
const barWidth = isPercentage ? value : (value * 100);
return `
<div class="metric" data-tooltip="${displayValue}">
<div class="metric-bar ${scoreClass}" style="width: ${barWidth}%"></div>
<span>${displayValue}</span>
</div>
`;
}
getRankClass(rank) {
if (rank === 1) return 'top-1';
if (rank <= 3) return 'top-3';
if (rank <= 5) return 'top-5';
return '';
}
getScoreClass(score) {
if (score >= 0.7) return 'excellent';
if (score >= 0.6) return 'good';
if (score >= 0.5) return 'average';
return 'poor';
}
updateSortIndicators() {
const headers = document.querySelectorAll('.sortable');
headers.forEach(header => {
header.classList.remove('active');
const icon = header.querySelector('.sort-icon');
icon.className = 'fas fa-sort sort-icon';
});
const activeHeader = document.querySelector(`[data-column="${this.currentSort.column}"]`);
if (activeHeader) {
activeHeader.classList.add('active');
const icon = activeHeader.querySelector('.sort-icon');
icon.className = this.currentSort.ascending ?
'fas fa-sort-up sort-icon' :
'fas fa-sort-down sort-icon';
}
}
updateSortOrderButton() {
const sortOrderButton = document.getElementById('sortOrder');
const icon = sortOrderButton.querySelector('i');
icon.className = this.currentSort.ascending ?
'fas fa-sort-amount-up' :
'fas fa-sort-amount-down';
sortOrderButton.title = this.currentSort.ascending ?
'Sort descending' :
'Sort ascending';
}
updateLastUpdated() {
const lastUpdatedElement = document.getElementById('lastUpdated');
if (this.data && this.data.lastUpdated) {
const date = new Date(this.data.lastUpdated);
lastUpdatedElement.textContent = date.toLocaleDateString('en-US', {
year: 'numeric',
month: 'long',
day: 'numeric'
});
} else {
lastUpdatedElement.textContent = new Date().toLocaleDateString('en-US', {
year: 'numeric',
month: 'long',
day: 'numeric'
});
}
}
showError(message) {
const tableBody = document.getElementById('tableBody');
tableBody.innerHTML = `
<tr>
<td colspan="9" class="no-results">
<i class="fas fa-exclamation-triangle"></i>
${message}
</td>
</tr>
`;
}
}
// Copy citation function
function copyCitation() {
const citationText = `@article{wang2024mcpbench,
title={MCP-Bench: Benchmarking Tool-Using LLM Agents with Complex Real-World Tasks via MCP Servers},
author={Wang, Zhenting and Chang, Qi and Patel, Hemani and Biju, Shashank and Wu, Cheng-En and Liu, Quan and Ding, Aolin and Rezazadeh, Alireza and Shah, Ankit and Bao, Yujia and Siow, Eugene},
journal={arXiv preprint arXiv:2508.20453},
year={2024}
}`;
if (navigator.clipboard && window.isSecureContext) {
navigator.clipboard.writeText(citationText).then(() => {
showCopySuccess();
}).catch(err => {
console.error('Failed to copy citation:', err);
fallbackCopy(citationText);
});
} else {
fallbackCopy(citationText);
}
}
function fallbackCopy(text) {
const textArea = document.createElement('textarea');
textArea.value = text;
textArea.style.position = 'fixed';
textArea.style.left = '-999999px';
textArea.style.top = '-999999px';
document.body.appendChild(textArea);
textArea.focus();
textArea.select();
try {
document.execCommand('copy');
showCopySuccess();
} catch (err) {
console.error('Fallback copy failed:', err);
}
document.body.removeChild(textArea);
}
function showCopySuccess() {
const button = document.querySelector('.copy-citation-btn');
const originalText = button.innerHTML;
button.innerHTML = '<i class="fas fa-check"></i> Copied!';
button.style.backgroundColor = '#10b981';
setTimeout(() => {
button.innerHTML = originalText;
button.style.backgroundColor = '';
}, 2000);
}
document.addEventListener('DOMContentLoaded', () => {
new LeaderboardApp();
});
if ('serviceWorker' in navigator) {
window.addEventListener('load', () => {
navigator.serviceWorker.register('/sw.js')
.then((registration) => {
console.log('SW registered: ', registration);
})
.catch((registrationError) => {
console.log('SW registration failed: ', registrationError);
});
});
}