|
const LEADERBOARD_DATA = { |
|
"lastUpdated": "2025-09-05", |
|
"models": [ |
|
{ |
|
"name": "llama-3-1-8b-instruct", |
|
"valid_tool_name_rate": 96.1, |
|
"schema_compliance": 89.4, |
|
"execution_success": 90.9, |
|
"task_fulfillment": 0.261, |
|
"information_grounding": 0.295, |
|
"tool_appropriateness": 0.352, |
|
"parameter_accuracy": 0.310, |
|
"dependency_awareness": 0.221, |
|
"parallelism_efficiency": 0.141, |
|
"overall_score": 0.428 |
|
}, |
|
{ |
|
"name": "llama-3-2-90b-vision-instruct", |
|
"valid_tool_name_rate": 99.6, |
|
"schema_compliance": 85.0, |
|
"execution_success": 90.9, |
|
"task_fulfillment": 0.293, |
|
"information_grounding": 0.444, |
|
"tool_appropriateness": 0.515, |
|
"parameter_accuracy": 0.427, |
|
"dependency_awareness": 0.267, |
|
"parallelism_efficiency": 0.173, |
|
"overall_score": 0.495 |
|
}, |
|
{ |
|
"name": "nova-micro-v1", |
|
"valid_tool_name_rate": 96.0, |
|
"schema_compliance": 93.1, |
|
"execution_success": 87.8, |
|
"task_fulfillment": 0.339, |
|
"information_grounding": 0.419, |
|
"tool_appropriateness": 0.504, |
|
"parameter_accuracy": 0.428, |
|
"dependency_awareness": 0.315, |
|
"parallelism_efficiency": 0.212, |
|
"overall_score": 0.508 |
|
}, |
|
{ |
|
"name": "llama-3-1-70b-instruct", |
|
"valid_tool_name_rate": 99.2, |
|
"schema_compliance": 90.5, |
|
"execution_success": 92.5, |
|
"task_fulfillment": 0.314, |
|
"information_grounding": 0.432, |
|
"tool_appropriateness": 0.523, |
|
"parameter_accuracy": 0.451, |
|
"dependency_awareness": 0.287, |
|
"parallelism_efficiency": 0.191, |
|
"overall_score": 0.510 |
|
}, |
|
{ |
|
"name": "mistral-small-2503", |
|
"valid_tool_name_rate": 96.4, |
|
"schema_compliance": 95.6, |
|
"execution_success": 86.2, |
|
"task_fulfillment": 0.373, |
|
"information_grounding": 0.445, |
|
"tool_appropriateness": 0.537, |
|
"parameter_accuracy": 0.446, |
|
"dependency_awareness": 0.349, |
|
"parallelism_efficiency": 0.232, |
|
"overall_score": 0.530 |
|
}, |
|
{ |
|
"name": "gpt-4o-mini", |
|
"valid_tool_name_rate": 97.5, |
|
"schema_compliance": 98.1, |
|
"execution_success": 93.9, |
|
"task_fulfillment": 0.374, |
|
"information_grounding": 0.500, |
|
"tool_appropriateness": 0.555, |
|
"parameter_accuracy": 0.544, |
|
"dependency_awareness": 0.352, |
|
"parallelism_efficiency": 0.201, |
|
"overall_score": 0.557 |
|
}, |
|
{ |
|
"name": "llama-3-3-70b-instruct", |
|
"valid_tool_name_rate": 99.5, |
|
"schema_compliance": 93.8, |
|
"execution_success": 91.6, |
|
"task_fulfillment": 0.349, |
|
"information_grounding": 0.493, |
|
"tool_appropriateness": 0.583, |
|
"parameter_accuracy": 0.525, |
|
"dependency_awareness": 0.355, |
|
"parallelism_efficiency": 0.262, |
|
"overall_score": 0.558 |
|
}, |
|
{ |
|
"name": "gemma-3-27b-it", |
|
"valid_tool_name_rate": 98.8, |
|
"schema_compliance": 97.6, |
|
"execution_success": 94.4, |
|
"task_fulfillment": 0.378, |
|
"information_grounding": 0.530, |
|
"tool_appropriateness": 0.608, |
|
"parameter_accuracy": 0.572, |
|
"dependency_awareness": 0.383, |
|
"parallelism_efficiency": 0.249, |
|
"overall_score": 0.582 |
|
}, |
|
{ |
|
"name": "gpt-4o", |
|
"valid_tool_name_rate": 98.9, |
|
"schema_compliance": 98.3, |
|
"execution_success": 92.8, |
|
"task_fulfillment": 0.394, |
|
"information_grounding": 0.542, |
|
"tool_appropriateness": 0.627, |
|
"parameter_accuracy": 0.587, |
|
"dependency_awareness": 0.405, |
|
"parallelism_efficiency": 0.272, |
|
"overall_score": 0.595 |
|
}, |
|
{ |
|
"name": "gemini-2.5-flash-lite", |
|
"valid_tool_name_rate": 99.4, |
|
"schema_compliance": 97.8, |
|
"execution_success": 94.3, |
|
"task_fulfillment": 0.412, |
|
"information_grounding": 0.577, |
|
"tool_appropriateness": 0.627, |
|
"parameter_accuracy": 0.597, |
|
"dependency_awareness": 0.404, |
|
"parallelism_efficiency": 0.226, |
|
"overall_score": 0.598 |
|
}, |
|
{ |
|
"name": "qwen3-30b-a3b-instruct-2507", |
|
"valid_tool_name_rate": 99.0, |
|
"schema_compliance": 98.4, |
|
"execution_success": 92.3, |
|
"task_fulfillment": 0.481, |
|
"information_grounding": 0.530, |
|
"tool_appropriateness": 0.658, |
|
"parameter_accuracy": 0.638, |
|
"dependency_awareness": 0.473, |
|
"parallelism_efficiency": 0.303, |
|
"overall_score": 0.627 |
|
}, |
|
{ |
|
"name": "kimi-k2", |
|
"valid_tool_name_rate": 98.8, |
|
"schema_compliance": 98.1, |
|
"execution_success": 94.5, |
|
"task_fulfillment": 0.502, |
|
"information_grounding": 0.577, |
|
"tool_appropriateness": 0.631, |
|
"parameter_accuracy": 0.623, |
|
"dependency_awareness": 0.448, |
|
"parallelism_efficiency": 0.307, |
|
"overall_score": 0.629 |
|
}, |
|
{ |
|
"name": "gpt-oss-20b", |
|
"valid_tool_name_rate": 98.8, |
|
"schema_compliance": 99.1, |
|
"execution_success": 93.6, |
|
"task_fulfillment": 0.547, |
|
"information_grounding": 0.623, |
|
"tool_appropriateness": 0.661, |
|
"parameter_accuracy": 0.638, |
|
"dependency_awareness": 0.509, |
|
"parallelism_efficiency": 0.309, |
|
"overall_score": 0.654 |
|
}, |
|
{ |
|
"name": "glm-4.5", |
|
"valid_tool_name_rate": 99.7, |
|
"schema_compliance": 99.7, |
|
"execution_success": 97.4, |
|
"task_fulfillment": 0.525, |
|
"information_grounding": 0.682, |
|
"tool_appropriateness": 0.680, |
|
"parameter_accuracy": 0.661, |
|
"dependency_awareness": 0.523, |
|
"parallelism_efficiency": 0.297, |
|
"overall_score": 0.668 |
|
}, |
|
{ |
|
"name": "qwen3-235b-a22b-2507", |
|
"valid_tool_name_rate": 99.1, |
|
"schema_compliance": 99.3, |
|
"execution_success": 94.8, |
|
"task_fulfillment": 0.549, |
|
"information_grounding": 0.625, |
|
"tool_appropriateness": 0.688, |
|
"parameter_accuracy": 0.712, |
|
"dependency_awareness": 0.542, |
|
"parallelism_efficiency": 0.355, |
|
"overall_score": 0.678 |
|
}, |
|
{ |
|
"name": "claude-sonnet-4", |
|
"valid_tool_name_rate": 100.0, |
|
"schema_compliance": 99.8, |
|
"execution_success": 98.8, |
|
"task_fulfillment": 0.554, |
|
"information_grounding": 0.676, |
|
"tool_appropriateness": 0.689, |
|
"parameter_accuracy": 0.671, |
|
"dependency_awareness": 0.541, |
|
"parallelism_efficiency": 0.328, |
|
"overall_score": 0.681 |
|
}, |
|
{ |
|
"name": "gemini-2.5-pro", |
|
"valid_tool_name_rate": 99.4, |
|
"schema_compliance": 99.6, |
|
"execution_success": 96.9, |
|
"task_fulfillment": 0.562, |
|
"information_grounding": 0.725, |
|
"tool_appropriateness": 0.717, |
|
"parameter_accuracy": 0.670, |
|
"dependency_awareness": 0.541, |
|
"parallelism_efficiency": 0.329, |
|
"overall_score": 0.690 |
|
}, |
|
{ |
|
"name": "gpt-oss-120b", |
|
"valid_tool_name_rate": 97.7, |
|
"schema_compliance": 98.8, |
|
"execution_success": 94.0, |
|
"task_fulfillment": 0.636, |
|
"information_grounding": 0.705, |
|
"tool_appropriateness": 0.691, |
|
"parameter_accuracy": 0.661, |
|
"dependency_awareness": 0.576, |
|
"parallelism_efficiency": 0.329, |
|
"overall_score": 0.692 |
|
}, |
|
{ |
|
"name": "o3", |
|
"valid_tool_name_rate": 99.3, |
|
"schema_compliance": 99.9, |
|
"execution_success": 97.1, |
|
"task_fulfillment": 0.641, |
|
"information_grounding": 0.706, |
|
"tool_appropriateness": 0.724, |
|
"parameter_accuracy": 0.726, |
|
"dependency_awareness": 0.592, |
|
"parallelism_efficiency": 0.359, |
|
"overall_score": 0.715 |
|
}, |
|
{ |
|
"name": "gpt-5", |
|
"valid_tool_name_rate": 100.0, |
|
"schema_compliance": 99.3, |
|
"execution_success": 99.1, |
|
"task_fulfillment": 0.677, |
|
"information_grounding": 0.828, |
|
"tool_appropriateness": 0.767, |
|
"parameter_accuracy": 0.749, |
|
"dependency_awareness": 0.649, |
|
"parallelism_efficiency": 0.339, |
|
"overall_score": 0.749 |
|
} |
|
] |
|
}; |
|
|
|
class LeaderboardApp { |
|
constructor() { |
|
this.data = null; |
|
this.filteredData = null; |
|
this.currentSort = { column: 'overall_score', ascending: false }; |
|
|
|
this.init(); |
|
} |
|
|
|
async init() { |
|
try { |
|
this.loadData(); |
|
this.setupEventListeners(); |
|
this.renderTable(); |
|
this.updateLastUpdated(); |
|
} catch (error) { |
|
console.error('Failed to initialize app:', error); |
|
this.showError('Failed to load leaderboard data'); |
|
} |
|
} |
|
|
|
loadData() { |
|
const loading = document.getElementById('loading'); |
|
loading.classList.add('active'); |
|
|
|
try { |
|
this.data = LEADERBOARD_DATA; |
|
this.filteredData = [...this.data.models]; |
|
this.sortData(); |
|
} finally { |
|
loading.classList.remove('active'); |
|
} |
|
} |
|
|
|
setupEventListeners() { |
|
const searchInput = document.getElementById('searchInput'); |
|
const sortSelect = document.getElementById('sortSelect'); |
|
const sortOrder = document.getElementById('sortOrder'); |
|
const tableHeaders = document.querySelectorAll('.sortable'); |
|
|
|
searchInput.addEventListener('input', (e) => this.handleSearch(e.target.value)); |
|
sortSelect.addEventListener('change', (e) => this.handleSortChange(e.target.value)); |
|
sortOrder.addEventListener('click', () => this.toggleSortOrder()); |
|
|
|
tableHeaders.forEach(header => { |
|
header.addEventListener('click', () => { |
|
const column = header.dataset.column; |
|
this.handleColumnSort(column); |
|
}); |
|
}); |
|
} |
|
|
|
handleSearch(query) { |
|
const searchTerm = query.toLowerCase().trim(); |
|
|
|
if (searchTerm === '') { |
|
this.filteredData = [...this.data.models]; |
|
} else { |
|
this.filteredData = this.data.models.filter(model => |
|
model.name.toLowerCase().includes(searchTerm) |
|
); |
|
} |
|
|
|
this.sortData(); |
|
this.renderTable(); |
|
} |
|
|
|
handleSortChange(column) { |
|
this.currentSort.column = column; |
|
this.sortData(); |
|
this.renderTable(); |
|
this.updateSortIndicators(); |
|
} |
|
|
|
handleColumnSort(column) { |
|
if (this.currentSort.column === column) { |
|
this.currentSort.ascending = !this.currentSort.ascending; |
|
} else { |
|
this.currentSort.column = column; |
|
this.currentSort.ascending = false; |
|
} |
|
|
|
document.getElementById('sortSelect').value = column; |
|
this.sortData(); |
|
this.renderTable(); |
|
this.updateSortIndicators(); |
|
this.updateSortOrderButton(); |
|
} |
|
|
|
toggleSortOrder() { |
|
this.currentSort.ascending = !this.currentSort.ascending; |
|
this.sortData(); |
|
this.renderTable(); |
|
this.updateSortOrderButton(); |
|
} |
|
|
|
sortData() { |
|
const { column, ascending } = this.currentSort; |
|
|
|
this.filteredData.sort((a, b) => { |
|
let aValue = a[column]; |
|
let bValue = b[column]; |
|
|
|
if (typeof aValue === 'string') { |
|
aValue = aValue.toLowerCase(); |
|
bValue = bValue.toLowerCase(); |
|
} |
|
|
|
let comparison = 0; |
|
if (aValue > bValue) comparison = 1; |
|
if (aValue < bValue) comparison = -1; |
|
|
|
return ascending ? comparison : -comparison; |
|
}); |
|
} |
|
|
|
renderTable() { |
|
const tableBody = document.getElementById('tableBody'); |
|
|
|
if (this.filteredData.length === 0) { |
|
tableBody.innerHTML = ` |
|
<tr> |
|
<td colspan="9" class="no-results"> |
|
<i class="fas fa-search"></i> |
|
No models found matching your search criteria |
|
</td> |
|
</tr> |
|
`; |
|
return; |
|
} |
|
|
|
tableBody.innerHTML = this.filteredData |
|
.map((model) => this.createTableRow(model)) |
|
.join(''); |
|
} |
|
|
|
createTableRow(model) { |
|
return ` |
|
<tr> |
|
<td class="model-col"> |
|
<span class="model-name">${model.name}</span> |
|
</td> |
|
<td class="score-col"> |
|
<span class="score ${this.getScoreClass(model.overall_score)}"> |
|
${model.overall_score.toFixed(3)} |
|
</span> |
|
</td> |
|
<td class="metric-col"> |
|
${this.createMetricCell(model.valid_tool_name_rate, true)} |
|
</td> |
|
<td class="metric-col"> |
|
${this.createMetricCell(model.schema_compliance, true)} |
|
</td> |
|
<td class="metric-col"> |
|
${this.createMetricCell(model.execution_success, true)} |
|
</td> |
|
<td class="metric-col"> |
|
${this.createMetricCell(model.task_fulfillment)} |
|
</td> |
|
<td class="metric-col"> |
|
${this.createMetricCell(model.information_grounding)} |
|
</td> |
|
<td class="metric-col"> |
|
${this.createMetricCell(model.tool_appropriateness)} |
|
</td> |
|
<td class="metric-col"> |
|
${this.createMetricCell(model.parameter_accuracy)} |
|
</td> |
|
<td class="metric-col"> |
|
${this.createMetricCell(model.dependency_awareness)} |
|
</td> |
|
<td class="metric-col"> |
|
${this.createMetricCell(model.parallelism_efficiency)} |
|
</td> |
|
</tr> |
|
`; |
|
} |
|
|
|
createMetricCell(value, isPercentage = false) { |
|
const displayValue = isPercentage ? |
|
`${value.toFixed(1)}%` : |
|
value.toFixed(3); |
|
|
|
const normalizedValue = isPercentage ? value / 100 : value; |
|
const scoreClass = this.getScoreClass(normalizedValue); |
|
const barWidth = isPercentage ? value : (value * 100); |
|
|
|
return ` |
|
<div class="metric" data-tooltip="${displayValue}"> |
|
<div class="metric-bar ${scoreClass}" style="width: ${barWidth}%"></div> |
|
<span>${displayValue}</span> |
|
</div> |
|
`; |
|
} |
|
|
|
getRankClass(rank) { |
|
if (rank === 1) return 'top-1'; |
|
if (rank <= 3) return 'top-3'; |
|
if (rank <= 5) return 'top-5'; |
|
return ''; |
|
} |
|
|
|
getScoreClass(score) { |
|
if (score >= 0.7) return 'excellent'; |
|
if (score >= 0.6) return 'good'; |
|
if (score >= 0.5) return 'average'; |
|
return 'poor'; |
|
} |
|
|
|
updateSortIndicators() { |
|
const headers = document.querySelectorAll('.sortable'); |
|
headers.forEach(header => { |
|
header.classList.remove('active'); |
|
const icon = header.querySelector('.sort-icon'); |
|
icon.className = 'fas fa-sort sort-icon'; |
|
}); |
|
|
|
const activeHeader = document.querySelector(`[data-column="${this.currentSort.column}"]`); |
|
if (activeHeader) { |
|
activeHeader.classList.add('active'); |
|
const icon = activeHeader.querySelector('.sort-icon'); |
|
icon.className = this.currentSort.ascending ? |
|
'fas fa-sort-up sort-icon' : |
|
'fas fa-sort-down sort-icon'; |
|
} |
|
} |
|
|
|
updateSortOrderButton() { |
|
const sortOrderButton = document.getElementById('sortOrder'); |
|
const icon = sortOrderButton.querySelector('i'); |
|
icon.className = this.currentSort.ascending ? |
|
'fas fa-sort-amount-up' : |
|
'fas fa-sort-amount-down'; |
|
|
|
sortOrderButton.title = this.currentSort.ascending ? |
|
'Sort descending' : |
|
'Sort ascending'; |
|
} |
|
|
|
updateLastUpdated() { |
|
const lastUpdatedElement = document.getElementById('lastUpdated'); |
|
if (this.data && this.data.lastUpdated) { |
|
const date = new Date(this.data.lastUpdated); |
|
lastUpdatedElement.textContent = date.toLocaleDateString('en-US', { |
|
year: 'numeric', |
|
month: 'long', |
|
day: 'numeric' |
|
}); |
|
} else { |
|
lastUpdatedElement.textContent = new Date().toLocaleDateString('en-US', { |
|
year: 'numeric', |
|
month: 'long', |
|
day: 'numeric' |
|
}); |
|
} |
|
} |
|
|
|
showError(message) { |
|
const tableBody = document.getElementById('tableBody'); |
|
tableBody.innerHTML = ` |
|
<tr> |
|
<td colspan="9" class="no-results"> |
|
<i class="fas fa-exclamation-triangle"></i> |
|
${message} |
|
</td> |
|
</tr> |
|
`; |
|
} |
|
} |
|
|
|
|
|
function copyCitation() { |
|
const citationText = `@article{wang2024mcpbench, |
|
title={MCP-Bench: Benchmarking Tool-Using LLM Agents with Complex Real-World Tasks via MCP Servers}, |
|
author={Wang, Zhenting and Chang, Qi and Patel, Hemani and Biju, Shashank and Wu, Cheng-En and Liu, Quan and Ding, Aolin and Rezazadeh, Alireza and Shah, Ankit and Bao, Yujia and Siow, Eugene}, |
|
journal={arXiv preprint arXiv:2508.20453}, |
|
year={2024} |
|
}`; |
|
|
|
if (navigator.clipboard && window.isSecureContext) { |
|
navigator.clipboard.writeText(citationText).then(() => { |
|
showCopySuccess(); |
|
}).catch(err => { |
|
console.error('Failed to copy citation:', err); |
|
fallbackCopy(citationText); |
|
}); |
|
} else { |
|
fallbackCopy(citationText); |
|
} |
|
} |
|
|
|
function fallbackCopy(text) { |
|
const textArea = document.createElement('textarea'); |
|
textArea.value = text; |
|
textArea.style.position = 'fixed'; |
|
textArea.style.left = '-999999px'; |
|
textArea.style.top = '-999999px'; |
|
document.body.appendChild(textArea); |
|
textArea.focus(); |
|
textArea.select(); |
|
|
|
try { |
|
document.execCommand('copy'); |
|
showCopySuccess(); |
|
} catch (err) { |
|
console.error('Fallback copy failed:', err); |
|
} |
|
|
|
document.body.removeChild(textArea); |
|
} |
|
|
|
function showCopySuccess() { |
|
const button = document.querySelector('.copy-citation-btn'); |
|
const originalText = button.innerHTML; |
|
button.innerHTML = '<i class="fas fa-check"></i> Copied!'; |
|
button.style.backgroundColor = '#10b981'; |
|
|
|
setTimeout(() => { |
|
button.innerHTML = originalText; |
|
button.style.backgroundColor = ''; |
|
}, 2000); |
|
} |
|
|
|
document.addEventListener('DOMContentLoaded', () => { |
|
new LeaderboardApp(); |
|
}); |
|
|
|
if ('serviceWorker' in navigator) { |
|
window.addEventListener('load', () => { |
|
navigator.serviceWorker.register('/sw.js') |
|
.then((registration) => { |
|
console.log('SW registered: ', registration); |
|
}) |
|
.catch((registrationError) => { |
|
console.log('SW registration failed: ', registrationError); |
|
}); |
|
}); |
|
} |