const LEADERBOARD_DATA = { "lastUpdated": "2025-09-05", "models": [ { "name": "llama-3-1-8b-instruct", "valid_tool_name_rate": 96.1, "schema_compliance": 89.4, "execution_success": 90.9, "task_fulfillment": 0.261, "information_grounding": 0.295, "tool_appropriateness": 0.352, "parameter_accuracy": 0.310, "dependency_awareness": 0.221, "parallelism_efficiency": 0.141, "overall_score": 0.428 }, { "name": "llama-3-2-90b-vision-instruct", "valid_tool_name_rate": 99.6, "schema_compliance": 85.0, "execution_success": 90.9, "task_fulfillment": 0.293, "information_grounding": 0.444, "tool_appropriateness": 0.515, "parameter_accuracy": 0.427, "dependency_awareness": 0.267, "parallelism_efficiency": 0.173, "overall_score": 0.495 }, { "name": "nova-micro-v1", "valid_tool_name_rate": 96.0, "schema_compliance": 93.1, "execution_success": 87.8, "task_fulfillment": 0.339, "information_grounding": 0.419, "tool_appropriateness": 0.504, "parameter_accuracy": 0.428, "dependency_awareness": 0.315, "parallelism_efficiency": 0.212, "overall_score": 0.508 }, { "name": "llama-3-1-70b-instruct", "valid_tool_name_rate": 99.2, "schema_compliance": 90.5, "execution_success": 92.5, "task_fulfillment": 0.314, "information_grounding": 0.432, "tool_appropriateness": 0.523, "parameter_accuracy": 0.451, "dependency_awareness": 0.287, "parallelism_efficiency": 0.191, "overall_score": 0.510 }, { "name": "mistral-small-2503", "valid_tool_name_rate": 96.4, "schema_compliance": 95.6, "execution_success": 86.2, "task_fulfillment": 0.373, "information_grounding": 0.445, "tool_appropriateness": 0.537, "parameter_accuracy": 0.446, "dependency_awareness": 0.349, "parallelism_efficiency": 0.232, "overall_score": 0.530 }, { "name": "gpt-4o-mini", "valid_tool_name_rate": 97.5, "schema_compliance": 98.1, "execution_success": 93.9, "task_fulfillment": 0.374, "information_grounding": 0.500, "tool_appropriateness": 0.555, "parameter_accuracy": 0.544, "dependency_awareness": 0.352, "parallelism_efficiency": 0.201, "overall_score": 0.557 }, { "name": "llama-3-3-70b-instruct", "valid_tool_name_rate": 99.5, "schema_compliance": 93.8, "execution_success": 91.6, "task_fulfillment": 0.349, "information_grounding": 0.493, "tool_appropriateness": 0.583, "parameter_accuracy": 0.525, "dependency_awareness": 0.355, "parallelism_efficiency": 0.262, "overall_score": 0.558 }, { "name": "gemma-3-27b-it", "valid_tool_name_rate": 98.8, "schema_compliance": 97.6, "execution_success": 94.4, "task_fulfillment": 0.378, "information_grounding": 0.530, "tool_appropriateness": 0.608, "parameter_accuracy": 0.572, "dependency_awareness": 0.383, "parallelism_efficiency": 0.249, "overall_score": 0.582 }, { "name": "gpt-4o", "valid_tool_name_rate": 98.9, "schema_compliance": 98.3, "execution_success": 92.8, "task_fulfillment": 0.394, "information_grounding": 0.542, "tool_appropriateness": 0.627, "parameter_accuracy": 0.587, "dependency_awareness": 0.405, "parallelism_efficiency": 0.272, "overall_score": 0.595 }, { "name": "gemini-2.5-flash-lite", "valid_tool_name_rate": 99.4, "schema_compliance": 97.8, "execution_success": 94.3, "task_fulfillment": 0.412, "information_grounding": 0.577, "tool_appropriateness": 0.627, "parameter_accuracy": 0.597, "dependency_awareness": 0.404, "parallelism_efficiency": 0.226, "overall_score": 0.598 }, { "name": "qwen3-30b-a3b-instruct-2507", "valid_tool_name_rate": 99.0, "schema_compliance": 98.4, "execution_success": 92.3, "task_fulfillment": 0.481, "information_grounding": 0.530, "tool_appropriateness": 0.658, "parameter_accuracy": 0.638, "dependency_awareness": 0.473, "parallelism_efficiency": 0.303, "overall_score": 0.627 }, { "name": "kimi-k2", "valid_tool_name_rate": 98.8, "schema_compliance": 98.1, "execution_success": 94.5, "task_fulfillment": 0.502, "information_grounding": 0.577, "tool_appropriateness": 0.631, "parameter_accuracy": 0.623, "dependency_awareness": 0.448, "parallelism_efficiency": 0.307, "overall_score": 0.629 }, { "name": "gpt-oss-20b", "valid_tool_name_rate": 98.8, "schema_compliance": 99.1, "execution_success": 93.6, "task_fulfillment": 0.547, "information_grounding": 0.623, "tool_appropriateness": 0.661, "parameter_accuracy": 0.638, "dependency_awareness": 0.509, "parallelism_efficiency": 0.309, "overall_score": 0.654 }, { "name": "glm-4.5", "valid_tool_name_rate": 99.7, "schema_compliance": 99.7, "execution_success": 97.4, "task_fulfillment": 0.525, "information_grounding": 0.682, "tool_appropriateness": 0.680, "parameter_accuracy": 0.661, "dependency_awareness": 0.523, "parallelism_efficiency": 0.297, "overall_score": 0.668 }, { "name": "qwen3-235b-a22b-2507", "valid_tool_name_rate": 99.1, "schema_compliance": 99.3, "execution_success": 94.8, "task_fulfillment": 0.549, "information_grounding": 0.625, "tool_appropriateness": 0.688, "parameter_accuracy": 0.712, "dependency_awareness": 0.542, "parallelism_efficiency": 0.355, "overall_score": 0.678 }, { "name": "claude-sonnet-4", "valid_tool_name_rate": 100.0, "schema_compliance": 99.8, "execution_success": 98.8, "task_fulfillment": 0.554, "information_grounding": 0.676, "tool_appropriateness": 0.689, "parameter_accuracy": 0.671, "dependency_awareness": 0.541, "parallelism_efficiency": 0.328, "overall_score": 0.681 }, { "name": "gemini-2.5-pro", "valid_tool_name_rate": 99.4, "schema_compliance": 99.6, "execution_success": 96.9, "task_fulfillment": 0.562, "information_grounding": 0.725, "tool_appropriateness": 0.717, "parameter_accuracy": 0.670, "dependency_awareness": 0.541, "parallelism_efficiency": 0.329, "overall_score": 0.690 }, { "name": "gpt-oss-120b", "valid_tool_name_rate": 97.7, "schema_compliance": 98.8, "execution_success": 94.0, "task_fulfillment": 0.636, "information_grounding": 0.705, "tool_appropriateness": 0.691, "parameter_accuracy": 0.661, "dependency_awareness": 0.576, "parallelism_efficiency": 0.329, "overall_score": 0.692 }, { "name": "o3", "valid_tool_name_rate": 99.3, "schema_compliance": 99.9, "execution_success": 97.1, "task_fulfillment": 0.641, "information_grounding": 0.706, "tool_appropriateness": 0.724, "parameter_accuracy": 0.726, "dependency_awareness": 0.592, "parallelism_efficiency": 0.359, "overall_score": 0.715 }, { "name": "gpt-5", "valid_tool_name_rate": 100.0, "schema_compliance": 99.3, "execution_success": 99.1, "task_fulfillment": 0.677, "information_grounding": 0.828, "tool_appropriateness": 0.767, "parameter_accuracy": 0.749, "dependency_awareness": 0.649, "parallelism_efficiency": 0.339, "overall_score": 0.749 } ] }; class LeaderboardApp { constructor() { this.data = null; this.filteredData = null; this.currentSort = { column: 'overall_score', ascending: false }; this.init(); } async init() { try { console.log('Starting LeaderboardApp initialization...'); this.loadData(); this.setupEventListeners(); this.renderTable(); this.updateLastUpdated(); console.log('LeaderboardApp initialization complete'); } catch (error) { console.error('Failed to initialize app:', error); this.showError('Failed to load leaderboard data'); } } loadData() { const loading = document.getElementById('loading'); if (loading) { loading.classList.add('active'); } try { console.log('Loading LEADERBOARD_DATA...'); this.data = LEADERBOARD_DATA; console.log('Data loaded, models count:', this.data.models ? this.data.models.length : 0); this.filteredData = [...this.data.models]; console.log('Filtered data initialized with', this.filteredData.length, 'models'); this.sortData(); console.log('Data sorted'); } catch (error) { console.error('Error loading data:', error); } finally { if (loading) { loading.classList.remove('active'); } } } setupEventListeners() { const searchInput = document.getElementById('searchInput'); const sortSelect = document.getElementById('sortSelect'); const sortOrder = document.getElementById('sortOrder'); const tableHeaders = document.querySelectorAll('.sortable'); searchInput.addEventListener('input', (e) => this.handleSearch(e.target.value)); sortSelect.addEventListener('change', (e) => this.handleSortChange(e.target.value)); sortOrder.addEventListener('click', () => this.toggleSortOrder()); tableHeaders.forEach(header => { header.addEventListener('click', () => { const column = header.dataset.column; this.handleColumnSort(column); }); }); } handleSearch(query) { const searchTerm = query.toLowerCase().trim(); if (searchTerm === '') { this.filteredData = [...this.data.models]; } else { this.filteredData = this.data.models.filter(model => model.name.toLowerCase().includes(searchTerm) ); } this.sortData(); this.renderTable(); } handleSortChange(column) { this.currentSort.column = column; this.sortData(); this.renderTable(); this.updateSortIndicators(); } handleColumnSort(column) { if (this.currentSort.column === column) { this.currentSort.ascending = !this.currentSort.ascending; } else { this.currentSort.column = column; this.currentSort.ascending = false; } document.getElementById('sortSelect').value = column; this.sortData(); this.renderTable(); this.updateSortIndicators(); this.updateSortOrderButton(); } toggleSortOrder() { this.currentSort.ascending = !this.currentSort.ascending; this.sortData(); this.renderTable(); this.updateSortOrderButton(); } sortData() { const { column, ascending } = this.currentSort; this.filteredData.sort((a, b) => { let aValue = a[column]; let bValue = b[column]; if (typeof aValue === 'string') { aValue = aValue.toLowerCase(); bValue = bValue.toLowerCase(); } let comparison = 0; if (aValue > bValue) comparison = 1; if (aValue < bValue) comparison = -1; return ascending ? comparison : -comparison; }); } renderTable() { const tableBody = document.getElementById('tableBody'); if (!tableBody) { console.error('tableBody element not found!'); return; } if (!this.filteredData || this.filteredData.length === 0) { console.log('No filtered data to display'); tableBody.innerHTML = ` No models found matching your search criteria `; return; } console.log(`Rendering ${this.filteredData.length} models`); tableBody.innerHTML = this.filteredData .map((model) => this.createTableRow(model)) .join(''); } createTableRow(model) { return ` ${model.name} ${model.overall_score.toFixed(3)} ${this.createMetricCell(model.valid_tool_name_rate, true)} ${this.createMetricCell(model.schema_compliance, true)} ${this.createMetricCell(model.execution_success, true)} ${this.createMetricCell(model.task_fulfillment)} ${this.createMetricCell(model.information_grounding)} ${this.createMetricCell(model.tool_appropriateness)} ${this.createMetricCell(model.parameter_accuracy)} ${this.createMetricCell(model.dependency_awareness)} ${this.createMetricCell(model.parallelism_efficiency)} `; } createMetricCell(value, isPercentage = false) { const displayValue = isPercentage ? `${value.toFixed(1)}%` : value.toFixed(3); const normalizedValue = isPercentage ? value / 100 : value; const scoreClass = this.getScoreClass(normalizedValue); const barWidth = isPercentage ? value : (value * 100); return `
${displayValue}
`; } getRankClass(rank) { if (rank === 1) return 'top-1'; if (rank <= 3) return 'top-3'; if (rank <= 5) return 'top-5'; return ''; } getScoreClass(score) { if (score >= 0.7) return 'excellent'; if (score >= 0.6) return 'good'; if (score >= 0.5) return 'average'; return 'poor'; } updateSortIndicators() { const headers = document.querySelectorAll('.sortable'); headers.forEach(header => { header.classList.remove('active'); const icon = header.querySelector('.sort-icon'); icon.className = 'fas fa-sort sort-icon'; }); const activeHeader = document.querySelector(`[data-column="${this.currentSort.column}"]`); if (activeHeader) { activeHeader.classList.add('active'); const icon = activeHeader.querySelector('.sort-icon'); icon.className = this.currentSort.ascending ? 'fas fa-sort-up sort-icon' : 'fas fa-sort-down sort-icon'; } } updateSortOrderButton() { const sortOrderButton = document.getElementById('sortOrder'); const icon = sortOrderButton.querySelector('i'); icon.className = this.currentSort.ascending ? 'fas fa-sort-amount-up' : 'fas fa-sort-amount-down'; sortOrderButton.title = this.currentSort.ascending ? 'Sort descending' : 'Sort ascending'; } updateLastUpdated() { const lastUpdatedElement = document.getElementById('lastUpdated'); if (this.data && this.data.lastUpdated) { const date = new Date(this.data.lastUpdated); lastUpdatedElement.textContent = date.toLocaleDateString('en-US', { year: 'numeric', month: 'long', day: 'numeric' }); } else { lastUpdatedElement.textContent = new Date().toLocaleDateString('en-US', { year: 'numeric', month: 'long', day: 'numeric' }); } } showError(message) { const tableBody = document.getElementById('tableBody'); tableBody.innerHTML = ` ${message} `; } } // Copy citation function function copyCitation() { const citationText = `@article{wang2024mcpbench, title={MCP-Bench: Benchmarking Tool-Using LLM Agents with Complex Real-World Tasks via MCP Servers}, author={Wang, Zhenting and Chang, Qi and Patel, Hemani and Biju, Shashank and Wu, Cheng-En and Liu, Quan and Ding, Aolin and Rezazadeh, Alireza and Shah, Ankit and Bao, Yujia and Siow, Eugene}, journal={arXiv preprint arXiv:2508.20453}, year={2024} }`; if (navigator.clipboard && window.isSecureContext) { navigator.clipboard.writeText(citationText).then(() => { showCopySuccess(); }).catch(err => { console.error('Failed to copy citation:', err); fallbackCopy(citationText); }); } else { fallbackCopy(citationText); } } function fallbackCopy(text) { const textArea = document.createElement('textarea'); textArea.value = text; textArea.style.position = 'fixed'; textArea.style.left = '-999999px'; textArea.style.top = '-999999px'; document.body.appendChild(textArea); textArea.focus(); textArea.select(); try { document.execCommand('copy'); showCopySuccess(); } catch (err) { console.error('Fallback copy failed:', err); } document.body.removeChild(textArea); } function showCopySuccess() { const button = document.querySelector('.copy-citation-btn'); const originalText = button.innerHTML; button.innerHTML = ' Copied!'; button.style.backgroundColor = '#10b981'; setTimeout(() => { button.innerHTML = originalText; button.style.backgroundColor = ''; }, 2000); } document.addEventListener('DOMContentLoaded', () => { new LeaderboardApp(); }); if ('serviceWorker' in navigator) { window.addEventListener('load', () => { navigator.serviceWorker.register('/sw.js') .then((registration) => { console.log('SW registered: ', registration); }) .catch((registrationError) => { console.log('SW registration failed: ', registrationError); }); }); }