File size: 11,681 Bytes
4966301 d7a53fe 4966301 a399453 4966301 d7a53fe 4966301 d7a53fe a399453 4966301 a399453 4966301 a399453 4966301 d7a53fe 4966301 a399453 54dfcdf a399453 54dfcdf a399453 54dfcdf a399453 4966301 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import gradio as gr
import os
import base64
from pathlib import Path
def encode_image_to_base64(image_path):
"""Convert image to base64 for embedding in HTML"""
if os.path.exists(image_path):
with open(image_path, "rb") as img_file:
encoded = base64.b64encode(img_file.read()).decode()
# Get file extension
ext = Path(image_path).suffix.lower()
mime_type = {
'.png': 'image/png',
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.gif': 'image/gif',
'.webp': 'image/webp'
}.get(ext, 'image/png')
return f"data:{mime_type};base64,{encoded}"
return ""
def generate_table_html():
"""Generate table HTML from data"""
models = [
{"name": "gpt-5", "overall_score": 0.749, "valid_tool_name_rate": 100.0, "schema_compliance": 99.3, "execution_success": 99.1, "task_fulfillment": 0.677, "information_grounding": 0.828, "tool_appropriateness": 0.767, "parameter_accuracy": 0.749, "dependency_awareness": 0.649, "parallelism_efficiency": 0.339},
{"name": "o3", "overall_score": 0.715, "valid_tool_name_rate": 99.3, "schema_compliance": 99.9, "execution_success": 97.1, "task_fulfillment": 0.641, "information_grounding": 0.706, "tool_appropriateness": 0.724, "parameter_accuracy": 0.726, "dependency_awareness": 0.592, "parallelism_efficiency": 0.359},
{"name": "gpt-oss-120b", "overall_score": 0.692, "valid_tool_name_rate": 97.7, "schema_compliance": 98.8, "execution_success": 94.0, "task_fulfillment": 0.636, "information_grounding": 0.705, "tool_appropriateness": 0.691, "parameter_accuracy": 0.661, "dependency_awareness": 0.576, "parallelism_efficiency": 0.329},
{"name": "gemini-2.5-pro", "overall_score": 0.690, "valid_tool_name_rate": 99.4, "schema_compliance": 99.6, "execution_success": 96.9, "task_fulfillment": 0.562, "information_grounding": 0.725, "tool_appropriateness": 0.717, "parameter_accuracy": 0.670, "dependency_awareness": 0.541, "parallelism_efficiency": 0.329},
{"name": "claude-sonnet-4", "overall_score": 0.681, "valid_tool_name_rate": 100.0, "schema_compliance": 99.8, "execution_success": 98.8, "task_fulfillment": 0.554, "information_grounding": 0.676, "tool_appropriateness": 0.689, "parameter_accuracy": 0.671, "dependency_awareness": 0.541, "parallelism_efficiency": 0.328},
{"name": "qwen3-235b-a22b-2507", "overall_score": 0.678, "valid_tool_name_rate": 99.1, "schema_compliance": 99.3, "execution_success": 94.8, "task_fulfillment": 0.549, "information_grounding": 0.625, "tool_appropriateness": 0.688, "parameter_accuracy": 0.712, "dependency_awareness": 0.542, "parallelism_efficiency": 0.355},
{"name": "glm-4.5", "overall_score": 0.668, "valid_tool_name_rate": 99.7, "schema_compliance": 99.7, "execution_success": 97.4, "task_fulfillment": 0.525, "information_grounding": 0.682, "tool_appropriateness": 0.680, "parameter_accuracy": 0.661, "dependency_awareness": 0.523, "parallelism_efficiency": 0.297},
{"name": "gpt-oss-20b", "overall_score": 0.654, "valid_tool_name_rate": 98.8, "schema_compliance": 99.1, "execution_success": 93.6, "task_fulfillment": 0.547, "information_grounding": 0.623, "tool_appropriateness": 0.661, "parameter_accuracy": 0.638, "dependency_awareness": 0.509, "parallelism_efficiency": 0.309},
{"name": "kimi-k2", "overall_score": 0.629, "valid_tool_name_rate": 98.8, "schema_compliance": 98.1, "execution_success": 94.5, "task_fulfillment": 0.502, "information_grounding": 0.577, "tool_appropriateness": 0.631, "parameter_accuracy": 0.623, "dependency_awareness": 0.448, "parallelism_efficiency": 0.307},
{"name": "qwen3-30b-a3b-instruct-2507", "overall_score": 0.627, "valid_tool_name_rate": 99.2, "schema_compliance": 95.4, "execution_success": 94.4, "task_fulfillment": 0.459, "information_grounding": 0.536, "tool_appropriateness": 0.658, "parameter_accuracy": 0.646, "dependency_awareness": 0.471, "parallelism_efficiency": 0.318},
{"name": "gemini-2.5-flash-lite", "overall_score": 0.598, "valid_tool_name_rate": 98.7, "schema_compliance": 98.8, "execution_success": 91.1, "task_fulfillment": 0.446, "information_grounding": 0.569, "tool_appropriateness": 0.629, "parameter_accuracy": 0.564, "dependency_awareness": 0.423, "parallelism_efficiency": 0.262},
{"name": "gpt-4o", "overall_score": 0.595, "valid_tool_name_rate": 96.7, "schema_compliance": 87.6, "execution_success": 85.3, "task_fulfillment": 0.477, "information_grounding": 0.519, "tool_appropriateness": 0.588, "parameter_accuracy": 0.551, "dependency_awareness": 0.423, "parallelism_efficiency": 0.253},
{"name": "gemma-3-27b-it", "overall_score": 0.582, "valid_tool_name_rate": 98.4, "schema_compliance": 81.6, "execution_success": 85.5, "task_fulfillment": 0.396, "information_grounding": 0.495, "tool_appropriateness": 0.588, "parameter_accuracy": 0.530, "dependency_awareness": 0.408, "parallelism_efficiency": 0.251},
{"name": "llama-3-3-70b-instruct", "overall_score": 0.558, "valid_tool_name_rate": 99.5, "schema_compliance": 93.1, "execution_success": 91.5, "task_fulfillment": 0.366, "information_grounding": 0.476, "tool_appropriateness": 0.554, "parameter_accuracy": 0.486, "dependency_awareness": 0.359, "parallelism_efficiency": 0.244},
{"name": "gpt-4o-mini", "overall_score": 0.557, "valid_tool_name_rate": 95.5, "schema_compliance": 86.5, "execution_success": 84.0, "task_fulfillment": 0.426, "information_grounding": 0.453, "tool_appropriateness": 0.556, "parameter_accuracy": 0.499, "dependency_awareness": 0.359, "parallelism_efficiency": 0.230},
{"name": "mistral-small-2503", "overall_score": 0.530, "valid_tool_name_rate": 92.0, "schema_compliance": 95.6, "execution_success": 87.2, "task_fulfillment": 0.344, "information_grounding": 0.438, "tool_appropriateness": 0.528, "parameter_accuracy": 0.462, "dependency_awareness": 0.345, "parallelism_efficiency": 0.220},
{"name": "llama-3-1-70b-instruct", "overall_score": 0.510, "valid_tool_name_rate": 99.2, "schema_compliance": 90.5, "execution_success": 92.5, "task_fulfillment": 0.314, "information_grounding": 0.432, "tool_appropriateness": 0.523, "parameter_accuracy": 0.433, "dependency_awareness": 0.303, "parallelism_efficiency": 0.190},
{"name": "nova-micro-v1", "overall_score": 0.508, "valid_tool_name_rate": 96.0, "schema_compliance": 93.1, "execution_success": 87.8, "task_fulfillment": 0.339, "information_grounding": 0.419, "tool_appropriateness": 0.504, "parameter_accuracy": 0.428, "dependency_awareness": 0.315, "parallelism_efficiency": 0.212},
{"name": "llama-3-2-90b-vision-instruct", "overall_score": 0.495, "valid_tool_name_rate": 99.6, "schema_compliance": 85.0, "execution_success": 90.9, "task_fulfillment": 0.293, "information_grounding": 0.444, "tool_appropriateness": 0.515, "parameter_accuracy": 0.427, "dependency_awareness": 0.267, "parallelism_efficiency": 0.173},
{"name": "llama-3-1-8b-instruct", "overall_score": 0.428, "valid_tool_name_rate": 96.1, "schema_compliance": 89.4, "execution_success": 90.9, "task_fulfillment": 0.261, "information_grounding": 0.295, "tool_appropriateness": 0.352, "parameter_accuracy": 0.310, "dependency_awareness": 0.221, "parallelism_efficiency": 0.141}
]
# Sort by overall score descending
models.sort(key=lambda x: x['overall_score'], reverse=True)
rows = []
for model in models:
row = f'''<tr>
<td class="model-col"><span class="model-name">{model['name']}</span></td>
<td class="score-col"><span class="score">{model['overall_score']:.3f}</span></td>
<td class="metric-col">{model['valid_tool_name_rate']:.1f}%</td>
<td class="metric-col">{model['schema_compliance']:.1f}%</td>
<td class="metric-col">{model['execution_success']:.1f}%</td>
<td class="metric-col">{model['task_fulfillment']:.3f}</td>
<td class="metric-col">{model['information_grounding']:.3f}</td>
<td class="metric-col">{model['tool_appropriateness']:.3f}</td>
<td class="metric-col">{model['parameter_accuracy']:.3f}</td>
<td class="metric-col">{model['dependency_awareness']:.3f}</td>
<td class="metric-col">{model['parallelism_efficiency']:.3f}</td>
</tr>'''
rows.append(row)
return '\n'.join(rows)
def create_gradio_app():
"""
Gradio app to serve the static HTML leaderboard with embedded images
This is required for Hugging Face Spaces deployment
"""
# Read the HTML content
with open('index.html', 'r', encoding='utf-8') as f:
html_content = f.read()
# Read the CSS content
with open('style.css', 'r', encoding='utf-8') as f:
css_content = f.read()
# Convert images to base64 for embedding
diagram_b64 = encode_image_to_base64('mcp-bench.png')
ranking_b64 = encode_image_to_base64('ranking.png')
# Replace image references with base64 embedded versions
html_content = html_content.replace(
'src="mcp-bench.png"',
f'src="{diagram_b64}"'
).replace(
'src="ranking.png"',
f'src="{ranking_b64}"'
)
# Generate static table HTML
table_html = generate_table_html()
# Replace the empty tbody with pre-generated content
combined_html = html_content.replace(
'<tbody id="tableBody">\n <!-- Table rows will be generated by JavaScript -->\n </tbody>',
f'<tbody id="tableBody">{table_html}</tbody>'
).replace(
'<link rel="stylesheet" href="style.css">',
f'<style>{css_content}</style>'
)
# The HTML already has the minimal JavaScript for citation copy and date update
# Create the Gradio interface
with gr.Blocks(
title="MCP-Bench Leaderboard",
theme=gr.themes.Soft(),
css="""
.gradio-container { padding: 0 !important; }
#leaderboard-container {
width: 100% !important;
max-width: none !important;
margin: 0 !important;
padding: 0 !important;
}
#leaderboard-container * {
box-sizing: border-box;
}
/* Force all buttons to have same blue color and remove underlines */
#leaderboard-container .paper-link {
color: white !important;
background-color: #4285F4 !important;
text-decoration: none !important;
}
#leaderboard-container .paper-link:hover {
color: white !important;
background-color: #3367D6 !important;
text-decoration: none !important;
}
#leaderboard-container .paper-link:focus,
#leaderboard-container .paper-link:visited,
#leaderboard-container .paper-link:active {
color: white !important;
background-color: #4285F4 !important;
text-decoration: none !important;
}
/* Fix font issues for authors */
#leaderboard-container .paper-authors {
font-family: Inter, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif !important;
}
/* Remove any underlines from links globally */
#leaderboard-container a {
text-decoration: none !important;
}
"""
) as demo:
gr.HTML(
combined_html,
elem_id="leaderboard-container"
)
return demo
if __name__ == "__main__":
demo = create_gradio_app()
demo.launch() |