File size: 11,681 Bytes
4966301
 
d7a53fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4966301
a399453
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4966301
 
d7a53fe
4966301
 
 
 
 
 
 
 
 
 
 
d7a53fe
 
 
 
 
 
 
 
 
 
 
 
 
a399453
 
 
 
4966301
a399453
 
 
4966301
 
 
 
a399453
 
4966301
 
d7a53fe
4966301
a399453
 
 
 
 
 
 
 
 
 
 
54dfcdf
a399453
 
 
 
 
54dfcdf
 
 
 
 
a399453
 
 
 
54dfcdf
a399453
 
 
 
 
 
 
 
 
 
 
4966301
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import gradio as gr
import os
import base64
from pathlib import Path

def encode_image_to_base64(image_path):
    """Convert image to base64 for embedding in HTML"""
    if os.path.exists(image_path):
        with open(image_path, "rb") as img_file:
            encoded = base64.b64encode(img_file.read()).decode()
            # Get file extension
            ext = Path(image_path).suffix.lower()
            mime_type = {
                '.png': 'image/png',
                '.jpg': 'image/jpeg', 
                '.jpeg': 'image/jpeg',
                '.gif': 'image/gif',
                '.webp': 'image/webp'
            }.get(ext, 'image/png')
            return f"data:{mime_type};base64,{encoded}"
    return ""

def generate_table_html():
    """Generate table HTML from data"""
    models = [
        {"name": "gpt-5", "overall_score": 0.749, "valid_tool_name_rate": 100.0, "schema_compliance": 99.3, "execution_success": 99.1, "task_fulfillment": 0.677, "information_grounding": 0.828, "tool_appropriateness": 0.767, "parameter_accuracy": 0.749, "dependency_awareness": 0.649, "parallelism_efficiency": 0.339},
        {"name": "o3", "overall_score": 0.715, "valid_tool_name_rate": 99.3, "schema_compliance": 99.9, "execution_success": 97.1, "task_fulfillment": 0.641, "information_grounding": 0.706, "tool_appropriateness": 0.724, "parameter_accuracy": 0.726, "dependency_awareness": 0.592, "parallelism_efficiency": 0.359},
        {"name": "gpt-oss-120b", "overall_score": 0.692, "valid_tool_name_rate": 97.7, "schema_compliance": 98.8, "execution_success": 94.0, "task_fulfillment": 0.636, "information_grounding": 0.705, "tool_appropriateness": 0.691, "parameter_accuracy": 0.661, "dependency_awareness": 0.576, "parallelism_efficiency": 0.329},
        {"name": "gemini-2.5-pro", "overall_score": 0.690, "valid_tool_name_rate": 99.4, "schema_compliance": 99.6, "execution_success": 96.9, "task_fulfillment": 0.562, "information_grounding": 0.725, "tool_appropriateness": 0.717, "parameter_accuracy": 0.670, "dependency_awareness": 0.541, "parallelism_efficiency": 0.329},
        {"name": "claude-sonnet-4", "overall_score": 0.681, "valid_tool_name_rate": 100.0, "schema_compliance": 99.8, "execution_success": 98.8, "task_fulfillment": 0.554, "information_grounding": 0.676, "tool_appropriateness": 0.689, "parameter_accuracy": 0.671, "dependency_awareness": 0.541, "parallelism_efficiency": 0.328},
        {"name": "qwen3-235b-a22b-2507", "overall_score": 0.678, "valid_tool_name_rate": 99.1, "schema_compliance": 99.3, "execution_success": 94.8, "task_fulfillment": 0.549, "information_grounding": 0.625, "tool_appropriateness": 0.688, "parameter_accuracy": 0.712, "dependency_awareness": 0.542, "parallelism_efficiency": 0.355},
        {"name": "glm-4.5", "overall_score": 0.668, "valid_tool_name_rate": 99.7, "schema_compliance": 99.7, "execution_success": 97.4, "task_fulfillment": 0.525, "information_grounding": 0.682, "tool_appropriateness": 0.680, "parameter_accuracy": 0.661, "dependency_awareness": 0.523, "parallelism_efficiency": 0.297},
        {"name": "gpt-oss-20b", "overall_score": 0.654, "valid_tool_name_rate": 98.8, "schema_compliance": 99.1, "execution_success": 93.6, "task_fulfillment": 0.547, "information_grounding": 0.623, "tool_appropriateness": 0.661, "parameter_accuracy": 0.638, "dependency_awareness": 0.509, "parallelism_efficiency": 0.309},
        {"name": "kimi-k2", "overall_score": 0.629, "valid_tool_name_rate": 98.8, "schema_compliance": 98.1, "execution_success": 94.5, "task_fulfillment": 0.502, "information_grounding": 0.577, "tool_appropriateness": 0.631, "parameter_accuracy": 0.623, "dependency_awareness": 0.448, "parallelism_efficiency": 0.307},
        {"name": "qwen3-30b-a3b-instruct-2507", "overall_score": 0.627, "valid_tool_name_rate": 99.2, "schema_compliance": 95.4, "execution_success": 94.4, "task_fulfillment": 0.459, "information_grounding": 0.536, "tool_appropriateness": 0.658, "parameter_accuracy": 0.646, "dependency_awareness": 0.471, "parallelism_efficiency": 0.318},
        {"name": "gemini-2.5-flash-lite", "overall_score": 0.598, "valid_tool_name_rate": 98.7, "schema_compliance": 98.8, "execution_success": 91.1, "task_fulfillment": 0.446, "information_grounding": 0.569, "tool_appropriateness": 0.629, "parameter_accuracy": 0.564, "dependency_awareness": 0.423, "parallelism_efficiency": 0.262},
        {"name": "gpt-4o", "overall_score": 0.595, "valid_tool_name_rate": 96.7, "schema_compliance": 87.6, "execution_success": 85.3, "task_fulfillment": 0.477, "information_grounding": 0.519, "tool_appropriateness": 0.588, "parameter_accuracy": 0.551, "dependency_awareness": 0.423, "parallelism_efficiency": 0.253},
        {"name": "gemma-3-27b-it", "overall_score": 0.582, "valid_tool_name_rate": 98.4, "schema_compliance": 81.6, "execution_success": 85.5, "task_fulfillment": 0.396, "information_grounding": 0.495, "tool_appropriateness": 0.588, "parameter_accuracy": 0.530, "dependency_awareness": 0.408, "parallelism_efficiency": 0.251},
        {"name": "llama-3-3-70b-instruct", "overall_score": 0.558, "valid_tool_name_rate": 99.5, "schema_compliance": 93.1, "execution_success": 91.5, "task_fulfillment": 0.366, "information_grounding": 0.476, "tool_appropriateness": 0.554, "parameter_accuracy": 0.486, "dependency_awareness": 0.359, "parallelism_efficiency": 0.244},
        {"name": "gpt-4o-mini", "overall_score": 0.557, "valid_tool_name_rate": 95.5, "schema_compliance": 86.5, "execution_success": 84.0, "task_fulfillment": 0.426, "information_grounding": 0.453, "tool_appropriateness": 0.556, "parameter_accuracy": 0.499, "dependency_awareness": 0.359, "parallelism_efficiency": 0.230},
        {"name": "mistral-small-2503", "overall_score": 0.530, "valid_tool_name_rate": 92.0, "schema_compliance": 95.6, "execution_success": 87.2, "task_fulfillment": 0.344, "information_grounding": 0.438, "tool_appropriateness": 0.528, "parameter_accuracy": 0.462, "dependency_awareness": 0.345, "parallelism_efficiency": 0.220},
        {"name": "llama-3-1-70b-instruct", "overall_score": 0.510, "valid_tool_name_rate": 99.2, "schema_compliance": 90.5, "execution_success": 92.5, "task_fulfillment": 0.314, "information_grounding": 0.432, "tool_appropriateness": 0.523, "parameter_accuracy": 0.433, "dependency_awareness": 0.303, "parallelism_efficiency": 0.190},
        {"name": "nova-micro-v1", "overall_score": 0.508, "valid_tool_name_rate": 96.0, "schema_compliance": 93.1, "execution_success": 87.8, "task_fulfillment": 0.339, "information_grounding": 0.419, "tool_appropriateness": 0.504, "parameter_accuracy": 0.428, "dependency_awareness": 0.315, "parallelism_efficiency": 0.212},
        {"name": "llama-3-2-90b-vision-instruct", "overall_score": 0.495, "valid_tool_name_rate": 99.6, "schema_compliance": 85.0, "execution_success": 90.9, "task_fulfillment": 0.293, "information_grounding": 0.444, "tool_appropriateness": 0.515, "parameter_accuracy": 0.427, "dependency_awareness": 0.267, "parallelism_efficiency": 0.173},
        {"name": "llama-3-1-8b-instruct", "overall_score": 0.428, "valid_tool_name_rate": 96.1, "schema_compliance": 89.4, "execution_success": 90.9, "task_fulfillment": 0.261, "information_grounding": 0.295, "tool_appropriateness": 0.352, "parameter_accuracy": 0.310, "dependency_awareness": 0.221, "parallelism_efficiency": 0.141}
    ]
    
    # Sort by overall score descending
    models.sort(key=lambda x: x['overall_score'], reverse=True)
    
    rows = []
    for model in models:
        row = f'''<tr>
            <td class="model-col"><span class="model-name">{model['name']}</span></td>
            <td class="score-col"><span class="score">{model['overall_score']:.3f}</span></td>
            <td class="metric-col">{model['valid_tool_name_rate']:.1f}%</td>
            <td class="metric-col">{model['schema_compliance']:.1f}%</td>
            <td class="metric-col">{model['execution_success']:.1f}%</td>
            <td class="metric-col">{model['task_fulfillment']:.3f}</td>
            <td class="metric-col">{model['information_grounding']:.3f}</td>
            <td class="metric-col">{model['tool_appropriateness']:.3f}</td>
            <td class="metric-col">{model['parameter_accuracy']:.3f}</td>
            <td class="metric-col">{model['dependency_awareness']:.3f}</td>
            <td class="metric-col">{model['parallelism_efficiency']:.3f}</td>
        </tr>'''
        rows.append(row)
    
    return '\n'.join(rows)

def create_gradio_app():
    """
    Gradio app to serve the static HTML leaderboard with embedded images
    This is required for Hugging Face Spaces deployment
    """
    
    # Read the HTML content
    with open('index.html', 'r', encoding='utf-8') as f:
        html_content = f.read()
    
    # Read the CSS content
    with open('style.css', 'r', encoding='utf-8') as f:
        css_content = f.read()
    
    # Convert images to base64 for embedding
    diagram_b64 = encode_image_to_base64('mcp-bench.png')
    ranking_b64 = encode_image_to_base64('ranking.png')
    
    # Replace image references with base64 embedded versions
    html_content = html_content.replace(
        'src="mcp-bench.png"',
        f'src="{diagram_b64}"'
    ).replace(
        'src="ranking.png"', 
        f'src="{ranking_b64}"'
    )
    
    # Generate static table HTML
    table_html = generate_table_html()
    
    # Replace the empty tbody with pre-generated content
    combined_html = html_content.replace(
        '<tbody id="tableBody">\n                    <!-- Table rows will be generated by JavaScript -->\n                </tbody>',
        f'<tbody id="tableBody">{table_html}</tbody>'
    ).replace(
        '<link rel="stylesheet" href="style.css">',
        f'<style>{css_content}</style>'
    )
    
    # The HTML already has the minimal JavaScript for citation copy and date update
    
    # Create the Gradio interface
    with gr.Blocks(
        title="MCP-Bench Leaderboard",
        theme=gr.themes.Soft(),
        css="""
        .gradio-container { padding: 0 !important; }
        #leaderboard-container { 
            width: 100% !important; 
            max-width: none !important;
            margin: 0 !important;
            padding: 0 !important;
        }
        #leaderboard-container * { 
            box-sizing: border-box; 
        }
        /* Force all buttons to have same blue color and remove underlines */
        #leaderboard-container .paper-link {
            color: white !important;
            background-color: #4285F4 !important;
            text-decoration: none !important;
        }
        #leaderboard-container .paper-link:hover {
            color: white !important;
            background-color: #3367D6 !important;
            text-decoration: none !important;
        }
        #leaderboard-container .paper-link:focus,
        #leaderboard-container .paper-link:visited,
        #leaderboard-container .paper-link:active {
            color: white !important;
            background-color: #4285F4 !important;
            text-decoration: none !important;
        }
        /* Fix font issues for authors */
        #leaderboard-container .paper-authors {
            font-family: Inter, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif !important;
        }
        /* Remove any underlines from links globally */
        #leaderboard-container a {
            text-decoration: none !important;
        }
        """
    ) as demo:
        gr.HTML(
            combined_html,
            elem_id="leaderboard-container"
        )
    
    return demo

if __name__ == "__main__":
    demo = create_gradio_app()
    demo.launch()