import gradio as gr import os import base64 from pathlib import Path def encode_image_to_base64(image_path): """Convert image to base64 for embedding in HTML""" if os.path.exists(image_path): with open(image_path, "rb") as img_file: encoded = base64.b64encode(img_file.read()).decode() # Get file extension ext = Path(image_path).suffix.lower() mime_type = { '.png': 'image/png', '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.gif': 'image/gif', '.webp': 'image/webp' }.get(ext, 'image/png') return f"data:{mime_type};base64,{encoded}" return "" def generate_table_html(): """Generate table HTML from data""" models = [ {"name": "gpt-5", "overall_score": 0.749, "valid_tool_name_rate": 100.0, "schema_compliance": 99.3, "execution_success": 99.1, "task_fulfillment": 0.677, "information_grounding": 0.828, "tool_appropriateness": 0.767, "parameter_accuracy": 0.749, "dependency_awareness": 0.649, "parallelism_efficiency": 0.339}, {"name": "o3", "overall_score": 0.715, "valid_tool_name_rate": 99.3, "schema_compliance": 99.9, "execution_success": 97.1, "task_fulfillment": 0.641, "information_grounding": 0.706, "tool_appropriateness": 0.724, "parameter_accuracy": 0.726, "dependency_awareness": 0.592, "parallelism_efficiency": 0.359}, {"name": "gpt-oss-120b", "overall_score": 0.692, "valid_tool_name_rate": 97.7, "schema_compliance": 98.8, "execution_success": 94.0, "task_fulfillment": 0.636, "information_grounding": 0.705, "tool_appropriateness": 0.691, "parameter_accuracy": 0.661, "dependency_awareness": 0.576, "parallelism_efficiency": 0.329}, {"name": "gemini-2.5-pro", "overall_score": 0.690, "valid_tool_name_rate": 99.4, "schema_compliance": 99.6, "execution_success": 96.9, "task_fulfillment": 0.562, "information_grounding": 0.725, "tool_appropriateness": 0.717, "parameter_accuracy": 0.670, "dependency_awareness": 0.541, "parallelism_efficiency": 0.329}, {"name": "claude-sonnet-4", "overall_score": 0.681, "valid_tool_name_rate": 100.0, "schema_compliance": 99.8, "execution_success": 98.8, "task_fulfillment": 0.554, "information_grounding": 0.676, "tool_appropriateness": 0.689, "parameter_accuracy": 0.671, "dependency_awareness": 0.541, "parallelism_efficiency": 0.328}, {"name": "qwen3-235b-a22b-2507", "overall_score": 0.678, "valid_tool_name_rate": 99.1, "schema_compliance": 99.3, "execution_success": 94.8, "task_fulfillment": 0.549, "information_grounding": 0.625, "tool_appropriateness": 0.688, "parameter_accuracy": 0.712, "dependency_awareness": 0.542, "parallelism_efficiency": 0.355}, {"name": "glm-4.5", "overall_score": 0.668, "valid_tool_name_rate": 99.7, "schema_compliance": 99.7, "execution_success": 97.4, "task_fulfillment": 0.525, "information_grounding": 0.682, "tool_appropriateness": 0.680, "parameter_accuracy": 0.661, "dependency_awareness": 0.523, "parallelism_efficiency": 0.297}, {"name": "gpt-oss-20b", "overall_score": 0.654, "valid_tool_name_rate": 98.8, "schema_compliance": 99.1, "execution_success": 93.6, "task_fulfillment": 0.547, "information_grounding": 0.623, "tool_appropriateness": 0.661, "parameter_accuracy": 0.638, "dependency_awareness": 0.509, "parallelism_efficiency": 0.309}, {"name": "kimi-k2", "overall_score": 0.629, "valid_tool_name_rate": 98.8, "schema_compliance": 98.1, "execution_success": 94.5, "task_fulfillment": 0.502, "information_grounding": 0.577, "tool_appropriateness": 0.631, "parameter_accuracy": 0.623, "dependency_awareness": 0.448, "parallelism_efficiency": 0.307}, {"name": "qwen3-30b-a3b-instruct-2507", "overall_score": 0.627, "valid_tool_name_rate": 99.2, "schema_compliance": 95.4, "execution_success": 94.4, "task_fulfillment": 0.459, "information_grounding": 0.536, "tool_appropriateness": 0.658, "parameter_accuracy": 0.646, "dependency_awareness": 0.471, "parallelism_efficiency": 0.318}, {"name": "gemini-2.5-flash-lite", "overall_score": 0.598, "valid_tool_name_rate": 98.7, "schema_compliance": 98.8, "execution_success": 91.1, "task_fulfillment": 0.446, "information_grounding": 0.569, "tool_appropriateness": 0.629, "parameter_accuracy": 0.564, "dependency_awareness": 0.423, "parallelism_efficiency": 0.262}, {"name": "gpt-4o", "overall_score": 0.595, "valid_tool_name_rate": 96.7, "schema_compliance": 87.6, "execution_success": 85.3, "task_fulfillment": 0.477, "information_grounding": 0.519, "tool_appropriateness": 0.588, "parameter_accuracy": 0.551, "dependency_awareness": 0.423, "parallelism_efficiency": 0.253}, {"name": "gemma-3-27b-it", "overall_score": 0.582, "valid_tool_name_rate": 98.4, "schema_compliance": 81.6, "execution_success": 85.5, "task_fulfillment": 0.396, "information_grounding": 0.495, "tool_appropriateness": 0.588, "parameter_accuracy": 0.530, "dependency_awareness": 0.408, "parallelism_efficiency": 0.251}, {"name": "llama-3-3-70b-instruct", "overall_score": 0.558, "valid_tool_name_rate": 99.5, "schema_compliance": 93.1, "execution_success": 91.5, "task_fulfillment": 0.366, "information_grounding": 0.476, "tool_appropriateness": 0.554, "parameter_accuracy": 0.486, "dependency_awareness": 0.359, "parallelism_efficiency": 0.244}, {"name": "gpt-4o-mini", "overall_score": 0.557, "valid_tool_name_rate": 95.5, "schema_compliance": 86.5, "execution_success": 84.0, "task_fulfillment": 0.426, "information_grounding": 0.453, "tool_appropriateness": 0.556, "parameter_accuracy": 0.499, "dependency_awareness": 0.359, "parallelism_efficiency": 0.230}, {"name": "mistral-small-2503", "overall_score": 0.530, "valid_tool_name_rate": 92.0, "schema_compliance": 95.6, "execution_success": 87.2, "task_fulfillment": 0.344, "information_grounding": 0.438, "tool_appropriateness": 0.528, "parameter_accuracy": 0.462, "dependency_awareness": 0.345, "parallelism_efficiency": 0.220}, {"name": "llama-3-1-70b-instruct", "overall_score": 0.510, "valid_tool_name_rate": 99.2, "schema_compliance": 90.5, "execution_success": 92.5, "task_fulfillment": 0.314, "information_grounding": 0.432, "tool_appropriateness": 0.523, "parameter_accuracy": 0.433, "dependency_awareness": 0.303, "parallelism_efficiency": 0.190}, {"name": "nova-micro-v1", "overall_score": 0.508, "valid_tool_name_rate": 96.0, "schema_compliance": 93.1, "execution_success": 87.8, "task_fulfillment": 0.339, "information_grounding": 0.419, "tool_appropriateness": 0.504, "parameter_accuracy": 0.428, "dependency_awareness": 0.315, "parallelism_efficiency": 0.212}, {"name": "llama-3-2-90b-vision-instruct", "overall_score": 0.495, "valid_tool_name_rate": 99.6, "schema_compliance": 85.0, "execution_success": 90.9, "task_fulfillment": 0.293, "information_grounding": 0.444, "tool_appropriateness": 0.515, "parameter_accuracy": 0.427, "dependency_awareness": 0.267, "parallelism_efficiency": 0.173}, {"name": "llama-3-1-8b-instruct", "overall_score": 0.428, "valid_tool_name_rate": 96.1, "schema_compliance": 89.4, "execution_success": 90.9, "task_fulfillment": 0.261, "information_grounding": 0.295, "tool_appropriateness": 0.352, "parameter_accuracy": 0.310, "dependency_awareness": 0.221, "parallelism_efficiency": 0.141} ] # Sort by overall score descending models.sort(key=lambda x: x['overall_score'], reverse=True) rows = [] for model in models: row = f''' {model['name']} {model['overall_score']:.3f} {model['valid_tool_name_rate']:.1f}% {model['schema_compliance']:.1f}% {model['execution_success']:.1f}% {model['task_fulfillment']:.3f} {model['information_grounding']:.3f} {model['tool_appropriateness']:.3f} {model['parameter_accuracy']:.3f} {model['dependency_awareness']:.3f} {model['parallelism_efficiency']:.3f} ''' rows.append(row) return '\n'.join(rows) def create_gradio_app(): """ Gradio app to serve the static HTML leaderboard with embedded images This is required for Hugging Face Spaces deployment """ # Read the HTML content with open('index.html', 'r', encoding='utf-8') as f: html_content = f.read() # Read the CSS content with open('style.css', 'r', encoding='utf-8') as f: css_content = f.read() # Convert images to base64 for embedding diagram_b64 = encode_image_to_base64('mcp-bench.png') ranking_b64 = encode_image_to_base64('ranking.png') # Replace image references with base64 embedded versions html_content = html_content.replace( 'src="mcp-bench.png"', f'src="{diagram_b64}"' ).replace( 'src="ranking.png"', f'src="{ranking_b64}"' ) # Generate static table HTML table_html = generate_table_html() # Replace the empty tbody with pre-generated content combined_html = html_content.replace( '\n \n ', f'{table_html}' ).replace( '', f'' ) # The HTML already has the minimal JavaScript for citation copy and date update # Create the Gradio interface with gr.Blocks( title="MCP-Bench Leaderboard", theme=gr.themes.Soft(), css=""" .gradio-container { padding: 0 !important; } #leaderboard-container { width: 100% !important; max-width: none !important; margin: 0 !important; padding: 0 !important; } #leaderboard-container * { box-sizing: border-box; } /* Force all buttons to have same blue color and remove underlines */ #leaderboard-container .paper-link { color: white !important; background-color: #4285F4 !important; text-decoration: none !important; } #leaderboard-container .paper-link:hover { color: white !important; background-color: #3367D6 !important; text-decoration: none !important; } #leaderboard-container .paper-link:focus, #leaderboard-container .paper-link:visited, #leaderboard-container .paper-link:active { color: white !important; background-color: #4285F4 !important; text-decoration: none !important; } /* Fix font issues for authors */ #leaderboard-container .paper-authors { font-family: Inter, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif !important; } /* Remove any underlines from links globally */ #leaderboard-container a { text-decoration: none !important; } """ ) as demo: gr.HTML( combined_html, elem_id="leaderboard-container" ) return demo if __name__ == "__main__": demo = create_gradio_app() demo.launch()