Spaces:
Running
Running
| """ | |
| Data loading utilities for the leaderboard. | |
| Loads data from HuggingFace dataset and integrates provider logos. | |
| """ | |
| import json | |
| import os | |
| import pandas as pd | |
| from datasets import load_dataset | |
| def load_provider_logos(): | |
| """ | |
| Load provider logos from data/provider_logos.json | |
| Returns: | |
| dict: Provider name -> logo URL mapping | |
| """ | |
| logos_path = os.path.join( | |
| os.path.dirname(__file__), "..", "data", "provider_logos.json" | |
| ) | |
| try: | |
| with open(logos_path, "r") as f: | |
| logos = json.load(f) | |
| return logos | |
| except FileNotFoundError: | |
| print(f"Warning: Provider logos file not found at {logos_path}") | |
| return {} | |
| except json.JSONDecodeError as e: | |
| print(f"Warning: Could not parse provider logos JSON: {e}") | |
| return {} | |
| def format_params(param_billions): | |
| """ | |
| Format parameter count for display. | |
| Args: | |
| param_billions: Parameter count in billions (float or None) | |
| Returns: | |
| str: Formatted parameter string (e.g., "72.7B", "Unknown") | |
| """ | |
| if pd.isna(param_billions) or param_billions is None: | |
| return "Unknown" | |
| if param_billions >= 1000: | |
| return f"{param_billions:.0f}B" | |
| elif param_billions >= 100: | |
| return f"{param_billions:.0f}B" | |
| elif param_billions >= 10: | |
| return f"{param_billions:.1f}B" | |
| else: | |
| return f"{param_billions:.2f}B" | |
| def load_leaderboard_data(): | |
| """ | |
| Load leaderboard data from HuggingFace dataset. | |
| Returns: | |
| pandas.DataFrame: Complete leaderboard data with: | |
| - All model metadata | |
| - All benchmark scores | |
| - Provider logos | |
| - Formatted parameters | |
| """ | |
| print("Loading leaderboard data from HuggingFace dataset...") | |
| # Load dataset from HF | |
| try: | |
| ds = load_dataset("OpenEvals/leaderboard-data", split="train") | |
| df = ds.to_pandas() | |
| print(f"✓ Loaded {len(df)} models from dataset") | |
| except Exception as e: | |
| print(f"✗ Error loading dataset: {e}") | |
| raise | |
| # Load provider logos | |
| logos = load_provider_logos() | |
| print(f"✓ Loaded {len(logos)} provider logos") | |
| # Add logo URLs to dataframe | |
| df["logo_url"] = df["provider"].map(logos) | |
| # Format parameters for display | |
| df["parameters_display"] = df["parameters_billions"].apply(format_params) | |
| # Sort by model name by default | |
| df = df.sort_values("model_name").reset_index(drop=True) | |
| print(f"✓ Data loaded successfully: {len(df)} models, {df.columns.size} columns") | |
| return df | |
| def get_benchmark_columns(): | |
| """ | |
| Get list of all benchmark score column names. | |
| Returns: | |
| list: Column names for benchmark scores | |
| """ | |
| return [ | |
| "gsm8k_score", | |
| "mmluPro_score", | |
| "gpqa_score", | |
| "hle_score", | |
| "olmOcr_score", | |
| "sweVerified_score", | |
| "swePro_score", | |
| "aime2026_score", | |
| "terminalBench_score", | |
| "evasionBench_score", | |
| "hmmt2026_score", | |
| ] | |
| def get_benchmark_info(): | |
| """ | |
| Get metadata about each benchmark. | |
| Returns: | |
| dict: Benchmark key -> metadata mapping | |
| """ | |
| return { | |
| "gsm8k": { | |
| "name": "GSM8K", | |
| "full_name": "Grade School Math 8K", | |
| "category": "math", | |
| "color": "#7c3aed", | |
| "url": "https://huggingface.co/datasets/openai/gsm8k", | |
| }, | |
| "mmluPro": { | |
| "name": "MMLU-Pro", | |
| "full_name": "Massive Multi-task Language Understanding Pro", | |
| "category": "knowledge", | |
| "color": "#2563eb", | |
| "url": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro", | |
| }, | |
| "gpqa": { | |
| "name": "GPQA◆", | |
| "full_name": "PhD-level Expert Questions", | |
| "category": "knowledge", | |
| "color": "#2563eb", | |
| "url": "https://huggingface.co/datasets/Idavidrein/gpqa", | |
| }, | |
| "hle": { | |
| "name": "HLE", | |
| "full_name": "Humanity's Last Exam", | |
| "category": "knowledge", | |
| "color": "#2563eb", | |
| "url": "https://lastexam.ai", | |
| }, | |
| "olmOcr": { | |
| "name": "olmOCR", | |
| "full_name": "OCR Evaluation Benchmark", | |
| "category": "vision", | |
| "color": "#db2777", | |
| "url": "https://huggingface.co/datasets/allenai/olmOCR-bench", | |
| }, | |
| "sweVerified": { | |
| "name": "SWE-V", | |
| "full_name": "SWE-bench Verified", | |
| "category": "coding", | |
| "color": "#059669", | |
| "url": "https://www.swebench.com", | |
| }, | |
| "swePro": { | |
| "name": "SWE-Pro", | |
| "full_name": "SWE-bench Pro", | |
| "category": "coding", | |
| "color": "#059669", | |
| "url": "https://scale.com/leaderboard/swe_bench_pro_public", | |
| }, | |
| "aime2026": { | |
| "name": "AIME 2026", | |
| "full_name": "American Invitational Mathematics Examination 2026", | |
| "category": "math", | |
| "color": "#7c3aed", | |
| "url": "https://matharena.ai/?comp=aime--aime_2026", | |
| }, | |
| "terminalBench": { | |
| "name": "TB 2.0", | |
| "full_name": "Terminal-Bench 2.0", | |
| "category": "agent", | |
| "color": "#0d9488", | |
| "url": "https://www.tbench.ai/leaderboard/terminal-bench/2.0", | |
| }, | |
| "evasionBench": { | |
| "name": "EvasionB", | |
| "full_name": "EvasionBench", | |
| "category": "language", | |
| "color": "#ea580c", | |
| "url": "https://huggingface.co/datasets/FutureMa/EvasionBench", | |
| }, | |
| "hmmt2026": { | |
| "name": "HMMT", | |
| "full_name": "Harvard-MIT Mathematics Tournament Feb 2026", | |
| "category": "math", | |
| "color": "#7c3aed", | |
| "url": "https://matharena.ai/?comp=hmmt--hmmt_feb_2026", | |
| }, | |
| } | |