benchmarks

Running

App Files Files Community

elevow commited on 11 days ago

Commit

3824ea2

verified ·

1 Parent(s): 1d58c43

Update update_data.py

Browse files

Files changed (1) hide show

update_data.py +143 -7

update_data.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# ///  script
 # requires-python = ">=3.11"
 # dependencies = [
 #     "httpx",
@@ -7,18 +7,30 @@
 # ///
 """
 Regenerate data.json and upload to the elevow/benchmarks Space.
 Source template: duplicated from davanstrien/benchmark-race
 https://huggingface.co/spaces/elevow/benchmarks
 Run locally (from repo root or this folder):
     export HF_TOKEN=hf_...
     uv run scripts/elevow-benchmarks/update_data.py
 Or copy this file to your Space repo root on Hugging Face and run there.
 Schedule on HF Jobs (example — point to YOUR raw file):
     hf jobs scheduled uv run "0 8,20 * * *" \\
         --secrets HF_TOKEN \\
         https://huggingface.co/spaces/elevow/benchmarks/resolve/main/update_data.py
 """
 from __future__ import annotations
 import json
 import os
 import re
@@ -27,16 +39,20 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any
 import httpx
 from huggingface_hub import HfApi
 # Upload target: your fork (was davanstrien/benchmark-race in upstream).
 SPACE_REPO = os.environ.get("BENCHMARK_SPACE_REPO", "elevow/benchmarks")
 ALIGNED_LOGO_URL = (
     "https://www.google.com/s2/favicons?sz=128&domain_url="
     "https%3A%2F%2Ftryaligned.ai"
 )
 ALIGNED_LOGOS_KEY = "AlignedAI"
 ALIGNED_COLOR = "#059669"
 # Full HF model_id strings from leaderboard APIs — add any row that should show Aligned branding.
 MODEL_IDS_USE_ALIGNED_LOGO: frozenset[str] = frozenset(
     {
@@ -44,6 +60,61 @@ MODEL_IDS_USE_ALIGNED_LOGO: frozenset[str] = frozenset(
         # "Qwen/Qwen2.5-Coder-32B-Instruct",
     }
 )
 BENCHMARK_CONFIGS = [
     {"dataset": "SWE-bench/SWE-bench_Verified", "key": "sweVerified", "name": "SWE-bench Verified", "gated": False},
     {"dataset": "ScaleAI/SWE-bench_Pro", "key": "swePro", "name": "SWE-bench Pro", "gated": False},
@@ -56,25 +127,58 @@ BENCHMARK_CONFIGS = [
     {"dataset": "harborframework/terminal-bench-2.0", "key": "terminalBench", "name": "Terminal-Bench 2.0", "gated": False},
     {"dataset": "FutureMa/EvasionBench", "key": "evasionBench", "name": "EvasionBench", "gated": False},
 ]
 PALETTE = [
     "#6366f1", "#0d9488", "#d97706", "#e11d48", "#7c3aed",
     "#16a34a", "#2563eb", "#ea580c", "#8b5cf6", "#0891b2",
     "#c026d3", "#65a30d", "#dc2626", "#0284c7", "#a21caf",
     "#059669", "#9333ea", "#ca8a04", "#be185d", "#0369a1",
 ]
 def inject_aligned_race_branding(
     benchmarks: dict[str, Any],
     logos: dict[str, str],
     color_map: dict[str, str],
-) -> None:
-    """Add Aligned logo URL, optional per-model race_logo_key, and bar color."""
     logos[ALIGNED_LOGOS_KEY] = ALIGNED_LOGO_URL
     color_map[ALIGNED_LOGOS_KEY] = ALIGNED_COLOR
     for _key, bm in benchmarks.items():
         for m in bm.get("models") or []:
             mid = m.get("model_id") or ""
-            if mid in MODEL_IDS_USE_ALIGNED_LOGO:
                 m["race_logo_key"] = ALIGNED_LOGOS_KEY
 def fetch_leaderboard(config: dict, hf_token: str | None) -> list[dict]:
     url = f"https://huggingface.co/api/datasets/{config['dataset']}/leaderboard"
     headers = {}
@@ -83,6 +187,7 @@ def fetch_leaderboard(config: dict, hf_token: str | None) -> list[dict]:
     elif config["gated"]:
         print(f"  {config['name']}: skipped (gated, no token)")
         return []
     print(f"  {config['name']}: fetching scores...")
     try:
         resp = httpx.get(url, headers=headers, timeout=30)
@@ -95,6 +200,7 @@ def fetch_leaderboard(config: dict, hf_token: str | None) -> list[dict]:
     except Exception as e:
         print(f"    error: {e}")
         return []
     seen: dict[str, float] = {}
     for entry in data:
         model_id = entry.get("modelId")
@@ -103,11 +209,15 @@ def fetch_leaderboard(config: dict, hf_token: str | None) -> list[dict]:
             score = float(score)
             if model_id not in seen or score > seen[model_id]:
                 seen[model_id] = score
     print(f"    {len(seen)} models")
     return [{"model_id": mid, "score": s} for mid, s in seen.items()]
 def fetch_model_dates(model_ids: list[str], hf_token: str | None) -> dict[str, dict]:
     api = HfApi()
     results: dict[str, dict] = {}
     def _get_info(mid: str):
         try:
             info = api.model_info(mid, token=hf_token)
@@ -121,13 +231,17 @@ def fetch_model_dates(model_ids: list[str], hf_token: str | None) -> dict[str, d
             return mid, info.created_at.strftime("%Y-%m-%d"), params_b
         except Exception:
             return mid, None, None
     with ThreadPoolExecutor(max_workers=8) as pool:
         futures = {pool.submit(_get_info, mid): mid for mid in model_ids}
         for f in as_completed(futures):
             mid, date, params = f.result()
             if date:
                 results[mid] = {"date": date, "parameters_b": params}
     return results
 def fetch_logo(provider: str) -> str | None:
     try:
         resp = httpx.get(
@@ -139,6 +253,8 @@ def fetch_logo(provider: str) -> str | None:
     except Exception:
         pass
     return None
 def fetch_all_logos(providers: set[str]) -> dict[str, str]:
     logos: dict[str, str] = {}
     with ThreadPoolExecutor(max_workers=8) as pool:
@@ -149,22 +265,29 @@ def fetch_all_logos(providers: set[str]) -> dict[str, str]:
             if url:
                 logos[p] = url
     return logos
 def main() -> None:
     hf_token = os.environ.get("HF_TOKEN")
     print(f"Generating data.json → upload to {SPACE_REPO}\n")
     all_scores: dict[str, dict] = {}
     all_model_ids: set[str] = set()
     for config in BENCHMARK_CONFIGS:
         rows = fetch_leaderboard(config, hf_token)
         if rows:
             all_scores[config["key"]] = {"name": config["name"], "rows": rows}
             all_model_ids.update(r["model_id"] for r in rows)
     print(f"\n{len(all_model_ids)} unique models across {len(all_scores)} benchmarks")
     print("Fetching model dates...")
     model_dates = fetch_model_dates(list(all_model_ids), hf_token)
     print(f"  got dates for {len(model_dates)}/{len(all_model_ids)} models")
     all_providers: set[str] = set()
     benchmarks: dict[str, Any] = {}
     for key, info in all_scores.items():
         models: list[dict] = []
         for row in info["rows"]:
@@ -183,29 +306,40 @@ def main() -> None:
             })
         if models:
             benchmarks[key] = {"name": info["name"], "models": models}
     print(f"\nFetching logos for {len(all_providers)} providers...")
     logos = fetch_all_logos(all_providers)
     print(f"  got {len(logos)} logos")
     color_map: dict[str, str] = {}
     for i, provider in enumerate(sorted(all_providers)):
         color_map[provider] = PALETTE[i % len(PALETTE)]
-    inject_aligned_race_branding(benchmarks, logos, color_map)
-    print(f"  injected {ALIGNED_LOGOS_KEY} logo + color; race_logo_key on {len(MODEL_IDS_USE_ALIGNED_LOGO)} id(s) configured")
     output = {
         "benchmarks": benchmarks,
         "logos": logos,
         "colors": color_map,
         "generated_at": datetime.now(timezone.utc).isoformat(),
     }
     data_json = json.dumps(output, indent=2)
     print(f"\nGenerated {len(data_json) / 1024:.1f} KB")
     for key, bm in benchmarks.items():
         print(f"  {bm['name']}: {len(bm['models'])} models")
     print(f"\nUploading data.json to {SPACE_REPO}...")
     api = HfApi()
     with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False, encoding="utf-8") as f:
         f.write(data_json)
         tmp_path = f.name
     try:
         api.upload_file(
             path_or_fileobj=tmp_path,
@@ -217,5 +351,7 @@ def main() -> None:
         print("Done!")
     finally:
         Path(tmp_path).unlink(missing_ok=True)
 if __name__ == "__main__":
-    main()

+# /// script
 # requires-python = ">=3.11"
 # dependencies = [
 #     "httpx",
 # ///
 """
 Regenerate data.json and upload to the elevow/benchmarks Space.
 Source template: duplicated from davanstrien/benchmark-race
 https://huggingface.co/spaces/elevow/benchmarks
+**Single file:** All Aligned race branding, axis relabeling, optional org-groq tagging, and
+offline ``patch_output_dict`` live here (no separate inject script).
+Populate ``MODEL_IDS_ALIGNED_AXIS_LABEL`` with full HF ``model_id`` strings (as leaderboards
+return them) to show **Aligned AI — {lane} · …** on race bar labels via rewritten ``short_name``.
 Run locally (from repo root or this folder):
     export HF_TOKEN=hf_...
     uv run scripts/elevow-benchmarks/update_data.py
 Or copy this file to your Space repo root on Hugging Face and run there.
 Schedule on HF Jobs (example — point to YOUR raw file):
     hf jobs scheduled uv run "0 8,20 * * *" \\
         --secrets HF_TOKEN \\
         https://huggingface.co/spaces/elevow/benchmarks/resolve/main/update_data.py
 """
 from __future__ import annotations
 import json
 import os
 import re
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any
 import httpx
 from huggingface_hub import HfApi
 # Upload target: your fork (was davanstrien/benchmark-race in upstream).
 SPACE_REPO = os.environ.get("BENCHMARK_SPACE_REPO", "elevow/benchmarks")
 ALIGNED_LOGO_URL = (
     "https://www.google.com/s2/favicons?sz=128&domain_url="
     "https%3A%2F%2Ftryaligned.ai"
 )
 ALIGNED_LOGOS_KEY = "AlignedAI"
 ALIGNED_COLOR = "#059669"
 # Full HF model_id strings from leaderboard APIs — add any row that should show Aligned branding.
 MODEL_IDS_USE_ALIGNED_LOGO: frozenset[str] = frozenset(
     {
         # "Qwen/Qwen2.5-Coder-32B-Instruct",
     }
 )
+# HF benchmark-race charts label bars with `short_name`. For models you treat as Groq-hosted
+# Aligned references, rewrite that field to "Aligned AI — {lane} · {checkpoint}" (same lanes as
+# client GMCQ charts). Stock Space UI ignores `race_logo_key` unless you fork index.html; it
+# always uses `short_name` for the bar text.
+MODEL_IDS_ALIGNED_AXIS_LABEL: frozenset[str] = frozenset(
+    {
+        # Same strings as leaderboards return, e.g.:
+        # "meta-llama/Llama-3.3-70B-Instruct",
+        # "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    }
+)
+# If True, tag every row whose HF org is literally "groq" with race_logo_key (rare on leaderboards).
+USE_ALIGNED_FOR_ORG_GROQ = False
+# Copy-paste example if you add a synthetic Aligned row by hand (ensure logos/colors cover provider).
+SYNTHETIC_ALIGNED_ROW_EXAMPLE = r"""
+# After building `models` for one benchmark, you may append:
+# models.append({
+#     "model_id": "tryaligned/Aligned-AI",
+#     "short_name": "Aligned-AI",
+#     "provider": "tryaligned",
+#     "score": 0.0,
+#     "date": "2026-01-01",
+#     "race_logo_key": "AlignedAI",
+# })
+# Then ensure logos["AlignedAI"] is set and colors include "tryaligned".
+"""
+def aligned_groq_lane_for_model_id(model_id: str) -> str:
+    """Match client `alignedGroqLaneForRawModel` heuristics on HF model_id."""
+    s = model_id.lower()
+    if "scout" in s:
+        return "Vision"
+    if "coder" in s:
+        return "Code"
+    if "llama-3.1" in s and "8b" in s:
+        return "Fast"
+    return "Reasoning"
+def aligned_axis_label_from_model_id(model_id: str) -> str:
+    """Bar label for forked data.json (benchmark-race reads `m.short_name`)."""
+    slug = model_id.split("/")[-1].replace("-", " ").replace("_", " ")
+    slug = re.sub(r"\s+", " ", slug).strip()
+    if len(slug) > 20:
+        slug = f"{slug[:18]}…"
+    lane = aligned_groq_lane_for_model_id(model_id)
+    label = f"Aligned AI — {lane} · {slug}"
+    if len(label) > 45:
+        label = f"{label[:43]}…"
+    return label
 BENCHMARK_CONFIGS = [
     {"dataset": "SWE-bench/SWE-bench_Verified", "key": "sweVerified", "name": "SWE-bench Verified", "gated": False},
     {"dataset": "ScaleAI/SWE-bench_Pro", "key": "swePro", "name": "SWE-bench Pro", "gated": False},
     {"dataset": "harborframework/terminal-bench-2.0", "key": "terminalBench", "name": "Terminal-Bench 2.0", "gated": False},
     {"dataset": "FutureMa/EvasionBench", "key": "evasionBench", "name": "EvasionBench", "gated": False},
 ]
 PALETTE = [
     "#6366f1", "#0d9488", "#d97706", "#e11d48", "#7c3aed",
     "#16a34a", "#2563eb", "#ea580c", "#8b5cf6", "#0891b2",
     "#c026d3", "#65a30d", "#dc2626", "#0284c7", "#a21caf",
     "#059669", "#9333ea", "#ca8a04", "#be185d", "#0369a1",
 ]
 def inject_aligned_race_branding(
     benchmarks: dict[str, Any],
     logos: dict[str, str],
     color_map: dict[str, str],
+) -> tuple[int, int]:
+    """Add Aligned logo URL, optional per-model race_logo_key, bar color, and axis labels.
+    Returns (logo_tag_count, axis_relabel_count) for logging.
+    """
     logos[ALIGNED_LOGOS_KEY] = ALIGNED_LOGO_URL
     color_map[ALIGNED_LOGOS_KEY] = ALIGNED_COLOR
+    logo_n = 0
+    axis_n = 0
     for _key, bm in benchmarks.items():
         for m in bm.get("models") or []:
             mid = m.get("model_id") or ""
+            provider = mid.split("/")[0] if "/" in mid else mid
+            use_logo = mid in MODEL_IDS_USE_ALIGNED_LOGO
+            use_axis = mid in MODEL_IDS_ALIGNED_AXIS_LABEL
+            use_groq_org = USE_ALIGNED_FOR_ORG_GROQ and provider.lower() == "groq"
+            if use_logo or use_axis or use_groq_org:
                 m["race_logo_key"] = ALIGNED_LOGOS_KEY
+                logo_n += 1
+            if use_axis:
+                orig_sn = m.get("short_name") or (mid.split("/")[-1] if "/" in mid else mid)
+                m["chart_full_name"] = f"Published HF model: {orig_sn.replace('-', ' ')}"
+                m["short_name"] = aligned_axis_label_from_model_id(mid)
+                axis_n += 1
+    return logo_n, axis_n
+def patch_output_dict(output: dict[str, Any]) -> dict[str, Any]:
+    """Deep-copy a loaded data.json dict, apply Aligned branding in place, return the copy."""
+    out = json.loads(json.dumps(output))
+    benchmarks = out.get("benchmarks") or {}
+    logos = out.setdefault("logos", {})
+    colors = out.setdefault("colors", {})
+    inject_aligned_race_branding(benchmarks, logos, colors)
+    return out
 def fetch_leaderboard(config: dict, hf_token: str | None) -> list[dict]:
     url = f"https://huggingface.co/api/datasets/{config['dataset']}/leaderboard"
     headers = {}
     elif config["gated"]:
         print(f"  {config['name']}: skipped (gated, no token)")
         return []
     print(f"  {config['name']}: fetching scores...")
     try:
         resp = httpx.get(url, headers=headers, timeout=30)
     except Exception as e:
         print(f"    error: {e}")
         return []
     seen: dict[str, float] = {}
     for entry in data:
         model_id = entry.get("modelId")
             score = float(score)
             if model_id not in seen or score > seen[model_id]:
                 seen[model_id] = score
     print(f"    {len(seen)} models")
     return [{"model_id": mid, "score": s} for mid, s in seen.items()]
 def fetch_model_dates(model_ids: list[str], hf_token: str | None) -> dict[str, dict]:
     api = HfApi()
     results: dict[str, dict] = {}
     def _get_info(mid: str):
         try:
             info = api.model_info(mid, token=hf_token)
             return mid, info.created_at.strftime("%Y-%m-%d"), params_b
         except Exception:
             return mid, None, None
     with ThreadPoolExecutor(max_workers=8) as pool:
         futures = {pool.submit(_get_info, mid): mid for mid in model_ids}
         for f in as_completed(futures):
             mid, date, params = f.result()
             if date:
                 results[mid] = {"date": date, "parameters_b": params}
     return results
 def fetch_logo(provider: str) -> str | None:
     try:
         resp = httpx.get(
     except Exception:
         pass
     return None
 def fetch_all_logos(providers: set[str]) -> dict[str, str]:
     logos: dict[str, str] = {}
     with ThreadPoolExecutor(max_workers=8) as pool:
             if url:
                 logos[p] = url
     return logos
 def main() -> None:
     hf_token = os.environ.get("HF_TOKEN")
     print(f"Generating data.json → upload to {SPACE_REPO}\n")
     all_scores: dict[str, dict] = {}
     all_model_ids: set[str] = set()
     for config in BENCHMARK_CONFIGS:
         rows = fetch_leaderboard(config, hf_token)
         if rows:
             all_scores[config["key"]] = {"name": config["name"], "rows": rows}
             all_model_ids.update(r["model_id"] for r in rows)
     print(f"\n{len(all_model_ids)} unique models across {len(all_scores)} benchmarks")
     print("Fetching model dates...")
     model_dates = fetch_model_dates(list(all_model_ids), hf_token)
     print(f"  got dates for {len(model_dates)}/{len(all_model_ids)} models")
     all_providers: set[str] = set()
     benchmarks: dict[str, Any] = {}
     for key, info in all_scores.items():
         models: list[dict] = []
         for row in info["rows"]:
             })
         if models:
             benchmarks[key] = {"name": info["name"], "models": models}
     print(f"\nFetching logos for {len(all_providers)} providers...")
     logos = fetch_all_logos(all_providers)
     print(f"  got {len(logos)} logos")
     color_map: dict[str, str] = {}
     for i, provider in enumerate(sorted(all_providers)):
         color_map[provider] = PALETTE[i % len(PALETTE)]
+    tagged, relabeled = inject_aligned_race_branding(benchmarks, logos, color_map)
+    print(
+        f"  injected {ALIGNED_LOGOS_KEY} logo + color; "
+        f"race_logo_key on {tagged} row(s); "
+        f"Aligned axis short_name on {relabeled} row(s)"
+    )
     output = {
         "benchmarks": benchmarks,
         "logos": logos,
         "colors": color_map,
         "generated_at": datetime.now(timezone.utc).isoformat(),
     }
     data_json = json.dumps(output, indent=2)
     print(f"\nGenerated {len(data_json) / 1024:.1f} KB")
     for key, bm in benchmarks.items():
         print(f"  {bm['name']}: {len(bm['models'])} models")
     print(f"\nUploading data.json to {SPACE_REPO}...")
     api = HfApi()
     with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False, encoding="utf-8") as f:
         f.write(data_json)
         tmp_path = f.name
     try:
         api.upload_file(
             path_or_fileobj=tmp_path,
         print("Done!")
     finally:
         Path(tmp_path).unlink(missing_ok=True)
 if __name__ == "__main__":
+    main()