Spaces:
Sleeping
Sleeping
| """ViTeX-Bench public leaderboard. | |
| Three tabs: | |
| * Leaderboard β full thirteen-metric vector for every approved method, | |
| rendered as styled HTML with TextScore highlighted as the | |
| explicit sort key. | |
| * Submit β upload eval.json from a successful `bash scripts/run_benchmark.sh` | |
| run; saved as a pending entry awaiting maintainer review. | |
| * Admin β passphrase-gated panel listing pending submissions with | |
| approve / reject actions. | |
| State lives in `submissions.jsonl` inside this Space repo. Each line is one | |
| JSON object; `status` is `pending` / `approved` / `rejected`. Pre-populated | |
| with the paper baselines (status=approved). Owner-only writes via HF_TOKEN | |
| (set as a Space secret). | |
| """ | |
| import html as _html | |
| import json | |
| import math | |
| import os | |
| import time | |
| from typing import Dict, List, Optional, Tuple | |
| import gradio as gr | |
| import pandas as pd | |
| from huggingface_hub import HfApi, hf_hub_download | |
| from huggingface_hub.utils import HfHubHTTPError | |
| REPO_ID = "ViTeX-Bench/ViTeX-Bench-Leaderboard" | |
| SUBMISSIONS_FILE = "submissions.jsonl" | |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") | |
| ADMIN_PASSPHRASE = os.environ.get("ADMIN_PASSPHRASE", "") | |
| api = HfApi(token=HF_TOKEN) if HF_TOKEN else HfApi() | |
| METRIC_KEYS = [ | |
| "SeqAcc", "CharAcc", "TTS", | |
| "Flicker_full", "Flicker_crop", "Warp_full", "Warp_crop", | |
| "MUSIQ_full", "MUSIQ_crop", | |
| "PSNR_loc", "SSIM_loc", "LPIPS_loc", "DreamSim_loc", | |
| ] | |
| TEXT_KEYS = ["SeqAcc", "CharAcc", "TTS"] | |
| def _text_score(seq, char, tts): | |
| if seq is None or char is None or tts is None: | |
| return None | |
| if seq <= 0.0 or char <= 0.0 or tts <= 0.0: | |
| return 0.0 | |
| return math.exp((math.log(seq) + math.log(char) + math.log(tts)) / 3.0) | |
| # ---------- I/O ---------- | |
| def fetch_submissions() -> List[Dict]: | |
| """Read submissions.jsonl from the Space repo (or local fallback). | |
| Returns an empty list on any error so the leaderboard can still render | |
| instead of breaking the whole Gradio app. | |
| """ | |
| try: | |
| if os.path.exists(SUBMISSIONS_FILE): | |
| with open(SUBMISSIONS_FILE) as f: | |
| return [json.loads(line) for line in f if line.strip()] | |
| except Exception as e: | |
| print(f"[fetch_submissions] local read failed: {e}", flush=True) | |
| try: | |
| path = hf_hub_download(REPO_ID, SUBMISSIONS_FILE, repo_type="space", | |
| token=HF_TOKEN or None) | |
| with open(path) as f: | |
| return [json.loads(line) for line in f if line.strip()] | |
| except HfHubHTTPError as e: | |
| print(f"[fetch_submissions] hub download HTTP error: {e}", flush=True) | |
| return [] | |
| except Exception as e: | |
| print(f"[fetch_submissions] hub download failed: {e}", flush=True) | |
| return [] | |
| def write_submissions(items: List[Dict]) -> None: | |
| """Persist submissions.jsonl to the Space repo + local cache.""" | |
| content = "\n".join(json.dumps(x, ensure_ascii=False) for x in items) + "\n" | |
| with open(SUBMISSIONS_FILE, "w") as f: | |
| f.write(content) | |
| if HF_TOKEN: | |
| api.upload_file( | |
| path_or_fileobj=content.encode(), | |
| path_in_repo=SUBMISSIONS_FILE, | |
| repo_id=REPO_ID, | |
| repo_type="space", | |
| commit_message=f"Update submissions ({len(items)} entries)", | |
| ) | |
| # ---------- HTML leaderboard rendering ---------- | |
| LEADERBOARD_CSS = """ | |
| <style> | |
| /* Palette: Stripe-inspired (navy #0a2540 / slate text #425466 / pale #f6f9fc). | |
| TextScore column = deep navy primary; the three text-correctness primitives | |
| that feed it carry a soft tint; remaining detail metrics are quiet slate. */ | |
| /* Borderless wrap. The outer .vbench-wrap is now a transparent | |
| pass-through with no border, radius, or shadow; the inner | |
| .vbench-scroll handles the actual scrolling. */ | |
| .vbench-wrap { width: 100%; background: transparent; } | |
| .vbench-scroll { | |
| width: 100%; | |
| overflow-x: auto; | |
| overflow-y: auto; | |
| max-height: 640px; | |
| } | |
| .vbench-tbl { | |
| font-family: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, | |
| "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; | |
| border-collapse: separate; border-spacing: 0; | |
| width: max-content; min-width: 100%; | |
| font-size: 13px; line-height: 1.4; color: #425466; | |
| } | |
| .vbench-tbl thead th { | |
| background: #f6f9fc; color: #425466; | |
| font-weight: 500; font-size: 11px; letter-spacing: 0.04em; | |
| text-transform: uppercase; | |
| padding: 11px 12px; | |
| border-bottom: 1px solid #e3e8ee; | |
| position: sticky; top: 0; z-index: 5; white-space: nowrap; | |
| } | |
| .vbench-tbl thead th.sortable { cursor: pointer; user-select: none; } | |
| .vbench-tbl thead th.sortable:hover { background: #eef2f8; color: #0a2540; } | |
| .vbench-tbl thead th.sorted-active { | |
| background: #eef2f8; color: #0a2540; | |
| box-shadow: inset 0 -2px 0 #0a2540; | |
| } | |
| .vbench-tbl thead th.score.sorted-active { | |
| background: #1e3a5f; box-shadow: inset 0 -2px 0 #f6f9fc; | |
| } | |
| .vbench-tbl td.cell-good { background: #ebf6ee !important; } | |
| .vbench-tbl td.cell-bad { background: #fbedef !important; } | |
| .vbench-tbl tbody tr:hover td.cell-good { background: #ddeee2 !important; } | |
| .vbench-tbl tbody tr:hover td.cell-bad { background: #f4dde1 !important; } | |
| .vbench-tbl td { | |
| padding: 9px 12px; border-bottom: 1px solid #f1f4f8; | |
| text-align: right; white-space: nowrap; | |
| font-variant-numeric: tabular-nums; | |
| } | |
| .vbench-tbl tbody tr:last-child td { border-bottom: none; } | |
| .vbench-tbl tbody tr:hover td { background: #fafbfc; } | |
| .vbench-tbl td.left { text-align: left; } | |
| .vbench-tbl td.rank { | |
| color: #a3acb9; font-weight: 500; font-size: 12px; | |
| text-align: center; width: 38px; | |
| } | |
| .vbench-tbl td.method { | |
| color: #0a2540; font-weight: 600; font-size: 13px; | |
| min-width: 170px; | |
| } | |
| .vbench-tbl td.org { | |
| color: #697386; font-size: 12px; max-width: 200px; | |
| overflow: hidden; text-overflow: ellipsis; | |
| } | |
| /* TextScore column: deep navy, dominant β this is the sort key */ | |
| .vbench-tbl th.score { | |
| background: #0a2540; color: #f6f9fc; | |
| font-weight: 600; font-size: 11px; | |
| border-bottom: 1px solid #0a2540; | |
| } | |
| .vbench-tbl td.score { | |
| background: #0a2540; color: #fff; | |
| font-weight: 700; font-size: 14px; | |
| letter-spacing: -0.01em; | |
| } | |
| .vbench-tbl tbody tr:hover td.score { background: #1e3a5f; } | |
| /* Text-correctness primitives: the three feeding TextScore β soft tint */ | |
| .vbench-tbl th.text { | |
| background: #eef2f8; color: #0a2540; | |
| font-weight: 600; | |
| } | |
| .vbench-tbl td.text { | |
| background: #f6f9fc; color: #0a2540; | |
| font-weight: 600; font-size: 13px; | |
| } | |
| .vbench-tbl tbody tr:hover td.text { background: #eef2f8; } | |
| /* Detail metrics: subtle */ | |
| .vbench-tbl td.metric { color: #425466; } | |
| .vbench-tbl .num0 { color: #c1c9d2; } | |
| /* Family pastel badges */ | |
| .vbench-tbl .badge { | |
| display: inline-block; padding: 2px 8px; | |
| font-size: 10.5px; font-weight: 500; | |
| border-radius: 999px; letter-spacing: 0.02em; | |
| } | |
| .vbench-tbl .badge.A { background: #cff5d1; color: #0a3514; } | |
| .vbench-tbl .badge.B { background: #fff5cc; color: #7e3503; } | |
| .vbench-tbl .badge.C { background: #ffd1d8; color: #7d122c; } | |
| .vbench-tbl .badge.D { background: #dadcff; color: #322a87; } | |
| .vbench-tbl .badge.Ref { background: #e6e0ff; color: #3b1f7a; } | |
| .vbench-tbl .badge.Other { background: #e3e8ee; color: #0a2540; } | |
| .vbench-tbl .badge.Admin { background: #0a2540; color: #f6f9fc; } | |
| .vbench-tbl .badge.User { background: #635bff; color: #fff; } | |
| .vbench-tbl .links a { | |
| text-decoration: none; margin-right: 6px; font-size: 13px; | |
| opacity: 0.65; transition: opacity 0.15s; | |
| } | |
| .vbench-tbl .links a:hover { opacity: 1; } | |
| .vbench-legend { | |
| font-size: 11.5px; color: #697386; | |
| padding: 14px 4px 0; line-height: 1.65; | |
| } | |
| .vbench-legend code { | |
| background: #f6f9fc; color: #425466; | |
| padding: 1px 6px; border-radius: 4px; font-size: 11px; | |
| border: 1px solid #e3e8ee; | |
| } | |
| .vbench-legend b { color: #0a2540; font-weight: 600; } | |
| .vbench-foot, .vbench-foot p { | |
| font-size: 12px; color: #697386; margin-top: 6px; | |
| } | |
| .vbench-foot code { | |
| background: #f6f9fc; color: #425466; | |
| padding: 1px 5px; border-radius: 4px; font-size: 11px; | |
| border: 1px solid #e3e8ee; | |
| } | |
| </style> | |
| """ | |
| # Global JS injected via Blocks(js=...). It hooks every .vbench-tbl table | |
| # in the DOM (immediately + via MutationObserver as Gradio re-renders the | |
| # HTML component) and attaches click-to-sort + above/below-mean shading. | |
| # Inline <script> tags inside gr.HTML are NOT executed by Gradio's React | |
| # renderer, which is why we route through Blocks(js=...). | |
| SORT_JS = r""" | |
| () => { | |
| function cellNum(cell) { | |
| if (!cell) return null; | |
| const txt = (cell.textContent || '').replace(/[β \s]/g, ''); | |
| if (txt === '' || txt === 'β') return null; | |
| const n = Number(txt); | |
| return isNaN(n) ? null : n; | |
| } | |
| function cellTxt(cell) { | |
| return (cell && cell.textContent || '').replace(/\s+/g, ' ').trim(); | |
| } | |
| function dirFor(th) { | |
| const t = th.textContent || ''; | |
| if (t.indexOf('β') >= 0) return 'up'; | |
| if (t.indexOf('β') >= 0) return 'down'; | |
| return null; | |
| } | |
| function applyMeanShading(table) { | |
| const tbody = table.querySelector('tbody'); | |
| const headers = Array.from(table.querySelectorAll('thead th')); | |
| const rows = Array.from(tbody.querySelectorAll('tr')); | |
| rows.forEach(r => Array.from(r.cells).forEach(c => { | |
| c.classList.remove('cell-good', 'cell-bad'); | |
| })); | |
| headers.forEach((th, col) => { | |
| const dir = dirFor(th); | |
| if (!dir) return; | |
| const values = []; | |
| rows.forEach(r => { | |
| const cell = r.children[col]; | |
| if (!cell || cell.classList.contains('score')) return; | |
| const n = cellNum(cell); | |
| if (n !== null) values.push(n); | |
| }); | |
| if (values.length < 2) return; | |
| const mean = values.reduce((a, b) => a + b, 0) / values.length; | |
| rows.forEach(r => { | |
| const cell = r.children[col]; | |
| if (!cell || cell.classList.contains('score')) return; | |
| const n = cellNum(cell); | |
| if (n === null) return; | |
| const better = (dir === 'up') ? n > mean : n < mean; | |
| cell.classList.add(better ? 'cell-good' : 'cell-bad'); | |
| }); | |
| }); | |
| } | |
| function attachSortAndShade(table) { | |
| if (table.dataset.vbenchAttached === '1') return; | |
| table.dataset.vbenchAttached = '1'; | |
| const tbody = table.querySelector('tbody'); | |
| const headers = Array.from(table.querySelectorAll('thead th')); | |
| function sortBy(idx, dir) { | |
| const rows = Array.from(tbody.querySelectorAll('tr')); | |
| rows.sort((ra, rb) => { | |
| const va = cellNum(ra.children[idx]); | |
| const vb = cellNum(rb.children[idx]); | |
| if (va !== null && vb !== null) return (va - vb) * dir; | |
| if (va !== null) return -1; | |
| if (vb !== null) return 1; | |
| return cellTxt(ra.children[idx]).localeCompare(cellTxt(rb.children[idx])) * dir; | |
| }); | |
| rows.forEach((row, k) => { | |
| if (row.children[0] && row.children[0].classList.contains('rank')) { | |
| row.children[0].textContent = String(k + 1); | |
| } | |
| tbody.appendChild(row); | |
| }); | |
| headers.forEach(h => h.classList.remove('sorted-active')); | |
| headers[idx].classList.add('sorted-active'); | |
| } | |
| let lastIdx = -1, lastDir = 0; | |
| headers.forEach((th, i) => { | |
| if (i === 0) return; | |
| th.classList.add('sortable'); | |
| th.addEventListener('click', () => { | |
| const txt = th.textContent || ''; | |
| const dir = (i === lastIdx) ? -lastDir : (txt.indexOf('β') >= 0 ? +1 : -1); | |
| lastIdx = i; lastDir = dir; | |
| sortBy(i, dir); | |
| }); | |
| }); | |
| applyMeanShading(table); | |
| } | |
| function rescan() { | |
| document.querySelectorAll('table.vbench-tbl').forEach(t => { | |
| // Re-shade on every rescan so newly injected rows pick up colours | |
| // even when the table-level click handlers were already attached. | |
| attachSortAndShade(t); | |
| applyMeanShading(t); | |
| }); | |
| } | |
| rescan(); | |
| // Gradio re-renders the gr.HTML component on every refresh; watch the | |
| // body for subtree mutations and re-scan whenever a vbench-tbl appears | |
| // (or is replaced) anywhere in the page. | |
| const obs = new MutationObserver(() => rescan()); | |
| obs.observe(document.body, { childList: true, subtree: true }); | |
| } | |
| """ | |
| def _fmt(v, fmt=".4f", dim_zero=True) -> str: | |
| if v is None: | |
| return '<span style="color:#9ca3af">β</span>' | |
| try: | |
| f = float(v) | |
| except Exception: | |
| return _html.escape(str(v)) | |
| s = format(f, fmt) | |
| if dim_zero and f == 0.0: | |
| return f'<span class="num0">{s}</span>' | |
| return s | |
| def _family_badge(fam: str) -> str: | |
| raw = (fam or "β").strip() | |
| if raw.startswith("A"): | |
| cls, label = "A", "A" | |
| elif raw.startswith("B"): | |
| cls, label = "B", "B" | |
| elif raw.startswith("C"): | |
| cls, label = "C", "C" | |
| elif raw.startswith("D"): | |
| cls, label = "D", "D" | |
| elif raw.lower().startswith("ref"): | |
| cls, label = "Ref", "Ref" | |
| else: | |
| cls, label = "Other", raw[:5] or "Other" | |
| return f'<span class="badge {cls}" title="{_html.escape(raw)}">{_html.escape(label)}</span>' | |
| def _links(r: Dict) -> str: | |
| out = [] | |
| if r.get("paper_url"): | |
| out.append(f'<a href="{_html.escape(r["paper_url"])}" target="_blank" title="Paper">π</a>') | |
| if r.get("code_url"): | |
| out.append(f'<a href="{_html.escape(r["code_url"])}" target="_blank" title="Code">π</a>') | |
| return "".join(out) | |
| def _record_text_score(r: Dict) -> Optional[float]: | |
| """Prefer the stored TextScore; fall back to recomputing it from the | |
| three primitives so older submissions render correctly.""" | |
| if r.get("TextScore") is not None: | |
| try: | |
| return float(r["TextScore"]) | |
| except Exception: | |
| return None | |
| return _text_score(r.get("SeqAcc"), r.get("CharAcc"), r.get("TTS")) | |
| def render_leaderboard_html() -> str: | |
| try: | |
| return _render_leaderboard_html_inner() | |
| except Exception as e: | |
| print(f"[render_leaderboard_html] failed: {e}", flush=True) | |
| return (LEADERBOARD_CSS + | |
| f"<p style='padding:16px;color:#b91c1c'>Could not render the " | |
| f"leaderboard: {_html.escape(str(e))}</p>") | |
| def _render_leaderboard_html_inner() -> str: | |
| rows = [r for r in fetch_submissions() if r.get("status") == "approved"] | |
| rows.sort(key=lambda r: ( | |
| -(_record_text_score(r) if _record_text_score(r) is not None else -1.0), | |
| -(r.get("CharAcc") or 0.0), | |
| -(r.get("TTS") or 0.0), | |
| )) | |
| if not rows: | |
| return LEADERBOARD_CSS + "<p style='padding:16px;color:#6b7280'>No approved submissions yet.</p>" | |
| lines = [LEADERBOARD_CSS, | |
| '<div class="vbench-wrap"><div class="vbench-scroll">' | |
| '<table class="vbench-tbl">'] | |
| lines.append( | |
| "<thead><tr>" | |
| '<th class="rank">#</th>' | |
| '<th class="left">Method</th>' | |
| '<th class="left">Authors / Org</th>' | |
| '<th class="left">Src</th>' | |
| '<th class="score">TextScoreβ</th>' | |
| '<th class="text">SeqAccβ</th><th class="text">CharAccβ</th><th class="text">TTSβ</th>' | |
| "<th>Flk_fβ</th><th>Flk_cβ</th><th>Wp_fβ</th><th>Wp_cβ</th>" | |
| "<th>MUSIQ_fβ</th><th>MUSIQ_cβ</th>" | |
| "<th>PSNRβ</th><th>SSIMβ</th><th>LPIPSβ</th><th>DSimβ</th>" | |
| '<th class="left">Links</th>' | |
| '<th class="left">Fam</th>' | |
| "</tr></thead><tbody>" | |
| ) | |
| for i, r in enumerate(rows, 1): | |
| method = _html.escape(r.get("method", "β")) | |
| fam = _family_badge(r.get("family", "β")) | |
| org = _html.escape(r.get("organization", "β") or "β") | |
| note = r.get("note", "") | |
| method_with_note = ( | |
| f'{method} <span title="{_html.escape(note)}" style="color:#635bff;font-weight:600;cursor:help">β </span>' | |
| if note else method | |
| ) | |
| ts = _record_text_score(r) | |
| submitter = (r.get("submitter") or "user").strip().lower() | |
| src_label = "Admin" if submitter == "admin" else "User" | |
| src_class = "Admin" if submitter == "admin" else "User" | |
| src_badge = f'<span class="badge {src_class}">{src_label}</span>' | |
| lines.append("<tr>") | |
| lines.append(f'<td class="rank">{i}</td>') | |
| lines.append(f'<td class="left method">{method_with_note}</td>') | |
| lines.append(f'<td class="left org" title="{org}">{org}</td>') | |
| lines.append(f'<td class="left">{src_badge}</td>') | |
| lines.append(f'<td class="score">{_fmt(ts, ".4f")}</td>') | |
| for k, fmt, dim, cls in [ | |
| ("SeqAcc", ".3f", True, "text"), | |
| ("CharAcc", ".3f", True, "text"), | |
| ("TTS", ".3f", True, "text"), | |
| ("Flicker_full", ".2f", False, "metric"), | |
| ("Flicker_crop", ".2f", False, "metric"), | |
| ("Warp_full", ".2f", False, "metric"), | |
| ("Warp_crop", ".2f", False, "metric"), | |
| ("MUSIQ_full", ".2f", False, "metric"), | |
| ("MUSIQ_crop", ".2f", False, "metric"), | |
| ("PSNR_loc", ".2f", False, "metric"), | |
| ("SSIM_loc", ".3f", False, "metric"), | |
| ("LPIPS_loc", ".3f", False, "metric"), | |
| ("DreamSim_loc", ".4f", False, "metric"), | |
| ]: | |
| lines.append(f'<td class="{cls}">{_fmt(r.get(k), fmt, dim)}</td>') | |
| lines.append(f'<td class="left links">{_links(r)}</td>') | |
| lines.append(f'<td class="left family">{fam}</td>') | |
| lines.append("</tr>") | |
| lines.append("</tbody></table></div></div>") | |
| lines.append( | |
| '<div class="vbench-legend">' | |
| "Ranked by <b>TextScore</b>, the geometric mean of the three text-correctness primitives. " | |
| "Click any column header to re-sort by that metric. " | |
| "β higher-better, β lower-better. " | |
| "<code>Flk</code> = Flicker, <code>Wp</code> = Warp, <code>DSim</code> = DreamSim; " | |
| "subscripts <code>f</code> / <code>c</code> = full-frame / text-crop scope. " | |
| "A β next to a method name marks a published caveat (hover for details)." | |
| "</div>" | |
| ) | |
| return "\n".join(lines) | |
| # ---------- Submit / admin handlers ---------- | |
| def parse_eval_json(file_obj) -> Tuple[Optional[Dict], Optional[str]]: | |
| """Extract the 13 metric means + TextScore from an eval.json produced | |
| by `benchmark/evaluate.py`.""" | |
| if file_obj is None: | |
| return None, "No file uploaded." | |
| try: | |
| path = file_obj.name if hasattr(file_obj, "name") else file_obj | |
| with open(path) as f: | |
| data = json.load(f) | |
| except Exception as e: | |
| return None, f"Could not parse JSON: {e}" | |
| agg = data.get("aggregate") | |
| if not isinstance(agg, dict): | |
| return None, "eval.json is missing the top-level `aggregate` field." | |
| metrics = {} | |
| for k in METRIC_KEYS: | |
| v = agg.get(k) | |
| m = v.get("mean") if isinstance(v, dict) else v | |
| if m is None: | |
| return None, f"aggregate.{k}.mean is missing or null." | |
| metrics[k] = float(m) | |
| score_obj = agg.get("TextScore") or {} | |
| s = score_obj.get("mean") if isinstance(score_obj, dict) else score_obj | |
| if s is None: | |
| s = _text_score(*[metrics[k] for k in TEXT_KEYS]) | |
| metrics["TextScore"] = float(s) if s is not None else 0.0 | |
| metrics["n_clips"] = ( | |
| len(data["per_clip"]) if isinstance(data.get("per_clip"), dict) else None | |
| ) | |
| metrics["submitter"] = "user" | |
| return metrics, None | |
| def submit_handler(method, organization, family, paper_url, code_url, | |
| contact_email, file_obj, admin_passphrase): | |
| if not method or not method.strip(): | |
| return "β Method name is required.", render_leaderboard_html() | |
| if not organization or not organization.strip(): | |
| return "β Organization / author(s) is required.", render_leaderboard_html() | |
| metrics, err = parse_eval_json(file_obj) | |
| if err: | |
| return f"β {err}", render_leaderboard_html() | |
| # If a non-empty admin passphrase is provided, validate it: matching | |
| # the secret promotes the submission to submitter='admin' + status= | |
| # 'approved' (no review queue). Any mismatch is rejected outright, | |
| # rather than silently downgrading to a user submission, so a typo | |
| # cannot accidentally enter the public table as Admin. | |
| is_admin = False | |
| pass_attempt = (admin_passphrase or "").strip() | |
| if pass_attempt: | |
| if not (ADMIN_PASSPHRASE or "").strip(): | |
| return ("β ADMIN_PASSPHRASE is not configured on this Space. " | |
| "Leave the admin passphrase field blank to submit as a user.", | |
| render_leaderboard_html()) | |
| if pass_attempt != (ADMIN_PASSPHRASE or "").strip(): | |
| return ("β Admin passphrase does not match. " | |
| "Leave the field blank to submit as a user.", | |
| render_leaderboard_html()) | |
| is_admin = True | |
| now = time.strftime("%Y-%m-%d %H:%M:%S UTC", time.gmtime()) | |
| entry = { | |
| "method": method.strip(), | |
| "organization": organization.strip(), | |
| "family": (family or "").strip() or "Other", | |
| "paper_url": (paper_url or "").strip(), | |
| "code_url": (code_url or "").strip(), | |
| "contact_email": (contact_email or "").strip(), | |
| "submitted_at": now, | |
| "status": "approved" if is_admin else "pending", | |
| **metrics, | |
| } | |
| if is_admin: | |
| entry["submitter"] = "admin" | |
| entry["approved_at"] = now | |
| # parse_eval_json defaults submitter='user' when not overridden above. | |
| items = fetch_submissions() | |
| items.append(entry) | |
| try: | |
| write_submissions(items) | |
| except Exception as e: | |
| return f"β Could not save submission: {e}", render_leaderboard_html() | |
| if is_admin: | |
| return ( | |
| f"β Admin submission published. {entry['method']} added with " | |
| f"TextScore = {entry['TextScore']:.4f} (n_clips = {entry.get('n_clips')}).", | |
| render_leaderboard_html(), | |
| ) | |
| return ( | |
| f"β Submitted for review. TextScore = {entry['TextScore']:.4f} " | |
| f"(n_clips = {entry.get('n_clips')}). The maintainers will review it shortly.", | |
| render_leaderboard_html(), | |
| ) | |
| def _check_passphrase(passphrase: str): | |
| """Return (ok: bool, message: str). Distinguishes between an unset | |
| Space secret and a mistyped value so the admin knows which one to | |
| fix. Both sides are stripped of surrounding whitespace because copy- | |
| pasting from a password manager often picks up a stray newline.""" | |
| if not (ADMIN_PASSPHRASE or "").strip(): | |
| return False, ("β ADMIN_PASSPHRASE is not configured on this Space. " | |
| "Set it in **Space β Settings β Variables and secrets** " | |
| "and restart the Space.") | |
| if (passphrase or "").strip() != (ADMIN_PASSPHRASE or "").strip(): | |
| return False, "β Wrong passphrase." | |
| return True, "" | |
| def submission_diagnostics(): | |
| """Status summary readable WITHOUT a passphrase: counts per status and | |
| whether the Space secrets are configured. Lets the maintainer confirm | |
| their submission landed before unlocking the admin actions.""" | |
| items = fetch_submissions() | |
| n_total = len(items) | |
| n_pending = sum(1 for r in items if r.get("status") == "pending") | |
| n_approved = sum(1 for r in items if r.get("status") == "approved") | |
| n_rejected = sum(1 for r in items if r.get("status") == "rejected") | |
| pass_set = bool((ADMIN_PASSPHRASE or "").strip()) | |
| token_set = bool((HF_TOKEN or "").strip()) | |
| return ( | |
| f"**Submissions** β {n_total} total Β· " | |
| f"{n_pending} pending Β· {n_approved} approved Β· {n_rejected} rejected.\n\n" | |
| f"**Space secrets** β `ADMIN_PASSPHRASE`: " | |
| f"{'β set' if pass_set else 'β not set'} Β· " | |
| f"`HF_TOKEN`: {'β set' if token_set else 'β not set (submissions stay in-memory only and disappear on Space restart)'}." | |
| ) | |
| def admin_view(passphrase: str): | |
| """Show every entry (pending / approved / rejected) with its absolute | |
| index so the admin operates on stable row numbers regardless of | |
| status.""" | |
| ok, msg = _check_passphrase(passphrase) | |
| if not ok: | |
| return pd.DataFrame(), msg | |
| items = fetch_submissions() | |
| if not items: | |
| return pd.DataFrame(), "π No submissions yet." | |
| rows = [] | |
| for i, r in enumerate(items): | |
| rows.append({ | |
| "#": i, | |
| "Method": r.get("method", "β"), | |
| "Status": r.get("status", "β"), | |
| "Source": (r.get("submitter") or "β"), | |
| "TextScore": _record_text_score(r), | |
| "Family": r.get("family", "β"), | |
| "Organization": r.get("organization", "β"), | |
| "Email": r.get("contact_email", ""), | |
| "Submitted": r.get("submitted_at", ""), | |
| }) | |
| n_pending = sum(1 for r in items if r.get("status") == "pending") | |
| return pd.DataFrame(rows), ( | |
| f"π {len(items)} entry(ies) β {n_pending} pending. " | |
| "Use the absolute index in the # column for every action below." | |
| ) | |
| def admin_act(passphrase: str, idx: int, action: str): | |
| """Approve or reject the entry at absolute index `idx`.""" | |
| ok, msg = _check_passphrase(passphrase) | |
| if not ok: | |
| return msg, render_leaderboard_html() | |
| items = fetch_submissions() | |
| if idx is None or int(idx) < 0 or int(idx) >= len(items): | |
| return f"β Index {idx} out of range ({len(items)} entries).", render_leaderboard_html() | |
| target = int(idx) | |
| items[target]["status"] = action | |
| items[target][f"{action}_at"] = time.strftime("%Y-%m-%d %H:%M:%S UTC", time.gmtime()) | |
| try: | |
| write_submissions(items) | |
| except Exception as e: | |
| return f"β Could not save: {e}", render_leaderboard_html() | |
| method = items[target].get("method", "β") | |
| return f"β Entry #{target} ({method}) β {action}.", render_leaderboard_html() | |
| def _admin_action_response(passphrase: str, msg: str): | |
| """Return the (msg, leaderboard_html, df, diag_md) tuple every admin | |
| action emits β keeps the dataframe and the public diagnostics block | |
| in sync with the leaderboard after every approve / reject / delete.""" | |
| df, _ = admin_view(passphrase) | |
| return msg, render_leaderboard_html(), df, submission_diagnostics() | |
| def admin_approve(passphrase: str, idx: int): | |
| msg, _ = admin_act(passphrase, idx, "approved") | |
| return _admin_action_response(passphrase, msg) | |
| def admin_reject(passphrase: str, idx: int): | |
| msg, _ = admin_act(passphrase, idx, "rejected") | |
| return _admin_action_response(passphrase, msg) | |
| def admin_delete(passphrase: str, idx: int): | |
| """Permanently remove the entry at absolute index `idx`.""" | |
| ok, msg = _check_passphrase(passphrase) | |
| if not ok: | |
| return _admin_action_response(passphrase, msg) | |
| items = fetch_submissions() | |
| if idx is None or int(idx) < 0 or int(idx) >= len(items): | |
| return _admin_action_response( | |
| passphrase, f"β Index {idx} out of range ({len(items)} entries).") | |
| removed = items.pop(int(idx)) | |
| try: | |
| write_submissions(items) | |
| except Exception as e: | |
| return _admin_action_response(passphrase, f"β Could not save: {e}") | |
| return _admin_action_response( | |
| passphrase, f"ποΈ Deleted #{int(idx)} ({removed.get('method', 'β')}).") | |
| # ---------- UI ---------- | |
| HEADER_MD = ( | |
| "# π ViTeX-Bench Leaderboard\n\n" | |
| "π [Project page](https://vitex-bench.github.io/) Β· " | |
| "π [Dataset](https://huggingface.co/datasets/ViTeX-Bench/ViTeX-Dataset) Β· " | |
| "π§ͺ [Benchmark code](https://huggingface.co/ViTeX-Bench/ViTeX-Bench) Β· " | |
| "π€ [Model & Inference code](https://huggingface.co/ViTeX-Bench/ViTeX-Edit-14B) Β· " | |
| "π **Leaderboard**\n\n" | |
| "Public ranking for **video scene text editing** under the 13-metric ViTeX-Bench " | |
| "protocol. Methods are ranked by **TextScore** = β(SeqAcc Β· CharAcc Β· TTS), " | |
| "the geometric mean of the three text-correctness primitives; the full thirteen-" | |
| "metric vector is shown alongside it." | |
| ) | |
| THEME = gr.themes.Soft( | |
| font=[gr.themes.GoogleFont("Inter"), | |
| "ui-sans-serif", "system-ui", "-apple-system", | |
| "Segoe UI", "Roboto", "Helvetica Neue", "Arial", "sans-serif"], | |
| font_mono=[gr.themes.GoogleFont("JetBrains Mono"), | |
| "ui-monospace", "SFMono-Regular", "Menlo", "Consolas", "monospace"], | |
| ) | |
| with gr.Blocks(title="ViTeX-Bench Leaderboard", | |
| theme=THEME, | |
| js=SORT_JS) as demo: | |
| gr.Markdown(HEADER_MD) | |
| with gr.Tab("π Leaderboard"): | |
| leaderboard_html = gr.HTML(value=render_leaderboard_html(), | |
| container=False) | |
| refresh_btn = gr.Button("π Refresh", variant="primary", size="sm") | |
| refresh_btn.click(fn=render_leaderboard_html, outputs=leaderboard_html) | |
| with gr.Tab("π Submit"): | |
| gr.Markdown( | |
| "Upload the `eval.json` produced by " | |
| "`bash scripts/run_benchmark.sh <method>` in the " | |
| "[Benchmark code repo](https://huggingface.co/ViTeX-Bench/ViTeX-Bench). " | |
| "All 13 metrics and TextScore are read directly from the JSON; " | |
| "fill the method metadata below." | |
| ) | |
| with gr.Row(): | |
| method_in = gr.Textbox(label="Method name *", | |
| placeholder="e.g., MyVideoTextEditor v2") | |
| family_in = gr.Dropdown( | |
| label="Family", | |
| choices=["A β per-frame image editor", | |
| "B β first-frame + I2V propagation", | |
| "C β mask-conditioned video inpainting", | |
| "D β instruction-guided V2V", | |
| "Reference", | |
| "Other"], | |
| value="Other", | |
| ) | |
| org_in = gr.Textbox(label="Organization / author(s) *", | |
| placeholder="e.g., Acme Research / J. Doe et al.") | |
| with gr.Row(): | |
| paper_in = gr.Textbox(label="Paper URL (optional)", | |
| placeholder="https://arxiv.org/abs/...") | |
| code_in = gr.Textbox(label="Code URL (optional)", | |
| placeholder="https://github.com/...") | |
| email_in = gr.Textbox(label="Contact email (optional, kept private)", | |
| placeholder="you@example.com") | |
| file_in = gr.File(label="eval.json", file_types=[".json"]) | |
| admin_pass_submit = gr.Textbox( | |
| label="Admin passphrase (optional)", type="password", | |
| placeholder="Leave blank to submit as a user (entry will be reviewed before appearing).", | |
| ) | |
| submit_btn = gr.Button("Submit", variant="primary") | |
| result_out = gr.Markdown() | |
| gr.Markdown( | |
| "If the **admin passphrase** field is filled with the correct value, " | |
| "the entry is published immediately with the **Admin** source badge " | |
| "and skips the review queue. Otherwise the submission is queued for " | |
| "the maintainers to approve.", | |
| elem_classes=["vbench-foot"], | |
| ) | |
| submit_btn.click( | |
| submit_handler, | |
| inputs=[method_in, org_in, family_in, paper_in, code_in, | |
| email_in, file_in, admin_pass_submit], | |
| outputs=[result_out, leaderboard_html], | |
| ) | |
| with gr.Tab("π Admin"): | |
| diag_md = gr.Markdown(value=submission_diagnostics()) | |
| with gr.Row(): | |
| admin_pass = gr.Textbox(label="Passphrase", type="password", | |
| scale=3, container=True) | |
| view_btn = gr.Button("π View entries", variant="primary", | |
| scale=1, min_width=160) | |
| all_df = gr.Dataframe(interactive=False, wrap=True, | |
| headers=["#", "Method", "Status", "Source", | |
| "TextScore", "Family", "Organization", | |
| "Email", "Submitted"]) | |
| with gr.Row(): | |
| idx_in = gr.Number(label="Row index (#)", value=0, precision=0, | |
| scale=1, min_width=120) | |
| approve_btn = gr.Button("β Approve", variant="primary", scale=1) | |
| reject_btn = gr.Button("β Reject", scale=1) | |
| delete_btn = gr.Button("ποΈ Delete", variant="stop", scale=1) | |
| action_msg = gr.Markdown() | |
| gr.Markdown( | |
| "Configure `ADMIN_PASSPHRASE` (auth) and `HF_TOKEN` " | |
| "(persistence) in **Space β Settings β Variables and secrets**.", | |
| elem_classes=["vbench-foot"], | |
| ) | |
| view_btn.click(admin_view, | |
| inputs=admin_pass, outputs=[all_df, action_msg]) | |
| approve_btn.click( | |
| admin_approve, | |
| inputs=[admin_pass, idx_in], | |
| outputs=[action_msg, leaderboard_html, all_df, diag_md], | |
| ) | |
| reject_btn.click( | |
| admin_reject, | |
| inputs=[admin_pass, idx_in], | |
| outputs=[action_msg, leaderboard_html, all_df, diag_md], | |
| ) | |
| delete_btn.click( | |
| admin_delete, | |
| inputs=[admin_pass, idx_in], | |
| outputs=[action_msg, leaderboard_html, all_df, diag_md], | |
| ) | |
| with gr.Tab("βΉοΈ About"): | |
| gr.Markdown( | |
| "**ViTeX-Bench** evaluates video scene text editing on a frozen " | |
| "157-clip test split sourced from Panda-70M and InternVid, " | |
| "across three orthogonal axes:\n\n" | |
| "- **Text correctness** β SeqAcc, CharAcc, TTS via PP-OCRv5 " | |
| "with per-clip language routing and substring edit distance.\n" | |
| "- **Visual quality** β Flicker, Warp (RAFT-flow), MUSIQ; " | |
| "each at full-frame and text-crop scope.\n" | |
| "- **Edit locality** β PSNR, SSIM, LPIPS on a locality-only " | |
| "prediction, plus DreamSim for VAE-noise-robust similarity.\n\n" | |
| "Ranking key **TextScore** = β(SeqAcc Β· CharAcc Β· TTS). " | |
| "Each primitive is natively in [0, 1]; SeqAcc = 0 collapses " | |
| "TextScore to zero β the intended semantics for methods that " | |
| "never produce the requested target string. The full thirteen-" | |
| "metric vector remains the unit of report.\n\n" | |
| "Full protocol, normalization rules, and per-axis weights live " | |
| "in [`docs/PROTOCOL.md`](https://huggingface.co/ViTeX-Bench/" | |
| "ViTeX-Bench/blob/main/docs/PROTOCOL.md) on the Benchmark code " | |
| "repo.\n\n" | |
| "Anonymous release under double-blind review at NeurIPS 2026 " | |
| "Datasets and Benchmarks Track. Author list and DOI updated " | |
| "after deanonymization." | |
| ) | |
| demo.queue() | |
| if __name__ == "__main__": | |
| demo.launch() | |