Spaces:

RomeroLab-Duke
/

BioDesignBench-Leaderboard

Running

Jasonkim8652

align jargon with paper; set organization to RomeroLab

dcf17b1 about 21 hours ago

79.3 kB

	"""BioDesignBench Leaderboard — Gradio App for HuggingFace Spaces

	Evaluating LLM Agents on Protein Design via MCP Tools
	Romero Lab, Duke University

	Tabs:
	1. Overall Leaderboard
	2. Taxonomy Breakdown
	3. Component Analysis
	4. Benchmark vs User
	5. Submit (new submission form)
	6. Status & Admin (password-protected pipeline control)
	7. About
	"""

	import json
	import os
	from pathlib import Path

	import gradio as gr
	import plotly.graph_objects as go

	ADMIN_PASSWORD = os.environ.get("BDB_ADMIN_PASSWORD", "biodesignbench2026")


	# ═══════════════════════════════════════════════════════════════════
	# Configuration — change these when deploying
	# ═══════════════════════════════════════════════════════════════════

	PAPER_URL = "https://www.biorxiv.org/content/10.64898/2026.05.06.723381v1"
	GITHUB_URL = "https://github.com/RomeroLab/BioDesignBench"
	HF_URL = "https://huggingface.co/spaces/RomeroLab-Duke/BioDesignBench-Leaderboard"
	PYPI_URL = "https://pypi.org/project/protein-design-mcp/"


	# ═══════════════════════════════════════════════════════════════════
	# Taxonomy & scoring constants (2 × 5 design matrix)
	# ═══════════════════════════════════════════════════════════════════

	APPROACHES = ["de_novo", "redesign"]
	APPROACH_LABELS = {
	"de_novo": "De Novo Design",
	"redesign": "Redesign",
	}
	SUBJECTS = ["antibody", "binder", "enzyme", "scaffold", "fluorescent_protein"]
	SUBJECT_LABELS = {
	"antibody": "Antibody",
	"binder": "Binder",
	"enzyme": "Enzyme",
	"scaffold": "Scaffold",
	"fluorescent_protein": "Fluorescent Prot.",
	}
	# 9 valid cells (rd × binder is empty in current task set)
	VALID_CELLS = {
	"de_novo": {"antibody", "binder", "enzyme", "scaffold", "fluorescent_protein"},
	"redesign": {"antibody", "enzyme", "scaffold", "fluorescent_protein"},
	}
	N_TASKS_PER_CELL = {
	("de_novo", "antibody"): 4,
	("de_novo", "binder"): 19,
	("de_novo", "enzyme"): 2,
	("de_novo", "scaffold"): 21,
	("de_novo", "fluorescent_protein"): 1,
	("redesign", "antibody"): 5,
	("redesign", "enzyme"): 10,
	("redesign", "scaffold"): 4,
	("redesign", "fluorescent_protein"): 10,
	}
	COMPONENTS = [
	"approach",
	"orchestration",
	"quality",
	"feasibility",
	"novelty",
	"diversity",
	]
	COMP_MAX = {
	"approach": 20,
	"orchestration": 15,
	"quality": 35,
	"feasibility": 15,
	"novelty": 5,
	"diversity": 10,
	}
	TYPE_STYLE = {
	"llm": {"icon": "", "bg": "#ffffff", "tag": ""},
	"hardcoded": {"icon": "\U0001f527", "bg": "#f0f0f0", "tag": "baseline"},
	"human_expert": {
	"icon": "\U0001f468\u200d\U0001f52c",
	"bg": "#ebf4ff",
	"tag": "baseline",
	},
	"human_oracle": {"icon": "\U0001f4c4", "bg": "#fefcbf", "tag": "baseline"},
	# Backward-compat alias for older JSON files
	"oracle": {"icon": "\U0001f4c4", "bg": "#fefcbf", "tag": "baseline"},
	}


	# ═══════════════════════════════════════════════════════════════════
	# Data loading
	# ═══════════════════════════════════════════════════════════════════


	def load_data() -> dict:
	path = Path(__file__).parent / "leaderboard_data.json"
	with open(path) as f:
	return json.load(f)


	# ═══════════════════════════════════════════════════════════════════
	# Custom CSS
	# ═══════════════════════════════════════════════════════════════════

	CUSTOM_CSS = """
	.gradio-container { max-width: 1200px !important; }
	.gr-padded { padding: 0 !important; }

	/* Force light appearance for all inline-styled HTML content */
	.dark .gradio-container {
	--body-background-fill: #f7fafc !important;
	--block-background-fill: #ffffff !important;
	--body-text-color: #1a202c !important;
	--block-label-text-color: #1a202c !important;
	--input-background-fill: #ffffff !important;
	--border-color-primary: #e2e8f0 !important;
	--color-accent-soft: rgba(49,130,206,0.15) !important;
	--neutral-50: #f7fafc !important;
	--neutral-100: #edf2f7 !important;
	--neutral-200: #e2e8f0 !important;
	--neutral-700: #4a5568 !important;
	--neutral-800: #2d3748 !important;
	color: #1a202c !important;
	background: #f7fafc !important;
	}
	.dark .tabs { background: #ffffff !important; }
	.dark .tab-nav button { color: #2d3748 !important; }
	.dark .tab-nav button.selected {
	color: #0f172a !important;
	border-color: #3182ce !important;
	}
	.dark .block { background: #ffffff !important; }
	.dark label, .dark .label-wrap { color: #2d3748 !important; }
	.dark input, .dark textarea, .dark select {
	background: #ffffff !important;
	color: #1a202c !important;
	border-color: #e2e8f0 !important;
	}
	.dark .accordion { background: #ffffff !important; }
	.dark .accordion > .label-wrap { color: #2d3748 !important; }
	"""

	# Force light mode on page load
	FORCE_LIGHT_JS = """
	() => {
	document.querySelector('body').classList.remove('dark');
	const obs = new MutationObserver(() => {
	document.querySelector('body').classList.remove('dark');
	});
	obs.observe(document.body, {attributes: true, attributeFilter: ['class']});
	setTimeout(() => obs.disconnect(), 5000);
	}
	"""


	# ═══════════════════════════════════════════════════════════════════
	# Plotly layout helper
	# ═══════════════════════════════════════════════════════════════════


	def _base_layout(**overrides) -> dict:
	"""Shared Plotly layout defaults, with per-chart overrides."""
	base = dict(
	plot_bgcolor="white",
	paper_bgcolor="white",
	font=dict(
	family="system-ui, -apple-system, sans-serif", size=12, color="#2d3748"
	),
	margin=dict(l=40, r=20, t=50, b=40),
	)
	base.update(overrides)
	return base


	# ═══════════════════════════════════════════════════════════════════
	# HTML builders
	# ═══════════════════════════════════════════════════════════════════


	def build_header(last_updated: str, n_entries: int) -> str:
	btn = (
	"display:inline-block;padding:0.45rem 1.1rem;border-radius:8px;"
	"text-decoration:none;font-size:0.82rem;font-weight:600;"
	"transition:opacity 0.15s"
	)
	return f"""
	<div style="background:#ffffff;border:1px solid #e2e8f0;
	padding:2.2rem 2rem 1.8rem;text-align:center;
	border-radius:16px;margin-bottom:0.8rem;
	box-shadow:0 1px 4px rgba(0,0,0,0.04)">
	<p style="margin:0 0 0.3rem;font-size:0.75rem;font-weight:700;
	letter-spacing:0.12em;text-transform:uppercase;
	color:#3182ce">Romero Lab · Duke University</p>
	<h1 style="font-size:2rem;margin:0;font-weight:800;color:#0f172a;
	letter-spacing:-0.02em">
	\U0001f9ec BioDesignBench</h1>
	<p style="color:#0f172a;margin:0.6rem 0 0.2rem;font-size:1.1rem;
	font-weight:600;line-height:1.4">
	Can LLM agents orchestrate stochastic protein-design pipelines?</p>
	<p style="color:#64748b;margin:0.2rem 0 0;font-size:0.95rem;
	font-weight:400;font-style:italic;max-width:680px;
	margin-left:auto;margin-right:auto;line-height:1.5">
	Top-tier agents now surpass a deterministic hardcoded pipeline —
	but invoke evaluation tools at only <strong>14% of expert intensity</strong>.
	Guidance closes the coverage gap, not the evaluation-depth gap.</p>
	<div style="margin-top:1rem;display:flex;justify-content:center;
	gap:0.6rem;flex-wrap:wrap">
	<a href="{PAPER_URL}" target="_blank"
	style="{btn};background:#0f172a;color:#ffffff">
	\U0001f4c4 Paper</a>
	<a href="{GITHUB_URL}" target="_blank"
	style="{btn};background:#f1f5f9;color:#334155">
	\U0001f4bb GitHub</a>
	<a href="{HF_URL}" target="_blank"
	style="{btn};background:#f1f5f9;color:#334155">
	\U0001f917 HuggingFace</a>
	<a href="{PYPI_URL}" target="_blank"
	style="{btn};background:#f1f5f9;color:#334155">
	\U0001f4e6 PyPI</a>
	</div>
	<div style="margin-top:1rem;display:flex;justify-content:center;
	gap:1.5rem;flex-wrap:wrap">
	<span style="font-size:0.78rem;color:#94a3b8">
	76 tasks · 5 molecular families</span>
	<span style="font-size:0.78rem;color:#94a3b8">
	17 MCP tools</span>
	<span style="font-size:0.78rem;color:#94a3b8">
	{n_entries} conditions</span>
	<span style="font-size:0.78rem;color:#94a3b8">
	Updated {last_updated}</span>
	</div>
	</div>"""


	# ── Score styling helpers ──


	def _score_color(s: float) -> str:
	if s >= 50:
	return "#38a169"
	if s >= 25:
	return "#d69e2e"
	return "#e53e3e"


	def _bar_bg(s: float) -> str:
	if s >= 50:
	return "rgba(56,161,105,0.15)"
	if s >= 25:
	return "rgba(214,158,46,0.15)"
	return "rgba(229,62,62,0.12)"


	def _heat_color(val, max_val=95) -> str:
	if val is None:
	return "#f7fafc"
	r = val / max_val
	if r >= 0.7:
	return f"rgba(56,161,105,{min(0.2 + r * 0.4, 0.8):.2f})"
	if r >= 0.4:
	return f"rgba(214,158,46,{min(0.2 + r * 0.4, 0.8):.2f})"
	return f"rgba(229,62,62,{min(0.15 + r * 0.3, 0.6):.2f})"


	# ── Tab 1: Overall leaderboard table ──


	def build_leaderboard_table(
	entries: list, mode_f: str, mcp_f: str, type_f: str
	) -> str:
	"""Generate the mixed-ranking HTML table with inline styles."""
	# Filter
	filtered = []
	for e in entries:
	st = e["submission_type"]
	if mode_f != "All" and st == "llm":
	if (e.get("mode") or "").lower() != mode_f.lower():
	continue
	if mcp_f == "Reference" and e.get("mcp_custom"):
	continue
	if mcp_f == "Custom" and not e.get("mcp_custom"):
	continue
	if type_f == "LLM Only" and st != "llm":
	continue
	if type_f == "Baselines Only" and st == "llm":
	continue
	filtered.append(e)

	filtered.sort(key=lambda x: x["overall_score"], reverse=True)

	# Shared cell styles
	TD = (
	"padding:0.65rem 1rem;border-bottom:1px solid #e2e8f0;"
	"font-size:0.9rem"
	)
	TH = (
	"background:#0f172a;color:white;padding:0.75rem 1rem;"
	"text-align:left;font-size:0.75rem;text-transform:uppercase;"
	"letter-spacing:0.05em;font-weight:600"
	)

	rows = []
	llm_rank = 0
	for e in filtered:
	st = e["submission_type"]
	sty = TYPE_STYLE.get(st, TYPE_STYLE["llm"])
	is_bl = st != "llm"
	sc = e["overall_score"]

	# ── Rank cell ──
	if is_bl:
	rank = (
	f'<td style="{TD};text-align:center;font-size:1.1rem;'
	f'width:50px">{sty["icon"]}</td>'
	)
	else:
	llm_rank += 1
	rcolor = {1: "#d69e2e", 2: "#a0aec0", 3: "#c17832"}.get(
	llm_rank, "#0f172a"
	)
	rsize = (
	"1.1rem"
	if llm_rank == 1
	else ("1.05rem" if llm_rank <= 3 else "0.9rem")
	)
	rank = (
	f'<td style="{TD};text-align:center;font-weight:700;'
	f"color:{rcolor};font-size:{rsize};width:50px\">"
	f"{llm_rank}</td>"
	)

	# ── Name cell ──
	tag_html = ""
	if sty["tag"]:
	tag_html = (
	' <span style="font-size:0.7rem;background:#e2e8f0;'
	"padding:0.1rem 0.4rem;border-radius:3px;color:#4a5568;"
	f'margin-left:0.3rem;vertical-align:middle">'
	f'{sty["tag"]}</span>'
	)
	icon_pfx = f'{sty["icon"]} ' if sty["icon"] else ""
	fw = "600" if is_bl else "500"
	name = (
	f'<td style="{TD};font-weight:{fw}">'
	f'{icon_pfx}{e["agent_name"]}{tag_html}</td>'
	)

	# ── Organization ──
	org = f'<td style="{TD}">{e["organization"]}</td>'

	# ── Mode badge ──
	if is_bl:
	mode = f'<td style="{TD};color:#718096">\u2014</td>'
	elif e.get("mode") == "unguided":
	mode = (
	f'<td style="{TD}"><span style="background:#fed7d7;'
	"color:#c53030;padding:0.15rem 0.5rem;border-radius:4px;"
	'font-size:0.75rem;font-weight:600">unguided</span></td>'
	)
	else:
	mode = (
	f'<td style="{TD}"><span style="background:#c6f6d5;'
	"color:#276749;padding:0.15rem 0.5rem;border-radius:4px;"
	'font-size:0.75rem;font-weight:600">guided</span></td>'
	)

	# ── MCP ──
	if is_bl:
	mcp = f'<td style="{TD};color:#718096">\u2014</td>'
	elif e.get("mcp_custom"):
	mcp = (
	f'<td style="{TD}"><span style="background:#fef3c7;'
	"color:#92400e;padding:0.15rem 0.55rem;border-radius:4px;"
	'font-size:0.72rem;font-weight:700">custom</span></td>'
	)
	else:
	mcp = (
	f'<td style="{TD}"><span style="background:#dbeafe;'
	"color:#1e40af;padding:0.15rem 0.55rem;border-radius:4px;"
	'font-size:0.72rem;font-weight:700">reference</span></td>'
	)

	# ── Score with proportional bar ──
	scol = _score_color(sc)
	bbg = _bar_bg(sc)
	score_cell = (
	f'<td style="{TD};font-weight:700;font-size:1rem;color:{scol};'
	f'position:relative;font-variant-numeric:tabular-nums">'
	f'<div style="position:absolute;left:0;top:0;bottom:0;'
	f"width:{sc}%;background:{bbg};"
	f'border-radius:3px"></div>'
	f'<span style="position:relative">{sc:.1f}</span></td>'
	)

	# ── Tasks & zeros ──
	tc = e.get("tasks_completed", 0)
	tt = e.get("tasks_total", 76)
	tasks = f'<td style="{TD}">{tc}/{tt}</td>'
	zeros = f'<td style="{TD}">{e.get("tasks_with_zero", 0)}</td>'

	rows.append(
	f'<tr style="background:{sty["bg"]}">'
	f"{rank}{name}{org}{mode}{mcp}{score_cell}{tasks}{zeros}</tr>"
	)

	return f"""
	<table style="width:100%;border-collapse:collapse;background:white;
	border-radius:10px;overflow:hidden;
	box-shadow:0 1px 3px rgba(0,0,0,0.08)">
	<thead><tr>
	<th style="{TH};width:50px">#</th>
	<th style="{TH}">Agent</th>
	<th style="{TH}">Organization</th>
	<th style="{TH}">Mode</th>
	<th style="{TH}">MCP</th>
	<th style="{TH}">Score</th>
	<th style="{TH}">Tasks</th>
	<th style="{TH}">Zero-Score</th>
	</tr></thead>
	<tbody>{''.join(rows)}</tbody>
	</table>"""


	# ── Tab 2: Taxonomy heatmap ──


	def build_heatmap(entry: dict) -> str:
	"""HTML heatmap for one agent across the 2 × 5 design matrix
	(DesignApproach × MolecularSubject = 9 valid cells; rd × binder is empty).
	"""
	ts = entry.get("taxonomy_scores", {})
	TH = (
	"background:#0f172a;color:white;padding:0.6rem 0.8rem;"
	"text-align:center;font-size:0.75rem;font-weight:600"
	)
	TD = (
	"text-align:center;padding:0.5rem;font-size:0.85rem;"
	"font-weight:600;border-bottom:1px solid #e2e8f0"
	)

	rows = []
	for ap in APPROACHES:
	cells = [
	f'<td style="{TD};text-align:left;font-weight:700;'
	f'background:#f8fafc;color:#0f172a">{APPROACH_LABELS[ap]}</td>'
	]
	vals = []
	for sj in SUBJECTS:
	if sj in VALID_CELLS[ap]:
	val = ts.get(ap, {}).get(sj)
	bg = _heat_color(val)
	n = N_TASKS_PER_CELL.get((ap, sj), 0)
	text = (
	f'{val:.0f}<br><span style="font-size:0.65rem;'
	f'font-weight:400;color:#64748b">n={n}</span>'
	if val is not None
	else "\u2014"
	)
	cells.append(f'<td style="{TD};background:{bg}">{text}</td>')
	if val is not None:
	vals.append(val)
	else:
	cells.append(
	f'<td style="{TD};color:#cbd5e0;font-weight:400">'
	"n/a</td>"
	)
	avg = sum(vals) / len(vals) if vals else 0
	avg_bg = _heat_color(avg)
	cells.append(
	f'<td style="{TD};font-weight:700;background:{avg_bg}">'
	f"{avg:.1f}</td>"
	)
	rows.append(f'<tr>{"".join(cells)}</tr>')

	sj_headers = "".join(
	f'<th style="{TH}">{SUBJECT_LABELS[sj]}</th>'
	for sj in SUBJECTS
	)

	return f"""
	<table style="width:100%;border-collapse:collapse;background:white;
	border-radius:10px;overflow:hidden;
	box-shadow:0 1px 3px rgba(0,0,0,0.08)">
	<thead><tr>
	<th style="{TH};text-align:left">Approach \u2193 / Subject \u2192</th>
	{sj_headers}
	<th style="{TH}">Mean</th>
	</tr></thead>
	<tbody>{''.join(rows)}</tbody>
	</table>"""


	# ── Tab 4: Mode comparison cards ──


	def build_mode_cards(entries: list) -> str:
	"""Per-LLM cards showing unguided vs guided delta."""
	by_name: dict[str, dict] = {}
	for e in entries:
	if e["submission_type"] != "llm":
	continue
	by_name.setdefault(e["agent_name"], {})[e["mode"]] = e

	ordered = sorted(
	by_name.items(),
	key=lambda x: x[1].get("guided", {}).get("overall_score", 0),
	reverse=True,
	)

	cards = []
	for name, modes in ordered:
	bench = modes.get("unguided")
	user = modes.get("guided")
	if not bench or not user:
	continue
	delta = user["overall_score"] - bench["overall_score"]
	pct = (delta / bench["overall_score"] * 100) if bench["overall_score"] else 0

	lines = [
	'<div style="display:flex;justify-content:space-between;'
	'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">'
	"<span>Unguided</span>"
	f'<span style="font-weight:700;color:#e53e3e">'
	f'{bench["overall_score"]:.1f}</span></div>',
	'<div style="display:flex;justify-content:space-between;'
	'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">'
	"<span>Guided</span>"
	f'<span style="font-weight:700;color:#d69e2e">'
	f'{user["overall_score"]:.1f}</span></div>',
	'<div style="display:flex;justify-content:space-between;'
	'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">'
	"<span>Delta</span>"
	f'<span style="font-weight:700;color:#38a169">'
	f"+{delta:.1f} (+{pct:.0f}%)</span></div>",
	]
	for c in COMPONENTS:
	d = user["component_scores"][c] - bench["component_scores"][c]
	color = "#38a169" if d >= 0 else "#e53e3e"
	sign = "+" if d >= 0 else ""
	lines.append(
	'<div style="display:flex;justify-content:space-between;'
	'padding:0.3rem 0;border-bottom:1px solid #e2e8f0;'
	'font-size:0.85rem">'
	f'<span style="color:#718096">{c}</span>'
	f'<span style="font-weight:700;color:{color}">'
	f"{sign}{d:.1f}</span></div>"
	)

	cards.append(
	'<div style="background:white;border-radius:10px;padding:1.2rem;'
	'box-shadow:0 1px 3px rgba(0,0,0,0.08)">'
	f'<h4 style="font-size:0.95rem;color:#0f172a;'
	f'margin:0 0 0.8rem">{name}</h4>'
	f'{"".join(lines)}</div>'
	)

	return (
	'<div style="display:grid;grid-template-columns:'
	'repeat(auto-fit,minmax(250px,1fr));gap:1rem;margin-top:1rem">'
	f'{"".join(cards)}</div>'
	)


	# ── Headline findings (paper banner) ──


	def build_headline_findings(findings: list) -> str:
	"""Top-of-page banner that surfaces the paper's three core claims."""
	if not findings:
	return ""
	cards = []
	accents = ["#3182ce", "#d69e2e", "#805ad5", "#38a169", "#e53e3e"]
	for i, text in enumerate(findings):
	c = accents[i % len(accents)]
	cards.append(
	f'<div style="background:#ffffff;border:1px solid #e2e8f0;'
	f"border-left:4px solid {c};border-radius:10px;"
	f'padding:0.85rem 1rem;flex:1 1 220px;min-width:220px;'
	f'box-shadow:0 1px 3px rgba(0,0,0,0.04)">'
	f'<div style="font-size:0.7rem;font-weight:700;'
	f'color:{c};letter-spacing:0.08em;text-transform:uppercase;'
	f'margin-bottom:0.35rem">Finding {i+1}</div>'
	f'<div style="font-size:0.82rem;color:#1a202c;'
	f'line-height:1.45">{text}</div></div>'
	)
	return (
	'<div style="display:flex;flex-wrap:wrap;gap:0.7rem;'
	'margin:0.4rem 0 1rem">'
	f"{''.join(cards)}</div>"
	)


	# ── Tab: Depth Gap (intervention experiments) ──


	def build_intervention_section(interventions: dict) -> str:
	"""Show forced-depth and low-variety intervention results.

	The forced-depth condition mandates ≥3 evaluation metric categories
	per design candidate; the low-variety control performs comparable
	compute with a narrow range of evaluation metrics. Together they
	isolate evaluation depth as the causal driver of the evaluation-depth
	gap reported in the paper.
	"""
	if not interventions or not interventions.get("rows"):
	return '<p style="color:#718096">No intervention data available.</p>'

	rows = interventions["rows"]

	cond_meta = {
	"baseline": ("#64748b", "Baseline"),
	"forced_depth": ("#38a169", "Forced Depth"),
	"low_variety_control": ("#d69e2e", "Low-Variety Control"),
	}

	TH = (
	"background:#0f172a;color:white;padding:0.65rem 0.9rem;"
	"text-align:left;font-size:0.72rem;text-transform:uppercase;"
	"letter-spacing:0.05em;font-weight:600"
	)
	TD = ("padding:0.6rem 0.9rem;border-bottom:1px solid #e2e8f0;"
	"font-size:0.86rem")

	body = []
	for r in rows:
	color, cond_label = cond_meta.get(r["condition"], ("#64748b", r["condition"]))
	delta = r.get("delta_vs_baseline")
	if delta is None or r["condition"] == "baseline":
	delta_html = '<span style="color:#cbd5e0">\u2014</span>'
	else:
	sign = "+" if delta >= 0 else ""
	dcol = "#38a169" if delta > 0 else ("#e53e3e" if delta < 0 else "#64748b")
	delta_html = (
	f'<span style="color:{dcol};font-weight:700">'
	f"{sign}{delta:.1f}</span>"
	)
	body.append(
	f'<tr><td style="{TD};font-weight:600;color:#0f172a">'
	f'{r["label"]}</td>'
	f'<td style="{TD}"><span style="background:{color}22;'
	f"color:{color};padding:0.15rem 0.55rem;border-radius:4px;"
	f'font-size:0.72rem;font-weight:700">{cond_label}</span></td>'
	f'<td style="{TD};font-weight:700;font-variant-numeric:'
	f'tabular-nums">{r["score"]:.1f}</td>'
	f'<td style="{TD};font-variant-numeric:tabular-nums">{delta_html}</td>'
	f'<td style="{TD};color:#475569;font-variant-numeric:tabular-nums">'
	f'{r["approach"]:.1f} / {r["orchestration"]:.1f}</td>'
	f'<td style="{TD};color:#475569;font-variant-numeric:tabular-nums">'
	f'{r["quality"]:.1f}</td>'
	f'<td style="{TD};color:#475569;font-variant-numeric:tabular-nums">'
	f'{r["diversity"]:.1f}</td></tr>'
	)

	n = interventions.get("n_tasks", 18)

	return f"""
	<div style="max-width:980px;margin:0 auto">

	<div style="background:#ffffff;border:1px solid #e2e8f0;
	border-radius:12px;padding:1.4rem 1.6rem;
	margin-bottom:1rem">
	<h2 style="color:#0f172a;margin:0 0 0.5rem;font-size:1.2rem;
	font-weight:700">Causal interventions on the depth gap</h2>
	<p style="color:#475569;line-height:1.55;margin:0">
	{interventions.get('description', '')}
	Reruns are scored on a representative <strong>{n}-task</strong>
	subset that spans all 9 occupied taxonomy cells.
	</p>
	</div>

	<div style="background:#fefce8;border-left:4px solid #ca8a04;
	border-radius:8px;padding:0.95rem 1.1rem;
	margin-bottom:1.1rem">
	<strong style="color:#713f12">Headline:</strong>
	<span style="color:#52340d">
	Forced-depth lifts <strong>DeepSeek V3 by +9.3</strong> and
	<strong>GPT-5 by +15.9</strong> points without any change to
	the underlying model or tools, while the low-variety control
	<em>hurts</em> DeepSeek V3 (−2.3). The dissociation is
	cleanest on the strongest agent, where it provides direct
	causal evidence that
	<strong>evaluation variety — not raw compute —
	drives the gain</strong>. GPT-5's response is more uniform
	across both interventions; we report the raw deltas without
	smoothing.
	</span>
	</div>

	<table style="width:100%;border-collapse:collapse;background:white;
	border-radius:10px;overflow:hidden;
	box-shadow:0 1px 3px rgba(0,0,0,0.08)">
	<thead><tr>
	<th style="{TH}">Run</th>
	<th style="{TH}">Condition</th>
	<th style="{TH}">Score</th>
	<th style="{TH}">Δ vs baseline</th>
	<th style="{TH}">Approach / Orch.</th>
	<th style="{TH}">Quality</th>
	<th style="{TH}">Diversity</th>
	</tr></thead>
	<tbody>{''.join(body)}</tbody>
	</table>

	<p style="color:#64748b;font-size:0.78rem;margin-top:0.8rem;
	line-height:1.5">
	Scoring uses the same 100-point hybrid rubric as the main
	leaderboard but is restricted to {n} representative tasks;
	absolute values therefore differ from the full-benchmark mean.
	The <em>delta vs baseline</em> compares each agent against
	its own untreated baseline run, isolating the intervention effect.
	</p>
	</div>
	"""


	# ── Tab 5: About ──


	def build_about() -> str:
	h2 = (
	'style="color:#0f172a;margin:0 0 0.8rem;font-size:1.25rem;'
	'font-weight:700"'
	)
	h3 = (
	'style="color:#334155;margin:1.2rem 0 0.5rem;font-size:1rem;'
	'font-weight:600"'
	)
	p = 'style="margin-bottom:0.8rem;color:#475569;line-height:1.6"'
	card = (
	'style="background:#ffffff;border:1px solid #e2e8f0;'
	'border-radius:12px;padding:2rem;margin-bottom:1.2rem"'
	)
	stat_box = (
	'style="background:#f8fafc;border:1px solid #e2e8f0;'
	'border-radius:10px;padding:1rem;text-align:center"'
	)
	return f"""
	<div style="max-width:900px;margin:0 auto">

	<div {card}>
	<h2 {h2}>What is BioDesignBench?</h2>
	<p {p}>
	BioDesignBench is a benchmark for evaluating LLM agents as
	orchestrators of multi-step <em>stochastic</em> protein-design
	pipelines. Unlike chemistry- or code-agent benchmarks, where
	tool chains are largely deterministic, protein design demands
	repeated sampling from generative tools (RFdiffusion,
	ProteinMPNN) and iterative cross-validation through several
	biophysical metrics. We test the full agentic loop —
	<strong>plan → call → evaluate → iterate</strong>
	— over 76 expert-curated tasks drawn from 2024–2026
	literature, exposed through 17 MCP-integrated tools.
	</p>
	<div style="display:grid;grid-template-columns:
	repeat(auto-fit,minmax(140px,1fr));gap:0.8rem;
	margin:1rem 0">
	<div {stat_box}>
	<div style="font-size:1.8rem;font-weight:800;color:#0f172a">
	76</div>
	<div style="font-size:0.78rem;color:#64748b">design tasks</div>
	</div>
	<div {stat_box}>
	<div style="font-size:1.8rem;font-weight:800;color:#0f172a">
	9</div>
	<div style="font-size:0.78rem;color:#64748b">
	taxonomy cells<br>(2 approaches \u00d7 5 subjects)</div>
	</div>
	<div {stat_box}>
	<div style="font-size:1.8rem;font-weight:800;color:#0f172a">
	17</div>
	<div style="font-size:0.78rem;color:#64748b">MCP tools</div>
	</div>
	<div {stat_box}>
	<div style="font-size:1.8rem;font-weight:800;color:#0f172a">
	100</div>
	<div style="font-size:0.78rem;color:#64748b">point rubric</div>
	</div>
	</div>
	</div>

	<div {card}>
	<h2 {h2}>Three principal findings</h2>
	<h3 {h3}>1. Top-tier agents now beat the hardcoded pipeline</h3>
	<p {p}>
	DeepSeek V3 and GPT-5 surpass the deterministic hardcoded
	pipeline (54.5) under both modes. Autonomous protein-design
	orchestration is no longer infeasible — but a substantial
	gap to the human expert (61.7) and oracle (75.2) remains.
	</p>
	<h3 {h3}>2. Coverage–depth dissociation</h3>
	<p {p}>
	Workflow guidance closes the <em>coverage</em> gap, bringing
	agent tool selection closer to the human expert, but leaves
	<em>evaluation depth</em> unchanged. Better tool documentation
	can teach agents <em>which</em> tools to call, but not how
	thoroughly to use them on each generated candidate.
	</p>
	<h3 {h3}>3. Evaluation variety, not tool knowledge, is the bottleneck</h3>
	<p {p}>
	Across 836 task–condition observations, the number of
	distinct evaluation metric categories per candidate correlates
	with total score at
	<strong>ρ = 0.68</strong>
	(<em>p</em> < 10<sup>-115</sup>). LLM agents generate
	backbone candidates at expert-level rates but invoke scoring
	tools at only <strong>~14% of expert intensity</strong>.
	Forced-depth interventions confirm this is causal — see
	the <em>Depth Gap</em> tab.
	</p>
	</div>

	<div {card}>
	<h2 {h2}>How to submit</h2>
	<p {p}>
	Unlike most agent benchmarks, <strong>you do not host an HTTP
	endpoint</strong>. The 76 task descriptions never leave Romero
	Lab infrastructure. Instead you provide:</p>
	<ol style="color:#475569;padding-left:1.5rem;margin-bottom:0.8rem;
	line-height:1.7">
	<li>an <strong>LLM provider + API key</strong>
	(Anthropic / OpenAI / Google / DeepSeek).
	We run the BioDesignBench agent loop against your chosen
	model inside the leaderboard backend. Your key is
	<em>scrubbed</em> from our records immediately after the
	dispatch phase completes.</li>
	<li>optionally, a <strong>custom MCP URL</strong> if you want
	to evaluate your own tool implementations. Otherwise, the
	agent calls our reference
	<a href="https://github.com/jasonkim8652/protein-design-mcp"
	style="color:#2563eb;font-weight:600">protein-design-mcp</a>
	endpoint (in progress).</li>
	</ol>

	<h3 {h3}>Data flow</h3>
	<p {p}>
	Each task prompt is sent to your chosen LLM provider via
	their standard API (Anthropic, OpenAI, Google, DeepSeek) —
	that single channel is the only path by which task data leaves
	Romero Lab. The MCP server (reference or custom) only ever
	sees operational tool arguments (sequences, PDB paths, hotspot
	residues); it never sees the raw task prompt or evaluation
	criteria. Every task prompt also carries a unique 16-character
	canary token as an HTML comment, for retrospective leakage
	detection.</p>

	<h3 {h3}>Bring your own tools (Custom MCP)</h3>
	<p {p}>
	If you want to benchmark a new tool implementation (a faster
	structure predictor, a different diffusion backbone, your own
	stability model) against the same 76 tasks and rubric, stand
	up an HTTPS endpoint that satisfies the MCP contract and paste
	the URL into the submission form's
	<em>Advanced: Custom MCP</em> section:</p>
	<ul style="color:#475569;padding-left:1.5rem;margin-bottom:0.8rem;
	line-height:1.7">
	<li><strong>Contract + hosting options</strong>:
	<a href="https://github.com/RomeroLab/BioDesignBench/blob/main/biodesignbench-leaderboard/README.md#bringing-your-own-mcp-tools"
	style="color:#2563eb;font-weight:600">leaderboard README</a></li>
	<li><strong>Minimal FastAPI stub (~150 lines)</strong>:
	<a href="https://github.com/RomeroLab/BioDesignBench/blob/main/biodesignbench-leaderboard/example_mcp_server.py"
	style="color:#2563eb;font-weight:600"><code>example_mcp_server.py</code></a></li>
	<li><strong>Reference implementation to fork</strong>:
	<a href="https://github.com/jasonkim8652/protein-design-mcp"
	style="color:#2563eb;font-weight:600">jasonkim8652/protein-design-mcp</a></li>
	</ul>

	<h3 {h3}>Limits</h3>
	<ul style="color:#475569;padding-left:1.5rem;margin-bottom:0.8rem;
	line-height:1.7">
	<li>Maximum 1 submission per calendar month per organization</li>
	<li>73 hidden tasks are used for ranking; 3 public example
	tasks are available for development</li>
	<li>LLM-judge API costs are paid by Romero Lab; your own
	agent LLM calls are billed to your provider</li>
	</ul>
	</div>

	<div {card}>
	<h2 {h2}>Scoring rubric (100 points, hybrid)</h2>
	<p {p}>
	Scores combine <strong>72 algorithmic points</strong> from
	deterministic biophysical metrics with
	<strong>28 LLM-judge points</strong> assessed by a 3-judge
	panel (PoLL) with self-exclusion to mitigate self-preference
	bias. Each component is capped at its rubric maximum to
	prevent double counting.
	</p>
	<p {p}>
	<strong>Approach (20 pts)</strong> — strategic
	appropriateness of tool selection across 10 functional
	categories (backbone generation, inverse folding, structure
	prediction, etc.).</p>
	<p {p}>
	<strong>Orchestration (15 pts)</strong> — pipeline
	ordering, intermediate validation, and adaptive iteration.</p>
	<p {p}>
	<strong>Quality (35 pts)</strong> — 100% algorithmic.
	Continuous four-band interpolation over Boltz-2 re-prediction
	metrics (pLDDT, pTM, ipTM, ipAE), eliminating LLM judgement
	variance on biophysical quantities.</p>
	<p {p}>
	<strong>Feasibility (15 pts)</strong> — valid amino
	acids, length constraints, composition, and biophysical
	plausibility.</p>
	<p {p}>
	<strong>Novelty (5 pts)</strong> — sequence identity to
	reference (lower identity = more novel).</p>
	<p {p}>
	<strong>Diversity (10 pts)</strong> — number and
	pairwise diversity of generated designs.</p>
	</div>

	<div {card}>
	<h2 {h2}>Five-layer contamination defense</h2>
	<p {p}>Every evaluated LLM may have read protein-design
	literature during pretraining, so we use a layered defense:</p>
	<ul style="color:#475569;padding-left:1.5rem;
	margin-bottom:0.8rem;line-height:1.7">
	<li>All 76 tasks derived from publications dated 2024–2026,
	post-dating model training cutoffs.</li>
	<li>Task prompts paraphrased and restructured — no
	verbatim passages from source literature.</li>
	<li>Targets specified by biological function and structural
	constraints, not by name or PDB identifier.</li>
	<li>12 decoy tasks with deliberately fabricated targets to
	detect memorisation-based responses.</li>
	<li>n-gram overlap analysis between agent outputs and source
	publications — no verbatim regurgitation above the
	8-gram threshold across any condition.</li>
	</ul>
	</div>

	<div {card}>
	<h2 {h2}>Citation</h2>
	<pre style="background:#0f172a;color:#e2e8f0;padding:1.2rem;
	border-radius:10px;font-size:0.8rem;
	line-height:1.6">@article{{biodesignbench2026,
	title={{Evaluating LLM-Driven Protein Design:
	Agents Lack Iterative Evaluation Depth}},
	author={{Kim, Jeonghyeon and Romero, Philip}},
	journal={{bioRxiv}},
	year={{2026}},
	doi={{10.64898/2026.05.06.723381}},
	url={{https://www.biorxiv.org/content/10.64898/2026.05.06.723381v1}}
	}}</pre>
	</div>

	</div>"""


	# ═══════════════════════════════════════════════════════════════════
	# Chart builders (Plotly)
	# ═══════════════════════════════════════════════════════════════════


	def chart_taxonomy_bar(entry: dict) -> go.Figure:
	"""Grouped bar chart of mean score per molecular subject,
	split by design approach (de novo vs redesign).
	"""
	ts = entry.get("taxonomy_scores", {})
	x_labels = [SUBJECT_LABELS[s] for s in SUBJECTS]

	def _series(ap):
	out = []
	for sj in SUBJECTS:
	if sj in VALID_CELLS[ap]:
	out.append(ts.get(ap, {}).get(sj))
	else:
	out.append(None)
	return out

	dn = _series("de_novo")
	rd = _series("redesign")

	fig = go.Figure()
	fig.add_trace(go.Bar(
	x=x_labels, y=dn, name="De Novo",
	marker_color="rgba(49,130,206,0.78)",
	text=[f"{v:.0f}" if v is not None else "" for v in dn],
	textposition="outside",
	))
	fig.add_trace(go.Bar(
	x=x_labels, y=rd, name="Redesign",
	marker_color="rgba(214,158,46,0.78)",
	text=[f"{v:.0f}" if v is not None else "" for v in rd],
	textposition="outside",
	))
	mode = entry.get("mode") or "\u2014"
	fig.update_layout(
	**_base_layout(
	barmode="group",
	title=dict(
	text=f"{entry['agent_name']} ({mode}) \u2014 Mean Score by Cell",
	font_size=14,
	),
	yaxis=dict(range=[0, 100], title="Hybrid score (out of 100)"),
	xaxis=dict(title=""),
	legend=dict(orientation="h", yanchor="bottom", y=-0.2,
	xanchor="center", x=0.5),
	height=340,
	)
	)
	return fig


	def chart_radar(e1: dict, e2: dict) -> go.Figure:
	"""Radar chart comparing two agents' component scores (% of max)."""
	labels = [c.capitalize() for c in COMPONENTS]

	def norm(e):
	return [e["component_scores"][c] / COMP_MAX[c] * 100 for c in COMPONENTS]

	v1, v2 = norm(e1), norm(e2)
	m1 = e1.get("mode") or "\u2014"
	m2 = e2.get("mode") or "\u2014"

	fig = go.Figure()
	fig.add_trace(
	go.Scatterpolar(
	r=v1 + [v1[0]],
	theta=labels + [labels[0]],
	fill="toself",
	name=f'{e1["agent_name"]} ({m1})',
	line=dict(color="rgba(49,130,206,0.8)"),
	fillcolor="rgba(49,130,206,0.15)",
	)
	)
	fig.add_trace(
	go.Scatterpolar(
	r=v2 + [v2[0]],
	theta=labels + [labels[0]],
	fill="toself",
	name=f'{e2["agent_name"]} ({m2})',
	line=dict(color="rgba(229,62,62,0.8)"),
	fillcolor="rgba(229,62,62,0.15)",
	)
	)
	fig.update_layout(
	**_base_layout(
	polar=dict(
	radialaxis=dict(visible=True, range=[0, 100], ticksuffix="%")
	),
	showlegend=True,
	legend=dict(
	orientation="h", yanchor="bottom", y=-0.25,
	xanchor="center", x=0.5,
	),
	title=dict(text="Component Radar (% of max)", font_size=14),
	height=420,
	)
	)
	return fig


	def chart_component_bar(e1: dict, e2: dict) -> go.Figure:
	"""Horizontal bar chart of raw component scores for two agents."""
	labels = [f"{c.capitalize()} (/{COMP_MAX[c]})" for c in COMPONENTS]
	m1 = e1.get("mode") or "\u2014"
	m2 = e2.get("mode") or "\u2014"

	fig = go.Figure()
	fig.add_trace(
	go.Bar(
	y=labels,
	x=[e1["component_scores"][c] for c in COMPONENTS],
	name=f'{e1["agent_name"]} ({m1})',
	orientation="h",
	marker_color="rgba(49,130,206,0.7)",
	)
	)
	fig.add_trace(
	go.Bar(
	y=labels,
	x=[e2["component_scores"][c] for c in COMPONENTS],
	name=f'{e2["agent_name"]} ({m2})',
	orientation="h",
	marker_color="rgba(229,62,62,0.7)",
	)
	)
	fig.update_layout(
	**_base_layout(
	barmode="group",
	xaxis=dict(title="Score"),
	title=dict(text="Component Breakdown", font_size=14),
	legend=dict(
	orientation="h", yanchor="bottom", y=-0.3,
	xanchor="center", x=0.5,
	),
	height=420,
	)
	)
	return fig


	def chart_mode_comparison(entries: list) -> go.Figure:
	"""Grouped bar chart: unguided vs guided mode for each LLM."""
	by_name: dict[str, dict[str, float]] = {}
	for e in entries:
	if e["submission_type"] != "llm":
	continue
	by_name.setdefault(e["agent_name"], {})[e["mode"]] = e["overall_score"]

	ordered = sorted(
	by_name.items(),
	key=lambda x: x[1].get("guided", 0),
	reverse=True,
	)
	names = [n for n, _ in ordered]
	bench = [m.get("unguided", 0) for _, m in ordered]
	user = [m.get("guided", 0) for _, m in ordered]

	fig = go.Figure()
	fig.add_trace(
	go.Bar(
	x=names, y=bench, name="Unguided",
	marker_color="rgba(229,62,62,0.6)",
	)
	)
	fig.add_trace(
	go.Bar(
	x=names, y=user, name="Guided",
	marker_color="rgba(56,161,105,0.6)",
	)
	)
	fig.update_layout(
	**_base_layout(
	barmode="group",
	yaxis=dict(range=[0, 80], title="Overall hybrid score"),
	xaxis=dict(title=""),
	title=dict(
	text=("Unguided vs Guided modes \u2014 "
	"guidance lifts coverage but rarely shifts overall score"),
	font_size=13,
	),
	legend=dict(
	orientation="h", yanchor="bottom", y=-0.18,
	xanchor="center", x=0.5,
	),
	height=380,
	)
	)
	return fig


	# ═══════════════════════════════════════════════════════════════════
	# Gradio application
	# ═══════════════════════════════════════════════════════════════════


	def create_app() -> gr.Blocks:
	data = load_data()
	entries = data["entries"]
	by_id = {e["agent_id"]: e for e in entries}

	# Build dropdown choices: (display_label, agent_id)
	agent_choices = []
	for e in entries:
	sty = TYPE_STYLE.get(e["submission_type"], TYPE_STYLE["llm"])
	icon = sty["icon"]
	mode = e.get("mode") or "\u2014"
	label = f"{icon} {e['agent_name']} ({mode})".strip()
	agent_choices.append((label, e["agent_id"]))

	# Safe index helper
	def _choice_val(idx: int) -> str:
	return agent_choices[min(idx, len(agent_choices) - 1)][1]

	with gr.Blocks(
	theme=gr.themes.Soft(primary_hue="blue"),
	css=CUSTOM_CSS,
	js=FORCE_LIGHT_JS,
	) as app:

	gr.HTML(build_header(data["last_updated"], len(entries)))
	gr.HTML(build_headline_findings(data.get("headline_findings", [])))

	with gr.Tabs():

	# ════════ Tab 1: Overall Leaderboard ════════
	with gr.Tab("\U0001f4ca Overall"):
	with gr.Row():
	f_mode = gr.Dropdown(
	["All", "Unguided", "Guided"],
	value="All", label="Mode", scale=1,
	)
	f_mcp = gr.Dropdown(
	["All", "Reference", "Custom"],
	value="All", label="MCP Tools", scale=1,
	)
	f_type = gr.Dropdown(
	["All Entries", "LLM Only", "Baselines Only"],
	value="All Entries", label="Show", scale=1,
	)

	tbl = gr.HTML(
	build_leaderboard_table(
	entries, "All", "All", "All Entries"
	)
	)

	def _update_table(m, mc, t):
	return build_leaderboard_table(entries, m, mc, t)

	for dd in [f_mode, f_mcp, f_type]:
	dd.change(
	_update_table, [f_mode, f_mcp, f_type], tbl
	)

	# ════════ Tab 2: Taxonomy Breakdown ════════
	with gr.Tab("\U0001f9ec Taxonomy"):
	tax_dd = gr.Dropdown(
	agent_choices,
	value=_choice_val(0),
	label="Select Agent",
	)
	hm_html = gr.HTML(build_heatmap(entries[0]))
	tax_plot = gr.Plot(chart_taxonomy_bar(entries[0]))

	def _update_taxonomy(aid):
	e = by_id.get(aid, entries[0])
	return build_heatmap(e), chart_taxonomy_bar(e)

	tax_dd.change(
	_update_taxonomy, [tax_dd], [hm_html, tax_plot]
	)

	# ════════ Tab 3: Component Analysis ════════
	with gr.Tab("\U0001f3af Components"):
	with gr.Row():
	c1 = gr.Dropdown(
	agent_choices, value=_choice_val(0),
	label="Agent 1", scale=1,
	)
	c2 = gr.Dropdown(
	agent_choices, value=_choice_val(4),
	label="Agent 2", scale=1,
	)
	with gr.Row():
	radar = gr.Plot(
	chart_radar(
	entries[0],
	entries[min(4, len(entries) - 1)],
	)
	)
	comp_bar = gr.Plot(
	chart_component_bar(
	entries[0],
	entries[min(4, len(entries) - 1)],
	)
	)

	def _update_comp(a1, a2):
	e1 = by_id.get(a1, entries[0])
	e2 = by_id.get(a2, entries[-1])
	return chart_radar(e1, e2), chart_component_bar(e1, e2)

	for dd in [c1, c2]:
	dd.change(_update_comp, [c1, c2], [radar, comp_bar])

	# ════════ Tab 4: Benchmark vs User (coverage-depth dissociation) ════════
	with gr.Tab("\u26a1 Guidance Effect"):
	gr.HTML(
	'<div style="background:#eff6ff;border-left:4px solid '
	'#3182ce;border-radius:8px;padding:0.85rem 1.1rem;'
	'margin:0.4rem 0 0.9rem;color:#1e3a8a;font-size:0.88rem;'
	'line-height:1.55">'
	'<strong>Mode semantics:</strong> '
	'<em>Unguided mode</em> exposes atomic tools without '
	'pipeline hints; <em>guided mode</em> packages '
	'them into composite workflows with explicit pipeline '
	'structure. Guidance lifts the lowest-tier '
	'agents but does not consistently help capable ones, '
	'and never closes the evaluation-depth gap (see '
	'<em>Depth Gap</em> tab).</div>'
	)
	gr.Plot(chart_mode_comparison(entries))
	gr.HTML(build_mode_cards(entries))

	# ════════ Tab 5: Depth Gap (interventions) ════════
	with gr.Tab("\U0001f50d Depth Gap"):
	gr.HTML(build_intervention_section(
	data.get("interventions", {})
	))

	# ══════ Tab: Submit ══════
	with gr.Tab("\U0001f4e4 Submit"):
	gr.HTML("""
	<div style="max-width:820px;margin:0 auto;padding:1rem">
	<h2 style="color:#0f172a;margin:0 0 0.5rem;
	font-weight:700;font-size:1.25rem">
	Submit your agent</h2>
	<p style="color:#475569;margin-bottom:1rem;line-height:1.6">
	BioDesignBench evaluates models inside Romero Lab
	infrastructure to keep the 76 task specifications
	contamination-clean. You provide an LLM API key and
	a model name, and we run the BioDesignBench agent
	loop against your model with the reference 17-tool
	MCP server. Task content never leaves Romero Lab
	except through your chosen LLM provider's API call.
	</p>

	<div style="background:#dcfce7;border-left:4px solid #15803d;
	padding:0.95rem 1.1rem;border-radius:8px;
	margin-bottom:1rem;font-size:0.86rem;
	color:#14532d;line-height:1.55">
	<strong>How your credentials are handled:</strong>
	<ul style="margin:0.5rem 0 0 1.1rem;padding:0">
	<li>Your API key is stored on the submission row
	only between submission and dispatch, then
	<strong>scrubbed automatically</strong> regardless
	of whether the run succeeded.</li>
	<li>Each task carries a unique 16-character canary
	token (invisible HTML comment) so we can
	retrospectively detect leakage in published
	models.</li>
	<li>The MCP server (reference or custom) sees
	only operational tool arguments, never the raw
	task description or evaluation criteria.</li>
	</ul>
	</div>

	<div style="background:#eff6ff;border-left:4px solid #3182ce;
	padding:0.95rem 1.1rem;border-radius:8px;
	margin-bottom:1rem;font-size:0.86rem;
	color:#1e3a8a;line-height:1.55">
	<strong>Reference vs Custom MCP</strong>
	<ul style="margin:0.5rem 0 0 1.1rem;padding:0">
	<li><strong>Reference</strong> (default): your
	agent uses our hosted
	<a href="https://github.com/jasonkim8652/protein-design-mcp"
	style="color:#1d4ed8;font-weight:600">protein-design-mcp</a>
	endpoint. Eligible for the reference ranking.</li>
	<li><strong>Custom</strong>: provide your own
	public MCP URL implementing the same 17-tool
	schema. Useful for benchmarking new tool
	implementations against an identical model
	under identical task prompts. Tagged with a
	<code>custom</code> badge.</li>
	</ul>
	</div>

	<div style="background:#fefce8;border-left:3px solid #ca8a04;
	padding:0.8rem 1rem;border-radius:6px;
	margin-bottom:1rem;font-size:0.85rem;color:#713f12">
	<strong>Rate limit:</strong> 1 submission per calendar
	month per organization. Your LLM-API and (if reference)
	MCP-GPU costs are billed to your account / paid by
	Romero Lab respectively; please be considerate.
	</div>
	</div>""")

	with gr.Column(scale=1):
	sub_agent = gr.Textbox(
	label="Agent Name",
	placeholder="e.g., GPT-5 with reference MCP",
	)
	sub_org = gr.Textbox(
	label="Organization",
	placeholder="e.g., OpenAI",
	)
	with gr.Row():
	sub_provider = gr.Dropdown(
	choices=[
	("Anthropic Claude", "anthropic"),
	("OpenAI GPT", "openai"),
	("Google Gemini", "google"),
	("DeepSeek", "deepseek"),
	],
	value="anthropic",
	label="LLM Provider",
	)
	sub_model = gr.Textbox(
	label="Model name",
	placeholder="e.g., claude-sonnet-4-20250514",
	)
	sub_api_key = gr.Textbox(
	label="API key (transient -- scrubbed after dispatch)",
	placeholder="sk-...",
	type="password",
	)
	sub_desc = gr.Textbox(
	label="Description (optional)",
	placeholder="Brief description of your agent...",
	lines=2,
	)
	with gr.Accordion("Advanced: Custom MCP", open=False):
	sub_custom_mcp_url = gr.Textbox(
	label="Custom MCP URL (optional)",
	placeholder="https://your-mcp.example.com/predict",
	)
	sub_custom_mcp_token = gr.Textbox(
	label="Custom MCP bearer token (optional)",
	placeholder="(empty if your MCP needs no auth)",
	type="password",
	)
	sub_btn = gr.Button(
	"Submit for Review",
	variant="primary",
	)
	sub_result = gr.HTML()

	def _handle_submit(
	name, org, provider, model, api_key, desc,
	custom_mcp_url, custom_mcp_token,
	):
	if not name or not org or not model or not api_key:
	return ('<div style="color:#e53e3e;padding:0.5rem">'
	"agent name, organization, model name, and "
	"API key are required.</div>")
	try:
	from eval_queue import submit
	result = submit(
	agent_name=name,
	organization=org,
	provider=provider,
	model_name=model,
	api_key=api_key,
	description=desc,
	custom_mcp_url=custom_mcp_url or "",
	custom_mcp_token=custom_mcp_token or "",
	)
	if "error" in result:
	return (f'<div style="color:#e53e3e;padding:0.5rem">'
	f'{result["error"]}</div>')
	mcp_mode = "custom" if custom_mcp_url else "reference"
	return (
	f'<div style="background:#c6f6d5;padding:1rem;'
	f'border-radius:8px;margin-top:0.5rem">'
	f'<strong>Submitted!</strong> '
	f'ID: <code>{result["submission_id"]}</code><br>'
	f'Status: {result["status"]}<br>'
	f'Provider: <strong>{provider}</strong> '
	f'/ Model: <strong>{model}</strong><br>'
	f'MCP mode: <strong>{mcp_mode}</strong><br>'
	f'Canary: <code>{result.get("canary_token","")}</code><br>'
	f'{result.get("message", "")}</div>'
	)
	except Exception as e:
	return (f'<div style="color:#e53e3e;padding:0.5rem">'
	f"Error: {str(e)[:200]}</div>")

	sub_btn.click(
	_handle_submit,
	[sub_agent, sub_org, sub_provider, sub_model,
	sub_api_key, sub_desc, sub_custom_mcp_url,
	sub_custom_mcp_token],
	sub_result,
	)

	# ══════ Tab 6: Status & Admin ══════
	with gr.Tab("\U0001f6e0 Status"):
	gr.HTML("""
	<div style="max-width:800px;margin:0 auto;padding:1rem">
	<h2 style="color:#0f172a;margin:0 0 0.5rem;
	font-weight:700;font-size:1.25rem">
	Submission status</h2>
	<p style="color:#475569;margin-bottom:0.5rem;line-height:1.6">
	Check your submission status or manage the pipeline
	(admin only).</p>
	</div>""")

	# --- Public status check ---
	with gr.Accordion("Check Submission Status", open=True):
	status_id = gr.Textbox(
	label="Submission ID",
	placeholder="Enter your submission ID...",
	)
	status_btn = gr.Button("Check Status")
	status_out = gr.HTML()

	def _check_status(sid):
	if not sid:
	return '<div style="color:#718096">Enter an ID above.</div>'
	try:
	from eval_queue import get_submission
	sub = get_submission(sid.strip())
	if sub is None:
	return ('<div style="color:#e53e3e">'
	"Submission not found.</div>")
	status_color = {
	"pending": "#d69e2e", "approved": "#38a169",
	"dispatching": "#3182ce", "boltz": "#805ad5",
	"scoring": "#805ad5", "complete": "#38a169",
	"failed": "#e53e3e", "rejected": "#e53e3e",
	}.get(sub["status"], "#718096")
	score_html = ""
	if sub.get("overall_score") is not None:
	score_html = (
	f'<div style="font-size:1.2rem;'
	f'font-weight:700;color:#0f172a;'
	f'margin-top:0.5rem">'
	f'Score: {sub["overall_score"]:.1f}/100'
	f'</div>'
	)
	return (
	f'<div style="background:white;padding:1rem;'
	f'border-radius:8px;border:1px solid #e2e8f0">'
	f'<strong>{sub["agent_name"]}</strong> '
	f'({sub["organization"]})<br>'
	f'Status: <span style="color:{status_color};'
	f'font-weight:700">{sub["status"]}</span><br>'
	f'Tasks: {sub.get("tasks_dispatched", 0)}'
	f'/{sub.get("tasks_total", 76)}<br>'
	f'Created: {sub.get("created_at", "")[:10]}'
	f'{score_html}</div>'
	)
	except Exception as e:
	return f'<div style="color:#e53e3e">{e}</div>'

	status_btn.click(_check_status, [status_id], status_out)

	# --- Admin panel (password-protected) ---
	with gr.Accordion("Admin Panel", open=False):
	admin_pw = gr.Textbox(
	label="Admin Password", type="password",
	)
	admin_auth_btn = gr.Button("Authenticate")
	admin_panel = gr.Column(visible=False)
	admin_msg = gr.HTML()

	with admin_panel:
	gr.HTML('<h3 style="color:#0f172a">'
	'Pending Submissions</h3>')
	pending_html = gr.HTML()
	refresh_btn = gr.Button("Refresh List")

	with gr.Row():
	approve_id = gr.Textbox(
	label="Submission ID to Approve/Reject",
	scale=2,
	)
	approve_btn = gr.Button(
	"Approve", variant="primary", scale=1,
	)
	reject_btn = gr.Button(
	"Reject", variant="stop", scale=1,
	)
	approve_msg = gr.HTML()

	gr.HTML('<h3 style="color:#0f172a;margin-top:1rem">'
	'Pipeline Control</h3>')
	with gr.Row():
	dispatch_id = gr.Textbox(
	label="Submission ID", scale=2,
	)
	dispatch_btn = gr.Button(
	"Phase A: Dispatch Tasks", scale=1,
	)
	with gr.Row():
	boltz_id = gr.Textbox(
	label="Submission ID", scale=2,
	)
	boltz_btn = gr.Button(
	"Phase B: Run Boltz (GPU)", scale=1,
	)
	with gr.Row():
	judge_id = gr.Textbox(
	label="Submission ID", scale=2,
	)
	judge_btn = gr.Button(
	"Phase C: Run LLM Judge", scale=1,
	)
	with gr.Row():
	final_id = gr.Textbox(
	label="Submission ID", scale=2,
	)
	final_btn = gr.Button(
	"Phase D: Finalize & Publish", scale=1,
	)
	pipeline_out = gr.HTML()

	def _admin_auth(pw):
	if pw == ADMIN_PASSWORD:
	return (
	gr.Column(visible=True),
	'<div style="color:#38a169">'
	'Authenticated.</div>',
	)
	return (
	gr.Column(visible=False),
	'<div style="color:#e53e3e">'
	'Wrong password.</div>',
	)

	admin_auth_btn.click(
	_admin_auth, [admin_pw],
	[admin_panel, admin_msg],
	)

	def _refresh_pending():
	try:
	from eval_queue import get_pending_submissions
	pending = get_pending_submissions()
	if not pending:
	return "<p>No pending submissions.</p>"
	rows = []
	for s in pending:
	mcp = "custom" if s.get("custom_mcp_url") else "reference"
	key_state = "set" if s.get("api_key") else "scrubbed"
	rows.append(
	f'<tr><td><code>{s["submission_id"]}</code></td>'
	f'<td>{s["agent_name"]}</td>'
	f'<td>{s["organization"]}</td>'
	f'<td>{s.get("provider","?")}/{s.get("model_name","?")}</td>'
	f'<td>{mcp}</td>'
	f'<td>{key_state}</td>'
	f'<td>{s.get("created_at","")[:10]}</td></tr>'
	)
	return (
	'<table style="width:100%;font-size:0.85rem;'
	'border-collapse:collapse">'
	"<tr><th>ID</th><th>Agent</th><th>Org</th>"
	"<th>Provider/Model</th><th>MCP</th>"
	"<th>Key</th><th>Date</th></tr>"
	+ "".join(rows) + "</table>"
	)
	except Exception as e:
	return f"<p>Error: {e}</p>"

	refresh_btn.click(
	_refresh_pending, [], pending_html,
	)

	def _approve_sub(sid):
	try:
	from eval_queue import update_status
	ok = update_status(sid.strip(), "approved")
	if ok:
	return (
	f'<div style="color:#38a169">'
	f'Approved: {sid}</div>'
	)
	return (
	f'<div style="color:#e53e3e">'
	f'Failed to approve {sid}</div>'
	)
	except Exception as e:
	return f'<div style="color:#e53e3e">{e}</div>'

	def _reject_sub(sid):
	try:
	from eval_queue import update_status
	ok = update_status(sid.strip(), "rejected")
	if ok:
	return (
	f'<div style="color:#d69e2e">'
	f'Rejected: {sid}</div>'
	)
	return (
	f'<div style="color:#e53e3e">'
	f'Failed to reject {sid}</div>'
	)
	except Exception as e:
	return f'<div style="color:#e53e3e">{e}</div>'

	approve_btn.click(
	_approve_sub, [approve_id], approve_msg,
	)
	reject_btn.click(
	_reject_sub, [approve_id], approve_msg,
	)

	def _run_dispatch(sid):
	try:
	from eval_queue import get_submission
	from eval_dispatcher import dispatch_all_tasks

	sub = get_submission(sid.strip())
	if sub is None:
	return ('<div style="color:#e53e3e">'
	'Not found</div>')
	if sub["status"] not in ("approved", "dispatching"):
	return (
	f'<div style="color:#e53e3e">'
	f'Cannot dispatch: status='
	f'{sub["status"]}</div>'
	)
	if not sub.get("api_key"):
	return (
	'<div style="color:#e53e3e">'
	'API key already scrubbed -- this '
	'submission has already been dispatched. '
	'Resubmit if you need to re-run.</div>'
	)
	results = dispatch_all_tasks(sid.strip())
	ok = sum(1 for r in results if r.get("success"))
	return (
	f'<div style="color:#38a169">'
	f'Dispatched: {ok}/{len(results)} tasks '
	f'succeeded. API key scrubbed.</div>'
	)
	except Exception as e:
	import traceback
	return (
	f'<div style="color:#e53e3e">'
	f'<strong>Dispatch error:</strong> {e}<br>'
	f'<pre style="font-size:0.7rem">'
	f'{traceback.format_exc()[:600]}</pre></div>'
	)

	def _run_boltz(sid):
	try:
	from eval_queue import get_submission
	from eval_boltz import run_boltz_posteval

	sub = get_submission(sid.strip())
	if sub is None:
	return (
	'<div style="color:#e53e3e">'
	'Not found</div>'
	)
	per_task = json.loads(
	sub.get("per_task_results", "{}")
	)
	if not per_task:
	return (
	'<div style="color:#e53e3e">'
	"No task results to process.</div>"
	)
	run_boltz_posteval(per_task)
	from eval_queue import save_task_result
	for tid, tres in per_task.items():
	save_task_result(sid.strip(), tid, tres)
	return (
	'<div style="color:#38a169">'
	"Boltz post-assessment complete.</div>"
	)
	except Exception as e:
	return f'<div style="color:#e53e3e">{e}</div>'

	def _run_judge(sid):
	try:
	import eval_judge as ej
	from eval_queue import (
	get_submission, save_task_result, update_status,
	)

	sub = get_submission(sid.strip())
	if sub is None:
	return ('<div style="color:#e53e3e">'
	'Not found</div>')
	per_task = json.loads(
	sub.get("per_task_results", "{}")
	)
	if not per_task:
	return ('<div style="color:#e53e3e">'
	"No task results to process.</div>")

	update_status(sid.strip(), "scoring")
	ej.run_judge_panel(
	per_task,
	agent_id=sub.get("agent_name", "unknown"),
	dry_run=False,
	)
	for tid, tres in per_task.items():
	save_task_result(sid.strip(), tid, tres)

	n_done = sum(
	1 for r in per_task.values()
	if r.get("hybrid_total") is not None
	)
	return (
	f'<div style="color:#38a169">'
	f"LLM judge complete on {n_done} tasks."
	"</div>"
	)
	except Exception as e:
	import traceback
	return (
	f'<div style="color:#e53e3e">'
	f'<strong>Judge error:</strong> {e}<br>'
	f'<pre style="font-size:0.7rem">'
	f'{traceback.format_exc()[:600]}</pre></div>'
	)

	def _run_finalize(sid):
	try:
	from eval_queue import (
	finalize_submission,
	get_submission,
	)
	from eval_scorer import aggregate_scores

	sub = get_submission(sid.strip())
	if sub is None:
	return (
	'<div style="color:#e53e3e">'
	'Not found</div>'
	)
	per_task = json.loads(
	sub.get("per_task_results", "{}")
	)
	agg = aggregate_scores(per_task)
	finalize_submission(
	sid.strip(),
	overall_score=agg["overall_score"],
	component_scores=agg["component_scores"],
	taxonomy_scores=agg["taxonomy_scores"],
	)
	mode_label = agg.get("scoring_mode", "algo")
	return (
	f'<div style="color:#38a169">'
	f'Finalized! Score: '
	f'{agg["overall_score"]:.1f} '
	f'(scoring={mode_label})</div>'
	)
	except Exception as e:
	return f'<div style="color:#e53e3e">{e}</div>'

	dispatch_btn.click(
	_run_dispatch, [dispatch_id], pipeline_out,
	)
	boltz_btn.click(
	_run_boltz, [boltz_id], pipeline_out,
	)
	judge_btn.click(
	_run_judge, [judge_id], pipeline_out,
	)
	final_btn.click(
	_run_finalize, [final_id], pipeline_out,
	)

	# ══════ Tab 7: About ══════
	with gr.Tab("\u2139\ufe0f About"):
	gr.HTML(build_about())

	return app


	# ═══════════════════════════════════════════════════════════════════
	# Entry point
	# ═══════════════════════════════════════════════════════════════════

	if __name__ == "__main__":
	create_app().launch()