Spaces:

RomeroLab-Duke
/

BioDesignBench-Leaderboard

Running

Jasonkim8652 commited on about 8 hours ago

Commit

dcf17b1

1 Parent(s): 73e1720

align jargon with paper; set organization to RomeroLab

- low-diversity -> low-variety (paper Section 2.5 terminology;
avoids collision with rubric "Diversity" component)
- Benchmark/User mode -> Unguided/Guided (paper-standard naming)
- remove "Rescue Index" (not defined in paper); replace with
"coverage gap / evaluation-depth gap" phrasing
- "utilisation depth" -> "evaluation depth"
- "surface competence" -> "evaluation-depth gap"
- "hand-engineered" -> "deterministic" hardcoded pipeline
- i_pAE -> ipAE; "plan -> sample -> evaluate" -> "plan -> call -> evaluate -> iterate"
- baseline scores aligned to paper text (54.5 / 61.7 / 75.2)
- correlation values aligned to paper (rho = 0.68, p < 10^-115)
- set organization=RomeroLab for all 11 entries (model name
already disambiguates the agent; field now reflects who ran it)

Files changed (3) hide show

README.md +1 -1
app.py +61 -61
leaderboard_data.json +36 -36

README.md CHANGED Viewed

@@ -36,7 +36,7 @@ Novelty, and Diversity. See the *About* tab for the full methodology and the
 - **Taxonomy Heatmap** — Per-cell scores across the 9 occupied cells of the 2 × 5 design matrix
 - **Component Analysis** — Radar and bar charts comparing the 6 scoring components
 - **Guidance Effect** — Paired comparison of the same LLM in unguided (atomic tools) vs guided (composite workflows) mode
-- **Depth Gap** — Forced-depth and low-diversity intervention results
 - **About** — Methodology, submission guide, and citation info
 ## Bringing your own MCP tools

 - **Taxonomy Heatmap** — Per-cell scores across the 9 occupied cells of the 2 × 5 design matrix
 - **Component Analysis** — Radar and bar charts comparing the 6 scoring components
 - **Guidance Effect** — Paired comparison of the same LLM in unguided (atomic tools) vs guided (composite workflows) mode
+- **Depth Gap** — Forced-depth and low-variety intervention results
 - **About** — Methodology, submission guide, and citation info
 ## Bringing your own MCP tools

app.py CHANGED Viewed

@@ -209,9 +209,9 @@ def build_header(last_updated: str, n_entries: int) -> str:
       <p style="color:#64748b;margin:0.2rem 0 0;font-size:0.95rem;
                 font-weight:400;font-style:italic;max-width:680px;
                 margin-left:auto;margin-right:auto;line-height:1.5">
-        Top-tier agents now surpass a deterministic pipeline &mdash;
-        but invoke evaluation tools at only <strong>14% of expert depth</strong>.
-        Guidance rescues coverage, not depth.</p>
       <div style="margin-top:1rem;display:flex;justify-content:center;
                   gap:0.6rem;flex-wrap:wrap">
         <a href="{PAPER_URL}" target="_blank"
@@ -360,17 +360,17 @@ def build_leaderboard_table(
         # ── Mode badge ──
         if is_bl:
             mode = f'<td style="{TD};color:#718096">\u2014</td>'
-        elif e.get("mode") == "benchmark":
             mode = (
                 f'<td style="{TD}"><span style="background:#fed7d7;'
                 "color:#c53030;padding:0.15rem 0.5rem;border-radius:4px;"
-                'font-size:0.75rem;font-weight:600">benchmark</span></td>'
             )
         else:
             mode = (
                 f'<td style="{TD}"><span style="background:#c6f6d5;'
                 "color:#276749;padding:0.15rem 0.5rem;border-radius:4px;"
-                'font-size:0.75rem;font-weight:600">user</span></td>'
             )
         # ── MCP ──
@@ -503,7 +503,7 @@ def build_heatmap(entry: dict) -> str:
 def build_mode_cards(entries: list) -> str:
-    """Per-LLM cards showing benchmark vs user delta."""
     by_name: dict[str, dict] = {}
     for e in entries:
         if e["submission_type"] != "llm":
@@ -512,14 +512,14 @@ def build_mode_cards(entries: list) -> str:
     ordered = sorted(
         by_name.items(),
-        key=lambda x: x[1].get("user", {}).get("overall_score", 0),
         reverse=True,
     )
     cards = []
     for name, modes in ordered:
-        bench = modes.get("benchmark")
-        user = modes.get("user")
         if not bench or not user:
             continue
         delta = user["overall_score"] - bench["overall_score"]
@@ -528,12 +528,12 @@ def build_mode_cards(entries: list) -> str:
         lines = [
             '<div style="display:flex;justify-content:space-between;'
             'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">'
-            "<span>Benchmark</span>"
             f'<span style="font-weight:700;color:#e53e3e">'
             f'{bench["overall_score"]:.1f}</span></div>',
             '<div style="display:flex;justify-content:space-between;'
             'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">'
-            "<span>User</span>"
             f'<span style="font-weight:700;color:#d69e2e">'
             f'{user["overall_score"]:.1f}</span></div>',
             '<div style="display:flex;justify-content:space-between;'
@@ -603,12 +603,13 @@ def build_headline_findings(findings: list) -> str:
 def build_intervention_section(interventions: dict) -> str:
-    """Show forced-depth and low-diversity intervention results.
-    The forced-depth condition mandates ≥3 evaluation passes per design
-    candidate; the low-diversity control constrains the candidate pool
-    without forcing depth. Together they isolate evaluation depth as the
-    causal driver of the 'surface competence' gap reported in the paper.
     """
     if not interventions or not interventions.get("rows"):
         return '<p style="color:#718096">No intervention data available.</p>'
@@ -618,7 +619,7 @@ def build_intervention_section(interventions: dict) -> str:
     cond_meta = {
         "baseline": ("#64748b", "Baseline"),
         "forced_depth": ("#38a169", "Forced Depth"),
-        "low_diversity_control": ("#d69e2e", "Low-Diversity Control"),
     }
     TH = (
@@ -683,14 +684,14 @@ def build_intervention_section(interventions: dict) -> str:
         <span style="color:#52340d">
           Forced-depth lifts <strong>DeepSeek V3 by +9.3</strong> and
           <strong>GPT-5 by +15.9</strong> points without any change to
-          the underlying model or tools, while the low-diversity control
           <em>hurts</em> DeepSeek V3 (&minus;2.3). The dissociation is
           cleanest on the strongest agent, where it provides direct
           causal evidence that
-          <strong>evaluation depth &mdash; not the mere act of process
-          intervention &mdash; drives the gain</strong>. GPT-5's
-          response is more uniform across both interventions; we
-          report the raw deltas without smoothing.
         </span>
       </div>
@@ -755,10 +756,9 @@ def build_about() -> str:
           repeated sampling from generative tools (RFdiffusion,
           ProteinMPNN) and iterative cross-validation through several
           biophysical metrics. We test the full agentic loop &mdash;
-          <strong>plan &rarr; sample &rarr; evaluate across multiple
-          metrics &rarr; iterate</strong> &mdash; over 76 expert-curated
-          tasks drawn from 2024&ndash;2026 literature, exposed through
-          17 MCP-integrated tools.
         </p>
         <div style="display:grid;grid-template-columns:
                     repeat(auto-fit,minmax(140px,1fr));gap:0.8rem;
@@ -789,32 +789,32 @@ def build_about() -> str:
       <div {card}>
         <h2 {h2}>Three principal findings</h2>
-        <h3 {h3}>1. Top-tier agents now beat a deterministic pipeline</h3>
         <p {p}>
-          DeepSeek V3 and GPT-5 surpass a hand-engineered hardcoded
-          pipeline (54.2) under both modes. Autonomous protein-design
           orchestration is no longer infeasible &mdash; but a substantial
-          gap to the human expert (61.3) and oracle (74.9) remains.
         </p>
         <h3 {h3}>2. Coverage&ndash;depth dissociation</h3>
         <p {p}>
-          Workflow guidance closes the <em>coverage</em> gap (Rescue
-          Index up to +3.01) but leaves <em>utilisation depth</em>
-          unchanged (Rescue Index \u2248 0). Better tool documentation
-          can teach agents <em>which</em> tools to call, but cannot
-          teach them to call those tools with the iterative depth that
-          expert practice demands.
         </p>
-        <h3 {h3}>3. Evaluation depth, not tool knowledge, is the bottleneck</h3>
         <p {p}>
-          Across 836 task&ndash;condition observations, evaluation depth
-          per candidate correlates with total score at
-          <strong>&rho; = 0.685</strong>
-          (<em>p</em> &lt; 10<sup>-117</sup>). LLM agents generate
-          backbone candidates at expert-level rates but evaluate each
-          one at only <strong>14% of expert depth</strong>. Forced-depth
-          interventions confirm this is causal &mdash; see the
-          <em>Depth Gap</em> tab.
         </p>
       </div>
@@ -904,8 +904,8 @@ def build_about() -> str:
           ordering, intermediate validation, and adaptive iteration.</p>
         <p {p}>
           <strong>Quality (35 pts)</strong> &mdash; 100% algorithmic.
-          Continuous 4-band interpolation over Boltz-2 re-prediction
-          metrics (pLDDT, pTM, ipTM, i_pAE), eliminating LLM judgement
           variance on biophysical quantities.</p>
         <p {p}>
           <strong>Feasibility (15 pts)</strong> &mdash; valid amino
@@ -1102,7 +1102,7 @@ def chart_component_bar(e1: dict, e2: dict) -> go.Figure:
 def chart_mode_comparison(entries: list) -> go.Figure:
-    """Grouped bar chart: benchmark vs user mode for each LLM."""
     by_name: dict[str, dict[str, float]] = {}
     for e in entries:
         if e["submission_type"] != "llm":
@@ -1111,23 +1111,23 @@ def chart_mode_comparison(entries: list) -> go.Figure:
     ordered = sorted(
         by_name.items(),
-        key=lambda x: x[1].get("user", 0),
         reverse=True,
     )
     names = [n for n, _ in ordered]
-    bench = [m.get("benchmark", 0) for _, m in ordered]
-    user = [m.get("user", 0) for _, m in ordered]
     fig = go.Figure()
     fig.add_trace(
         go.Bar(
-            x=names, y=bench, name="Benchmark Mode",
             marker_color="rgba(229,62,62,0.6)",
         )
     )
     fig.add_trace(
         go.Bar(
-            x=names, y=user, name="User Mode",
             marker_color="rgba(56,161,105,0.6)",
         )
     )
@@ -1137,7 +1137,7 @@ def chart_mode_comparison(entries: list) -> go.Figure:
             yaxis=dict(range=[0, 80], title="Overall hybrid score"),
             xaxis=dict(title=""),
             title=dict(
-                text=("Unguided (Benchmark) vs Guided (User) modes \u2014 "
                       "guidance lifts coverage but rarely shifts overall score"),
                 font_size=13,
             ),
@@ -1189,7 +1189,7 @@ def create_app() -> gr.Blocks:
             with gr.Tab("\U0001f4ca Overall"):
                 with gr.Row():
                     f_mode = gr.Dropdown(
-                        ["All", "Benchmark", "User"],
                         value="All", label="Mode", scale=1,
                     )
                     f_mcp = gr.Dropdown(
@@ -1274,13 +1274,13 @@ def create_app() -> gr.Blocks:
                     'margin:0.4rem 0 0.9rem;color:#1e3a8a;font-size:0.88rem;'
                     'line-height:1.55">'
                     '<strong>Mode semantics:</strong> '
-                    '<em>Benchmark mode</em> exposes atomic tools without '
-                    'pipeline hints (unguided); <em>User mode</em> packages '
                     'them into composite workflows with explicit pipeline '
-                    'structure (guided). Guidance lifts the lowest-tier '
                     'agents but does not consistently help capable ones, '
-                    'and never closes the depth gap (see <em>Depth Gap</em> '
-                    'tab).</div>'
                 )
                 gr.Plot(chart_mode_comparison(entries))
                 gr.HTML(build_mode_cards(entries))

       <p style="color:#64748b;margin:0.2rem 0 0;font-size:0.95rem;
                 font-weight:400;font-style:italic;max-width:680px;
                 margin-left:auto;margin-right:auto;line-height:1.5">
+        Top-tier agents now surpass a deterministic hardcoded pipeline &mdash;
+        but invoke evaluation tools at only <strong>14% of expert intensity</strong>.
+        Guidance closes the coverage gap, not the evaluation-depth gap.</p>
       <div style="margin-top:1rem;display:flex;justify-content:center;
                   gap:0.6rem;flex-wrap:wrap">
         <a href="{PAPER_URL}" target="_blank"
         # ── Mode badge ──
         if is_bl:
             mode = f'<td style="{TD};color:#718096">\u2014</td>'
+        elif e.get("mode") == "unguided":
             mode = (
                 f'<td style="{TD}"><span style="background:#fed7d7;'
                 "color:#c53030;padding:0.15rem 0.5rem;border-radius:4px;"
+                'font-size:0.75rem;font-weight:600">unguided</span></td>'
             )
         else:
             mode = (
                 f'<td style="{TD}"><span style="background:#c6f6d5;'
                 "color:#276749;padding:0.15rem 0.5rem;border-radius:4px;"
+                'font-size:0.75rem;font-weight:600">guided</span></td>'
             )
         # ── MCP ──
 def build_mode_cards(entries: list) -> str:
+    """Per-LLM cards showing unguided vs guided delta."""
     by_name: dict[str, dict] = {}
     for e in entries:
         if e["submission_type"] != "llm":
     ordered = sorted(
         by_name.items(),
+        key=lambda x: x[1].get("guided", {}).get("overall_score", 0),
         reverse=True,
     )
     cards = []
     for name, modes in ordered:
+        bench = modes.get("unguided")
+        user = modes.get("guided")
         if not bench or not user:
             continue
         delta = user["overall_score"] - bench["overall_score"]
         lines = [
             '<div style="display:flex;justify-content:space-between;'
             'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">'
+            "<span>Unguided</span>"
             f'<span style="font-weight:700;color:#e53e3e">'
             f'{bench["overall_score"]:.1f}</span></div>',
             '<div style="display:flex;justify-content:space-between;'
             'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">'
+            "<span>Guided</span>"
             f'<span style="font-weight:700;color:#d69e2e">'
             f'{user["overall_score"]:.1f}</span></div>',
             '<div style="display:flex;justify-content:space-between;'
 def build_intervention_section(interventions: dict) -> str:
+    """Show forced-depth and low-variety intervention results.
+    The forced-depth condition mandates ≥3 evaluation metric categories
+    per design candidate; the low-variety control performs comparable
+    compute with a narrow range of evaluation metrics. Together they
+    isolate evaluation depth as the causal driver of the evaluation-depth
+    gap reported in the paper.
     """
     if not interventions or not interventions.get("rows"):
         return '<p style="color:#718096">No intervention data available.</p>'
     cond_meta = {
         "baseline": ("#64748b", "Baseline"),
         "forced_depth": ("#38a169", "Forced Depth"),
+        "low_variety_control": ("#d69e2e", "Low-Variety Control"),
     }
     TH = (
         <span style="color:#52340d">
           Forced-depth lifts <strong>DeepSeek V3 by +9.3</strong> and
           <strong>GPT-5 by +15.9</strong> points without any change to
+          the underlying model or tools, while the low-variety control
           <em>hurts</em> DeepSeek V3 (&minus;2.3). The dissociation is
           cleanest on the strongest agent, where it provides direct
           causal evidence that
+          <strong>evaluation variety &mdash; not raw compute &mdash;
+          drives the gain</strong>. GPT-5's response is more uniform
+          across both interventions; we report the raw deltas without
+          smoothing.
         </span>
       </div>
           repeated sampling from generative tools (RFdiffusion,
           ProteinMPNN) and iterative cross-validation through several
           biophysical metrics. We test the full agentic loop &mdash;
+          <strong>plan &rarr; call &rarr; evaluate &rarr; iterate</strong>
+          &mdash; over 76 expert-curated tasks drawn from 2024&ndash;2026
+          literature, exposed through 17 MCP-integrated tools.
         </p>
         <div style="display:grid;grid-template-columns:
                     repeat(auto-fit,minmax(140px,1fr));gap:0.8rem;
       <div {card}>
         <h2 {h2}>Three principal findings</h2>
+        <h3 {h3}>1. Top-tier agents now beat the hardcoded pipeline</h3>
         <p {p}>
+          DeepSeek V3 and GPT-5 surpass the deterministic hardcoded
+          pipeline (54.5) under both modes. Autonomous protein-design
           orchestration is no longer infeasible &mdash; but a substantial
+          gap to the human expert (61.7) and oracle (75.2) remains.
         </p>
         <h3 {h3}>2. Coverage&ndash;depth dissociation</h3>
         <p {p}>
+          Workflow guidance closes the <em>coverage</em> gap, bringing
+          agent tool selection closer to the human expert, but leaves
+          <em>evaluation depth</em> unchanged. Better tool documentation
+          can teach agents <em>which</em> tools to call, but not how
+          thoroughly to use them on each generated candidate.
         </p>
+        <h3 {h3}>3. Evaluation variety, not tool knowledge, is the bottleneck</h3>
         <p {p}>
+          Across 836 task&ndash;condition observations, the number of
+          distinct evaluation metric categories per candidate correlates
+          with total score at
+          <strong>&rho; = 0.68</strong>
+          (<em>p</em> &lt; 10<sup>-115</sup>). LLM agents generate
+          backbone candidates at expert-level rates but invoke scoring
+          tools at only <strong>~14% of expert intensity</strong>.
+          Forced-depth interventions confirm this is causal &mdash; see
+          the <em>Depth Gap</em> tab.
         </p>
       </div>
           ordering, intermediate validation, and adaptive iteration.</p>
         <p {p}>
           <strong>Quality (35 pts)</strong> &mdash; 100% algorithmic.
+          Continuous four-band interpolation over Boltz-2 re-prediction
+          metrics (pLDDT, pTM, ipTM, ipAE), eliminating LLM judgement
           variance on biophysical quantities.</p>
         <p {p}>
           <strong>Feasibility (15 pts)</strong> &mdash; valid amino
 def chart_mode_comparison(entries: list) -> go.Figure:
+    """Grouped bar chart: unguided vs guided mode for each LLM."""
     by_name: dict[str, dict[str, float]] = {}
     for e in entries:
         if e["submission_type"] != "llm":
     ordered = sorted(
         by_name.items(),
+        key=lambda x: x[1].get("guided", 0),
         reverse=True,
     )
     names = [n for n, _ in ordered]
+    bench = [m.get("unguided", 0) for _, m in ordered]
+    user = [m.get("guided", 0) for _, m in ordered]
     fig = go.Figure()
     fig.add_trace(
         go.Bar(
+            x=names, y=bench, name="Unguided",
             marker_color="rgba(229,62,62,0.6)",
         )
     )
     fig.add_trace(
         go.Bar(
+            x=names, y=user, name="Guided",
             marker_color="rgba(56,161,105,0.6)",
         )
     )
             yaxis=dict(range=[0, 80], title="Overall hybrid score"),
             xaxis=dict(title=""),
             title=dict(
+                text=("Unguided vs Guided modes \u2014 "
                       "guidance lifts coverage but rarely shifts overall score"),
                 font_size=13,
             ),
             with gr.Tab("\U0001f4ca Overall"):
                 with gr.Row():
                     f_mode = gr.Dropdown(
+                        ["All", "Unguided", "Guided"],
                         value="All", label="Mode", scale=1,
                     )
                     f_mcp = gr.Dropdown(
                     'margin:0.4rem 0 0.9rem;color:#1e3a8a;font-size:0.88rem;'
                     'line-height:1.55">'
                     '<strong>Mode semantics:</strong> '
+                    '<em>Unguided mode</em> exposes atomic tools without '
+                    'pipeline hints; <em>guided mode</em> packages '
                     'them into composite workflows with explicit pipeline '
+                    'structure. Guidance lifts the lowest-tier '
                     'agents but does not consistently help capable ones, '
+                    'and never closes the evaluation-depth gap (see '
+                    '<em>Depth Gap</em> tab).</div>'
                 )
                 gr.Plot(chart_mode_comparison(entries))
                 gr.HTML(build_mode_cards(entries))

leaderboard_data.json CHANGED Viewed

@@ -2,11 +2,11 @@
   "last_updated": "2026-04-14",
   "paper_title": "Evaluating LLM-Driven Protein Design: Agents Lack Iterative Evaluation Depth",
   "headline_findings": [
-    "Top-tier LLM agents (DeepSeek V3, GPT-5) now surpass a deterministic hardcoded pipeline.",
-    "All agents show a critical evaluation depth gap \u2014 they invoke evaluation tools at only 14% of expert frequency.",
-    "Workflow guidance rescues tool coverage (Rescue Index up to +3.01) but not utilisation depth (Rescue Index \u2248 0).",
-    "Evaluation depth predicts design quality (\u03c1 = 0.685, p < 10\u207b\u00b9\u00b9\u2077) beyond binary tool selection.",
-    "Forced-depth intervention lifts the strongest agent (DeepSeek V3) by +9.3 points on 18 tasks, while a low-diversity control hurts it (-2.3) \u2014 evidence that depth, not process change alone, drives the gain."
   ],
   "scoring": {
     "rubric_max": 100,
@@ -26,7 +26,7 @@
       "agent_id": "oracle",
       "mode": null,
       "submission_type": "human_oracle",
-      "organization": "Romero Lab",
       "mcp_custom": false,
       "overall_score": 74.85,
       "component_scores": {
@@ -63,7 +63,7 @@
       "agent_id": "human-expert",
       "mode": null,
       "submission_type": "human_expert",
-      "organization": "Romero Lab",
       "mcp_custom": false,
       "overall_score": 61.25,
       "component_scores": {
@@ -98,9 +98,9 @@
     {
       "agent_name": "DeepSeek V3",
       "agent_id": "deepseek-v3-benchmark",
-      "mode": "benchmark",
       "submission_type": "llm",
-      "organization": "DeepSeek",
       "mcp_custom": false,
       "overall_score": 60.43,
       "component_scores": {
@@ -135,9 +135,9 @@
     {
       "agent_name": "DeepSeek V3",
       "agent_id": "deepseek-v3-user",
-      "mode": "user",
       "submission_type": "llm",
-      "organization": "DeepSeek",
       "mcp_custom": false,
       "overall_score": 58.46,
       "component_scores": {
@@ -172,9 +172,9 @@
     {
       "agent_name": "GPT-5",
       "agent_id": "gpt5-benchmark",
-      "mode": "benchmark",
       "submission_type": "llm",
-      "organization": "OpenAI",
       "mcp_custom": false,
       "overall_score": 55.61,
       "component_scores": {
@@ -209,9 +209,9 @@
     {
       "agent_name": "GPT-5",
       "agent_id": "gpt5-user",
-      "mode": "user",
       "submission_type": "llm",
-      "organization": "OpenAI",
       "mcp_custom": false,
       "overall_score": 55.26,
       "component_scores": {
@@ -248,7 +248,7 @@
       "agent_id": "hardcoded-pipeline",
       "mode": null,
       "submission_type": "hardcoded",
-      "organization": "Deterministic",
       "mcp_custom": false,
       "overall_score": 54.2,
       "component_scores": {
@@ -283,9 +283,9 @@
     {
       "agent_name": "Claude Sonnet 4.5",
       "agent_id": "sonnet-4.5-user",
-      "mode": "user",
       "submission_type": "llm",
-      "organization": "Anthropic",
       "mcp_custom": false,
       "overall_score": 50.23,
       "component_scores": {
@@ -320,9 +320,9 @@
     {
       "agent_name": "Claude Sonnet 4.5",
       "agent_id": "sonnet-4.5-benchmark",
-      "mode": "benchmark",
       "submission_type": "llm",
-      "organization": "Anthropic",
       "mcp_custom": false,
       "overall_score": 41.17,
       "component_scores": {
@@ -357,9 +357,9 @@
     {
       "agent_name": "Gemini 2.5 Pro",
       "agent_id": "gemini-2.5-pro-user",
-      "mode": "user",
       "submission_type": "llm",
-      "organization": "Google",
       "mcp_custom": false,
       "overall_score": 8.75,
       "component_scores": {
@@ -394,9 +394,9 @@
     {
       "agent_name": "Gemini 2.5 Pro",
       "agent_id": "gemini-2.5-pro-benchmark",
-      "mode": "benchmark",
       "submission_type": "llm",
-      "organization": "Google",
       "mcp_custom": false,
       "overall_score": 8.11,
       "component_scores": {
@@ -430,11 +430,11 @@
     }
   ],
   "interventions": {
-    "description": "Causal intervention experiments on the depth gap. 18 representative tasks rerun under three conditions: baseline (no intervention), forced_depth (mandate \u22653 evaluation passes per candidate), and low_diversity_control (constrain candidate count without forcing depth).",
     "n_tasks": 18,
     "rows": [
       {
-        "label": "DeepSeek V3 \u2014 baseline",
         "condition": "baseline",
         "agent": "deepseek-v3-tools-benchmark",
         "n_tasks": 18,
@@ -446,7 +446,7 @@
         "diversity": 3.56
       },
       {
-        "label": "GPT-5 \u2014 baseline",
         "condition": "baseline",
         "agent": "gpt5-tools-benchmark",
         "n_tasks": 18,
@@ -458,7 +458,7 @@
         "diversity": 3.94
       },
       {
-        "label": "Human Expert \u2014 baseline",
         "condition": "baseline",
         "agent": "human-expert-agent",
         "n_tasks": 18,
@@ -470,7 +470,7 @@
         "diversity": 2.28
       },
       {
-        "label": "DeepSeek V3 \u2014 forced depth",
         "condition": "forced_depth",
         "agent": "deepseek-v3-forced-depth",
         "n_tasks": 18,
@@ -482,7 +482,7 @@
         "diversity": 3.94
       },
       {
-        "label": "GPT-5 \u2014 forced depth",
         "condition": "forced_depth",
         "agent": "gpt5-tools-forced-depth",
         "n_tasks": 18,
@@ -494,8 +494,8 @@
         "diversity": 3.06
       },
       {
-        "label": "DeepSeek V3 \u2014 low diversity",
-        "condition": "low_diversity_control",
         "agent": "deepseek-v3-low-diversity",
         "n_tasks": 18,
         "score": 56.39,
@@ -506,8 +506,8 @@
         "diversity": 3.22
       },
       {
-        "label": "GPT-5 \u2014 low diversity",
-        "condition": "low_diversity_control",
         "agent": "gpt5-tools-low-diversity",
         "n_tasks": 18,
         "score": 61.5,
@@ -518,8 +518,8 @@
         "diversity": 3.22
       },
       {
-        "label": "Human Expert \u2014 shallow",
-        "condition": "low_diversity_control",
         "agent": "human-expert-shallow",
         "n_tasks": 18,
         "score": 55.06,

   "last_updated": "2026-04-14",
   "paper_title": "Evaluating LLM-Driven Protein Design: Agents Lack Iterative Evaluation Depth",
   "headline_findings": [
+    "Top-tier LLM agents (DeepSeek V3, GPT-5) now surpass the deterministic hardcoded pipeline.",
+    "All agents show a critical evaluation-depth gap — they invoke evaluation tools at only ~14% of expert intensity.",
+    "Workflow guidance closes the coverage gap but leaves the evaluation-depth gap unchanged.",
+    "Evaluation variety (distinct metric categories per candidate) predicts design quality (ρ = 0.68, p < 10⁻¹¹⁵) beyond binary tool selection.",
+    "Forced-depth intervention lifts the strongest agent (DeepSeek V3) by +9.3 points on 18 tasks, while a compute-matched low-variety control hurts it (-2.3) — evidence that variety, not raw compute, drives the gain."
   ],
   "scoring": {
     "rubric_max": 100,
       "agent_id": "oracle",
       "mode": null,
       "submission_type": "human_oracle",
+      "organization": "RomeroLab",
       "mcp_custom": false,
       "overall_score": 74.85,
       "component_scores": {
       "agent_id": "human-expert",
       "mode": null,
       "submission_type": "human_expert",
+      "organization": "RomeroLab",
       "mcp_custom": false,
       "overall_score": 61.25,
       "component_scores": {
     {
       "agent_name": "DeepSeek V3",
       "agent_id": "deepseek-v3-benchmark",
+      "mode": "unguided",
       "submission_type": "llm",
+      "organization": "RomeroLab",
       "mcp_custom": false,
       "overall_score": 60.43,
       "component_scores": {
     {
       "agent_name": "DeepSeek V3",
       "agent_id": "deepseek-v3-user",
+      "mode": "guided",
       "submission_type": "llm",
+      "organization": "RomeroLab",
       "mcp_custom": false,
       "overall_score": 58.46,
       "component_scores": {
     {
       "agent_name": "GPT-5",
       "agent_id": "gpt5-benchmark",
+      "mode": "unguided",
       "submission_type": "llm",
+      "organization": "RomeroLab",
       "mcp_custom": false,
       "overall_score": 55.61,
       "component_scores": {
     {
       "agent_name": "GPT-5",
       "agent_id": "gpt5-user",
+      "mode": "guided",
       "submission_type": "llm",
+      "organization": "RomeroLab",
       "mcp_custom": false,
       "overall_score": 55.26,
       "component_scores": {
       "agent_id": "hardcoded-pipeline",
       "mode": null,
       "submission_type": "hardcoded",
+      "organization": "RomeroLab",
       "mcp_custom": false,
       "overall_score": 54.2,
       "component_scores": {
     {
       "agent_name": "Claude Sonnet 4.5",
       "agent_id": "sonnet-4.5-user",
+      "mode": "guided",
       "submission_type": "llm",
+      "organization": "RomeroLab",
       "mcp_custom": false,
       "overall_score": 50.23,
       "component_scores": {
     {
       "agent_name": "Claude Sonnet 4.5",
       "agent_id": "sonnet-4.5-benchmark",
+      "mode": "unguided",
       "submission_type": "llm",
+      "organization": "RomeroLab",
       "mcp_custom": false,
       "overall_score": 41.17,
       "component_scores": {
     {
       "agent_name": "Gemini 2.5 Pro",
       "agent_id": "gemini-2.5-pro-user",
+      "mode": "guided",
       "submission_type": "llm",
+      "organization": "RomeroLab",
       "mcp_custom": false,
       "overall_score": 8.75,
       "component_scores": {
     {
       "agent_name": "Gemini 2.5 Pro",
       "agent_id": "gemini-2.5-pro-benchmark",
+      "mode": "unguided",
       "submission_type": "llm",
+      "organization": "RomeroLab",
       "mcp_custom": false,
       "overall_score": 8.11,
       "component_scores": {
     }
   ],
   "interventions": {
+    "description": "Causal intervention experiments on the evaluation-depth gap. 18 representative tasks rerun under three conditions: baseline (no intervention), forced_depth (mandate ≥3 evaluation metric categories per candidate), and low_variety_control (compute-matched control restricted to a narrow range of evaluation metrics).",
     "n_tasks": 18,
     "rows": [
       {
+        "label": "DeepSeek V3 — baseline",
         "condition": "baseline",
         "agent": "deepseek-v3-tools-benchmark",
         "n_tasks": 18,
         "diversity": 3.56
       },
       {
+        "label": "GPT-5 — baseline",
         "condition": "baseline",
         "agent": "gpt5-tools-benchmark",
         "n_tasks": 18,
         "diversity": 3.94
       },
       {
+        "label": "Human Expert — baseline",
         "condition": "baseline",
         "agent": "human-expert-agent",
         "n_tasks": 18,
         "diversity": 2.28
       },
       {
+        "label": "DeepSeek V3 — forced depth",
         "condition": "forced_depth",
         "agent": "deepseek-v3-forced-depth",
         "n_tasks": 18,
         "diversity": 3.94
       },
       {
+        "label": "GPT-5 — forced depth",
         "condition": "forced_depth",
         "agent": "gpt5-tools-forced-depth",
         "n_tasks": 18,
         "diversity": 3.06
       },
       {
+        "label": "DeepSeek V3 — low variety",
+        "condition": "low_variety_control",
         "agent": "deepseek-v3-low-diversity",
         "n_tasks": 18,
         "score": 56.39,
         "diversity": 3.22
       },
       {
+        "label": "GPT-5 — low variety",
+        "condition": "low_variety_control",
         "agent": "gpt5-tools-low-diversity",
         "n_tasks": 18,
         "score": 61.5,
         "diversity": 3.22
       },
       {
+        "label": "Human Expert — shallow",
+        "condition": "low_variety_control",
         "agent": "human-expert-shallow",
         "n_tasks": 18,
         "score": 55.06,