Jasonkim8652 commited on
Commit
dcf17b1
Β·
1 Parent(s): 73e1720

align jargon with paper; set organization to RomeroLab

Browse files

- low-diversity -> low-variety (paper Section 2.5 terminology;
avoids collision with rubric "Diversity" component)
- Benchmark/User mode -> Unguided/Guided (paper-standard naming)
- remove "Rescue Index" (not defined in paper); replace with
"coverage gap / evaluation-depth gap" phrasing
- "utilisation depth" -> "evaluation depth"
- "surface competence" -> "evaluation-depth gap"
- "hand-engineered" -> "deterministic" hardcoded pipeline
- i_pAE -> ipAE; "plan -> sample -> evaluate" -> "plan -> call -> evaluate -> iterate"
- baseline scores aligned to paper text (54.5 / 61.7 / 75.2)
- correlation values aligned to paper (rho = 0.68, p < 10^-115)
- set organization=RomeroLab for all 11 entries (model name
already disambiguates the agent; field now reflects who ran it)

Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +61 -61
  3. leaderboard_data.json +36 -36
README.md CHANGED
@@ -36,7 +36,7 @@ Novelty, and Diversity. See the *About* tab for the full methodology and the
36
  - **Taxonomy Heatmap** β€” Per-cell scores across the 9 occupied cells of the 2 Γ— 5 design matrix
37
  - **Component Analysis** β€” Radar and bar charts comparing the 6 scoring components
38
  - **Guidance Effect** β€” Paired comparison of the same LLM in unguided (atomic tools) vs guided (composite workflows) mode
39
- - **Depth Gap** β€” Forced-depth and low-diversity intervention results
40
  - **About** β€” Methodology, submission guide, and citation info
41
 
42
  ## Bringing your own MCP tools
 
36
  - **Taxonomy Heatmap** β€” Per-cell scores across the 9 occupied cells of the 2 Γ— 5 design matrix
37
  - **Component Analysis** β€” Radar and bar charts comparing the 6 scoring components
38
  - **Guidance Effect** β€” Paired comparison of the same LLM in unguided (atomic tools) vs guided (composite workflows) mode
39
+ - **Depth Gap** β€” Forced-depth and low-variety intervention results
40
  - **About** β€” Methodology, submission guide, and citation info
41
 
42
  ## Bringing your own MCP tools
app.py CHANGED
@@ -209,9 +209,9 @@ def build_header(last_updated: str, n_entries: int) -> str:
209
  <p style="color:#64748b;margin:0.2rem 0 0;font-size:0.95rem;
210
  font-weight:400;font-style:italic;max-width:680px;
211
  margin-left:auto;margin-right:auto;line-height:1.5">
212
- Top-tier agents now surpass a deterministic pipeline &mdash;
213
- but invoke evaluation tools at only <strong>14% of expert depth</strong>.
214
- Guidance rescues coverage, not depth.</p>
215
  <div style="margin-top:1rem;display:flex;justify-content:center;
216
  gap:0.6rem;flex-wrap:wrap">
217
  <a href="{PAPER_URL}" target="_blank"
@@ -360,17 +360,17 @@ def build_leaderboard_table(
360
  # ── Mode badge ──
361
  if is_bl:
362
  mode = f'<td style="{TD};color:#718096">\u2014</td>'
363
- elif e.get("mode") == "benchmark":
364
  mode = (
365
  f'<td style="{TD}"><span style="background:#fed7d7;'
366
  "color:#c53030;padding:0.15rem 0.5rem;border-radius:4px;"
367
- 'font-size:0.75rem;font-weight:600">benchmark</span></td>'
368
  )
369
  else:
370
  mode = (
371
  f'<td style="{TD}"><span style="background:#c6f6d5;'
372
  "color:#276749;padding:0.15rem 0.5rem;border-radius:4px;"
373
- 'font-size:0.75rem;font-weight:600">user</span></td>'
374
  )
375
 
376
  # ── MCP ──
@@ -503,7 +503,7 @@ def build_heatmap(entry: dict) -> str:
503
 
504
 
505
  def build_mode_cards(entries: list) -> str:
506
- """Per-LLM cards showing benchmark vs user delta."""
507
  by_name: dict[str, dict] = {}
508
  for e in entries:
509
  if e["submission_type"] != "llm":
@@ -512,14 +512,14 @@ def build_mode_cards(entries: list) -> str:
512
 
513
  ordered = sorted(
514
  by_name.items(),
515
- key=lambda x: x[1].get("user", {}).get("overall_score", 0),
516
  reverse=True,
517
  )
518
 
519
  cards = []
520
  for name, modes in ordered:
521
- bench = modes.get("benchmark")
522
- user = modes.get("user")
523
  if not bench or not user:
524
  continue
525
  delta = user["overall_score"] - bench["overall_score"]
@@ -528,12 +528,12 @@ def build_mode_cards(entries: list) -> str:
528
  lines = [
529
  '<div style="display:flex;justify-content:space-between;'
530
  'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">'
531
- "<span>Benchmark</span>"
532
  f'<span style="font-weight:700;color:#e53e3e">'
533
  f'{bench["overall_score"]:.1f}</span></div>',
534
  '<div style="display:flex;justify-content:space-between;'
535
  'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">'
536
- "<span>User</span>"
537
  f'<span style="font-weight:700;color:#d69e2e">'
538
  f'{user["overall_score"]:.1f}</span></div>',
539
  '<div style="display:flex;justify-content:space-between;'
@@ -603,12 +603,13 @@ def build_headline_findings(findings: list) -> str:
603
 
604
 
605
  def build_intervention_section(interventions: dict) -> str:
606
- """Show forced-depth and low-diversity intervention results.
607
 
608
- The forced-depth condition mandates β‰₯3 evaluation passes per design
609
- candidate; the low-diversity control constrains the candidate pool
610
- without forcing depth. Together they isolate evaluation depth as the
611
- causal driver of the 'surface competence' gap reported in the paper.
 
612
  """
613
  if not interventions or not interventions.get("rows"):
614
  return '<p style="color:#718096">No intervention data available.</p>'
@@ -618,7 +619,7 @@ def build_intervention_section(interventions: dict) -> str:
618
  cond_meta = {
619
  "baseline": ("#64748b", "Baseline"),
620
  "forced_depth": ("#38a169", "Forced Depth"),
621
- "low_diversity_control": ("#d69e2e", "Low-Diversity Control"),
622
  }
623
 
624
  TH = (
@@ -683,14 +684,14 @@ def build_intervention_section(interventions: dict) -> str:
683
  <span style="color:#52340d">
684
  Forced-depth lifts <strong>DeepSeek V3 by +9.3</strong> and
685
  <strong>GPT-5 by +15.9</strong> points without any change to
686
- the underlying model or tools, while the low-diversity control
687
  <em>hurts</em> DeepSeek V3 (&minus;2.3). The dissociation is
688
  cleanest on the strongest agent, where it provides direct
689
  causal evidence that
690
- <strong>evaluation depth &mdash; not the mere act of process
691
- intervention &mdash; drives the gain</strong>. GPT-5's
692
- response is more uniform across both interventions; we
693
- report the raw deltas without smoothing.
694
  </span>
695
  </div>
696
 
@@ -755,10 +756,9 @@ def build_about() -> str:
755
  repeated sampling from generative tools (RFdiffusion,
756
  ProteinMPNN) and iterative cross-validation through several
757
  biophysical metrics. We test the full agentic loop &mdash;
758
- <strong>plan &rarr; sample &rarr; evaluate across multiple
759
- metrics &rarr; iterate</strong> &mdash; over 76 expert-curated
760
- tasks drawn from 2024&ndash;2026 literature, exposed through
761
- 17 MCP-integrated tools.
762
  </p>
763
  <div style="display:grid;grid-template-columns:
764
  repeat(auto-fit,minmax(140px,1fr));gap:0.8rem;
@@ -789,32 +789,32 @@ def build_about() -> str:
789
 
790
  <div {card}>
791
  <h2 {h2}>Three principal findings</h2>
792
- <h3 {h3}>1. Top-tier agents now beat a deterministic pipeline</h3>
793
  <p {p}>
794
- DeepSeek V3 and GPT-5 surpass a hand-engineered hardcoded
795
- pipeline (54.2) under both modes. Autonomous protein-design
796
  orchestration is no longer infeasible &mdash; but a substantial
797
- gap to the human expert (61.3) and oracle (74.9) remains.
798
  </p>
799
  <h3 {h3}>2. Coverage&ndash;depth dissociation</h3>
800
  <p {p}>
801
- Workflow guidance closes the <em>coverage</em> gap (Rescue
802
- Index up to +3.01) but leaves <em>utilisation depth</em>
803
- unchanged (Rescue Index \u2248 0). Better tool documentation
804
- can teach agents <em>which</em> tools to call, but cannot
805
- teach them to call those tools with the iterative depth that
806
- expert practice demands.
807
  </p>
808
- <h3 {h3}>3. Evaluation depth, not tool knowledge, is the bottleneck</h3>
809
  <p {p}>
810
- Across 836 task&ndash;condition observations, evaluation depth
811
- per candidate correlates with total score at
812
- <strong>&rho; = 0.685</strong>
813
- (<em>p</em> &lt; 10<sup>-117</sup>). LLM agents generate
814
- backbone candidates at expert-level rates but evaluate each
815
- one at only <strong>14% of expert depth</strong>. Forced-depth
816
- interventions confirm this is causal &mdash; see the
817
- <em>Depth Gap</em> tab.
 
818
  </p>
819
  </div>
820
 
@@ -904,8 +904,8 @@ def build_about() -> str:
904
  ordering, intermediate validation, and adaptive iteration.</p>
905
  <p {p}>
906
  <strong>Quality (35 pts)</strong> &mdash; 100% algorithmic.
907
- Continuous 4-band interpolation over Boltz-2 re-prediction
908
- metrics (pLDDT, pTM, ipTM, i_pAE), eliminating LLM judgement
909
  variance on biophysical quantities.</p>
910
  <p {p}>
911
  <strong>Feasibility (15 pts)</strong> &mdash; valid amino
@@ -1102,7 +1102,7 @@ def chart_component_bar(e1: dict, e2: dict) -> go.Figure:
1102
 
1103
 
1104
  def chart_mode_comparison(entries: list) -> go.Figure:
1105
- """Grouped bar chart: benchmark vs user mode for each LLM."""
1106
  by_name: dict[str, dict[str, float]] = {}
1107
  for e in entries:
1108
  if e["submission_type"] != "llm":
@@ -1111,23 +1111,23 @@ def chart_mode_comparison(entries: list) -> go.Figure:
1111
 
1112
  ordered = sorted(
1113
  by_name.items(),
1114
- key=lambda x: x[1].get("user", 0),
1115
  reverse=True,
1116
  )
1117
  names = [n for n, _ in ordered]
1118
- bench = [m.get("benchmark", 0) for _, m in ordered]
1119
- user = [m.get("user", 0) for _, m in ordered]
1120
 
1121
  fig = go.Figure()
1122
  fig.add_trace(
1123
  go.Bar(
1124
- x=names, y=bench, name="Benchmark Mode",
1125
  marker_color="rgba(229,62,62,0.6)",
1126
  )
1127
  )
1128
  fig.add_trace(
1129
  go.Bar(
1130
- x=names, y=user, name="User Mode",
1131
  marker_color="rgba(56,161,105,0.6)",
1132
  )
1133
  )
@@ -1137,7 +1137,7 @@ def chart_mode_comparison(entries: list) -> go.Figure:
1137
  yaxis=dict(range=[0, 80], title="Overall hybrid score"),
1138
  xaxis=dict(title=""),
1139
  title=dict(
1140
- text=("Unguided (Benchmark) vs Guided (User) modes \u2014 "
1141
  "guidance lifts coverage but rarely shifts overall score"),
1142
  font_size=13,
1143
  ),
@@ -1189,7 +1189,7 @@ def create_app() -> gr.Blocks:
1189
  with gr.Tab("\U0001f4ca Overall"):
1190
  with gr.Row():
1191
  f_mode = gr.Dropdown(
1192
- ["All", "Benchmark", "User"],
1193
  value="All", label="Mode", scale=1,
1194
  )
1195
  f_mcp = gr.Dropdown(
@@ -1274,13 +1274,13 @@ def create_app() -> gr.Blocks:
1274
  'margin:0.4rem 0 0.9rem;color:#1e3a8a;font-size:0.88rem;'
1275
  'line-height:1.55">'
1276
  '<strong>Mode semantics:</strong> '
1277
- '<em>Benchmark mode</em> exposes atomic tools without '
1278
- 'pipeline hints (unguided); <em>User mode</em> packages '
1279
  'them into composite workflows with explicit pipeline '
1280
- 'structure (guided). Guidance lifts the lowest-tier '
1281
  'agents but does not consistently help capable ones, '
1282
- 'and never closes the depth gap (see <em>Depth Gap</em> '
1283
- 'tab).</div>'
1284
  )
1285
  gr.Plot(chart_mode_comparison(entries))
1286
  gr.HTML(build_mode_cards(entries))
 
209
  <p style="color:#64748b;margin:0.2rem 0 0;font-size:0.95rem;
210
  font-weight:400;font-style:italic;max-width:680px;
211
  margin-left:auto;margin-right:auto;line-height:1.5">
212
+ Top-tier agents now surpass a deterministic hardcoded pipeline &mdash;
213
+ but invoke evaluation tools at only <strong>14% of expert intensity</strong>.
214
+ Guidance closes the coverage gap, not the evaluation-depth gap.</p>
215
  <div style="margin-top:1rem;display:flex;justify-content:center;
216
  gap:0.6rem;flex-wrap:wrap">
217
  <a href="{PAPER_URL}" target="_blank"
 
360
  # ── Mode badge ──
361
  if is_bl:
362
  mode = f'<td style="{TD};color:#718096">\u2014</td>'
363
+ elif e.get("mode") == "unguided":
364
  mode = (
365
  f'<td style="{TD}"><span style="background:#fed7d7;'
366
  "color:#c53030;padding:0.15rem 0.5rem;border-radius:4px;"
367
+ 'font-size:0.75rem;font-weight:600">unguided</span></td>'
368
  )
369
  else:
370
  mode = (
371
  f'<td style="{TD}"><span style="background:#c6f6d5;'
372
  "color:#276749;padding:0.15rem 0.5rem;border-radius:4px;"
373
+ 'font-size:0.75rem;font-weight:600">guided</span></td>'
374
  )
375
 
376
  # ── MCP ──
 
503
 
504
 
505
  def build_mode_cards(entries: list) -> str:
506
+ """Per-LLM cards showing unguided vs guided delta."""
507
  by_name: dict[str, dict] = {}
508
  for e in entries:
509
  if e["submission_type"] != "llm":
 
512
 
513
  ordered = sorted(
514
  by_name.items(),
515
+ key=lambda x: x[1].get("guided", {}).get("overall_score", 0),
516
  reverse=True,
517
  )
518
 
519
  cards = []
520
  for name, modes in ordered:
521
+ bench = modes.get("unguided")
522
+ user = modes.get("guided")
523
  if not bench or not user:
524
  continue
525
  delta = user["overall_score"] - bench["overall_score"]
 
528
  lines = [
529
  '<div style="display:flex;justify-content:space-between;'
530
  'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">'
531
+ "<span>Unguided</span>"
532
  f'<span style="font-weight:700;color:#e53e3e">'
533
  f'{bench["overall_score"]:.1f}</span></div>',
534
  '<div style="display:flex;justify-content:space-between;'
535
  'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">'
536
+ "<span>Guided</span>"
537
  f'<span style="font-weight:700;color:#d69e2e">'
538
  f'{user["overall_score"]:.1f}</span></div>',
539
  '<div style="display:flex;justify-content:space-between;'
 
603
 
604
 
605
  def build_intervention_section(interventions: dict) -> str:
606
+ """Show forced-depth and low-variety intervention results.
607
 
608
+ The forced-depth condition mandates β‰₯3 evaluation metric categories
609
+ per design candidate; the low-variety control performs comparable
610
+ compute with a narrow range of evaluation metrics. Together they
611
+ isolate evaluation depth as the causal driver of the evaluation-depth
612
+ gap reported in the paper.
613
  """
614
  if not interventions or not interventions.get("rows"):
615
  return '<p style="color:#718096">No intervention data available.</p>'
 
619
  cond_meta = {
620
  "baseline": ("#64748b", "Baseline"),
621
  "forced_depth": ("#38a169", "Forced Depth"),
622
+ "low_variety_control": ("#d69e2e", "Low-Variety Control"),
623
  }
624
 
625
  TH = (
 
684
  <span style="color:#52340d">
685
  Forced-depth lifts <strong>DeepSeek V3 by +9.3</strong> and
686
  <strong>GPT-5 by +15.9</strong> points without any change to
687
+ the underlying model or tools, while the low-variety control
688
  <em>hurts</em> DeepSeek V3 (&minus;2.3). The dissociation is
689
  cleanest on the strongest agent, where it provides direct
690
  causal evidence that
691
+ <strong>evaluation variety &mdash; not raw compute &mdash;
692
+ drives the gain</strong>. GPT-5's response is more uniform
693
+ across both interventions; we report the raw deltas without
694
+ smoothing.
695
  </span>
696
  </div>
697
 
 
756
  repeated sampling from generative tools (RFdiffusion,
757
  ProteinMPNN) and iterative cross-validation through several
758
  biophysical metrics. We test the full agentic loop &mdash;
759
+ <strong>plan &rarr; call &rarr; evaluate &rarr; iterate</strong>
760
+ &mdash; over 76 expert-curated tasks drawn from 2024&ndash;2026
761
+ literature, exposed through 17 MCP-integrated tools.
 
762
  </p>
763
  <div style="display:grid;grid-template-columns:
764
  repeat(auto-fit,minmax(140px,1fr));gap:0.8rem;
 
789
 
790
  <div {card}>
791
  <h2 {h2}>Three principal findings</h2>
792
+ <h3 {h3}>1. Top-tier agents now beat the hardcoded pipeline</h3>
793
  <p {p}>
794
+ DeepSeek V3 and GPT-5 surpass the deterministic hardcoded
795
+ pipeline (54.5) under both modes. Autonomous protein-design
796
  orchestration is no longer infeasible &mdash; but a substantial
797
+ gap to the human expert (61.7) and oracle (75.2) remains.
798
  </p>
799
  <h3 {h3}>2. Coverage&ndash;depth dissociation</h3>
800
  <p {p}>
801
+ Workflow guidance closes the <em>coverage</em> gap, bringing
802
+ agent tool selection closer to the human expert, but leaves
803
+ <em>evaluation depth</em> unchanged. Better tool documentation
804
+ can teach agents <em>which</em> tools to call, but not how
805
+ thoroughly to use them on each generated candidate.
 
806
  </p>
807
+ <h3 {h3}>3. Evaluation variety, not tool knowledge, is the bottleneck</h3>
808
  <p {p}>
809
+ Across 836 task&ndash;condition observations, the number of
810
+ distinct evaluation metric categories per candidate correlates
811
+ with total score at
812
+ <strong>&rho; = 0.68</strong>
813
+ (<em>p</em> &lt; 10<sup>-115</sup>). LLM agents generate
814
+ backbone candidates at expert-level rates but invoke scoring
815
+ tools at only <strong>~14% of expert intensity</strong>.
816
+ Forced-depth interventions confirm this is causal &mdash; see
817
+ the <em>Depth Gap</em> tab.
818
  </p>
819
  </div>
820
 
 
904
  ordering, intermediate validation, and adaptive iteration.</p>
905
  <p {p}>
906
  <strong>Quality (35 pts)</strong> &mdash; 100% algorithmic.
907
+ Continuous four-band interpolation over Boltz-2 re-prediction
908
+ metrics (pLDDT, pTM, ipTM, ipAE), eliminating LLM judgement
909
  variance on biophysical quantities.</p>
910
  <p {p}>
911
  <strong>Feasibility (15 pts)</strong> &mdash; valid amino
 
1102
 
1103
 
1104
  def chart_mode_comparison(entries: list) -> go.Figure:
1105
+ """Grouped bar chart: unguided vs guided mode for each LLM."""
1106
  by_name: dict[str, dict[str, float]] = {}
1107
  for e in entries:
1108
  if e["submission_type"] != "llm":
 
1111
 
1112
  ordered = sorted(
1113
  by_name.items(),
1114
+ key=lambda x: x[1].get("guided", 0),
1115
  reverse=True,
1116
  )
1117
  names = [n for n, _ in ordered]
1118
+ bench = [m.get("unguided", 0) for _, m in ordered]
1119
+ user = [m.get("guided", 0) for _, m in ordered]
1120
 
1121
  fig = go.Figure()
1122
  fig.add_trace(
1123
  go.Bar(
1124
+ x=names, y=bench, name="Unguided",
1125
  marker_color="rgba(229,62,62,0.6)",
1126
  )
1127
  )
1128
  fig.add_trace(
1129
  go.Bar(
1130
+ x=names, y=user, name="Guided",
1131
  marker_color="rgba(56,161,105,0.6)",
1132
  )
1133
  )
 
1137
  yaxis=dict(range=[0, 80], title="Overall hybrid score"),
1138
  xaxis=dict(title=""),
1139
  title=dict(
1140
+ text=("Unguided vs Guided modes \u2014 "
1141
  "guidance lifts coverage but rarely shifts overall score"),
1142
  font_size=13,
1143
  ),
 
1189
  with gr.Tab("\U0001f4ca Overall"):
1190
  with gr.Row():
1191
  f_mode = gr.Dropdown(
1192
+ ["All", "Unguided", "Guided"],
1193
  value="All", label="Mode", scale=1,
1194
  )
1195
  f_mcp = gr.Dropdown(
 
1274
  'margin:0.4rem 0 0.9rem;color:#1e3a8a;font-size:0.88rem;'
1275
  'line-height:1.55">'
1276
  '<strong>Mode semantics:</strong> '
1277
+ '<em>Unguided mode</em> exposes atomic tools without '
1278
+ 'pipeline hints; <em>guided mode</em> packages '
1279
  'them into composite workflows with explicit pipeline '
1280
+ 'structure. Guidance lifts the lowest-tier '
1281
  'agents but does not consistently help capable ones, '
1282
+ 'and never closes the evaluation-depth gap (see '
1283
+ '<em>Depth Gap</em> tab).</div>'
1284
  )
1285
  gr.Plot(chart_mode_comparison(entries))
1286
  gr.HTML(build_mode_cards(entries))
leaderboard_data.json CHANGED
@@ -2,11 +2,11 @@
2
  "last_updated": "2026-04-14",
3
  "paper_title": "Evaluating LLM-Driven Protein Design: Agents Lack Iterative Evaluation Depth",
4
  "headline_findings": [
5
- "Top-tier LLM agents (DeepSeek V3, GPT-5) now surpass a deterministic hardcoded pipeline.",
6
- "All agents show a critical evaluation depth gap \u2014 they invoke evaluation tools at only 14% of expert frequency.",
7
- "Workflow guidance rescues tool coverage (Rescue Index up to +3.01) but not utilisation depth (Rescue Index \u2248 0).",
8
- "Evaluation depth predicts design quality (\u03c1 = 0.685, p < 10\u207b\u00b9\u00b9\u2077) beyond binary tool selection.",
9
- "Forced-depth intervention lifts the strongest agent (DeepSeek V3) by +9.3 points on 18 tasks, while a low-diversity control hurts it (-2.3) \u2014 evidence that depth, not process change alone, drives the gain."
10
  ],
11
  "scoring": {
12
  "rubric_max": 100,
@@ -26,7 +26,7 @@
26
  "agent_id": "oracle",
27
  "mode": null,
28
  "submission_type": "human_oracle",
29
- "organization": "Romero Lab",
30
  "mcp_custom": false,
31
  "overall_score": 74.85,
32
  "component_scores": {
@@ -63,7 +63,7 @@
63
  "agent_id": "human-expert",
64
  "mode": null,
65
  "submission_type": "human_expert",
66
- "organization": "Romero Lab",
67
  "mcp_custom": false,
68
  "overall_score": 61.25,
69
  "component_scores": {
@@ -98,9 +98,9 @@
98
  {
99
  "agent_name": "DeepSeek V3",
100
  "agent_id": "deepseek-v3-benchmark",
101
- "mode": "benchmark",
102
  "submission_type": "llm",
103
- "organization": "DeepSeek",
104
  "mcp_custom": false,
105
  "overall_score": 60.43,
106
  "component_scores": {
@@ -135,9 +135,9 @@
135
  {
136
  "agent_name": "DeepSeek V3",
137
  "agent_id": "deepseek-v3-user",
138
- "mode": "user",
139
  "submission_type": "llm",
140
- "organization": "DeepSeek",
141
  "mcp_custom": false,
142
  "overall_score": 58.46,
143
  "component_scores": {
@@ -172,9 +172,9 @@
172
  {
173
  "agent_name": "GPT-5",
174
  "agent_id": "gpt5-benchmark",
175
- "mode": "benchmark",
176
  "submission_type": "llm",
177
- "organization": "OpenAI",
178
  "mcp_custom": false,
179
  "overall_score": 55.61,
180
  "component_scores": {
@@ -209,9 +209,9 @@
209
  {
210
  "agent_name": "GPT-5",
211
  "agent_id": "gpt5-user",
212
- "mode": "user",
213
  "submission_type": "llm",
214
- "organization": "OpenAI",
215
  "mcp_custom": false,
216
  "overall_score": 55.26,
217
  "component_scores": {
@@ -248,7 +248,7 @@
248
  "agent_id": "hardcoded-pipeline",
249
  "mode": null,
250
  "submission_type": "hardcoded",
251
- "organization": "Deterministic",
252
  "mcp_custom": false,
253
  "overall_score": 54.2,
254
  "component_scores": {
@@ -283,9 +283,9 @@
283
  {
284
  "agent_name": "Claude Sonnet 4.5",
285
  "agent_id": "sonnet-4.5-user",
286
- "mode": "user",
287
  "submission_type": "llm",
288
- "organization": "Anthropic",
289
  "mcp_custom": false,
290
  "overall_score": 50.23,
291
  "component_scores": {
@@ -320,9 +320,9 @@
320
  {
321
  "agent_name": "Claude Sonnet 4.5",
322
  "agent_id": "sonnet-4.5-benchmark",
323
- "mode": "benchmark",
324
  "submission_type": "llm",
325
- "organization": "Anthropic",
326
  "mcp_custom": false,
327
  "overall_score": 41.17,
328
  "component_scores": {
@@ -357,9 +357,9 @@
357
  {
358
  "agent_name": "Gemini 2.5 Pro",
359
  "agent_id": "gemini-2.5-pro-user",
360
- "mode": "user",
361
  "submission_type": "llm",
362
- "organization": "Google",
363
  "mcp_custom": false,
364
  "overall_score": 8.75,
365
  "component_scores": {
@@ -394,9 +394,9 @@
394
  {
395
  "agent_name": "Gemini 2.5 Pro",
396
  "agent_id": "gemini-2.5-pro-benchmark",
397
- "mode": "benchmark",
398
  "submission_type": "llm",
399
- "organization": "Google",
400
  "mcp_custom": false,
401
  "overall_score": 8.11,
402
  "component_scores": {
@@ -430,11 +430,11 @@
430
  }
431
  ],
432
  "interventions": {
433
- "description": "Causal intervention experiments on the depth gap. 18 representative tasks rerun under three conditions: baseline (no intervention), forced_depth (mandate \u22653 evaluation passes per candidate), and low_diversity_control (constrain candidate count without forcing depth).",
434
  "n_tasks": 18,
435
  "rows": [
436
  {
437
- "label": "DeepSeek V3 \u2014 baseline",
438
  "condition": "baseline",
439
  "agent": "deepseek-v3-tools-benchmark",
440
  "n_tasks": 18,
@@ -446,7 +446,7 @@
446
  "diversity": 3.56
447
  },
448
  {
449
- "label": "GPT-5 \u2014 baseline",
450
  "condition": "baseline",
451
  "agent": "gpt5-tools-benchmark",
452
  "n_tasks": 18,
@@ -458,7 +458,7 @@
458
  "diversity": 3.94
459
  },
460
  {
461
- "label": "Human Expert \u2014 baseline",
462
  "condition": "baseline",
463
  "agent": "human-expert-agent",
464
  "n_tasks": 18,
@@ -470,7 +470,7 @@
470
  "diversity": 2.28
471
  },
472
  {
473
- "label": "DeepSeek V3 \u2014 forced depth",
474
  "condition": "forced_depth",
475
  "agent": "deepseek-v3-forced-depth",
476
  "n_tasks": 18,
@@ -482,7 +482,7 @@
482
  "diversity": 3.94
483
  },
484
  {
485
- "label": "GPT-5 \u2014 forced depth",
486
  "condition": "forced_depth",
487
  "agent": "gpt5-tools-forced-depth",
488
  "n_tasks": 18,
@@ -494,8 +494,8 @@
494
  "diversity": 3.06
495
  },
496
  {
497
- "label": "DeepSeek V3 \u2014 low diversity",
498
- "condition": "low_diversity_control",
499
  "agent": "deepseek-v3-low-diversity",
500
  "n_tasks": 18,
501
  "score": 56.39,
@@ -506,8 +506,8 @@
506
  "diversity": 3.22
507
  },
508
  {
509
- "label": "GPT-5 \u2014 low diversity",
510
- "condition": "low_diversity_control",
511
  "agent": "gpt5-tools-low-diversity",
512
  "n_tasks": 18,
513
  "score": 61.5,
@@ -518,8 +518,8 @@
518
  "diversity": 3.22
519
  },
520
  {
521
- "label": "Human Expert \u2014 shallow",
522
- "condition": "low_diversity_control",
523
  "agent": "human-expert-shallow",
524
  "n_tasks": 18,
525
  "score": 55.06,
 
2
  "last_updated": "2026-04-14",
3
  "paper_title": "Evaluating LLM-Driven Protein Design: Agents Lack Iterative Evaluation Depth",
4
  "headline_findings": [
5
+ "Top-tier LLM agents (DeepSeek V3, GPT-5) now surpass the deterministic hardcoded pipeline.",
6
+ "All agents show a critical evaluation-depth gap β€” they invoke evaluation tools at only ~14% of expert intensity.",
7
+ "Workflow guidance closes the coverage gap but leaves the evaluation-depth gap unchanged.",
8
+ "Evaluation variety (distinct metric categories per candidate) predicts design quality (ρ = 0.68, p < 10⁻¹¹⁡) beyond binary tool selection.",
9
+ "Forced-depth intervention lifts the strongest agent (DeepSeek V3) by +9.3 points on 18 tasks, while a compute-matched low-variety control hurts it (-2.3) β€” evidence that variety, not raw compute, drives the gain."
10
  ],
11
  "scoring": {
12
  "rubric_max": 100,
 
26
  "agent_id": "oracle",
27
  "mode": null,
28
  "submission_type": "human_oracle",
29
+ "organization": "RomeroLab",
30
  "mcp_custom": false,
31
  "overall_score": 74.85,
32
  "component_scores": {
 
63
  "agent_id": "human-expert",
64
  "mode": null,
65
  "submission_type": "human_expert",
66
+ "organization": "RomeroLab",
67
  "mcp_custom": false,
68
  "overall_score": 61.25,
69
  "component_scores": {
 
98
  {
99
  "agent_name": "DeepSeek V3",
100
  "agent_id": "deepseek-v3-benchmark",
101
+ "mode": "unguided",
102
  "submission_type": "llm",
103
+ "organization": "RomeroLab",
104
  "mcp_custom": false,
105
  "overall_score": 60.43,
106
  "component_scores": {
 
135
  {
136
  "agent_name": "DeepSeek V3",
137
  "agent_id": "deepseek-v3-user",
138
+ "mode": "guided",
139
  "submission_type": "llm",
140
+ "organization": "RomeroLab",
141
  "mcp_custom": false,
142
  "overall_score": 58.46,
143
  "component_scores": {
 
172
  {
173
  "agent_name": "GPT-5",
174
  "agent_id": "gpt5-benchmark",
175
+ "mode": "unguided",
176
  "submission_type": "llm",
177
+ "organization": "RomeroLab",
178
  "mcp_custom": false,
179
  "overall_score": 55.61,
180
  "component_scores": {
 
209
  {
210
  "agent_name": "GPT-5",
211
  "agent_id": "gpt5-user",
212
+ "mode": "guided",
213
  "submission_type": "llm",
214
+ "organization": "RomeroLab",
215
  "mcp_custom": false,
216
  "overall_score": 55.26,
217
  "component_scores": {
 
248
  "agent_id": "hardcoded-pipeline",
249
  "mode": null,
250
  "submission_type": "hardcoded",
251
+ "organization": "RomeroLab",
252
  "mcp_custom": false,
253
  "overall_score": 54.2,
254
  "component_scores": {
 
283
  {
284
  "agent_name": "Claude Sonnet 4.5",
285
  "agent_id": "sonnet-4.5-user",
286
+ "mode": "guided",
287
  "submission_type": "llm",
288
+ "organization": "RomeroLab",
289
  "mcp_custom": false,
290
  "overall_score": 50.23,
291
  "component_scores": {
 
320
  {
321
  "agent_name": "Claude Sonnet 4.5",
322
  "agent_id": "sonnet-4.5-benchmark",
323
+ "mode": "unguided",
324
  "submission_type": "llm",
325
+ "organization": "RomeroLab",
326
  "mcp_custom": false,
327
  "overall_score": 41.17,
328
  "component_scores": {
 
357
  {
358
  "agent_name": "Gemini 2.5 Pro",
359
  "agent_id": "gemini-2.5-pro-user",
360
+ "mode": "guided",
361
  "submission_type": "llm",
362
+ "organization": "RomeroLab",
363
  "mcp_custom": false,
364
  "overall_score": 8.75,
365
  "component_scores": {
 
394
  {
395
  "agent_name": "Gemini 2.5 Pro",
396
  "agent_id": "gemini-2.5-pro-benchmark",
397
+ "mode": "unguided",
398
  "submission_type": "llm",
399
+ "organization": "RomeroLab",
400
  "mcp_custom": false,
401
  "overall_score": 8.11,
402
  "component_scores": {
 
430
  }
431
  ],
432
  "interventions": {
433
+ "description": "Causal intervention experiments on the evaluation-depth gap. 18 representative tasks rerun under three conditions: baseline (no intervention), forced_depth (mandate β‰₯3 evaluation metric categories per candidate), and low_variety_control (compute-matched control restricted to a narrow range of evaluation metrics).",
434
  "n_tasks": 18,
435
  "rows": [
436
  {
437
+ "label": "DeepSeek V3 β€” baseline",
438
  "condition": "baseline",
439
  "agent": "deepseek-v3-tools-benchmark",
440
  "n_tasks": 18,
 
446
  "diversity": 3.56
447
  },
448
  {
449
+ "label": "GPT-5 β€” baseline",
450
  "condition": "baseline",
451
  "agent": "gpt5-tools-benchmark",
452
  "n_tasks": 18,
 
458
  "diversity": 3.94
459
  },
460
  {
461
+ "label": "Human Expert β€” baseline",
462
  "condition": "baseline",
463
  "agent": "human-expert-agent",
464
  "n_tasks": 18,
 
470
  "diversity": 2.28
471
  },
472
  {
473
+ "label": "DeepSeek V3 β€” forced depth",
474
  "condition": "forced_depth",
475
  "agent": "deepseek-v3-forced-depth",
476
  "n_tasks": 18,
 
482
  "diversity": 3.94
483
  },
484
  {
485
+ "label": "GPT-5 β€” forced depth",
486
  "condition": "forced_depth",
487
  "agent": "gpt5-tools-forced-depth",
488
  "n_tasks": 18,
 
494
  "diversity": 3.06
495
  },
496
  {
497
+ "label": "DeepSeek V3 β€” low variety",
498
+ "condition": "low_variety_control",
499
  "agent": "deepseek-v3-low-diversity",
500
  "n_tasks": 18,
501
  "score": 56.39,
 
506
  "diversity": 3.22
507
  },
508
  {
509
+ "label": "GPT-5 β€” low variety",
510
+ "condition": "low_variety_control",
511
  "agent": "gpt5-tools-low-diversity",
512
  "n_tasks": 18,
513
  "score": 61.5,
 
518
  "diversity": 3.22
519
  },
520
  {
521
+ "label": "Human Expert β€” shallow",
522
+ "condition": "low_variety_control",
523
  "agent": "human-expert-shallow",
524
  "n_tasks": 18,
525
  "score": 55.06,