align jargon with paper; set organization to RomeroLab
Browse files- low-diversity -> low-variety (paper Section 2.5 terminology;
avoids collision with rubric "Diversity" component)
- Benchmark/User mode -> Unguided/Guided (paper-standard naming)
- remove "Rescue Index" (not defined in paper); replace with
"coverage gap / evaluation-depth gap" phrasing
- "utilisation depth" -> "evaluation depth"
- "surface competence" -> "evaluation-depth gap"
- "hand-engineered" -> "deterministic" hardcoded pipeline
- i_pAE -> ipAE; "plan -> sample -> evaluate" -> "plan -> call -> evaluate -> iterate"
- baseline scores aligned to paper text (54.5 / 61.7 / 75.2)
- correlation values aligned to paper (rho = 0.68, p < 10^-115)
- set organization=RomeroLab for all 11 entries (model name
already disambiguates the agent; field now reflects who ran it)
- README.md +1 -1
- app.py +61 -61
- leaderboard_data.json +36 -36
|
@@ -36,7 +36,7 @@ Novelty, and Diversity. See the *About* tab for the full methodology and the
|
|
| 36 |
- **Taxonomy Heatmap** β Per-cell scores across the 9 occupied cells of the 2 Γ 5 design matrix
|
| 37 |
- **Component Analysis** β Radar and bar charts comparing the 6 scoring components
|
| 38 |
- **Guidance Effect** β Paired comparison of the same LLM in unguided (atomic tools) vs guided (composite workflows) mode
|
| 39 |
-
- **Depth Gap** β Forced-depth and low-
|
| 40 |
- **About** β Methodology, submission guide, and citation info
|
| 41 |
|
| 42 |
## Bringing your own MCP tools
|
|
|
|
| 36 |
- **Taxonomy Heatmap** β Per-cell scores across the 9 occupied cells of the 2 Γ 5 design matrix
|
| 37 |
- **Component Analysis** β Radar and bar charts comparing the 6 scoring components
|
| 38 |
- **Guidance Effect** β Paired comparison of the same LLM in unguided (atomic tools) vs guided (composite workflows) mode
|
| 39 |
+
- **Depth Gap** β Forced-depth and low-variety intervention results
|
| 40 |
- **About** β Methodology, submission guide, and citation info
|
| 41 |
|
| 42 |
## Bringing your own MCP tools
|
|
@@ -209,9 +209,9 @@ def build_header(last_updated: str, n_entries: int) -> str:
|
|
| 209 |
<p style="color:#64748b;margin:0.2rem 0 0;font-size:0.95rem;
|
| 210 |
font-weight:400;font-style:italic;max-width:680px;
|
| 211 |
margin-left:auto;margin-right:auto;line-height:1.5">
|
| 212 |
-
Top-tier agents now surpass a deterministic pipeline —
|
| 213 |
-
but invoke evaluation tools at only <strong>14% of expert
|
| 214 |
-
Guidance
|
| 215 |
<div style="margin-top:1rem;display:flex;justify-content:center;
|
| 216 |
gap:0.6rem;flex-wrap:wrap">
|
| 217 |
<a href="{PAPER_URL}" target="_blank"
|
|
@@ -360,17 +360,17 @@ def build_leaderboard_table(
|
|
| 360 |
# ββ Mode badge ββ
|
| 361 |
if is_bl:
|
| 362 |
mode = f'<td style="{TD};color:#718096">\u2014</td>'
|
| 363 |
-
elif e.get("mode") == "
|
| 364 |
mode = (
|
| 365 |
f'<td style="{TD}"><span style="background:#fed7d7;'
|
| 366 |
"color:#c53030;padding:0.15rem 0.5rem;border-radius:4px;"
|
| 367 |
-
'font-size:0.75rem;font-weight:600">
|
| 368 |
)
|
| 369 |
else:
|
| 370 |
mode = (
|
| 371 |
f'<td style="{TD}"><span style="background:#c6f6d5;'
|
| 372 |
"color:#276749;padding:0.15rem 0.5rem;border-radius:4px;"
|
| 373 |
-
'font-size:0.75rem;font-weight:600">
|
| 374 |
)
|
| 375 |
|
| 376 |
# ββ MCP ββ
|
|
@@ -503,7 +503,7 @@ def build_heatmap(entry: dict) -> str:
|
|
| 503 |
|
| 504 |
|
| 505 |
def build_mode_cards(entries: list) -> str:
|
| 506 |
-
"""Per-LLM cards showing
|
| 507 |
by_name: dict[str, dict] = {}
|
| 508 |
for e in entries:
|
| 509 |
if e["submission_type"] != "llm":
|
|
@@ -512,14 +512,14 @@ def build_mode_cards(entries: list) -> str:
|
|
| 512 |
|
| 513 |
ordered = sorted(
|
| 514 |
by_name.items(),
|
| 515 |
-
key=lambda x: x[1].get("
|
| 516 |
reverse=True,
|
| 517 |
)
|
| 518 |
|
| 519 |
cards = []
|
| 520 |
for name, modes in ordered:
|
| 521 |
-
bench = modes.get("
|
| 522 |
-
user = modes.get("
|
| 523 |
if not bench or not user:
|
| 524 |
continue
|
| 525 |
delta = user["overall_score"] - bench["overall_score"]
|
|
@@ -528,12 +528,12 @@ def build_mode_cards(entries: list) -> str:
|
|
| 528 |
lines = [
|
| 529 |
'<div style="display:flex;justify-content:space-between;'
|
| 530 |
'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">'
|
| 531 |
-
"<span>
|
| 532 |
f'<span style="font-weight:700;color:#e53e3e">'
|
| 533 |
f'{bench["overall_score"]:.1f}</span></div>',
|
| 534 |
'<div style="display:flex;justify-content:space-between;'
|
| 535 |
'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">'
|
| 536 |
-
"<span>
|
| 537 |
f'<span style="font-weight:700;color:#d69e2e">'
|
| 538 |
f'{user["overall_score"]:.1f}</span></div>',
|
| 539 |
'<div style="display:flex;justify-content:space-between;'
|
|
@@ -603,12 +603,13 @@ def build_headline_findings(findings: list) -> str:
|
|
| 603 |
|
| 604 |
|
| 605 |
def build_intervention_section(interventions: dict) -> str:
|
| 606 |
-
"""Show forced-depth and low-
|
| 607 |
|
| 608 |
-
The forced-depth condition mandates β₯3 evaluation
|
| 609 |
-
candidate; the low-
|
| 610 |
-
|
| 611 |
-
|
|
|
|
| 612 |
"""
|
| 613 |
if not interventions or not interventions.get("rows"):
|
| 614 |
return '<p style="color:#718096">No intervention data available.</p>'
|
|
@@ -618,7 +619,7 @@ def build_intervention_section(interventions: dict) -> str:
|
|
| 618 |
cond_meta = {
|
| 619 |
"baseline": ("#64748b", "Baseline"),
|
| 620 |
"forced_depth": ("#38a169", "Forced Depth"),
|
| 621 |
-
"
|
| 622 |
}
|
| 623 |
|
| 624 |
TH = (
|
|
@@ -683,14 +684,14 @@ def build_intervention_section(interventions: dict) -> str:
|
|
| 683 |
<span style="color:#52340d">
|
| 684 |
Forced-depth lifts <strong>DeepSeek V3 by +9.3</strong> and
|
| 685 |
<strong>GPT-5 by +15.9</strong> points without any change to
|
| 686 |
-
the underlying model or tools, while the low-
|
| 687 |
<em>hurts</em> DeepSeek V3 (−2.3). The dissociation is
|
| 688 |
cleanest on the strongest agent, where it provides direct
|
| 689 |
causal evidence that
|
| 690 |
-
<strong>evaluation
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
|
| 694 |
</span>
|
| 695 |
</div>
|
| 696 |
|
|
@@ -755,10 +756,9 @@ def build_about() -> str:
|
|
| 755 |
repeated sampling from generative tools (RFdiffusion,
|
| 756 |
ProteinMPNN) and iterative cross-validation through several
|
| 757 |
biophysical metrics. We test the full agentic loop —
|
| 758 |
-
<strong>plan →
|
| 759 |
-
|
| 760 |
-
|
| 761 |
-
17 MCP-integrated tools.
|
| 762 |
</p>
|
| 763 |
<div style="display:grid;grid-template-columns:
|
| 764 |
repeat(auto-fit,minmax(140px,1fr));gap:0.8rem;
|
|
@@ -789,32 +789,32 @@ def build_about() -> str:
|
|
| 789 |
|
| 790 |
<div {card}>
|
| 791 |
<h2 {h2}>Three principal findings</h2>
|
| 792 |
-
<h3 {h3}>1. Top-tier agents now beat
|
| 793 |
<p {p}>
|
| 794 |
-
DeepSeek V3 and GPT-5 surpass
|
| 795 |
-
pipeline (54.
|
| 796 |
orchestration is no longer infeasible — but a substantial
|
| 797 |
-
gap to the human expert (61.
|
| 798 |
</p>
|
| 799 |
<h3 {h3}>2. Coverage–depth dissociation</h3>
|
| 800 |
<p {p}>
|
| 801 |
-
Workflow guidance closes the <em>coverage</em> gap
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
can teach agents <em>which</em> tools to call, but
|
| 805 |
-
|
| 806 |
-
expert practice demands.
|
| 807 |
</p>
|
| 808 |
-
<h3 {h3}>3. Evaluation
|
| 809 |
<p {p}>
|
| 810 |
-
Across 836 task–condition observations,
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
-
|
| 816 |
-
|
| 817 |
-
|
|
|
|
| 818 |
</p>
|
| 819 |
</div>
|
| 820 |
|
|
@@ -904,8 +904,8 @@ def build_about() -> str:
|
|
| 904 |
ordering, intermediate validation, and adaptive iteration.</p>
|
| 905 |
<p {p}>
|
| 906 |
<strong>Quality (35 pts)</strong> — 100% algorithmic.
|
| 907 |
-
Continuous
|
| 908 |
-
metrics (pLDDT, pTM, ipTM,
|
| 909 |
variance on biophysical quantities.</p>
|
| 910 |
<p {p}>
|
| 911 |
<strong>Feasibility (15 pts)</strong> — valid amino
|
|
@@ -1102,7 +1102,7 @@ def chart_component_bar(e1: dict, e2: dict) -> go.Figure:
|
|
| 1102 |
|
| 1103 |
|
| 1104 |
def chart_mode_comparison(entries: list) -> go.Figure:
|
| 1105 |
-
"""Grouped bar chart:
|
| 1106 |
by_name: dict[str, dict[str, float]] = {}
|
| 1107 |
for e in entries:
|
| 1108 |
if e["submission_type"] != "llm":
|
|
@@ -1111,23 +1111,23 @@ def chart_mode_comparison(entries: list) -> go.Figure:
|
|
| 1111 |
|
| 1112 |
ordered = sorted(
|
| 1113 |
by_name.items(),
|
| 1114 |
-
key=lambda x: x[1].get("
|
| 1115 |
reverse=True,
|
| 1116 |
)
|
| 1117 |
names = [n for n, _ in ordered]
|
| 1118 |
-
bench = [m.get("
|
| 1119 |
-
user = [m.get("
|
| 1120 |
|
| 1121 |
fig = go.Figure()
|
| 1122 |
fig.add_trace(
|
| 1123 |
go.Bar(
|
| 1124 |
-
x=names, y=bench, name="
|
| 1125 |
marker_color="rgba(229,62,62,0.6)",
|
| 1126 |
)
|
| 1127 |
)
|
| 1128 |
fig.add_trace(
|
| 1129 |
go.Bar(
|
| 1130 |
-
x=names, y=user, name="
|
| 1131 |
marker_color="rgba(56,161,105,0.6)",
|
| 1132 |
)
|
| 1133 |
)
|
|
@@ -1137,7 +1137,7 @@ def chart_mode_comparison(entries: list) -> go.Figure:
|
|
| 1137 |
yaxis=dict(range=[0, 80], title="Overall hybrid score"),
|
| 1138 |
xaxis=dict(title=""),
|
| 1139 |
title=dict(
|
| 1140 |
-
text=("Unguided
|
| 1141 |
"guidance lifts coverage but rarely shifts overall score"),
|
| 1142 |
font_size=13,
|
| 1143 |
),
|
|
@@ -1189,7 +1189,7 @@ def create_app() -> gr.Blocks:
|
|
| 1189 |
with gr.Tab("\U0001f4ca Overall"):
|
| 1190 |
with gr.Row():
|
| 1191 |
f_mode = gr.Dropdown(
|
| 1192 |
-
["All", "
|
| 1193 |
value="All", label="Mode", scale=1,
|
| 1194 |
)
|
| 1195 |
f_mcp = gr.Dropdown(
|
|
@@ -1274,13 +1274,13 @@ def create_app() -> gr.Blocks:
|
|
| 1274 |
'margin:0.4rem 0 0.9rem;color:#1e3a8a;font-size:0.88rem;'
|
| 1275 |
'line-height:1.55">'
|
| 1276 |
'<strong>Mode semantics:</strong> '
|
| 1277 |
-
'<em>
|
| 1278 |
-
'pipeline hints
|
| 1279 |
'them into composite workflows with explicit pipeline '
|
| 1280 |
-
'structure
|
| 1281 |
'agents but does not consistently help capable ones, '
|
| 1282 |
-
'and never closes the depth gap (see
|
| 1283 |
-
'tab).</div>'
|
| 1284 |
)
|
| 1285 |
gr.Plot(chart_mode_comparison(entries))
|
| 1286 |
gr.HTML(build_mode_cards(entries))
|
|
|
|
| 209 |
<p style="color:#64748b;margin:0.2rem 0 0;font-size:0.95rem;
|
| 210 |
font-weight:400;font-style:italic;max-width:680px;
|
| 211 |
margin-left:auto;margin-right:auto;line-height:1.5">
|
| 212 |
+
Top-tier agents now surpass a deterministic hardcoded pipeline —
|
| 213 |
+
but invoke evaluation tools at only <strong>14% of expert intensity</strong>.
|
| 214 |
+
Guidance closes the coverage gap, not the evaluation-depth gap.</p>
|
| 215 |
<div style="margin-top:1rem;display:flex;justify-content:center;
|
| 216 |
gap:0.6rem;flex-wrap:wrap">
|
| 217 |
<a href="{PAPER_URL}" target="_blank"
|
|
|
|
| 360 |
# ββ Mode badge ββ
|
| 361 |
if is_bl:
|
| 362 |
mode = f'<td style="{TD};color:#718096">\u2014</td>'
|
| 363 |
+
elif e.get("mode") == "unguided":
|
| 364 |
mode = (
|
| 365 |
f'<td style="{TD}"><span style="background:#fed7d7;'
|
| 366 |
"color:#c53030;padding:0.15rem 0.5rem;border-radius:4px;"
|
| 367 |
+
'font-size:0.75rem;font-weight:600">unguided</span></td>'
|
| 368 |
)
|
| 369 |
else:
|
| 370 |
mode = (
|
| 371 |
f'<td style="{TD}"><span style="background:#c6f6d5;'
|
| 372 |
"color:#276749;padding:0.15rem 0.5rem;border-radius:4px;"
|
| 373 |
+
'font-size:0.75rem;font-weight:600">guided</span></td>'
|
| 374 |
)
|
| 375 |
|
| 376 |
# ββ MCP ββ
|
|
|
|
| 503 |
|
| 504 |
|
| 505 |
def build_mode_cards(entries: list) -> str:
|
| 506 |
+
"""Per-LLM cards showing unguided vs guided delta."""
|
| 507 |
by_name: dict[str, dict] = {}
|
| 508 |
for e in entries:
|
| 509 |
if e["submission_type"] != "llm":
|
|
|
|
| 512 |
|
| 513 |
ordered = sorted(
|
| 514 |
by_name.items(),
|
| 515 |
+
key=lambda x: x[1].get("guided", {}).get("overall_score", 0),
|
| 516 |
reverse=True,
|
| 517 |
)
|
| 518 |
|
| 519 |
cards = []
|
| 520 |
for name, modes in ordered:
|
| 521 |
+
bench = modes.get("unguided")
|
| 522 |
+
user = modes.get("guided")
|
| 523 |
if not bench or not user:
|
| 524 |
continue
|
| 525 |
delta = user["overall_score"] - bench["overall_score"]
|
|
|
|
| 528 |
lines = [
|
| 529 |
'<div style="display:flex;justify-content:space-between;'
|
| 530 |
'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">'
|
| 531 |
+
"<span>Unguided</span>"
|
| 532 |
f'<span style="font-weight:700;color:#e53e3e">'
|
| 533 |
f'{bench["overall_score"]:.1f}</span></div>',
|
| 534 |
'<div style="display:flex;justify-content:space-between;'
|
| 535 |
'padding:0.4rem 0;border-bottom:1px solid #e2e8f0">'
|
| 536 |
+
"<span>Guided</span>"
|
| 537 |
f'<span style="font-weight:700;color:#d69e2e">'
|
| 538 |
f'{user["overall_score"]:.1f}</span></div>',
|
| 539 |
'<div style="display:flex;justify-content:space-between;'
|
|
|
|
| 603 |
|
| 604 |
|
| 605 |
def build_intervention_section(interventions: dict) -> str:
|
| 606 |
+
"""Show forced-depth and low-variety intervention results.
|
| 607 |
|
| 608 |
+
The forced-depth condition mandates β₯3 evaluation metric categories
|
| 609 |
+
per design candidate; the low-variety control performs comparable
|
| 610 |
+
compute with a narrow range of evaluation metrics. Together they
|
| 611 |
+
isolate evaluation depth as the causal driver of the evaluation-depth
|
| 612 |
+
gap reported in the paper.
|
| 613 |
"""
|
| 614 |
if not interventions or not interventions.get("rows"):
|
| 615 |
return '<p style="color:#718096">No intervention data available.</p>'
|
|
|
|
| 619 |
cond_meta = {
|
| 620 |
"baseline": ("#64748b", "Baseline"),
|
| 621 |
"forced_depth": ("#38a169", "Forced Depth"),
|
| 622 |
+
"low_variety_control": ("#d69e2e", "Low-Variety Control"),
|
| 623 |
}
|
| 624 |
|
| 625 |
TH = (
|
|
|
|
| 684 |
<span style="color:#52340d">
|
| 685 |
Forced-depth lifts <strong>DeepSeek V3 by +9.3</strong> and
|
| 686 |
<strong>GPT-5 by +15.9</strong> points without any change to
|
| 687 |
+
the underlying model or tools, while the low-variety control
|
| 688 |
<em>hurts</em> DeepSeek V3 (−2.3). The dissociation is
|
| 689 |
cleanest on the strongest agent, where it provides direct
|
| 690 |
causal evidence that
|
| 691 |
+
<strong>evaluation variety — not raw compute —
|
| 692 |
+
drives the gain</strong>. GPT-5's response is more uniform
|
| 693 |
+
across both interventions; we report the raw deltas without
|
| 694 |
+
smoothing.
|
| 695 |
</span>
|
| 696 |
</div>
|
| 697 |
|
|
|
|
| 756 |
repeated sampling from generative tools (RFdiffusion,
|
| 757 |
ProteinMPNN) and iterative cross-validation through several
|
| 758 |
biophysical metrics. We test the full agentic loop —
|
| 759 |
+
<strong>plan → call → evaluate → iterate</strong>
|
| 760 |
+
— over 76 expert-curated tasks drawn from 2024–2026
|
| 761 |
+
literature, exposed through 17 MCP-integrated tools.
|
|
|
|
| 762 |
</p>
|
| 763 |
<div style="display:grid;grid-template-columns:
|
| 764 |
repeat(auto-fit,minmax(140px,1fr));gap:0.8rem;
|
|
|
|
| 789 |
|
| 790 |
<div {card}>
|
| 791 |
<h2 {h2}>Three principal findings</h2>
|
| 792 |
+
<h3 {h3}>1. Top-tier agents now beat the hardcoded pipeline</h3>
|
| 793 |
<p {p}>
|
| 794 |
+
DeepSeek V3 and GPT-5 surpass the deterministic hardcoded
|
| 795 |
+
pipeline (54.5) under both modes. Autonomous protein-design
|
| 796 |
orchestration is no longer infeasible — but a substantial
|
| 797 |
+
gap to the human expert (61.7) and oracle (75.2) remains.
|
| 798 |
</p>
|
| 799 |
<h3 {h3}>2. Coverage–depth dissociation</h3>
|
| 800 |
<p {p}>
|
| 801 |
+
Workflow guidance closes the <em>coverage</em> gap, bringing
|
| 802 |
+
agent tool selection closer to the human expert, but leaves
|
| 803 |
+
<em>evaluation depth</em> unchanged. Better tool documentation
|
| 804 |
+
can teach agents <em>which</em> tools to call, but not how
|
| 805 |
+
thoroughly to use them on each generated candidate.
|
|
|
|
| 806 |
</p>
|
| 807 |
+
<h3 {h3}>3. Evaluation variety, not tool knowledge, is the bottleneck</h3>
|
| 808 |
<p {p}>
|
| 809 |
+
Across 836 task–condition observations, the number of
|
| 810 |
+
distinct evaluation metric categories per candidate correlates
|
| 811 |
+
with total score at
|
| 812 |
+
<strong>ρ = 0.68</strong>
|
| 813 |
+
(<em>p</em> < 10<sup>-115</sup>). LLM agents generate
|
| 814 |
+
backbone candidates at expert-level rates but invoke scoring
|
| 815 |
+
tools at only <strong>~14% of expert intensity</strong>.
|
| 816 |
+
Forced-depth interventions confirm this is causal — see
|
| 817 |
+
the <em>Depth Gap</em> tab.
|
| 818 |
</p>
|
| 819 |
</div>
|
| 820 |
|
|
|
|
| 904 |
ordering, intermediate validation, and adaptive iteration.</p>
|
| 905 |
<p {p}>
|
| 906 |
<strong>Quality (35 pts)</strong> — 100% algorithmic.
|
| 907 |
+
Continuous four-band interpolation over Boltz-2 re-prediction
|
| 908 |
+
metrics (pLDDT, pTM, ipTM, ipAE), eliminating LLM judgement
|
| 909 |
variance on biophysical quantities.</p>
|
| 910 |
<p {p}>
|
| 911 |
<strong>Feasibility (15 pts)</strong> — valid amino
|
|
|
|
| 1102 |
|
| 1103 |
|
| 1104 |
def chart_mode_comparison(entries: list) -> go.Figure:
|
| 1105 |
+
"""Grouped bar chart: unguided vs guided mode for each LLM."""
|
| 1106 |
by_name: dict[str, dict[str, float]] = {}
|
| 1107 |
for e in entries:
|
| 1108 |
if e["submission_type"] != "llm":
|
|
|
|
| 1111 |
|
| 1112 |
ordered = sorted(
|
| 1113 |
by_name.items(),
|
| 1114 |
+
key=lambda x: x[1].get("guided", 0),
|
| 1115 |
reverse=True,
|
| 1116 |
)
|
| 1117 |
names = [n for n, _ in ordered]
|
| 1118 |
+
bench = [m.get("unguided", 0) for _, m in ordered]
|
| 1119 |
+
user = [m.get("guided", 0) for _, m in ordered]
|
| 1120 |
|
| 1121 |
fig = go.Figure()
|
| 1122 |
fig.add_trace(
|
| 1123 |
go.Bar(
|
| 1124 |
+
x=names, y=bench, name="Unguided",
|
| 1125 |
marker_color="rgba(229,62,62,0.6)",
|
| 1126 |
)
|
| 1127 |
)
|
| 1128 |
fig.add_trace(
|
| 1129 |
go.Bar(
|
| 1130 |
+
x=names, y=user, name="Guided",
|
| 1131 |
marker_color="rgba(56,161,105,0.6)",
|
| 1132 |
)
|
| 1133 |
)
|
|
|
|
| 1137 |
yaxis=dict(range=[0, 80], title="Overall hybrid score"),
|
| 1138 |
xaxis=dict(title=""),
|
| 1139 |
title=dict(
|
| 1140 |
+
text=("Unguided vs Guided modes \u2014 "
|
| 1141 |
"guidance lifts coverage but rarely shifts overall score"),
|
| 1142 |
font_size=13,
|
| 1143 |
),
|
|
|
|
| 1189 |
with gr.Tab("\U0001f4ca Overall"):
|
| 1190 |
with gr.Row():
|
| 1191 |
f_mode = gr.Dropdown(
|
| 1192 |
+
["All", "Unguided", "Guided"],
|
| 1193 |
value="All", label="Mode", scale=1,
|
| 1194 |
)
|
| 1195 |
f_mcp = gr.Dropdown(
|
|
|
|
| 1274 |
'margin:0.4rem 0 0.9rem;color:#1e3a8a;font-size:0.88rem;'
|
| 1275 |
'line-height:1.55">'
|
| 1276 |
'<strong>Mode semantics:</strong> '
|
| 1277 |
+
'<em>Unguided mode</em> exposes atomic tools without '
|
| 1278 |
+
'pipeline hints; <em>guided mode</em> packages '
|
| 1279 |
'them into composite workflows with explicit pipeline '
|
| 1280 |
+
'structure. Guidance lifts the lowest-tier '
|
| 1281 |
'agents but does not consistently help capable ones, '
|
| 1282 |
+
'and never closes the evaluation-depth gap (see '
|
| 1283 |
+
'<em>Depth Gap</em> tab).</div>'
|
| 1284 |
)
|
| 1285 |
gr.Plot(chart_mode_comparison(entries))
|
| 1286 |
gr.HTML(build_mode_cards(entries))
|
|
@@ -2,11 +2,11 @@
|
|
| 2 |
"last_updated": "2026-04-14",
|
| 3 |
"paper_title": "Evaluating LLM-Driven Protein Design: Agents Lack Iterative Evaluation Depth",
|
| 4 |
"headline_findings": [
|
| 5 |
-
"Top-tier LLM agents (DeepSeek V3, GPT-5) now surpass
|
| 6 |
-
"All agents show a critical evaluation
|
| 7 |
-
"Workflow guidance
|
| 8 |
-
"Evaluation
|
| 9 |
-
"Forced-depth intervention lifts the strongest agent (DeepSeek V3) by +9.3 points on 18 tasks, while a low-
|
| 10 |
],
|
| 11 |
"scoring": {
|
| 12 |
"rubric_max": 100,
|
|
@@ -26,7 +26,7 @@
|
|
| 26 |
"agent_id": "oracle",
|
| 27 |
"mode": null,
|
| 28 |
"submission_type": "human_oracle",
|
| 29 |
-
"organization": "
|
| 30 |
"mcp_custom": false,
|
| 31 |
"overall_score": 74.85,
|
| 32 |
"component_scores": {
|
|
@@ -63,7 +63,7 @@
|
|
| 63 |
"agent_id": "human-expert",
|
| 64 |
"mode": null,
|
| 65 |
"submission_type": "human_expert",
|
| 66 |
-
"organization": "
|
| 67 |
"mcp_custom": false,
|
| 68 |
"overall_score": 61.25,
|
| 69 |
"component_scores": {
|
|
@@ -98,9 +98,9 @@
|
|
| 98 |
{
|
| 99 |
"agent_name": "DeepSeek V3",
|
| 100 |
"agent_id": "deepseek-v3-benchmark",
|
| 101 |
-
"mode": "
|
| 102 |
"submission_type": "llm",
|
| 103 |
-
"organization": "
|
| 104 |
"mcp_custom": false,
|
| 105 |
"overall_score": 60.43,
|
| 106 |
"component_scores": {
|
|
@@ -135,9 +135,9 @@
|
|
| 135 |
{
|
| 136 |
"agent_name": "DeepSeek V3",
|
| 137 |
"agent_id": "deepseek-v3-user",
|
| 138 |
-
"mode": "
|
| 139 |
"submission_type": "llm",
|
| 140 |
-
"organization": "
|
| 141 |
"mcp_custom": false,
|
| 142 |
"overall_score": 58.46,
|
| 143 |
"component_scores": {
|
|
@@ -172,9 +172,9 @@
|
|
| 172 |
{
|
| 173 |
"agent_name": "GPT-5",
|
| 174 |
"agent_id": "gpt5-benchmark",
|
| 175 |
-
"mode": "
|
| 176 |
"submission_type": "llm",
|
| 177 |
-
"organization": "
|
| 178 |
"mcp_custom": false,
|
| 179 |
"overall_score": 55.61,
|
| 180 |
"component_scores": {
|
|
@@ -209,9 +209,9 @@
|
|
| 209 |
{
|
| 210 |
"agent_name": "GPT-5",
|
| 211 |
"agent_id": "gpt5-user",
|
| 212 |
-
"mode": "
|
| 213 |
"submission_type": "llm",
|
| 214 |
-
"organization": "
|
| 215 |
"mcp_custom": false,
|
| 216 |
"overall_score": 55.26,
|
| 217 |
"component_scores": {
|
|
@@ -248,7 +248,7 @@
|
|
| 248 |
"agent_id": "hardcoded-pipeline",
|
| 249 |
"mode": null,
|
| 250 |
"submission_type": "hardcoded",
|
| 251 |
-
"organization": "
|
| 252 |
"mcp_custom": false,
|
| 253 |
"overall_score": 54.2,
|
| 254 |
"component_scores": {
|
|
@@ -283,9 +283,9 @@
|
|
| 283 |
{
|
| 284 |
"agent_name": "Claude Sonnet 4.5",
|
| 285 |
"agent_id": "sonnet-4.5-user",
|
| 286 |
-
"mode": "
|
| 287 |
"submission_type": "llm",
|
| 288 |
-
"organization": "
|
| 289 |
"mcp_custom": false,
|
| 290 |
"overall_score": 50.23,
|
| 291 |
"component_scores": {
|
|
@@ -320,9 +320,9 @@
|
|
| 320 |
{
|
| 321 |
"agent_name": "Claude Sonnet 4.5",
|
| 322 |
"agent_id": "sonnet-4.5-benchmark",
|
| 323 |
-
"mode": "
|
| 324 |
"submission_type": "llm",
|
| 325 |
-
"organization": "
|
| 326 |
"mcp_custom": false,
|
| 327 |
"overall_score": 41.17,
|
| 328 |
"component_scores": {
|
|
@@ -357,9 +357,9 @@
|
|
| 357 |
{
|
| 358 |
"agent_name": "Gemini 2.5 Pro",
|
| 359 |
"agent_id": "gemini-2.5-pro-user",
|
| 360 |
-
"mode": "
|
| 361 |
"submission_type": "llm",
|
| 362 |
-
"organization": "
|
| 363 |
"mcp_custom": false,
|
| 364 |
"overall_score": 8.75,
|
| 365 |
"component_scores": {
|
|
@@ -394,9 +394,9 @@
|
|
| 394 |
{
|
| 395 |
"agent_name": "Gemini 2.5 Pro",
|
| 396 |
"agent_id": "gemini-2.5-pro-benchmark",
|
| 397 |
-
"mode": "
|
| 398 |
"submission_type": "llm",
|
| 399 |
-
"organization": "
|
| 400 |
"mcp_custom": false,
|
| 401 |
"overall_score": 8.11,
|
| 402 |
"component_scores": {
|
|
@@ -430,11 +430,11 @@
|
|
| 430 |
}
|
| 431 |
],
|
| 432 |
"interventions": {
|
| 433 |
-
"description": "Causal intervention experiments on the depth gap. 18 representative tasks rerun under three conditions: baseline (no intervention), forced_depth (mandate
|
| 434 |
"n_tasks": 18,
|
| 435 |
"rows": [
|
| 436 |
{
|
| 437 |
-
"label": "DeepSeek V3
|
| 438 |
"condition": "baseline",
|
| 439 |
"agent": "deepseek-v3-tools-benchmark",
|
| 440 |
"n_tasks": 18,
|
|
@@ -446,7 +446,7 @@
|
|
| 446 |
"diversity": 3.56
|
| 447 |
},
|
| 448 |
{
|
| 449 |
-
"label": "GPT-5
|
| 450 |
"condition": "baseline",
|
| 451 |
"agent": "gpt5-tools-benchmark",
|
| 452 |
"n_tasks": 18,
|
|
@@ -458,7 +458,7 @@
|
|
| 458 |
"diversity": 3.94
|
| 459 |
},
|
| 460 |
{
|
| 461 |
-
"label": "Human Expert
|
| 462 |
"condition": "baseline",
|
| 463 |
"agent": "human-expert-agent",
|
| 464 |
"n_tasks": 18,
|
|
@@ -470,7 +470,7 @@
|
|
| 470 |
"diversity": 2.28
|
| 471 |
},
|
| 472 |
{
|
| 473 |
-
"label": "DeepSeek V3
|
| 474 |
"condition": "forced_depth",
|
| 475 |
"agent": "deepseek-v3-forced-depth",
|
| 476 |
"n_tasks": 18,
|
|
@@ -482,7 +482,7 @@
|
|
| 482 |
"diversity": 3.94
|
| 483 |
},
|
| 484 |
{
|
| 485 |
-
"label": "GPT-5
|
| 486 |
"condition": "forced_depth",
|
| 487 |
"agent": "gpt5-tools-forced-depth",
|
| 488 |
"n_tasks": 18,
|
|
@@ -494,8 +494,8 @@
|
|
| 494 |
"diversity": 3.06
|
| 495 |
},
|
| 496 |
{
|
| 497 |
-
"label": "DeepSeek V3
|
| 498 |
-
"condition": "
|
| 499 |
"agent": "deepseek-v3-low-diversity",
|
| 500 |
"n_tasks": 18,
|
| 501 |
"score": 56.39,
|
|
@@ -506,8 +506,8 @@
|
|
| 506 |
"diversity": 3.22
|
| 507 |
},
|
| 508 |
{
|
| 509 |
-
"label": "GPT-5
|
| 510 |
-
"condition": "
|
| 511 |
"agent": "gpt5-tools-low-diversity",
|
| 512 |
"n_tasks": 18,
|
| 513 |
"score": 61.5,
|
|
@@ -518,8 +518,8 @@
|
|
| 518 |
"diversity": 3.22
|
| 519 |
},
|
| 520 |
{
|
| 521 |
-
"label": "Human Expert
|
| 522 |
-
"condition": "
|
| 523 |
"agent": "human-expert-shallow",
|
| 524 |
"n_tasks": 18,
|
| 525 |
"score": 55.06,
|
|
|
|
| 2 |
"last_updated": "2026-04-14",
|
| 3 |
"paper_title": "Evaluating LLM-Driven Protein Design: Agents Lack Iterative Evaluation Depth",
|
| 4 |
"headline_findings": [
|
| 5 |
+
"Top-tier LLM agents (DeepSeek V3, GPT-5) now surpass the deterministic hardcoded pipeline.",
|
| 6 |
+
"All agents show a critical evaluation-depth gap β they invoke evaluation tools at only ~14% of expert intensity.",
|
| 7 |
+
"Workflow guidance closes the coverage gap but leaves the evaluation-depth gap unchanged.",
|
| 8 |
+
"Evaluation variety (distinct metric categories per candidate) predicts design quality (Ο = 0.68, p < 10β»ΒΉΒΉβ΅) beyond binary tool selection.",
|
| 9 |
+
"Forced-depth intervention lifts the strongest agent (DeepSeek V3) by +9.3 points on 18 tasks, while a compute-matched low-variety control hurts it (-2.3) β evidence that variety, not raw compute, drives the gain."
|
| 10 |
],
|
| 11 |
"scoring": {
|
| 12 |
"rubric_max": 100,
|
|
|
|
| 26 |
"agent_id": "oracle",
|
| 27 |
"mode": null,
|
| 28 |
"submission_type": "human_oracle",
|
| 29 |
+
"organization": "RomeroLab",
|
| 30 |
"mcp_custom": false,
|
| 31 |
"overall_score": 74.85,
|
| 32 |
"component_scores": {
|
|
|
|
| 63 |
"agent_id": "human-expert",
|
| 64 |
"mode": null,
|
| 65 |
"submission_type": "human_expert",
|
| 66 |
+
"organization": "RomeroLab",
|
| 67 |
"mcp_custom": false,
|
| 68 |
"overall_score": 61.25,
|
| 69 |
"component_scores": {
|
|
|
|
| 98 |
{
|
| 99 |
"agent_name": "DeepSeek V3",
|
| 100 |
"agent_id": "deepseek-v3-benchmark",
|
| 101 |
+
"mode": "unguided",
|
| 102 |
"submission_type": "llm",
|
| 103 |
+
"organization": "RomeroLab",
|
| 104 |
"mcp_custom": false,
|
| 105 |
"overall_score": 60.43,
|
| 106 |
"component_scores": {
|
|
|
|
| 135 |
{
|
| 136 |
"agent_name": "DeepSeek V3",
|
| 137 |
"agent_id": "deepseek-v3-user",
|
| 138 |
+
"mode": "guided",
|
| 139 |
"submission_type": "llm",
|
| 140 |
+
"organization": "RomeroLab",
|
| 141 |
"mcp_custom": false,
|
| 142 |
"overall_score": 58.46,
|
| 143 |
"component_scores": {
|
|
|
|
| 172 |
{
|
| 173 |
"agent_name": "GPT-5",
|
| 174 |
"agent_id": "gpt5-benchmark",
|
| 175 |
+
"mode": "unguided",
|
| 176 |
"submission_type": "llm",
|
| 177 |
+
"organization": "RomeroLab",
|
| 178 |
"mcp_custom": false,
|
| 179 |
"overall_score": 55.61,
|
| 180 |
"component_scores": {
|
|
|
|
| 209 |
{
|
| 210 |
"agent_name": "GPT-5",
|
| 211 |
"agent_id": "gpt5-user",
|
| 212 |
+
"mode": "guided",
|
| 213 |
"submission_type": "llm",
|
| 214 |
+
"organization": "RomeroLab",
|
| 215 |
"mcp_custom": false,
|
| 216 |
"overall_score": 55.26,
|
| 217 |
"component_scores": {
|
|
|
|
| 248 |
"agent_id": "hardcoded-pipeline",
|
| 249 |
"mode": null,
|
| 250 |
"submission_type": "hardcoded",
|
| 251 |
+
"organization": "RomeroLab",
|
| 252 |
"mcp_custom": false,
|
| 253 |
"overall_score": 54.2,
|
| 254 |
"component_scores": {
|
|
|
|
| 283 |
{
|
| 284 |
"agent_name": "Claude Sonnet 4.5",
|
| 285 |
"agent_id": "sonnet-4.5-user",
|
| 286 |
+
"mode": "guided",
|
| 287 |
"submission_type": "llm",
|
| 288 |
+
"organization": "RomeroLab",
|
| 289 |
"mcp_custom": false,
|
| 290 |
"overall_score": 50.23,
|
| 291 |
"component_scores": {
|
|
|
|
| 320 |
{
|
| 321 |
"agent_name": "Claude Sonnet 4.5",
|
| 322 |
"agent_id": "sonnet-4.5-benchmark",
|
| 323 |
+
"mode": "unguided",
|
| 324 |
"submission_type": "llm",
|
| 325 |
+
"organization": "RomeroLab",
|
| 326 |
"mcp_custom": false,
|
| 327 |
"overall_score": 41.17,
|
| 328 |
"component_scores": {
|
|
|
|
| 357 |
{
|
| 358 |
"agent_name": "Gemini 2.5 Pro",
|
| 359 |
"agent_id": "gemini-2.5-pro-user",
|
| 360 |
+
"mode": "guided",
|
| 361 |
"submission_type": "llm",
|
| 362 |
+
"organization": "RomeroLab",
|
| 363 |
"mcp_custom": false,
|
| 364 |
"overall_score": 8.75,
|
| 365 |
"component_scores": {
|
|
|
|
| 394 |
{
|
| 395 |
"agent_name": "Gemini 2.5 Pro",
|
| 396 |
"agent_id": "gemini-2.5-pro-benchmark",
|
| 397 |
+
"mode": "unguided",
|
| 398 |
"submission_type": "llm",
|
| 399 |
+
"organization": "RomeroLab",
|
| 400 |
"mcp_custom": false,
|
| 401 |
"overall_score": 8.11,
|
| 402 |
"component_scores": {
|
|
|
|
| 430 |
}
|
| 431 |
],
|
| 432 |
"interventions": {
|
| 433 |
+
"description": "Causal intervention experiments on the evaluation-depth gap. 18 representative tasks rerun under three conditions: baseline (no intervention), forced_depth (mandate β₯3 evaluation metric categories per candidate), and low_variety_control (compute-matched control restricted to a narrow range of evaluation metrics).",
|
| 434 |
"n_tasks": 18,
|
| 435 |
"rows": [
|
| 436 |
{
|
| 437 |
+
"label": "DeepSeek V3 β baseline",
|
| 438 |
"condition": "baseline",
|
| 439 |
"agent": "deepseek-v3-tools-benchmark",
|
| 440 |
"n_tasks": 18,
|
|
|
|
| 446 |
"diversity": 3.56
|
| 447 |
},
|
| 448 |
{
|
| 449 |
+
"label": "GPT-5 β baseline",
|
| 450 |
"condition": "baseline",
|
| 451 |
"agent": "gpt5-tools-benchmark",
|
| 452 |
"n_tasks": 18,
|
|
|
|
| 458 |
"diversity": 3.94
|
| 459 |
},
|
| 460 |
{
|
| 461 |
+
"label": "Human Expert β baseline",
|
| 462 |
"condition": "baseline",
|
| 463 |
"agent": "human-expert-agent",
|
| 464 |
"n_tasks": 18,
|
|
|
|
| 470 |
"diversity": 2.28
|
| 471 |
},
|
| 472 |
{
|
| 473 |
+
"label": "DeepSeek V3 β forced depth",
|
| 474 |
"condition": "forced_depth",
|
| 475 |
"agent": "deepseek-v3-forced-depth",
|
| 476 |
"n_tasks": 18,
|
|
|
|
| 482 |
"diversity": 3.94
|
| 483 |
},
|
| 484 |
{
|
| 485 |
+
"label": "GPT-5 β forced depth",
|
| 486 |
"condition": "forced_depth",
|
| 487 |
"agent": "gpt5-tools-forced-depth",
|
| 488 |
"n_tasks": 18,
|
|
|
|
| 494 |
"diversity": 3.06
|
| 495 |
},
|
| 496 |
{
|
| 497 |
+
"label": "DeepSeek V3 β low variety",
|
| 498 |
+
"condition": "low_variety_control",
|
| 499 |
"agent": "deepseek-v3-low-diversity",
|
| 500 |
"n_tasks": 18,
|
| 501 |
"score": 56.39,
|
|
|
|
| 506 |
"diversity": 3.22
|
| 507 |
},
|
| 508 |
{
|
| 509 |
+
"label": "GPT-5 β low variety",
|
| 510 |
+
"condition": "low_variety_control",
|
| 511 |
"agent": "gpt5-tools-low-diversity",
|
| 512 |
"n_tasks": 18,
|
| 513 |
"score": 61.5,
|
|
|
|
| 518 |
"diversity": 3.22
|
| 519 |
},
|
| 520 |
{
|
| 521 |
+
"label": "Human Expert β shallow",
|
| 522 |
+
"condition": "low_variety_control",
|
| 523 |
"agent": "human-expert-shallow",
|
| 524 |
"n_tasks": 18,
|
| 525 |
"score": 55.06,
|