| { |
| "last_updated": "2026-04-14", |
| "paper_title": "Evaluating LLM-Driven Protein Design: Agents Lack Iterative Evaluation Depth", |
| "headline_findings": [ |
| "Top-tier LLM agents (DeepSeek V3, GPT-5) now surpass the deterministic hardcoded pipeline.", |
| "All agents show a critical evaluation-depth gap — they invoke evaluation tools at only ~14% of expert intensity.", |
| "Workflow guidance closes the coverage gap but leaves the evaluation-depth gap unchanged.", |
| "Evaluation variety (distinct metric categories per candidate) predicts design quality (ρ = 0.68, p < 10⁻¹¹⁵) beyond binary tool selection.", |
| "Forced-depth intervention lifts the strongest agent (DeepSeek V3) by +9.3 points on 18 tasks, while a compute-matched low-variety control hurts it (-2.3) — evidence that variety, not raw compute, drives the gain." |
| ], |
| "scoring": { |
| "rubric_max": 100, |
| "components": { |
| "approach": 20, |
| "orchestration": 15, |
| "quality": 35, |
| "feasibility": 15, |
| "novelty": 5, |
| "diversity": 10 |
| }, |
| "method": "Hybrid: 72 algorithmic points (Boltz-2 verification) + 28 LLM-judge points (3-judge panel with self-exclusion)." |
| }, |
| "entries": [ |
| { |
| "agent_name": "Human Oracle", |
| "agent_id": "oracle", |
| "mode": null, |
| "submission_type": "human_oracle", |
| "organization": "RomeroLab", |
| "mcp_custom": false, |
| "overall_score": 74.85, |
| "component_scores": { |
| "approach": 20.0, |
| "orchestration": 15.0, |
| "quality": 26.24, |
| "feasibility": 10.26, |
| "novelty": 2.93, |
| "diversity": 0.43 |
| }, |
| "taxonomy_scores": { |
| "de_novo": { |
| "antibody": 79.2, |
| "binder": 71.8, |
| "enzyme": 75.6, |
| "fluorescent_protein": 78.7, |
| "scaffold": 75.8 |
| }, |
| "redesign": { |
| "antibody": 69.2, |
| "enzyme": 76.2, |
| "fluorescent_protein": 77.1, |
| "scaffold": 76.8 |
| } |
| }, |
| "tasks_completed": 76, |
| "tasks_total": 76, |
| "tasks_with_zero": 0, |
| "avg_latency_sec": null, |
| "submission_date": "2026-04-06" |
| }, |
| { |
| "agent_name": "Human Expert", |
| "agent_id": "human-expert", |
| "mode": null, |
| "submission_type": "human_expert", |
| "organization": "RomeroLab", |
| "mcp_custom": false, |
| "overall_score": 61.25, |
| "component_scores": { |
| "approach": 13.81, |
| "orchestration": 8.86, |
| "quality": 20.91, |
| "feasibility": 10.79, |
| "novelty": 3.46, |
| "diversity": 3.43 |
| }, |
| "taxonomy_scores": { |
| "de_novo": { |
| "antibody": 65.6, |
| "binder": 65.0, |
| "enzyme": 55.3, |
| "fluorescent_protein": 57.2, |
| "scaffold": 65.4 |
| }, |
| "redesign": { |
| "antibody": 52.4, |
| "enzyme": 59.5, |
| "fluorescent_protein": 54.6, |
| "scaffold": 53.7 |
| } |
| }, |
| "tasks_completed": 76, |
| "tasks_total": 76, |
| "tasks_with_zero": 0, |
| "avg_latency_sec": null, |
| "submission_date": "2026-04-06" |
| }, |
| { |
| "agent_name": "DeepSeek V3", |
| "agent_id": "deepseek-v3-benchmark", |
| "mode": "unguided", |
| "submission_type": "llm", |
| "organization": "RomeroLab", |
| "mcp_custom": false, |
| "overall_score": 60.43, |
| "component_scores": { |
| "approach": 11.4, |
| "orchestration": 9.36, |
| "quality": 22.07, |
| "feasibility": 10.77, |
| "novelty": 3.44, |
| "diversity": 3.38 |
| }, |
| "taxonomy_scores": { |
| "de_novo": { |
| "antibody": 65.0, |
| "binder": 63.4, |
| "enzyme": 53.9, |
| "fluorescent_protein": 72.3, |
| "scaffold": 57.8 |
| }, |
| "redesign": { |
| "antibody": 61.3, |
| "enzyme": 59.3, |
| "fluorescent_protein": 56.9, |
| "scaffold": 66.9 |
| } |
| }, |
| "tasks_completed": 76, |
| "tasks_total": 76, |
| "tasks_with_zero": 1, |
| "avg_latency_sec": null, |
| "submission_date": "2026-04-06" |
| }, |
| { |
| "agent_name": "DeepSeek V3", |
| "agent_id": "deepseek-v3-user", |
| "mode": "guided", |
| "submission_type": "llm", |
| "organization": "RomeroLab", |
| "mcp_custom": false, |
| "overall_score": 58.46, |
| "component_scores": { |
| "approach": 11.09, |
| "orchestration": 9.14, |
| "quality": 21.74, |
| "feasibility": 9.91, |
| "novelty": 3.25, |
| "diversity": 3.33 |
| }, |
| "taxonomy_scores": { |
| "de_novo": { |
| "antibody": 65.6, |
| "binder": 63.0, |
| "enzyme": 64.2, |
| "fluorescent_protein": 64.2, |
| "scaffold": 60.4 |
| }, |
| "redesign": { |
| "antibody": 61.6, |
| "enzyme": 60.7, |
| "fluorescent_protein": 43.0, |
| "scaffold": 44.1 |
| } |
| }, |
| "tasks_completed": 76, |
| "tasks_total": 76, |
| "tasks_with_zero": 7, |
| "avg_latency_sec": null, |
| "submission_date": "2026-04-06" |
| }, |
| { |
| "agent_name": "GPT-5", |
| "agent_id": "gpt5-benchmark", |
| "mode": "unguided", |
| "submission_type": "llm", |
| "organization": "RomeroLab", |
| "mcp_custom": false, |
| "overall_score": 55.61, |
| "component_scores": { |
| "approach": 8.76, |
| "orchestration": 6.84, |
| "quality": 22.96, |
| "feasibility": 10.03, |
| "novelty": 3.27, |
| "diversity": 3.75 |
| }, |
| "taxonomy_scores": { |
| "de_novo": { |
| "antibody": 62.6, |
| "binder": 59.9, |
| "enzyme": 55.9, |
| "fluorescent_protein": 53.9, |
| "scaffold": 56.1 |
| }, |
| "redesign": { |
| "antibody": 47.3, |
| "enzyme": 54.4, |
| "fluorescent_protein": 49.5, |
| "scaffold": 54.6 |
| } |
| }, |
| "tasks_completed": 76, |
| "tasks_total": 76, |
| "tasks_with_zero": 2, |
| "avg_latency_sec": null, |
| "submission_date": "2026-04-06" |
| }, |
| { |
| "agent_name": "GPT-5", |
| "agent_id": "gpt5-user", |
| "mode": "guided", |
| "submission_type": "llm", |
| "organization": "RomeroLab", |
| "mcp_custom": false, |
| "overall_score": 55.26, |
| "component_scores": { |
| "approach": 9.46, |
| "orchestration": 8.29, |
| "quality": 20.83, |
| "feasibility": 9.9, |
| "novelty": 3.2, |
| "diversity": 3.58 |
| }, |
| "taxonomy_scores": { |
| "de_novo": { |
| "antibody": 61.2, |
| "binder": 56.1, |
| "enzyme": 57.9, |
| "fluorescent_protein": 61.3, |
| "scaffold": 55.6 |
| }, |
| "redesign": { |
| "antibody": 52.1, |
| "enzyme": 54.2, |
| "fluorescent_protein": 55.7, |
| "scaffold": 46.3 |
| } |
| }, |
| "tasks_completed": 76, |
| "tasks_total": 76, |
| "tasks_with_zero": 4, |
| "avg_latency_sec": null, |
| "submission_date": "2026-04-06" |
| }, |
| { |
| "agent_name": "Hardcoded Pipeline", |
| "agent_id": "hardcoded-pipeline", |
| "mode": null, |
| "submission_type": "hardcoded", |
| "organization": "RomeroLab", |
| "mcp_custom": false, |
| "overall_score": 54.2, |
| "component_scores": { |
| "approach": 10.19, |
| "orchestration": 8.3, |
| "quality": 19.91, |
| "feasibility": 10.26, |
| "novelty": 2.48, |
| "diversity": 3.08 |
| }, |
| "taxonomy_scores": { |
| "de_novo": { |
| "antibody": 60.8, |
| "binder": 59.8, |
| "enzyme": 46.0, |
| "fluorescent_protein": 62.6, |
| "scaffold": 55.0 |
| }, |
| "redesign": { |
| "antibody": 45.4, |
| "enzyme": 50.7, |
| "fluorescent_protein": 49.5, |
| "scaffold": 50.3 |
| } |
| }, |
| "tasks_completed": 76, |
| "tasks_total": 76, |
| "tasks_with_zero": 0, |
| "avg_latency_sec": null, |
| "submission_date": "2026-04-06" |
| }, |
| { |
| "agent_name": "Claude Sonnet 4.5", |
| "agent_id": "sonnet-4.5-user", |
| "mode": "guided", |
| "submission_type": "llm", |
| "organization": "RomeroLab", |
| "mcp_custom": false, |
| "overall_score": 50.23, |
| "component_scores": { |
| "approach": 9.63, |
| "orchestration": 8.54, |
| "quality": 17.31, |
| "feasibility": 9.03, |
| "novelty": 2.68, |
| "diversity": 3.05 |
| }, |
| "taxonomy_scores": { |
| "de_novo": { |
| "antibody": 66.3, |
| "binder": 56.5, |
| "enzyme": 56.9, |
| "fluorescent_protein": 62.8, |
| "scaffold": 57.9 |
| }, |
| "redesign": { |
| "antibody": 43.1, |
| "enzyme": 37.5, |
| "fluorescent_protein": 32.8, |
| "scaffold": 42.0 |
| } |
| }, |
| "tasks_completed": 76, |
| "tasks_total": 76, |
| "tasks_with_zero": 16, |
| "avg_latency_sec": null, |
| "submission_date": "2026-04-06" |
| }, |
| { |
| "agent_name": "Claude Sonnet 4.5", |
| "agent_id": "sonnet-4.5-benchmark", |
| "mode": "unguided", |
| "submission_type": "llm", |
| "organization": "RomeroLab", |
| "mcp_custom": false, |
| "overall_score": 41.17, |
| "component_scores": { |
| "approach": 7.92, |
| "orchestration": 6.93, |
| "quality": 13.54, |
| "feasibility": 8.2, |
| "novelty": 2.25, |
| "diversity": 2.33 |
| }, |
| "taxonomy_scores": { |
| "de_novo": { |
| "antibody": 29.5, |
| "binder": 55.5, |
| "enzyme": 29.6, |
| "fluorescent_protein": 45.9, |
| "scaffold": 41.2 |
| }, |
| "redesign": { |
| "antibody": 34.6, |
| "enzyme": 29.5, |
| "fluorescent_protein": 35.3, |
| "scaffold": 40.9 |
| } |
| }, |
| "tasks_completed": 76, |
| "tasks_total": 76, |
| "tasks_with_zero": 23, |
| "avg_latency_sec": null, |
| "submission_date": "2026-04-06" |
| }, |
| { |
| "agent_name": "Gemini 2.5 Pro", |
| "agent_id": "gemini-2.5-pro-user", |
| "mode": "guided", |
| "submission_type": "llm", |
| "organization": "RomeroLab", |
| "mcp_custom": false, |
| "overall_score": 8.75, |
| "component_scores": { |
| "approach": 3.37, |
| "orchestration": 2.79, |
| "quality": 0.55, |
| "feasibility": 1.15, |
| "novelty": 0.49, |
| "diversity": 0.41 |
| }, |
| "taxonomy_scores": { |
| "de_novo": { |
| "antibody": 10.8, |
| "binder": 9.3, |
| "enzyme": 30.2, |
| "fluorescent_protein": 3.1, |
| "scaffold": 9.2 |
| }, |
| "redesign": { |
| "antibody": 8.0, |
| "enzyme": 4.9, |
| "fluorescent_protein": 6.8, |
| "scaffold": 8.6 |
| } |
| }, |
| "tasks_completed": 76, |
| "tasks_total": 76, |
| "tasks_with_zero": 74, |
| "avg_latency_sec": null, |
| "submission_date": "2026-04-06" |
| }, |
| { |
| "agent_name": "Gemini 2.5 Pro", |
| "agent_id": "gemini-2.5-pro-benchmark", |
| "mode": "unguided", |
| "submission_type": "llm", |
| "organization": "RomeroLab", |
| "mcp_custom": false, |
| "overall_score": 8.11, |
| "component_scores": { |
| "approach": 3.58, |
| "orchestration": 2.47, |
| "quality": 0.34, |
| "feasibility": 0.93, |
| "novelty": 0.42, |
| "diversity": 0.37 |
| }, |
| "taxonomy_scores": { |
| "de_novo": { |
| "antibody": 9.1, |
| "binder": 9.2, |
| "enzyme": 11.0, |
| "fluorescent_protein": 3.1, |
| "scaffold": 9.1 |
| }, |
| "redesign": { |
| "antibody": 7.3, |
| "enzyme": 4.4, |
| "fluorescent_protein": 6.2, |
| "scaffold": 11.4 |
| } |
| }, |
| "tasks_completed": 76, |
| "tasks_total": 76, |
| "tasks_with_zero": 75, |
| "avg_latency_sec": null, |
| "submission_date": "2026-04-06" |
| } |
| ], |
| "interventions": { |
| "description": "Causal intervention experiments on the evaluation-depth gap. 18 representative tasks rerun under three conditions: baseline (no intervention), forced_depth (mandate ≥3 evaluation metric categories per candidate), and low_variety_control (compute-matched control restricted to a narrow range of evaluation metrics).", |
| "n_tasks": 18, |
| "rows": [ |
| { |
| "label": "DeepSeek V3 — baseline", |
| "condition": "baseline", |
| "agent": "deepseek-v3-tools-benchmark", |
| "n_tasks": 18, |
| "score": 58.72, |
| "delta_vs_baseline": 0.0, |
| "approach": 13.44, |
| "orchestration": 11.17, |
| "quality": 16.11, |
| "diversity": 3.56 |
| }, |
| { |
| "label": "GPT-5 — baseline", |
| "condition": "baseline", |
| "agent": "gpt5-tools-benchmark", |
| "n_tasks": 18, |
| "score": 46.78, |
| "delta_vs_baseline": 0.0, |
| "approach": 8.33, |
| "orchestration": 6.22, |
| "quality": 15.39, |
| "diversity": 3.94 |
| }, |
| { |
| "label": "Human Expert — baseline", |
| "condition": "baseline", |
| "agent": "human-expert-agent", |
| "n_tasks": 18, |
| "score": 56.67, |
| "delta_vs_baseline": 0.0, |
| "approach": 18.28, |
| "orchestration": 9.28, |
| "quality": 11.06, |
| "diversity": 2.28 |
| }, |
| { |
| "label": "DeepSeek V3 — forced depth", |
| "condition": "forced_depth", |
| "agent": "deepseek-v3-forced-depth", |
| "n_tasks": 18, |
| "score": 68.06, |
| "delta_vs_baseline": 9.34, |
| "approach": 18.39, |
| "orchestration": 12.28, |
| "quality": 16.11, |
| "diversity": 3.94 |
| }, |
| { |
| "label": "GPT-5 — forced depth", |
| "condition": "forced_depth", |
| "agent": "gpt5-tools-forced-depth", |
| "n_tasks": 18, |
| "score": 62.67, |
| "delta_vs_baseline": 15.89, |
| "approach": 18.28, |
| "orchestration": 11.67, |
| "quality": 15.0, |
| "diversity": 3.06 |
| }, |
| { |
| "label": "DeepSeek V3 — low variety", |
| "condition": "low_variety_control", |
| "agent": "deepseek-v3-low-diversity", |
| "n_tasks": 18, |
| "score": 56.39, |
| "delta_vs_baseline": -2.33, |
| "approach": 13.11, |
| "orchestration": 11.11, |
| "quality": 16.0, |
| "diversity": 3.22 |
| }, |
| { |
| "label": "GPT-5 — low variety", |
| "condition": "low_variety_control", |
| "agent": "gpt5-tools-low-diversity", |
| "n_tasks": 18, |
| "score": 61.5, |
| "delta_vs_baseline": 14.72, |
| "approach": 13.06, |
| "orchestration": 12.0, |
| "quality": 16.22, |
| "diversity": 3.22 |
| }, |
| { |
| "label": "Human Expert — shallow", |
| "condition": "low_variety_control", |
| "agent": "human-expert-shallow", |
| "n_tasks": 18, |
| "score": 55.06, |
| "delta_vs_baseline": -1.61, |
| "approach": 18.22, |
| "orchestration": 9.28, |
| "quality": 11.17, |
| "diversity": 0.61 |
| } |
| ] |
| } |
| } |