Spaces:
Sleeping
Sleeping
v3.0 — Intelligence layer: failure classification, strategy detection, advanced metrics, self-improvement, multi-agent comparison, 3D visualizer
dfbd16e | # server/multi_agent.py | |
| """ | |
| Multi-Agent Comparison Engine. | |
| Runs multiple agent configurations against the SAME task variant | |
| and produces a side-by-side comparison report. | |
| Agent configurations: | |
| - Deterministic (rule-based, no LLM) — baseline | |
| - Test-first (forces reading tests before anything) | |
| - Search-first (forces search_code before reads) | |
| - LLM-based (if HF_TOKEN provided) | |
| This is the key feature that answers: "Which agent strategy wins?" | |
| """ | |
| import time | |
| import copy | |
| from typing import List, Dict, Any, Optional, Callable | |
| from dataclasses import dataclass, field | |
| class AgentRunResult: | |
| """Result of one agent configuration running one episode.""" | |
| agent_name: str | |
| task: str | |
| variant_id: str | |
| final_score: float | |
| total_steps: int | |
| cumulative_reward: float | |
| duration_seconds: float | |
| action_sequence: List[str] | |
| files_read: List[str] | |
| files_written: List[str] | |
| strategy: str # Detected strategy label | |
| strategy_score: float | |
| failure_type: str | |
| reliability_index: float | |
| step_timeline: List[dict] | |
| def to_dict(self) -> dict: | |
| return { | |
| "agent_name": self.agent_name, | |
| "task": self.task, | |
| "variant_id": self.variant_id, | |
| "final_score": round(self.final_score, 3), | |
| "total_steps": self.total_steps, | |
| "cumulative_reward": round(self.cumulative_reward, 3), | |
| "duration_seconds": round(self.duration_seconds, 2), | |
| "action_sequence": self.action_sequence, | |
| "files_read": self.files_read, | |
| "files_written": self.files_written, | |
| "strategy": self.strategy, | |
| "strategy_score": round(self.strategy_score, 3), | |
| "failure_type": self.failure_type, | |
| "reliability_index": round(self.reliability_index, 3), | |
| "step_timeline": self.step_timeline, | |
| } | |
| class ComparisonReport: | |
| """Side-by-side comparison of multiple agent configurations.""" | |
| task: str | |
| variant_id: str | |
| runs: List[AgentRunResult] = field(default_factory=list) | |
| def to_dict(self) -> dict: | |
| if not self.runs: | |
| return {"error": "No runs to compare"} | |
| # Rank by score then steps | |
| ranked = sorted(self.runs, key=lambda r: (-r.final_score, r.total_steps)) | |
| winner = ranked[0] | |
| return { | |
| "task": self.task, | |
| "variant_id": self.variant_id, | |
| "winner": winner.agent_name, | |
| "winner_score": winner.final_score, | |
| "summary_table": [ | |
| { | |
| "rank": i + 1, | |
| "agent": r.agent_name, | |
| "score": round(r.final_score, 3), | |
| "steps": r.total_steps, | |
| "reward": round(r.cumulative_reward, 3), | |
| "strategy": r.strategy, | |
| "failure": r.failure_type, | |
| "reliability": round(r.reliability_index, 3), | |
| } | |
| for i, r in enumerate(ranked) | |
| ], | |
| "detailed_runs": [r.to_dict() for r in self.runs], | |
| "insights": self._generate_insights(ranked), | |
| } | |
| def _generate_insights(self, ranked: List[AgentRunResult]) -> List[str]: | |
| insights = [] | |
| if len(ranked) < 2: | |
| return insights | |
| best = ranked[0] | |
| worst = ranked[-1] | |
| if best.final_score > worst.final_score + 0.2: | |
| insights.append( | |
| f"'{best.agent_name}' significantly outperformed '{worst.agent_name}' " | |
| f"({best.final_score:.2f} vs {worst.final_score:.2f})" | |
| ) | |
| step_diffs = [(r.agent_name, r.total_steps) for r in ranked] | |
| most_efficient = min(ranked, key=lambda r: r.total_steps if r.final_score >= 0.5 else float('inf')) | |
| if most_efficient.final_score >= 0.5: | |
| insights.append( | |
| f"Most step-efficient successful agent: '{most_efficient.agent_name}' " | |
| f"({most_efficient.total_steps} steps)" | |
| ) | |
| strategies = [r.strategy for r in ranked] | |
| if len(set(strategies)) > 1: | |
| insights.append( | |
| f"Strategy variance observed: {set(strategies)} — " | |
| f"'{best.agent_name}' used {best.strategy} which proved most effective." | |
| ) | |
| return insights | |
| class MultiAgentComparison: | |
| """ | |
| Runs multiple deterministic agent strategies against the same environment. | |
| Usage (in-process, no LLM required): | |
| from server.environment import CodebaseNavEnvironment | |
| from server.models import RepoAction | |
| env = CodebaseNavEnvironment() | |
| engine = MultiAgentComparison() | |
| report = engine.compare(env, task="task1") | |
| """ | |
| # ── Built-in agent strategies ───────────────────────────────────────────── | |
| def _agent_test_first(obs: dict, step: int, context: dict) -> dict: | |
| """Strategy: Read tests before any source file.""" | |
| tree = obs.get("repo_tree", []) | |
| files_read = set(obs.get("files_read", [])) | |
| test_files = sorted([f for f in tree if f.startswith("tests/")]) | |
| src_files = sorted([f for f in tree if f.startswith("src/") and f.endswith(".py")]) | |
| spec_files = sorted([f for f in tree if f.endswith(".md")]) | |
| # Phase 1: Tests first | |
| for tf in test_files: | |
| if tf not in files_read: | |
| return {"action_type": "read_file", "path": tf} | |
| # Phase 2: Source files | |
| for sf in src_files: | |
| if sf not in files_read: | |
| return {"action_type": "read_file", "path": sf} | |
| # Phase 3: Run tests | |
| if test_files and context.get("tests_run", 0) == 0: | |
| context["tests_run"] = 1 | |
| return {"action_type": "run_tests", "path": test_files[0]} | |
| return {"action_type": "submit"} | |
| def _agent_search_first(obs: dict, step: int, context: dict) -> dict: | |
| """Strategy: Use search_code to locate the bug before reading.""" | |
| tree = obs.get("repo_tree", []) | |
| files_read = set(obs.get("files_read", [])) | |
| failing = obs.get("failing_tests", []) | |
| # Step 1: search for the failing test function name | |
| if step == 1 and failing: | |
| fn_name = failing[0].split(".")[-1] if failing else "bug" | |
| context["searched"] = True | |
| return {"action_type": "search_code", "query": fn_name} | |
| # Step 2: Read files based on search | |
| test_files = sorted([f for f in tree if f.startswith("tests/")]) | |
| src_files = sorted([f for f in tree if f.startswith("src/") and f.endswith(".py")]) | |
| for tf in test_files: | |
| if tf not in files_read: | |
| return {"action_type": "read_file", "path": tf} | |
| for sf in src_files: | |
| if sf not in files_read: | |
| return {"action_type": "read_file", "path": sf} | |
| if test_files and context.get("tests_run", 0) == 0: | |
| context["tests_run"] = 1 | |
| return {"action_type": "run_tests", "path": test_files[0]} | |
| return {"action_type": "submit"} | |
| def _agent_minimal(obs: dict, step: int, context: dict) -> dict: | |
| """Strategy: Minimal effort — read one file, submit immediately.""" | |
| tree = obs.get("repo_tree", []) | |
| files_read = set(obs.get("files_read", [])) | |
| src_files = [f for f in tree if f.startswith("src/") and f.endswith(".py")] | |
| if src_files and not files_read: | |
| return {"action_type": "read_file", "path": src_files[0]} | |
| return {"action_type": "submit"} | |
| def _agent_exhaustive(obs: dict, step: int, context: dict) -> dict: | |
| """Strategy: Read everything, run tests, then submit.""" | |
| tree = obs.get("repo_tree", []) | |
| files_read = set(obs.get("files_read", [])) | |
| all_readable = [f for f in tree if f.endswith(".py") or f.endswith(".md")] | |
| for f in all_readable: | |
| if f not in files_read: | |
| return {"action_type": "read_file", "path": f} | |
| test_files = [f for f in tree if f.startswith("tests/")] | |
| if test_files and context.get("tests_run", 0) == 0: | |
| context["tests_run"] = 1 | |
| return {"action_type": "run_tests", "path": test_files[0]} | |
| if test_files and context.get("tests_run2", 0) == 0: | |
| context["tests_run2"] = 1 | |
| return {"action_type": "run_tests"} | |
| return {"action_type": "submit"} | |
| AGENT_CONFIGS = { | |
| "test-first": _agent_test_first.__func__, | |
| "search-first": _agent_search_first.__func__, | |
| "minimal": _agent_minimal.__func__, | |
| "exhaustive": _agent_exhaustive.__func__, | |
| } | |
| def compare( | |
| self, | |
| env, # CodebaseNavEnvironment instance | |
| task: str = "task1", | |
| agents: Optional[List[str]] = None, | |
| shared_variant: Optional[str] = None, | |
| ) -> ComparisonReport: | |
| """ | |
| Run all (or selected) agents against the same task and compare. | |
| The environment is reset to the same variant for each agent. | |
| """ | |
| from server.models import RepoAction | |
| from server.strategy_detector import StrategyDetector | |
| from server.failure_classifier import FailureClassifier | |
| from server.advanced_metrics import AdvancedMetricsEngine | |
| agent_names = agents or list(self.AGENT_CONFIGS.keys()) | |
| strategy_detector = StrategyDetector() | |
| failure_classifier = FailureClassifier() | |
| metrics_engine = AdvancedMetricsEngine() | |
| runs: List[AgentRunResult] = [] | |
| variant_id = None | |
| for agent_name in agent_names: | |
| agent_fn = self.AGENT_CONFIGS.get(agent_name) | |
| if not agent_fn: | |
| continue | |
| # Reset environment | |
| reset_result = env.reset(task=task) | |
| obs = reset_result.observation | |
| variant_id = reset_result.info.get("variant_id", "unknown") | |
| context = {} | |
| start = time.time() | |
| max_steps = 15 | |
| files_read = [] | |
| files_written = [] | |
| cumulative_reward = 0.0 | |
| action_sequence = [] | |
| step_timeline = [] | |
| obs_dict = obs.model_dump() | |
| for step_num in range(1, max_steps + 1): | |
| if env.done: | |
| break | |
| action_dict = agent_fn(obs_dict, step_num, context) | |
| action = RepoAction( | |
| action_type=action_dict.get("action_type", "submit"), | |
| path=action_dict.get("path"), | |
| query=action_dict.get("query"), | |
| content=action_dict.get("content"), | |
| ) | |
| result = env.step(action) | |
| obs = result.observation | |
| obs_dict = obs.model_dump() | |
| cumulative_reward += result.reward | |
| action_sequence.append(action.action_type) | |
| if action.path and action.action_type == "read_file": | |
| files_read.append(action.path) | |
| if action.path and action.action_type == "write_file": | |
| files_written.append(action.path) | |
| step_timeline.append({ | |
| "step": step_num, | |
| "action": action.action_type, | |
| "path": action.path, | |
| "reward": round(result.reward, 3), | |
| }) | |
| if result.done: | |
| break | |
| # Force submit if not done | |
| if not env.done: | |
| result = env.step(RepoAction(action_type="submit")) | |
| cumulative_reward += result.reward | |
| action_sequence.append("submit") | |
| duration = time.time() - start | |
| final_score = env.final_score | |
| # Get trajectory for analysis | |
| trajectory = env.get_trajectory() | |
| traj_steps = trajectory.get("steps", []) if trajectory else [] | |
| variant_meta = {} | |
| if env.variant: | |
| variant_meta = env.variant.meta | |
| # Detect strategy | |
| strategy_report = strategy_detector.detect( | |
| traj_steps, task, variant_meta, files_read, final_score | |
| ) | |
| # Classify failure | |
| failure_report = failure_classifier.classify( | |
| episode_id=trajectory.get("episode_id", "") if trajectory else "", | |
| task=task, | |
| trajectory_steps=traj_steps, | |
| variant_meta=variant_meta, | |
| files_read=files_read, | |
| files_written=files_written, | |
| final_score=final_score, | |
| ) | |
| # Advanced metrics | |
| adv_metrics = metrics_engine.compute( | |
| traj_steps, variant_meta, final_score, files_read, files_written | |
| ) | |
| runs.append(AgentRunResult( | |
| agent_name=agent_name, | |
| task=task, | |
| variant_id=variant_id or "unknown", | |
| final_score=final_score, | |
| total_steps=len(action_sequence), | |
| cumulative_reward=cumulative_reward, | |
| duration_seconds=duration, | |
| action_sequence=action_sequence, | |
| files_read=files_read, | |
| files_written=files_written, | |
| strategy=strategy_report.strategy, | |
| strategy_score=strategy_report.score, | |
| failure_type=failure_report.primary_failure, | |
| reliability_index=adv_metrics.reliability_index, | |
| step_timeline=step_timeline, | |
| )) | |
| return ComparisonReport( | |
| task=task, | |
| variant_id=variant_id or "unknown", | |
| runs=runs, | |
| ) | |