| """Tests for baseline_runner and inference helpers.""" |
|
|
| from baseline_runner import run_baseline_episodes, _heuristic_episode |
| from server.environment import CloudNativeDebugEnvironment |
| from server.tasks.task_registry import TASK_REGISTRY |
|
|
|
|
| def test_heuristic_baseline_scores_above_zero_on_most_scenarios(): |
| """Heuristic baseline should score > 0 on most scenarios. |
| |
| Some scenarios (e.g. reordering steps) can't be solved by simple |
| contains-based heuristics, so we allow a few zeros. |
| """ |
| total = 0 |
| nonzero = 0 |
| for task_id, task_cls in TASK_REGISTRY.items(): |
| for scenario in task_cls.SCENARIOS: |
| env = CloudNativeDebugEnvironment() |
| result = _heuristic_episode(env, task_id, scenario["id"]) |
| total += 1 |
| if result.score > 0.0: |
| nonzero += 1 |
| |
| assert nonzero / total >= 0.8, f"Only {nonzero}/{total} scenarios scored > 0" |
|
|
|
|
| def test_run_baseline_episodes_single_task(): |
| results = run_baseline_episodes(task_id="dockerfile_syntax", num_episodes=1) |
| assert len(results) == 1 |
| assert results[0].task_id == "dockerfile_syntax" |
| assert results[0].score >= 0.0 |
|
|
|
|
| def test_run_baseline_episodes_all_tasks(): |
| results = run_baseline_episodes(task_id=None, num_episodes=1) |
| assert len(results) == len(TASK_REGISTRY) |
| task_ids_seen = {r.task_id for r in results} |
| assert task_ids_seen == set(TASK_REGISTRY.keys()) |
|
|
|
|
| def test_heuristic_fixes_easy_tasks_well(): |
| """Easy tasks should score >= 0.5 with heuristic baseline.""" |
| easy_tasks = [tid for tid, cls in TASK_REGISTRY.items() if cls.DIFFICULTY.value == "easy"] |
| for task_id in easy_tasks: |
| task_cls = TASK_REGISTRY[task_id] |
| scores = [] |
| for scenario in task_cls.SCENARIOS: |
| env = CloudNativeDebugEnvironment() |
| result = _heuristic_episode(env, task_id, scenario["id"]) |
| scores.append(result.score) |
| avg = sum(scores) / len(scores) |
| assert avg >= 0.3, f"Easy task {task_id} avg score {avg:.2f} too low" |
|
|