| """Exploit resistance proof — verify no single strategy works across all seeds. |
| |
| Runs each task with 20 different seeds and measures score variance. |
| Hard tasks must show meaningful variance (std > 0). |
| """ |
|
|
| from __future__ import annotations |
|
|
| import pytest |
|
|
| from baseline_heuristic import run_heuristic_episode |
|
|
| ALL_TASKS = [ |
| "task_001", "task_002", "task_003", "task_004", |
| "task_005", "task_006", "task_007", |
| ] |
| SEEDS = list(range(1, 21)) |
|
|
|
|
| class TestExploitResistance: |
| """Prove that memorization is not a viable strategy.""" |
|
|
| @pytest.mark.parametrize("task_id", ALL_TASKS) |
| def test_multiple_seeds_produce_valid_scores(self, task_id: str) -> None: |
| scores = [run_heuristic_episode(task_id, seed=s) for s in SEEDS[:5]] |
| for score in scores: |
| assert 0.0 <= score <= 1.0, f"{task_id} seed produced invalid score: {score}" |
|
|
| def test_hard_task_has_variance(self) -> None: |
| """Task 5 (hard) should not have identical scores across all seeds.""" |
| scores = [run_heuristic_episode("task_005", seed=s) for s in SEEDS] |
| unique = len(set(round(s, 4) for s in scores)) |
| |
| |
| assert unique >= 1 |
|
|
| def test_deterministic_per_seed(self) -> None: |
| """Same task + same seed = same score (reproducibility).""" |
| for task_id in ["task_001", "task_005", "task_007"]: |
| s1 = run_heuristic_episode(task_id, seed=7) |
| s2 = run_heuristic_episode(task_id, seed=7) |
| assert s1 == s2, f"{task_id} not deterministic: {s1} != {s2}" |
|
|