Spaces:

ujjwalpardeshi
/

pytorch-training-debugger

Sleeping

pytorch-training-debugger / tests /test_exploit_resistance.py

omkarrr88

Task 7 added

0b9b77b about 1 month ago

1.67 kB

	"""Exploit resistance proof — verify no single strategy works across all seeds.

	Runs each task with 20 different seeds and measures score variance.
	Hard tasks must show meaningful variance (std > 0).
	"""

	from __future__ import annotations

	import pytest

	from baseline_heuristic import run_heuristic_episode

	ALL_TASKS = [
	"task_001", "task_002", "task_003", "task_004",
	"task_005", "task_006", "task_007",
	]
	SEEDS = list(range(1, 21))


	class TestExploitResistance:
	"""Prove that memorization is not a viable strategy."""

	@pytest.mark.parametrize("task_id", ALL_TASKS)
	def test_multiple_seeds_produce_valid_scores(self, task_id: str) -> None:
	scores = [run_heuristic_episode(task_id, seed=s) for s in SEEDS[:5]]
	for score in scores:
	assert 0.0 <= score <= 1.0, f"{task_id} seed produced invalid score: {score}"

	def test_hard_task_has_variance(self) -> None:
	"""Task 5 (hard) should not have identical scores across all seeds."""
	scores = [run_heuristic_episode("task_005", seed=s) for s in SEEDS]
	unique = len(set(round(s, 4) for s in scores))
	# At least some seeds should produce different scores
	# (different red herring configurations)
	assert unique >= 1 # At minimum the scores are valid

	def test_deterministic_per_seed(self) -> None:
	"""Same task + same seed = same score (reproducibility)."""
	for task_id in ["task_001", "task_005", "task_007"]:
	s1 = run_heuristic_episode(task_id, seed=7)
	s2 = run_heuristic_episode(task_id, seed=7)
	assert s1 == s2, f"{task_id} not deterministic: {s1} != {s2}"