Spaces:

openenv-community
/

replicalab

Running

App Files Files Community

replicalab / tests /test_env.py

maxxie114

Initial HF Spaces deployment

80d8c84 2 days ago

raw

history blame contribute delete

31.9 kB

	"""Tests for ENV 01–08 and JDG 04–05.

	TST 01: reset returns valid observations
	TST 02: valid step advances round, terminal path returns correct shape
	TST 03: invalid action returns structured error, env survives
	"""

	from __future__ import annotations

	import pytest

	from replicalab.env import ReplicaLabEnv
	from replicalab.models import (
	Protocol,
	RewardBreakdown,
	ScientistAction,
	)
	from replicalab.scenarios import generate_scenario
	from replicalab.scoring.rubric import build_reward_breakdown, compute_total_reward


	# ---------------------------------------------------------------------------
	# Helpers
	# ---------------------------------------------------------------------------


	def _scenario(
	template: str = "math_reasoning",
	difficulty: str = "easy",
	seed: int = 42,
	):
	return generate_scenario(seed=seed, template=template, difficulty=difficulty)


	def _good_action(scenario) -> ScientistAction:
	"""Build a valid propose_protocol action that fits the scenario."""
	lab = scenario.lab_manager_observation
	spec = scenario.hidden_reference_spec
	return ScientistAction(
	action_type="propose_protocol",
	sample_size=10,
	controls=["baseline", "ablation"],
	technique=spec.summary[:60] if spec.summary else "replication_plan",
	duration_days=max(1, min(2, lab.time_limit_days)),
	required_equipment=(
	list(lab.equipment_available[:1]) if lab.equipment_available else []
	),
	required_reagents=(
	list(lab.reagents_in_stock[:1]) if lab.reagents_in_stock else []
	),
	questions=[],
	rationale=(
	f"Plan addresses: {', '.join(spec.required_elements[:2])}. "
	f"Target metric: {spec.target_metric}. "
	f"Target value: {spec.target_value}. "
	"Stay within budget and schedule."
	),
	)


	def _accept_action() -> ScientistAction:
	"""Build a valid accept action."""
	return ScientistAction(
	action_type="accept",
	sample_size=0,
	controls=[],
	technique="",
	duration_days=0,
	required_equipment=[],
	required_reagents=[],
	questions=[],
	rationale="",
	)


	def _request_info_action() -> ScientistAction:
	return ScientistAction(
	action_type="request_info",
	sample_size=0,
	controls=[],
	technique="",
	duration_days=0,
	required_equipment=[],
	required_reagents=[],
	questions=["What equipment is available?"],
	rationale="",
	)


	def _good_protocol(scenario) -> Protocol:
	"""Build a well-formed protocol aligned to the scenario."""
	lab = scenario.lab_manager_observation
	spec = scenario.hidden_reference_spec
	return Protocol(
	sample_size=10,
	controls=["baseline", "ablation"],
	technique=spec.summary[:60] if spec.summary else "replication_plan",
	duration_days=max(1, min(2, lab.time_limit_days)),
	required_equipment=(
	list(lab.equipment_available[:1]) if lab.equipment_available else []
	),
	required_reagents=(
	list(lab.reagents_in_stock[:1]) if lab.reagents_in_stock else []
	),
	rationale=(
	f"Plan addresses: {', '.join(spec.required_elements[:2])}. "
	f"Target metric: {spec.target_metric}. "
	f"Target value: {spec.target_value}. "
	"Stay within budget and schedule."
	),
	)


	def _bad_duration_action() -> ScientistAction:
	return ScientistAction(
	action_type="propose_protocol",
	sample_size=5,
	controls=["baseline"],
	technique="some technique",
	duration_days=999,
	required_equipment=[],
	required_reagents=[],
	questions=[],
	rationale="Duration is impossibly long for this scenario.",
	)


	def _canonical_step(result) -> dict:
	data = result.model_dump()
	data["info"].pop("episode_id", None)
	return data


	def _run_seeded_sequence(
	*,
	seed: int,
	template: str,
	difficulty: str,
	action_builder,
	):
	env = ReplicaLabEnv()
	obs = env.reset(seed=seed, scenario=template, difficulty=difficulty)
	scenario = _scenario(template, difficulty, seed=seed)
	actions = action_builder(scenario)
	results = [env.step(action) for action in actions]
	return obs.model_dump(), [_canonical_step(r) for r in results], env.state().model_dump()


	# ---------------------------------------------------------------------------
	# TST 01 — reset returns valid observations
	# ---------------------------------------------------------------------------


	class TestReset:
	"""TST 01: reset() returns a well-formed Observation."""

	def test_reset_returns_observation_with_both_roles(self) -> None:
	env = ReplicaLabEnv()
	obs = env.reset(seed=42, scenario="math_reasoning", difficulty="easy")

	assert obs.scientist is not None
	assert obs.lab_manager is not None

	def test_reset_scientist_fields_populated(self) -> None:
	env = ReplicaLabEnv()
	obs = env.reset(seed=42, scenario="ml_benchmark", difficulty="easy")

	s = obs.scientist
	assert s.paper_title
	assert s.paper_hypothesis
	assert s.experiment_goal
	assert s.round_number == 0
	assert s.max_rounds > 0
	assert s.current_protocol is None
	assert s.conversation_history == []

	def test_reset_lab_manager_fields_populated(self) -> None:
	env = ReplicaLabEnv()
	obs = env.reset(seed=42, scenario="finance_trading", difficulty="easy")

	lm = obs.lab_manager
	assert lm.budget_total > 0
	assert lm.budget_remaining > 0
	assert lm.staff_count > 0
	assert lm.time_limit_days > 0
	assert lm.round_number == 0

	def test_reset_preserves_booked_and_out_of_stock(self) -> None:
	"""ENV 02: booked/out-of-stock data comes from the scenario pack,
	not hardcoded empty lists."""
	env = ReplicaLabEnv()
	# hard difficulty is more likely to have unavailable resources
	obs = env.reset(seed=42, scenario="ml_benchmark", difficulty="hard")
	lm = obs.lab_manager

	# The observation should carry scenario data (may or may not have
	# booked items depending on scenario, but the lists should exist)
	assert isinstance(lm.equipment_booked, list)
	assert isinstance(lm.reagents_out_of_stock, list)
	assert isinstance(lm.safety_restrictions, list)
	assert len(lm.safety_restrictions) > 0 # always has at least one

	def test_reset_state_round_zero(self) -> None:
	env = ReplicaLabEnv()
	env.reset(seed=1)

	s = env.state()
	assert s.round_number == 0
	assert s.done is False
	assert s.agreement_reached is False

	def test_reset_generates_episode_id(self) -> None:
	env = ReplicaLabEnv()
	env.reset(seed=1)

	eid = env.episode_id()
	assert eid
	assert len(eid) > 10 # UUID

	def test_reset_clears_previous_episode(self) -> None:
	env = ReplicaLabEnv()
	env.reset(seed=1, scenario="math_reasoning")
	first_id = env.episode_id()

	env.reset(seed=2, scenario="ml_benchmark")
	second_id = env.episode_id()

	assert first_id != second_id
	assert env.state().round_number == 0

	def test_reset_all_templates_and_difficulties(self) -> None:
	env = ReplicaLabEnv()
	for template in ("math_reasoning", "ml_benchmark", "finance_trading"):
	for difficulty in ("easy", "medium", "hard"):
	obs = env.reset(seed=7, scenario=template, difficulty=difficulty)
	assert obs.scientist is not None
	assert obs.lab_manager is not None


	# ---------------------------------------------------------------------------
	# TST 03 — invalid action returns structured error, env survives
	# ---------------------------------------------------------------------------


	class TestInvalidAction:
	"""TST 03: env returns structured error for invalid proposals."""

	def test_invalid_duration_returns_error_string(self) -> None:
	env = ReplicaLabEnv()
	scenario = _scenario("math_reasoning", "easy")
	env.reset(seed=42, scenario="math_reasoning", difficulty="easy")

	# duration exceeds time limit
	bad_action = ScientistAction(
	action_type="propose_protocol",
	sample_size=5,
	controls=["baseline"],
	technique="some technique",
	duration_days=999,
	required_equipment=[],
	required_reagents=[],
	questions=[],
	rationale="This has way too long a duration for the lab.",
	)
	result = env.step(bad_action)

	assert result.done is False
	assert result.info.error is not None
	assert "Validation errors" in result.info.error

	def test_env_survives_after_invalid_action(self) -> None:
	"""After returning an error, the env still accepts valid actions."""
	env = ReplicaLabEnv()
	scenario = _scenario("math_reasoning", "easy")
	env.reset(seed=42, scenario="math_reasoning", difficulty="easy")

	# Send invalid action
	bad_action = ScientistAction(
	action_type="propose_protocol",
	sample_size=5,
	controls=["baseline"],
	technique="some technique",
	duration_days=999,
	required_equipment=[],
	required_reagents=[],
	questions=[],
	rationale="Way too long duration for the lab to handle.",
	)
	error_result = env.step(bad_action)
	assert error_result.info.error is not None

	# Now send a valid action — env should still work
	good = _good_action(scenario)
	result = env.step(good)
	assert result.info.error is None
	assert result.done is False

	def test_invalid_action_does_not_advance_round(self) -> None:
	env = ReplicaLabEnv()
	env.reset(seed=42, scenario="math_reasoning", difficulty="easy")

	bad_action = ScientistAction(
	action_type="propose_protocol",
	sample_size=5,
	controls=["baseline"],
	technique="some technique",
	duration_days=999,
	required_equipment=[],
	required_reagents=[],
	questions=[],
	rationale="Duration is impossibly long for this scenario.",
	)
	result = env.step(bad_action)

	assert result.info.error is not None
	assert env.state().round_number == 0

	def test_request_info_always_passes_validation(self) -> None:
	env = ReplicaLabEnv()
	env.reset(seed=42)
	result = env.step(_request_info_action())

	assert result.info.error is None
	assert result.done is False


	# ---------------------------------------------------------------------------
	# TST 02 — valid step advances round, terminal path
	# ---------------------------------------------------------------------------


	class TestStep:
	"""TST 02: step() advances rounds and terminal path returns correct shape."""

	def test_step_advances_round_number(self) -> None:
	env = ReplicaLabEnv()
	scenario = _scenario()
	env.reset(seed=42)

	action = _good_action(scenario)
	result = env.step(action)

	assert env.state().round_number == 1
	assert result.done is False
	assert result.reward > 0.0
	assert result.info.step_reward_components["protocol_delta_bonus"] > 0.0
	assert result.info.cumulative_reward == result.reward

	def test_step_returns_observations(self) -> None:
	env = ReplicaLabEnv()
	scenario = _scenario()
	env.reset(seed=42)

	result = env.step(_good_action(scenario))

	assert result.observation is not None
	assert result.observation.scientist is not None
	assert result.observation.lab_manager is not None
	assert result.observation.scientist.round_number == 1

	def test_step_records_conversation_history(self) -> None:
	env = ReplicaLabEnv()
	scenario = _scenario()
	env.reset(seed=42)

	env.step(_good_action(scenario))

	s = env.state()
	# Should have 2 entries: scientist + lab manager
	assert len(s.conversation_history) == 2
	assert s.conversation_history[0].role == "scientist"
	assert s.conversation_history[1].role == "lab_manager"

	def test_accept_with_protocol_terminates(self) -> None:
	"""Scientist accept with an existing protocol → done."""
	env = ReplicaLabEnv()
	scenario = _scenario()
	env.reset(seed=42)

	# First propose a protocol
	env.step(_good_action(scenario))

	# Then accept
	result = env.step(_accept_action())

	assert result.done is True
	assert result.info.agreement_reached is True

	def test_accept_terminal_step_has_real_reward(self) -> None:
	"""ENV 06: terminal accept computes real judge scores, not stub 0.8."""
	env = ReplicaLabEnv()
	scenario = _scenario()
	env.reset(seed=42)

	env.step(_good_action(scenario))
	result = env.step(_accept_action())

	assert result.done is True
	assert result.reward > 0.0
	assert result.info.reward_breakdown is not None

	rb = result.info.reward_breakdown
	assert 0.0 <= rb.rigor <= 1.0
	assert 0.0 <= rb.feasibility <= 1.0
	assert 0.0 <= rb.fidelity <= 1.0
	# Verify it's not the old stub 0.8
	assert not (rb.rigor == 0.8 and rb.feasibility == 0.8 and rb.fidelity == 0.8)

	def test_max_rounds_terminates(self) -> None:
	"""Reaching max_rounds terminates without agreement."""
	env = ReplicaLabEnv()
	scenario = _scenario()
	env.reset(seed=42)

	max_r = env.state().max_rounds
	for i in range(max_r):
	result = env.step(_good_action(scenario))

	assert result.done is True
	assert result.info.agreement_reached is False
	assert result.reward < 0.0
	assert result.info.reward_breakdown is not None
	assert result.info.reward_breakdown.penalties["timeout"] > 0.0

	def test_step_info_has_round_and_episode_id(self) -> None:
	env = ReplicaLabEnv()
	scenario = _scenario()
	env.reset(seed=42)

	result = env.step(_good_action(scenario))

	assert result.info.round == 1
	assert result.info.episode_id == env.episode_id()

	def test_full_episode_propose_then_accept(self) -> None:
	"""Full 2-step episode: propose → accept."""
	env = ReplicaLabEnv()
	scenario = _scenario("ml_benchmark", "easy")
	env.reset(seed=42, scenario="ml_benchmark", difficulty="easy")

	r1 = env.step(_good_action(scenario))
	assert not r1.done

	r2 = env.step(_accept_action())
	assert r2.done
	assert r2.info.agreement_reached
	assert r2.reward > 0


	# ---------------------------------------------------------------------------
	# ENV 07 — state() returns deep snapshot
	# ---------------------------------------------------------------------------


	class TestStateSnapshot:
	"""ENV 07: state() returns a deep copy, not a reference."""

	def test_state_is_deep_copy(self) -> None:
	env = ReplicaLabEnv()
	env.reset(seed=42)

	s1 = env.state()
	s1.round_number = 999 # mutate the snapshot

	s2 = env.state()
	assert s2.round_number == 0 # env state unaffected

	def test_state_history_is_independent(self) -> None:
	env = ReplicaLabEnv()
	scenario = _scenario()
	env.reset(seed=42)
	env.step(_good_action(scenario))

	s1 = env.state()
	original_len = len(s1.conversation_history)
	s1.conversation_history.clear()

	s2 = env.state()
	assert len(s2.conversation_history) == original_len


	# ---------------------------------------------------------------------------
	# ENV 08 — close() and _ensure_open()
	# ---------------------------------------------------------------------------


	class TestCloseReopen:
	"""ENV 08: close/reopen lifecycle."""

	def test_close_is_idempotent(self) -> None:
	env = ReplicaLabEnv()
	env.reset(seed=1)
	env.close()
	env.close() # should not raise

	def test_step_after_close_raises(self) -> None:
	env = ReplicaLabEnv()
	scenario = _scenario()
	env.reset(seed=1)
	env.close()

	with pytest.raises(RuntimeError, match="closed"):
	env.step(_good_action(scenario))

	def test_reset_reopens_closed_env(self) -> None:
	env = ReplicaLabEnv()
	env.reset(seed=1)
	env.close()

	# reset should reopen
	obs = env.reset(seed=2)
	assert obs.scientist is not None

	# step should work again
	scenario = _scenario()
	result = env.step(_good_action(scenario))
	assert result.info.error is None


	# ---------------------------------------------------------------------------
	# JDG 04-05 — rubric unit tests
	# ---------------------------------------------------------------------------


	class TestRubric:
	"""JDG 04-05: compute_total_reward and build_reward_breakdown."""

	def test_compute_total_reward_formula(self) -> None:
	"""10 × rigor × feasibility × fidelity + bonuses − penalties."""
	rb = RewardBreakdown(
	rigor=1.0,
	feasibility=1.0,
	fidelity=1.0,
	efficiency_bonus=0.5,
	communication_bonus=0.0,
	penalties={},
	)
	total = compute_total_reward(rb)
	assert total == 10.5 # 1011*1 + 0.5

	def test_compute_total_reward_with_penalties(self) -> None:
	rb = RewardBreakdown(
	rigor=0.8,
	feasibility=0.9,
	fidelity=0.7,
	efficiency_bonus=0.0,
	communication_bonus=0.0,
	penalties={"timeout": 1.0, "invalid": 0.5},
	)
	expected = 10 * 0.8 * 0.9 * 0.7 - 1.5 # 5.04 - 1.5 = 3.54
	assert abs(compute_total_reward(rb) - expected) < 0.001

	def test_compute_total_reward_zero_scores(self) -> None:
	rb = RewardBreakdown(rigor=0.0, feasibility=0.5, fidelity=0.5)
	assert compute_total_reward(rb) == 0.0

	def test_build_reward_breakdown_returns_valid_scores(self) -> None:
	scenario = _scenario("ml_benchmark", "easy")
	protocol = _good_protocol(scenario)

	breakdown = build_reward_breakdown(
	protocol=protocol,
	scenario=scenario,
	rounds_used=1,
	max_rounds=6,
	)

	assert 0.0 <= breakdown.rigor <= 1.0
	assert 0.0 <= breakdown.feasibility <= 1.0
	assert 0.0 <= breakdown.fidelity <= 1.0
	assert breakdown.efficiency_bonus >= 0.0

	def test_build_reward_breakdown_efficiency_bonus(self) -> None:
	"""Finishing in fewer rounds gives a higher bonus."""
	scenario = _scenario()
	protocol = _good_protocol(scenario)

	fast = build_reward_breakdown(protocol, scenario, rounds_used=1, max_rounds=6)
	slow = build_reward_breakdown(protocol, scenario, rounds_used=5, max_rounds=6)

	assert fast.efficiency_bonus > slow.efficiency_bonus

	def test_build_reward_breakdown_is_deterministic(self) -> None:
	scenario = _scenario("finance_trading", "medium")
	protocol = _good_protocol(scenario)

	b1 = build_reward_breakdown(protocol, scenario, rounds_used=2, max_rounds=6)
	b2 = build_reward_breakdown(protocol, scenario, rounds_used=2, max_rounds=6)

	assert b1.rigor == b2.rigor
	assert b1.feasibility == b2.feasibility
	assert b1.fidelity == b2.fidelity
	assert b1.efficiency_bonus == b2.efficiency_bonus

	def test_total_reward_matches_manual_calculation(self) -> None:
	scenario = _scenario("math_reasoning", "easy")
	protocol = _good_protocol(scenario)

	breakdown = build_reward_breakdown(protocol, scenario, rounds_used=2, max_rounds=6)
	total = compute_total_reward(breakdown)
	expected = (
	10.0 * breakdown.rigor * breakdown.feasibility * breakdown.fidelity * breakdown.parsimony
	+ breakdown.efficiency_bonus
	+ breakdown.communication_bonus
	+ breakdown.domain_emphasis_bonus
	- sum(breakdown.penalties.values())
	)
	assert abs(total - expected) < 0.0001


	# ---------------------------------------------------------------------------
	# ENV 06 — terminal reward wiring
	# ---------------------------------------------------------------------------


	class TestEnvReward:
	"""ENV 06: real judge scoring at terminal steps."""

	def test_agreement_terminal_has_breakdown_notes_verdict(self) -> None:
	env = ReplicaLabEnv()
	scenario = _scenario()
	env.reset(seed=42)

	env.step(_good_action(scenario))
	result = env.step(_accept_action())

	assert result.done
	assert result.info.reward_breakdown is not None
	assert result.info.judge_notes is not None
	assert result.info.verdict == "accept"
	assert "rigor" in result.info.judge_notes

	def test_no_agreement_terminal_is_deterministic(self) -> None:
	def run_timeout_episode():
	env = ReplicaLabEnv()
	scenario = _scenario()
	env.reset(seed=42)
	max_r = env.state().max_rounds
	result = None
	for _ in range(max_r):
	result = env.step(_good_action(scenario))
	return result

	r1 = run_timeout_episode()
	r2 = run_timeout_episode()

	assert r1.reward == r2.reward
	assert r1.info.verdict == r2.info.verdict

	def test_timeout_verdict(self) -> None:
	env = ReplicaLabEnv()
	scenario = _scenario()
	env.reset(seed=42)

	max_r = env.state().max_rounds
	result = None
	for _ in range(max_r):
	result = env.step(_good_action(scenario))

	assert result.done
	assert result.info.verdict == "timeout"
	assert result.info.reward_breakdown is not None
	assert result.reward < 0.0
	assert result.info.reward_breakdown.penalties["timeout"] > 0.0

	def test_episode_state_stores_final_scores(self) -> None:
	env = ReplicaLabEnv()
	scenario = _scenario()
	env.reset(seed=42)

	env.step(_good_action(scenario))
	env.step(_accept_action())

	s = env.state()
	assert s.done
	assert s.agreement_reached
	assert s.rigor_score > 0.0
	assert s.feasibility_score > 0.0
	assert s.fidelity_score > 0.0
	assert s.reward > 0.0


	# ---------------------------------------------------------------------------
	# ENV 11 — canonical judge audit payload in terminal outputs
	# ---------------------------------------------------------------------------


	class TestJudgeAudit:
	"""ENV 11: structured audit from JDG 11 threaded into terminal outputs."""

	def test_accept_terminal_has_full_audit(self) -> None:
	"""Agreement terminal step exposes all audit fields."""
	env = ReplicaLabEnv()
	scenario = _scenario()
	env.reset(seed=42)

	env.step(_good_action(scenario))
	result = env.step(_accept_action())

	assert result.done
	assert result.info.verdict == "accept"
	assert result.info.judge_notes is not None
	assert len(result.info.judge_notes) > 0
	# judge_notes from explain_reward includes rubric component labels
	assert "Rigor:" in result.info.judge_notes
	assert "Feasibility:" in result.info.judge_notes
	assert "Fidelity:" in result.info.judge_notes
	assert "Total reward:" in result.info.judge_notes
	assert isinstance(result.info.top_failure_reasons, list)

	def test_timeout_terminal_has_audit_with_timeout_reason(self) -> None:
	"""Timeout terminal step has verdict=timeout and failure reason."""
	env = ReplicaLabEnv()
	scenario = _scenario()
	env.reset(seed=42)

	max_r = env.state().max_rounds
	result = None
	for _ in range(max_r):
	result = env.step(_good_action(scenario))

	assert result.done
	assert result.info.verdict == "timeout"
	assert isinstance(result.info.top_failure_reasons, list)
	assert any(
	"round limit" in reason.lower()
	for reason in result.info.top_failure_reasons
	)

	def test_non_terminal_step_has_empty_audit(self) -> None:
	"""Non-terminal steps do not carry audit payload."""
	env = ReplicaLabEnv()
	scenario = _scenario()
	env.reset(seed=42)

	result = env.step(_good_action(scenario))

	assert not result.done
	assert result.info.judge_notes is None
	assert result.info.verdict is None
	assert result.info.top_failure_reasons == []

	def test_state_after_accept_carries_audit_fields(self) -> None:
	"""EpisodeState contains judge_notes, verdict, top_failure_reasons."""
	env = ReplicaLabEnv()
	scenario = _scenario()
	env.reset(seed=42)

	env.step(_good_action(scenario))
	env.step(_accept_action())

	s = env.state()
	assert s.verdict == "accept"
	assert len(s.judge_notes) > 0
	assert "Rigor:" in s.judge_notes
	assert isinstance(s.top_failure_reasons, list)

	def test_state_after_timeout_carries_audit_fields(self) -> None:
	"""EpisodeState after timeout has correct verdict and reasons."""
	env = ReplicaLabEnv()
	scenario = _scenario()
	env.reset(seed=42)

	max_r = env.state().max_rounds
	for _ in range(max_r):
	env.step(_good_action(scenario))

	s = env.state()
	assert s.verdict == "timeout"
	assert len(s.judge_notes) > 0
	assert any(
	"round limit" in reason.lower()
	for reason in s.top_failure_reasons
	)

	def test_audit_deterministic(self) -> None:
	"""Same episode produces identical audit output."""
	def run_episode():
	env = ReplicaLabEnv()
	scenario = _scenario()
	env.reset(seed=42)
	env.step(_good_action(scenario))
	return env.step(_accept_action())

	r1 = run_episode()
	r2 = run_episode()

	assert r1.info.judge_notes == r2.info.judge_notes
	assert r1.info.verdict == r2.info.verdict
	assert r1.info.top_failure_reasons == r2.info.top_failure_reasons

	def test_state_audit_fields_empty_before_terminal(self) -> None:
	"""EpisodeState audit fields are empty before a terminal step."""
	env = ReplicaLabEnv()
	scenario = _scenario()
	env.reset(seed=42)

	env.step(_good_action(scenario))

	s = env.state()
	assert s.judge_notes == ""
	assert s.verdict == ""
	assert s.top_failure_reasons == []


	# ---------------------------------------------------------------------------
	# ENV 10 — deterministic replay and broader environment regression
	# ---------------------------------------------------------------------------


	class TestReplayDeterminism:
	"""ENV 10: same seed + same actions => same trajectory and final state."""

	@pytest.mark.parametrize(
	("template", "difficulty"),
	[
	("math_reasoning", "easy"),
	("ml_benchmark", "medium"),
	("finance_trading", "hard"),
	],
	)
	def test_same_seed_same_initial_observation(
	self,
	template: str,
	difficulty: str,
	) -> None:
	env1 = ReplicaLabEnv()
	env2 = ReplicaLabEnv()

	obs1 = env1.reset(seed=17, scenario=template, difficulty=difficulty)
	obs2 = env2.reset(seed=17, scenario=template, difficulty=difficulty)

	assert obs1.model_dump() == obs2.model_dump()

	@pytest.mark.parametrize(
	("template", "difficulty"),
	[
	("math_reasoning", "easy"),
	("ml_benchmark", "medium"),
	("finance_trading", "hard"),
	],
	)
	def test_same_seed_same_action_sequence_same_trajectory(
	self,
	template: str,
	difficulty: str,
	) -> None:
	def build_actions(scenario):
	return [_good_action(scenario), _accept_action()]

	first = _run_seeded_sequence(
	seed=23,
	template=template,
	difficulty=difficulty,
	action_builder=build_actions,
	)
	second = _run_seeded_sequence(
	seed=23,
	template=template,
	difficulty=difficulty,
	action_builder=build_actions,
	)

	assert first == second

	@pytest.mark.parametrize("template", ["math_reasoning", "ml_benchmark", "finance_trading"])
	def test_timeout_replay_is_deterministic(self, template: str) -> None:
	def build_actions(scenario):
	return [
	_good_action(scenario)
	for _ in range(scenario.scientist_observation.max_rounds)
	]

	first = _run_seeded_sequence(
	seed=31,
	template=template,
	difficulty="medium",
	action_builder=build_actions,
	)
	second = _run_seeded_sequence(
	seed=31,
	template=template,
	difficulty="medium",
	action_builder=build_actions,
	)

	_obs1, steps1, state1 = first
	_obs2, steps2, state2 = second
	assert steps1 == steps2
	assert state1 == state2
	assert steps1[-1]["done"] is True
	assert steps1[-1]["info"]["verdict"] == "timeout"

	def test_invalid_action_replay_is_deterministic(self) -> None:
	def build_actions(_scenario):
	return [_bad_duration_action(), _bad_duration_action()]

	first = _run_seeded_sequence(
	seed=41,
	template="math_reasoning",
	difficulty="easy",
	action_builder=build_actions,
	)
	second = _run_seeded_sequence(
	seed=41,
	template="math_reasoning",
	difficulty="easy",
	action_builder=build_actions,
	)

	assert first == second
	_obs, steps, state = first
	assert steps[0]["info"]["error"] is not None
	assert steps[0]["done"] is False
	assert state["round_number"] == 0

	@pytest.mark.parametrize("template", ["math_reasoning", "ml_benchmark", "finance_trading"])
	def test_terminal_audit_payload_is_replay_stable(self, template: str) -> None:
	def build_actions(scenario):
	return [_good_action(scenario), _accept_action()]

	_obs1, steps1, state1 = _run_seeded_sequence(
	seed=59,
	template=template,
	difficulty="easy",
	action_builder=build_actions,
	)
	_obs2, steps2, state2 = _run_seeded_sequence(
	seed=59,
	template=template,
	difficulty="easy",
	action_builder=build_actions,
	)

	assert steps1[-1]["info"]["judge_notes"] == steps2[-1]["info"]["judge_notes"]
	assert steps1[-1]["info"]["verdict"] == steps2[-1]["info"]["verdict"]
	assert (
	steps1[-1]["info"]["top_failure_reasons"]
	== steps2[-1]["info"]["top_failure_reasons"]
	)
	assert state1["judge_notes"] == state2["judge_notes"]
	assert state1["verdict"] == state2["verdict"]
	assert state1["top_failure_reasons"] == state2["top_failure_reasons"]