Spaces:
Running
Running
| """Tests for ENV 01–08 and JDG 04–05. | |
| TST 01: reset returns valid observations | |
| TST 02: valid step advances round, terminal path returns correct shape | |
| TST 03: invalid action returns structured error, env survives | |
| """ | |
| from __future__ import annotations | |
| import pytest | |
| from replicalab.env import ReplicaLabEnv | |
| from replicalab.models import ( | |
| Protocol, | |
| RewardBreakdown, | |
| ScientistAction, | |
| ) | |
| from replicalab.scenarios import generate_scenario | |
| from replicalab.scoring.rubric import build_reward_breakdown, compute_total_reward | |
| # --------------------------------------------------------------------------- | |
| # Helpers | |
| # --------------------------------------------------------------------------- | |
| def _scenario( | |
| template: str = "math_reasoning", | |
| difficulty: str = "easy", | |
| seed: int = 42, | |
| ): | |
| return generate_scenario(seed=seed, template=template, difficulty=difficulty) | |
| def _good_action(scenario) -> ScientistAction: | |
| """Build a valid propose_protocol action that fits the scenario.""" | |
| lab = scenario.lab_manager_observation | |
| spec = scenario.hidden_reference_spec | |
| return ScientistAction( | |
| action_type="propose_protocol", | |
| sample_size=10, | |
| controls=["baseline", "ablation"], | |
| technique=spec.summary[:60] if spec.summary else "replication_plan", | |
| duration_days=max(1, min(2, lab.time_limit_days)), | |
| required_equipment=( | |
| list(lab.equipment_available[:1]) if lab.equipment_available else [] | |
| ), | |
| required_reagents=( | |
| list(lab.reagents_in_stock[:1]) if lab.reagents_in_stock else [] | |
| ), | |
| questions=[], | |
| rationale=( | |
| f"Plan addresses: {', '.join(spec.required_elements[:2])}. " | |
| f"Target metric: {spec.target_metric}. " | |
| f"Target value: {spec.target_value}. " | |
| "Stay within budget and schedule." | |
| ), | |
| ) | |
| def _accept_action() -> ScientistAction: | |
| """Build a valid accept action.""" | |
| return ScientistAction( | |
| action_type="accept", | |
| sample_size=0, | |
| controls=[], | |
| technique="", | |
| duration_days=0, | |
| required_equipment=[], | |
| required_reagents=[], | |
| questions=[], | |
| rationale="", | |
| ) | |
| def _request_info_action() -> ScientistAction: | |
| return ScientistAction( | |
| action_type="request_info", | |
| sample_size=0, | |
| controls=[], | |
| technique="", | |
| duration_days=0, | |
| required_equipment=[], | |
| required_reagents=[], | |
| questions=["What equipment is available?"], | |
| rationale="", | |
| ) | |
| def _good_protocol(scenario) -> Protocol: | |
| """Build a well-formed protocol aligned to the scenario.""" | |
| lab = scenario.lab_manager_observation | |
| spec = scenario.hidden_reference_spec | |
| return Protocol( | |
| sample_size=10, | |
| controls=["baseline", "ablation"], | |
| technique=spec.summary[:60] if spec.summary else "replication_plan", | |
| duration_days=max(1, min(2, lab.time_limit_days)), | |
| required_equipment=( | |
| list(lab.equipment_available[:1]) if lab.equipment_available else [] | |
| ), | |
| required_reagents=( | |
| list(lab.reagents_in_stock[:1]) if lab.reagents_in_stock else [] | |
| ), | |
| rationale=( | |
| f"Plan addresses: {', '.join(spec.required_elements[:2])}. " | |
| f"Target metric: {spec.target_metric}. " | |
| f"Target value: {spec.target_value}. " | |
| "Stay within budget and schedule." | |
| ), | |
| ) | |
| def _bad_duration_action() -> ScientistAction: | |
| return ScientistAction( | |
| action_type="propose_protocol", | |
| sample_size=5, | |
| controls=["baseline"], | |
| technique="some technique", | |
| duration_days=999, | |
| required_equipment=[], | |
| required_reagents=[], | |
| questions=[], | |
| rationale="Duration is impossibly long for this scenario.", | |
| ) | |
| def _canonical_step(result) -> dict: | |
| data = result.model_dump() | |
| data["info"].pop("episode_id", None) | |
| return data | |
| def _run_seeded_sequence( | |
| *, | |
| seed: int, | |
| template: str, | |
| difficulty: str, | |
| action_builder, | |
| ): | |
| env = ReplicaLabEnv() | |
| obs = env.reset(seed=seed, scenario=template, difficulty=difficulty) | |
| scenario = _scenario(template, difficulty, seed=seed) | |
| actions = action_builder(scenario) | |
| results = [env.step(action) for action in actions] | |
| return obs.model_dump(), [_canonical_step(r) for r in results], env.state().model_dump() | |
| # --------------------------------------------------------------------------- | |
| # TST 01 — reset returns valid observations | |
| # --------------------------------------------------------------------------- | |
| class TestReset: | |
| """TST 01: reset() returns a well-formed Observation.""" | |
| def test_reset_returns_observation_with_both_roles(self) -> None: | |
| env = ReplicaLabEnv() | |
| obs = env.reset(seed=42, scenario="math_reasoning", difficulty="easy") | |
| assert obs.scientist is not None | |
| assert obs.lab_manager is not None | |
| def test_reset_scientist_fields_populated(self) -> None: | |
| env = ReplicaLabEnv() | |
| obs = env.reset(seed=42, scenario="ml_benchmark", difficulty="easy") | |
| s = obs.scientist | |
| assert s.paper_title | |
| assert s.paper_hypothesis | |
| assert s.experiment_goal | |
| assert s.round_number == 0 | |
| assert s.max_rounds > 0 | |
| assert s.current_protocol is None | |
| assert s.conversation_history == [] | |
| def test_reset_lab_manager_fields_populated(self) -> None: | |
| env = ReplicaLabEnv() | |
| obs = env.reset(seed=42, scenario="finance_trading", difficulty="easy") | |
| lm = obs.lab_manager | |
| assert lm.budget_total > 0 | |
| assert lm.budget_remaining > 0 | |
| assert lm.staff_count > 0 | |
| assert lm.time_limit_days > 0 | |
| assert lm.round_number == 0 | |
| def test_reset_preserves_booked_and_out_of_stock(self) -> None: | |
| """ENV 02: booked/out-of-stock data comes from the scenario pack, | |
| not hardcoded empty lists.""" | |
| env = ReplicaLabEnv() | |
| # hard difficulty is more likely to have unavailable resources | |
| obs = env.reset(seed=42, scenario="ml_benchmark", difficulty="hard") | |
| lm = obs.lab_manager | |
| # The observation should carry scenario data (may or may not have | |
| # booked items depending on scenario, but the lists should exist) | |
| assert isinstance(lm.equipment_booked, list) | |
| assert isinstance(lm.reagents_out_of_stock, list) | |
| assert isinstance(lm.safety_restrictions, list) | |
| assert len(lm.safety_restrictions) > 0 # always has at least one | |
| def test_reset_state_round_zero(self) -> None: | |
| env = ReplicaLabEnv() | |
| env.reset(seed=1) | |
| s = env.state() | |
| assert s.round_number == 0 | |
| assert s.done is False | |
| assert s.agreement_reached is False | |
| def test_reset_generates_episode_id(self) -> None: | |
| env = ReplicaLabEnv() | |
| env.reset(seed=1) | |
| eid = env.episode_id() | |
| assert eid | |
| assert len(eid) > 10 # UUID | |
| def test_reset_clears_previous_episode(self) -> None: | |
| env = ReplicaLabEnv() | |
| env.reset(seed=1, scenario="math_reasoning") | |
| first_id = env.episode_id() | |
| env.reset(seed=2, scenario="ml_benchmark") | |
| second_id = env.episode_id() | |
| assert first_id != second_id | |
| assert env.state().round_number == 0 | |
| def test_reset_all_templates_and_difficulties(self) -> None: | |
| env = ReplicaLabEnv() | |
| for template in ("math_reasoning", "ml_benchmark", "finance_trading"): | |
| for difficulty in ("easy", "medium", "hard"): | |
| obs = env.reset(seed=7, scenario=template, difficulty=difficulty) | |
| assert obs.scientist is not None | |
| assert obs.lab_manager is not None | |
| # --------------------------------------------------------------------------- | |
| # TST 03 — invalid action returns structured error, env survives | |
| # --------------------------------------------------------------------------- | |
| class TestInvalidAction: | |
| """TST 03: env returns structured error for invalid proposals.""" | |
| def test_invalid_duration_returns_error_string(self) -> None: | |
| env = ReplicaLabEnv() | |
| scenario = _scenario("math_reasoning", "easy") | |
| env.reset(seed=42, scenario="math_reasoning", difficulty="easy") | |
| # duration exceeds time limit | |
| bad_action = ScientistAction( | |
| action_type="propose_protocol", | |
| sample_size=5, | |
| controls=["baseline"], | |
| technique="some technique", | |
| duration_days=999, | |
| required_equipment=[], | |
| required_reagents=[], | |
| questions=[], | |
| rationale="This has way too long a duration for the lab.", | |
| ) | |
| result = env.step(bad_action) | |
| assert result.done is False | |
| assert result.info.error is not None | |
| assert "Validation errors" in result.info.error | |
| def test_env_survives_after_invalid_action(self) -> None: | |
| """After returning an error, the env still accepts valid actions.""" | |
| env = ReplicaLabEnv() | |
| scenario = _scenario("math_reasoning", "easy") | |
| env.reset(seed=42, scenario="math_reasoning", difficulty="easy") | |
| # Send invalid action | |
| bad_action = ScientistAction( | |
| action_type="propose_protocol", | |
| sample_size=5, | |
| controls=["baseline"], | |
| technique="some technique", | |
| duration_days=999, | |
| required_equipment=[], | |
| required_reagents=[], | |
| questions=[], | |
| rationale="Way too long duration for the lab to handle.", | |
| ) | |
| error_result = env.step(bad_action) | |
| assert error_result.info.error is not None | |
| # Now send a valid action — env should still work | |
| good = _good_action(scenario) | |
| result = env.step(good) | |
| assert result.info.error is None | |
| assert result.done is False | |
| def test_invalid_action_does_not_advance_round(self) -> None: | |
| env = ReplicaLabEnv() | |
| env.reset(seed=42, scenario="math_reasoning", difficulty="easy") | |
| bad_action = ScientistAction( | |
| action_type="propose_protocol", | |
| sample_size=5, | |
| controls=["baseline"], | |
| technique="some technique", | |
| duration_days=999, | |
| required_equipment=[], | |
| required_reagents=[], | |
| questions=[], | |
| rationale="Duration is impossibly long for this scenario.", | |
| ) | |
| result = env.step(bad_action) | |
| assert result.info.error is not None | |
| assert env.state().round_number == 0 | |
| def test_request_info_always_passes_validation(self) -> None: | |
| env = ReplicaLabEnv() | |
| env.reset(seed=42) | |
| result = env.step(_request_info_action()) | |
| assert result.info.error is None | |
| assert result.done is False | |
| # --------------------------------------------------------------------------- | |
| # TST 02 — valid step advances round, terminal path | |
| # --------------------------------------------------------------------------- | |
| class TestStep: | |
| """TST 02: step() advances rounds and terminal path returns correct shape.""" | |
| def test_step_advances_round_number(self) -> None: | |
| env = ReplicaLabEnv() | |
| scenario = _scenario() | |
| env.reset(seed=42) | |
| action = _good_action(scenario) | |
| result = env.step(action) | |
| assert env.state().round_number == 1 | |
| assert result.done is False | |
| assert result.reward > 0.0 | |
| assert result.info.step_reward_components["protocol_delta_bonus"] > 0.0 | |
| assert result.info.cumulative_reward == result.reward | |
| def test_step_returns_observations(self) -> None: | |
| env = ReplicaLabEnv() | |
| scenario = _scenario() | |
| env.reset(seed=42) | |
| result = env.step(_good_action(scenario)) | |
| assert result.observation is not None | |
| assert result.observation.scientist is not None | |
| assert result.observation.lab_manager is not None | |
| assert result.observation.scientist.round_number == 1 | |
| def test_step_records_conversation_history(self) -> None: | |
| env = ReplicaLabEnv() | |
| scenario = _scenario() | |
| env.reset(seed=42) | |
| env.step(_good_action(scenario)) | |
| s = env.state() | |
| # Should have 2 entries: scientist + lab manager | |
| assert len(s.conversation_history) == 2 | |
| assert s.conversation_history[0].role == "scientist" | |
| assert s.conversation_history[1].role == "lab_manager" | |
| def test_accept_with_protocol_terminates(self) -> None: | |
| """Scientist accept with an existing protocol → done.""" | |
| env = ReplicaLabEnv() | |
| scenario = _scenario() | |
| env.reset(seed=42) | |
| # First propose a protocol | |
| env.step(_good_action(scenario)) | |
| # Then accept | |
| result = env.step(_accept_action()) | |
| assert result.done is True | |
| assert result.info.agreement_reached is True | |
| def test_accept_terminal_step_has_real_reward(self) -> None: | |
| """ENV 06: terminal accept computes real judge scores, not stub 0.8.""" | |
| env = ReplicaLabEnv() | |
| scenario = _scenario() | |
| env.reset(seed=42) | |
| env.step(_good_action(scenario)) | |
| result = env.step(_accept_action()) | |
| assert result.done is True | |
| assert result.reward > 0.0 | |
| assert result.info.reward_breakdown is not None | |
| rb = result.info.reward_breakdown | |
| assert 0.0 <= rb.rigor <= 1.0 | |
| assert 0.0 <= rb.feasibility <= 1.0 | |
| assert 0.0 <= rb.fidelity <= 1.0 | |
| # Verify it's not the old stub 0.8 | |
| assert not (rb.rigor == 0.8 and rb.feasibility == 0.8 and rb.fidelity == 0.8) | |
| def test_max_rounds_terminates(self) -> None: | |
| """Reaching max_rounds terminates without agreement.""" | |
| env = ReplicaLabEnv() | |
| scenario = _scenario() | |
| env.reset(seed=42) | |
| max_r = env.state().max_rounds | |
| for i in range(max_r): | |
| result = env.step(_good_action(scenario)) | |
| assert result.done is True | |
| assert result.info.agreement_reached is False | |
| assert result.reward < 0.0 | |
| assert result.info.reward_breakdown is not None | |
| assert result.info.reward_breakdown.penalties["timeout"] > 0.0 | |
| def test_step_info_has_round_and_episode_id(self) -> None: | |
| env = ReplicaLabEnv() | |
| scenario = _scenario() | |
| env.reset(seed=42) | |
| result = env.step(_good_action(scenario)) | |
| assert result.info.round == 1 | |
| assert result.info.episode_id == env.episode_id() | |
| def test_full_episode_propose_then_accept(self) -> None: | |
| """Full 2-step episode: propose → accept.""" | |
| env = ReplicaLabEnv() | |
| scenario = _scenario("ml_benchmark", "easy") | |
| env.reset(seed=42, scenario="ml_benchmark", difficulty="easy") | |
| r1 = env.step(_good_action(scenario)) | |
| assert not r1.done | |
| r2 = env.step(_accept_action()) | |
| assert r2.done | |
| assert r2.info.agreement_reached | |
| assert r2.reward > 0 | |
| # --------------------------------------------------------------------------- | |
| # ENV 07 — state() returns deep snapshot | |
| # --------------------------------------------------------------------------- | |
| class TestStateSnapshot: | |
| """ENV 07: state() returns a deep copy, not a reference.""" | |
| def test_state_is_deep_copy(self) -> None: | |
| env = ReplicaLabEnv() | |
| env.reset(seed=42) | |
| s1 = env.state() | |
| s1.round_number = 999 # mutate the snapshot | |
| s2 = env.state() | |
| assert s2.round_number == 0 # env state unaffected | |
| def test_state_history_is_independent(self) -> None: | |
| env = ReplicaLabEnv() | |
| scenario = _scenario() | |
| env.reset(seed=42) | |
| env.step(_good_action(scenario)) | |
| s1 = env.state() | |
| original_len = len(s1.conversation_history) | |
| s1.conversation_history.clear() | |
| s2 = env.state() | |
| assert len(s2.conversation_history) == original_len | |
| # --------------------------------------------------------------------------- | |
| # ENV 08 — close() and _ensure_open() | |
| # --------------------------------------------------------------------------- | |
| class TestCloseReopen: | |
| """ENV 08: close/reopen lifecycle.""" | |
| def test_close_is_idempotent(self) -> None: | |
| env = ReplicaLabEnv() | |
| env.reset(seed=1) | |
| env.close() | |
| env.close() # should not raise | |
| def test_step_after_close_raises(self) -> None: | |
| env = ReplicaLabEnv() | |
| scenario = _scenario() | |
| env.reset(seed=1) | |
| env.close() | |
| with pytest.raises(RuntimeError, match="closed"): | |
| env.step(_good_action(scenario)) | |
| def test_reset_reopens_closed_env(self) -> None: | |
| env = ReplicaLabEnv() | |
| env.reset(seed=1) | |
| env.close() | |
| # reset should reopen | |
| obs = env.reset(seed=2) | |
| assert obs.scientist is not None | |
| # step should work again | |
| scenario = _scenario() | |
| result = env.step(_good_action(scenario)) | |
| assert result.info.error is None | |
| # --------------------------------------------------------------------------- | |
| # JDG 04-05 — rubric unit tests | |
| # --------------------------------------------------------------------------- | |
| class TestRubric: | |
| """JDG 04-05: compute_total_reward and build_reward_breakdown.""" | |
| def test_compute_total_reward_formula(self) -> None: | |
| """10 × rigor × feasibility × fidelity + bonuses − penalties.""" | |
| rb = RewardBreakdown( | |
| rigor=1.0, | |
| feasibility=1.0, | |
| fidelity=1.0, | |
| efficiency_bonus=0.5, | |
| communication_bonus=0.0, | |
| penalties={}, | |
| ) | |
| total = compute_total_reward(rb) | |
| assert total == 10.5 # 10*1*1*1 + 0.5 | |
| def test_compute_total_reward_with_penalties(self) -> None: | |
| rb = RewardBreakdown( | |
| rigor=0.8, | |
| feasibility=0.9, | |
| fidelity=0.7, | |
| efficiency_bonus=0.0, | |
| communication_bonus=0.0, | |
| penalties={"timeout": 1.0, "invalid": 0.5}, | |
| ) | |
| expected = 10 * 0.8 * 0.9 * 0.7 - 1.5 # 5.04 - 1.5 = 3.54 | |
| assert abs(compute_total_reward(rb) - expected) < 0.001 | |
| def test_compute_total_reward_zero_scores(self) -> None: | |
| rb = RewardBreakdown(rigor=0.0, feasibility=0.5, fidelity=0.5) | |
| assert compute_total_reward(rb) == 0.0 | |
| def test_build_reward_breakdown_returns_valid_scores(self) -> None: | |
| scenario = _scenario("ml_benchmark", "easy") | |
| protocol = _good_protocol(scenario) | |
| breakdown = build_reward_breakdown( | |
| protocol=protocol, | |
| scenario=scenario, | |
| rounds_used=1, | |
| max_rounds=6, | |
| ) | |
| assert 0.0 <= breakdown.rigor <= 1.0 | |
| assert 0.0 <= breakdown.feasibility <= 1.0 | |
| assert 0.0 <= breakdown.fidelity <= 1.0 | |
| assert breakdown.efficiency_bonus >= 0.0 | |
| def test_build_reward_breakdown_efficiency_bonus(self) -> None: | |
| """Finishing in fewer rounds gives a higher bonus.""" | |
| scenario = _scenario() | |
| protocol = _good_protocol(scenario) | |
| fast = build_reward_breakdown(protocol, scenario, rounds_used=1, max_rounds=6) | |
| slow = build_reward_breakdown(protocol, scenario, rounds_used=5, max_rounds=6) | |
| assert fast.efficiency_bonus > slow.efficiency_bonus | |
| def test_build_reward_breakdown_is_deterministic(self) -> None: | |
| scenario = _scenario("finance_trading", "medium") | |
| protocol = _good_protocol(scenario) | |
| b1 = build_reward_breakdown(protocol, scenario, rounds_used=2, max_rounds=6) | |
| b2 = build_reward_breakdown(protocol, scenario, rounds_used=2, max_rounds=6) | |
| assert b1.rigor == b2.rigor | |
| assert b1.feasibility == b2.feasibility | |
| assert b1.fidelity == b2.fidelity | |
| assert b1.efficiency_bonus == b2.efficiency_bonus | |
| def test_total_reward_matches_manual_calculation(self) -> None: | |
| scenario = _scenario("math_reasoning", "easy") | |
| protocol = _good_protocol(scenario) | |
| breakdown = build_reward_breakdown(protocol, scenario, rounds_used=2, max_rounds=6) | |
| total = compute_total_reward(breakdown) | |
| expected = ( | |
| 10.0 * breakdown.rigor * breakdown.feasibility * breakdown.fidelity * breakdown.parsimony | |
| + breakdown.efficiency_bonus | |
| + breakdown.communication_bonus | |
| + breakdown.domain_emphasis_bonus | |
| - sum(breakdown.penalties.values()) | |
| ) | |
| assert abs(total - expected) < 0.0001 | |
| # --------------------------------------------------------------------------- | |
| # ENV 06 — terminal reward wiring | |
| # --------------------------------------------------------------------------- | |
| class TestEnvReward: | |
| """ENV 06: real judge scoring at terminal steps.""" | |
| def test_agreement_terminal_has_breakdown_notes_verdict(self) -> None: | |
| env = ReplicaLabEnv() | |
| scenario = _scenario() | |
| env.reset(seed=42) | |
| env.step(_good_action(scenario)) | |
| result = env.step(_accept_action()) | |
| assert result.done | |
| assert result.info.reward_breakdown is not None | |
| assert result.info.judge_notes is not None | |
| assert result.info.verdict == "accept" | |
| assert "rigor" in result.info.judge_notes | |
| def test_no_agreement_terminal_is_deterministic(self) -> None: | |
| def run_timeout_episode(): | |
| env = ReplicaLabEnv() | |
| scenario = _scenario() | |
| env.reset(seed=42) | |
| max_r = env.state().max_rounds | |
| result = None | |
| for _ in range(max_r): | |
| result = env.step(_good_action(scenario)) | |
| return result | |
| r1 = run_timeout_episode() | |
| r2 = run_timeout_episode() | |
| assert r1.reward == r2.reward | |
| assert r1.info.verdict == r2.info.verdict | |
| def test_timeout_verdict(self) -> None: | |
| env = ReplicaLabEnv() | |
| scenario = _scenario() | |
| env.reset(seed=42) | |
| max_r = env.state().max_rounds | |
| result = None | |
| for _ in range(max_r): | |
| result = env.step(_good_action(scenario)) | |
| assert result.done | |
| assert result.info.verdict == "timeout" | |
| assert result.info.reward_breakdown is not None | |
| assert result.reward < 0.0 | |
| assert result.info.reward_breakdown.penalties["timeout"] > 0.0 | |
| def test_episode_state_stores_final_scores(self) -> None: | |
| env = ReplicaLabEnv() | |
| scenario = _scenario() | |
| env.reset(seed=42) | |
| env.step(_good_action(scenario)) | |
| env.step(_accept_action()) | |
| s = env.state() | |
| assert s.done | |
| assert s.agreement_reached | |
| assert s.rigor_score > 0.0 | |
| assert s.feasibility_score > 0.0 | |
| assert s.fidelity_score > 0.0 | |
| assert s.reward > 0.0 | |
| # --------------------------------------------------------------------------- | |
| # ENV 11 — canonical judge audit payload in terminal outputs | |
| # --------------------------------------------------------------------------- | |
| class TestJudgeAudit: | |
| """ENV 11: structured audit from JDG 11 threaded into terminal outputs.""" | |
| def test_accept_terminal_has_full_audit(self) -> None: | |
| """Agreement terminal step exposes all audit fields.""" | |
| env = ReplicaLabEnv() | |
| scenario = _scenario() | |
| env.reset(seed=42) | |
| env.step(_good_action(scenario)) | |
| result = env.step(_accept_action()) | |
| assert result.done | |
| assert result.info.verdict == "accept" | |
| assert result.info.judge_notes is not None | |
| assert len(result.info.judge_notes) > 0 | |
| # judge_notes from explain_reward includes rubric component labels | |
| assert "Rigor:" in result.info.judge_notes | |
| assert "Feasibility:" in result.info.judge_notes | |
| assert "Fidelity:" in result.info.judge_notes | |
| assert "Total reward:" in result.info.judge_notes | |
| assert isinstance(result.info.top_failure_reasons, list) | |
| def test_timeout_terminal_has_audit_with_timeout_reason(self) -> None: | |
| """Timeout terminal step has verdict=timeout and failure reason.""" | |
| env = ReplicaLabEnv() | |
| scenario = _scenario() | |
| env.reset(seed=42) | |
| max_r = env.state().max_rounds | |
| result = None | |
| for _ in range(max_r): | |
| result = env.step(_good_action(scenario)) | |
| assert result.done | |
| assert result.info.verdict == "timeout" | |
| assert isinstance(result.info.top_failure_reasons, list) | |
| assert any( | |
| "round limit" in reason.lower() | |
| for reason in result.info.top_failure_reasons | |
| ) | |
| def test_non_terminal_step_has_empty_audit(self) -> None: | |
| """Non-terminal steps do not carry audit payload.""" | |
| env = ReplicaLabEnv() | |
| scenario = _scenario() | |
| env.reset(seed=42) | |
| result = env.step(_good_action(scenario)) | |
| assert not result.done | |
| assert result.info.judge_notes is None | |
| assert result.info.verdict is None | |
| assert result.info.top_failure_reasons == [] | |
| def test_state_after_accept_carries_audit_fields(self) -> None: | |
| """EpisodeState contains judge_notes, verdict, top_failure_reasons.""" | |
| env = ReplicaLabEnv() | |
| scenario = _scenario() | |
| env.reset(seed=42) | |
| env.step(_good_action(scenario)) | |
| env.step(_accept_action()) | |
| s = env.state() | |
| assert s.verdict == "accept" | |
| assert len(s.judge_notes) > 0 | |
| assert "Rigor:" in s.judge_notes | |
| assert isinstance(s.top_failure_reasons, list) | |
| def test_state_after_timeout_carries_audit_fields(self) -> None: | |
| """EpisodeState after timeout has correct verdict and reasons.""" | |
| env = ReplicaLabEnv() | |
| scenario = _scenario() | |
| env.reset(seed=42) | |
| max_r = env.state().max_rounds | |
| for _ in range(max_r): | |
| env.step(_good_action(scenario)) | |
| s = env.state() | |
| assert s.verdict == "timeout" | |
| assert len(s.judge_notes) > 0 | |
| assert any( | |
| "round limit" in reason.lower() | |
| for reason in s.top_failure_reasons | |
| ) | |
| def test_audit_deterministic(self) -> None: | |
| """Same episode produces identical audit output.""" | |
| def run_episode(): | |
| env = ReplicaLabEnv() | |
| scenario = _scenario() | |
| env.reset(seed=42) | |
| env.step(_good_action(scenario)) | |
| return env.step(_accept_action()) | |
| r1 = run_episode() | |
| r2 = run_episode() | |
| assert r1.info.judge_notes == r2.info.judge_notes | |
| assert r1.info.verdict == r2.info.verdict | |
| assert r1.info.top_failure_reasons == r2.info.top_failure_reasons | |
| def test_state_audit_fields_empty_before_terminal(self) -> None: | |
| """EpisodeState audit fields are empty before a terminal step.""" | |
| env = ReplicaLabEnv() | |
| scenario = _scenario() | |
| env.reset(seed=42) | |
| env.step(_good_action(scenario)) | |
| s = env.state() | |
| assert s.judge_notes == "" | |
| assert s.verdict == "" | |
| assert s.top_failure_reasons == [] | |
| # --------------------------------------------------------------------------- | |
| # ENV 10 — deterministic replay and broader environment regression | |
| # --------------------------------------------------------------------------- | |
| class TestReplayDeterminism: | |
| """ENV 10: same seed + same actions => same trajectory and final state.""" | |
| def test_same_seed_same_initial_observation( | |
| self, | |
| template: str, | |
| difficulty: str, | |
| ) -> None: | |
| env1 = ReplicaLabEnv() | |
| env2 = ReplicaLabEnv() | |
| obs1 = env1.reset(seed=17, scenario=template, difficulty=difficulty) | |
| obs2 = env2.reset(seed=17, scenario=template, difficulty=difficulty) | |
| assert obs1.model_dump() == obs2.model_dump() | |
| def test_same_seed_same_action_sequence_same_trajectory( | |
| self, | |
| template: str, | |
| difficulty: str, | |
| ) -> None: | |
| def build_actions(scenario): | |
| return [_good_action(scenario), _accept_action()] | |
| first = _run_seeded_sequence( | |
| seed=23, | |
| template=template, | |
| difficulty=difficulty, | |
| action_builder=build_actions, | |
| ) | |
| second = _run_seeded_sequence( | |
| seed=23, | |
| template=template, | |
| difficulty=difficulty, | |
| action_builder=build_actions, | |
| ) | |
| assert first == second | |
| def test_timeout_replay_is_deterministic(self, template: str) -> None: | |
| def build_actions(scenario): | |
| return [ | |
| _good_action(scenario) | |
| for _ in range(scenario.scientist_observation.max_rounds) | |
| ] | |
| first = _run_seeded_sequence( | |
| seed=31, | |
| template=template, | |
| difficulty="medium", | |
| action_builder=build_actions, | |
| ) | |
| second = _run_seeded_sequence( | |
| seed=31, | |
| template=template, | |
| difficulty="medium", | |
| action_builder=build_actions, | |
| ) | |
| _obs1, steps1, state1 = first | |
| _obs2, steps2, state2 = second | |
| assert steps1 == steps2 | |
| assert state1 == state2 | |
| assert steps1[-1]["done"] is True | |
| assert steps1[-1]["info"]["verdict"] == "timeout" | |
| def test_invalid_action_replay_is_deterministic(self) -> None: | |
| def build_actions(_scenario): | |
| return [_bad_duration_action(), _bad_duration_action()] | |
| first = _run_seeded_sequence( | |
| seed=41, | |
| template="math_reasoning", | |
| difficulty="easy", | |
| action_builder=build_actions, | |
| ) | |
| second = _run_seeded_sequence( | |
| seed=41, | |
| template="math_reasoning", | |
| difficulty="easy", | |
| action_builder=build_actions, | |
| ) | |
| assert first == second | |
| _obs, steps, state = first | |
| assert steps[0]["info"]["error"] is not None | |
| assert steps[0]["done"] is False | |
| assert state["round_number"] == 0 | |
| def test_terminal_audit_payload_is_replay_stable(self, template: str) -> None: | |
| def build_actions(scenario): | |
| return [_good_action(scenario), _accept_action()] | |
| _obs1, steps1, state1 = _run_seeded_sequence( | |
| seed=59, | |
| template=template, | |
| difficulty="easy", | |
| action_builder=build_actions, | |
| ) | |
| _obs2, steps2, state2 = _run_seeded_sequence( | |
| seed=59, | |
| template=template, | |
| difficulty="easy", | |
| action_builder=build_actions, | |
| ) | |
| assert steps1[-1]["info"]["judge_notes"] == steps2[-1]["info"]["judge_notes"] | |
| assert steps1[-1]["info"]["verdict"] == steps2[-1]["info"]["verdict"] | |
| assert ( | |
| steps1[-1]["info"]["top_failure_reasons"] | |
| == steps2[-1]["info"]["top_failure_reasons"] | |
| ) | |
| assert state1["judge_notes"] == state2["judge_notes"] | |
| assert state1["verdict"] == state2["verdict"] | |
| assert state1["top_failure_reasons"] == state2["top_failure_reasons"] | |