VLAarchtests4 / code /VLAarchtests2_code /VLAarchtests /tests /test_public_benchmark_package_summary.py
| import pytest | |
| from eval.public_benchmark_package import build_public_eval_protocol, build_target_training_spec | |
| from eval.run_public_benchmark_package import summarize_public_benchmark_package | |
| def _successes(num_success: int, total: int = 100) -> list[int]: | |
| return [1] * num_success + [0] * (total - num_success) | |
| def _target_record(track_id: str, adapter_mode: str, seed: int, num_success: int, *, intervention: float, non_base: float) -> dict: | |
| successes = _successes(num_success) | |
| record = { | |
| "track_id": track_id, | |
| "adapter_mode": adapter_mode, | |
| "successes": successes, | |
| "success_rate": sum(successes) / len(successes), | |
| "episodes": len(successes), | |
| "seed": seed, | |
| "eval_protocol": build_public_eval_protocol(track_id=track_id, eval_mode=adapter_mode, seed=seed, episodes=len(successes)), | |
| "intervention_rate": intervention, | |
| "non_base_selection_rate": non_base, | |
| "steps_to_first_reveal_or_access": 8.0, | |
| "steps_to_retrieve": 22.0, | |
| "disturbance_proxy": 0.3, | |
| } | |
| if adapter_mode != "adapter_noop": | |
| record["train_spec"] = build_target_training_spec(track_id=track_id, model_variant=adapter_mode, seed=seed) | |
| else: | |
| record["train_spec"] = build_target_training_spec(track_id=track_id, model_variant="adapter_active_ft", seed=seed) | |
| return record | |
| def _anchor_record(adapter_mode: str, seed: int, num_success: int) -> dict: | |
| successes = _successes(num_success) | |
| return { | |
| "track_id": "anchor_track", | |
| "adapter_mode": adapter_mode, | |
| "successes": successes, | |
| "success_rate": sum(successes) / len(successes), | |
| "episodes": len(successes), | |
| "seed": seed, | |
| "eval_protocol": build_public_eval_protocol(track_id="anchor_track", eval_mode=adapter_mode, seed=seed, episodes=len(successes)), | |
| } | |
| def test_public_benchmark_package_summary_passes_with_clear_gain(): | |
| payloads = [ | |
| _target_record("bag_track", "trunk_only_ft", 17, 35, intervention=0.0, non_base=0.0), | |
| _target_record("bag_track", "adapter_noop", 17, 35, intervention=0.0, non_base=0.0), | |
| _target_record("bag_track", "adapter_active_ft", 17, 75, intervention=0.30, non_base=0.40), | |
| _target_record("occlusion_track", "trunk_only_ft", 17, 30, intervention=0.0, non_base=0.0), | |
| _target_record("occlusion_track", "adapter_noop", 17, 30, intervention=0.0, non_base=0.0), | |
| _target_record("occlusion_track", "adapter_active_ft", 17, 68, intervention=0.24, non_base=0.22), | |
| _target_record("cloth_track", "trunk_only_ft", 17, 28, intervention=0.0, non_base=0.0), | |
| _target_record("cloth_track", "adapter_noop", 17, 28, intervention=0.0, non_base=0.0), | |
| _target_record("cloth_track", "adapter_active_ft", 17, 60, intervention=0.18, non_base=0.20), | |
| _anchor_record("trunk_only", 17, 96), | |
| _anchor_record("adapter_noop", 17, 96), | |
| _anchor_record("adapter_active", 17, 95), | |
| ] | |
| summary = summarize_public_benchmark_package(payloads, bootstrap_samples=200, bootstrap_seed=0) | |
| assert summary["headline_pass"] | |
| assert summary["sign_of_life_pass"] | |
| assert summary["anchor_pass"] | |
| assert summary["sign_of_life_track_count"] == 3 | |
| assert summary["tracks"]["bag_track"]["delta_active_vs_trunk"] > 0.0 | |
| assert summary["tracks"]["anchor_track"]["anchor_within_tolerance"] | |
| def test_public_benchmark_package_detects_training_mismatch(): | |
| payloads = [ | |
| _target_record("bag_track", "trunk_only_ft", 17, 35, intervention=0.0, non_base=0.0), | |
| _target_record("bag_track", "adapter_noop", 17, 35, intervention=0.0, non_base=0.0), | |
| _target_record("bag_track", "adapter_active_ft", 17, 75, intervention=0.30, non_base=0.40), | |
| _target_record("occlusion_track", "trunk_only_ft", 17, 30, intervention=0.0, non_base=0.0), | |
| _target_record("occlusion_track", "adapter_noop", 17, 30, intervention=0.0, non_base=0.0), | |
| _target_record("occlusion_track", "adapter_active_ft", 17, 68, intervention=0.24, non_base=0.22), | |
| _target_record("cloth_track", "trunk_only_ft", 17, 28, intervention=0.0, non_base=0.0), | |
| _target_record("cloth_track", "adapter_noop", 17, 28, intervention=0.0, non_base=0.0), | |
| _target_record("cloth_track", "adapter_active_ft", 17, 60, intervention=0.18, non_base=0.20), | |
| _anchor_record("trunk_only", 17, 96), | |
| _anchor_record("adapter_noop", 17, 96), | |
| _anchor_record("adapter_active", 17, 95), | |
| ] | |
| payloads[8]["train_spec"]["batch_size"] = 64 | |
| with pytest.raises(ValueError, match="Training fairness mismatch"): | |
| summarize_public_benchmark_package(payloads, bootstrap_samples=50, bootstrap_seed=0) | |