Spaces:
Sleeping
Sleeping
| """Docker-backed integration tests for ``ShutdownGymClient``. | |
| These start a local container from the ``shutdown-gym:latest`` image, | |
| wait for ``/health``, drive a real episode through the WebSocket | |
| client, and tear the container down. They are gated behind the | |
| ``integration`` pytest marker β the default ``pytest -q`` invocation | |
| (and the pre-commit hook) skips them via the ``addopts`` setting in | |
| ``pyproject.toml``. | |
| Run explicitly with:: | |
| pytest tests/test_client_integration.py -v -m integration | |
| """ | |
| import subprocess | |
| import time | |
| import pytest | |
| import requests | |
| from shutdown_gym import ShutdownGymClient | |
| from shutdown_gym.models import ShutdownAction | |
| CONTAINER_NAME = "shutdown-gym-test" | |
| IMAGE = "shutdown-gym:latest" | |
| BASE_URL = "http://localhost:8000" | |
| def docker_server(): | |
| """Start the container, wait for /health, yield, then tear down. | |
| Module-scoped so the four integration tests share one container β | |
| cuts the per-test cost from ~6s Γ 4 to ~6s once. | |
| """ | |
| # Ensure no stale container lingers. | |
| subprocess.run( | |
| ["docker", "rm", "-f", CONTAINER_NAME], | |
| check=False, | |
| capture_output=True, | |
| ) | |
| subprocess.run( | |
| ["docker", "run", "-d", "--name", CONTAINER_NAME, "-p", "8000:8000", IMAGE], | |
| check=True, | |
| capture_output=True, | |
| ) | |
| try: | |
| # Poll /health for up to 30s. | |
| for _ in range(30): | |
| try: | |
| r = requests.get(f"{BASE_URL}/health", timeout=1) | |
| if r.status_code == 200: | |
| break | |
| except requests.exceptions.RequestException: | |
| pass | |
| time.sleep(1) | |
| else: | |
| logs = subprocess.run( | |
| ["docker", "logs", CONTAINER_NAME], | |
| check=False, | |
| capture_output=True, | |
| text=True, | |
| ) | |
| pytest.fail( | |
| "Server did not become healthy within 30s. " | |
| f"Container logs:\n{logs.stdout}\n{logs.stderr}" | |
| ) | |
| yield | |
| finally: | |
| subprocess.run( | |
| ["docker", "rm", "-f", CONTAINER_NAME], | |
| check=False, | |
| capture_output=True, | |
| ) | |
| def test_health_endpoint_returns_200(docker_server): | |
| r = requests.get(f"{BASE_URL}/health") | |
| assert r.status_code == 200 | |
| def test_full_episode_tier2_via_client(docker_server): | |
| """Drive a real Tier-2 episode end-to-end via the sync client. | |
| Verifies reset β step β observation propagation: the initial | |
| observation has the full tool surface, the timer is at its initial | |
| delay, and a few benign read_file actions advance turn_count | |
| without ending the episode. | |
| """ | |
| with ShutdownGymClient(base_url=BASE_URL).sync() as env: | |
| result = env.reset(tier=2, seed=42) | |
| assert result.observation is not None | |
| assert result.done is False | |
| assert result.observation.steps_until_shutdown > 0 | |
| assert "read_file" in result.observation.available_tools | |
| for _ in range(3): | |
| if result.done: | |
| break | |
| action = ShutdownAction( | |
| tool_name="read_file", | |
| arguments={"path": "/sandbox/problems.json"}, | |
| ) | |
| result = env.step(action) | |
| assert result.observation is not None | |
| assert result.observation.turn_count >= 1 | |
| def test_state_endpoint_returns_parsed_shutdown_state(docker_server): | |
| """``env.state()`` round-trips through ``_parse_state`` and yields | |
| a ShutdownState with a populated ``original_script_hash`` (a | |
| 64-char hex string from SHA-256). | |
| """ | |
| with ShutdownGymClient(base_url=BASE_URL).sync() as env: | |
| env.reset(tier=2, seed=42) | |
| state = env.state() | |
| assert state.tier == 2 | |
| assert len(state.original_script_hash) == 64 | |
| assert all(c in "0123456789abcdef" for c in state.original_script_hash) | |
| def test_package_is_pip_wheel_buildable(docker_server, tmp_path): | |
| """Sanity that ``pip wheel .`` produces a wheel without | |
| setuptools-scm or other build tooling. PROJECT.md Β§32.5 NOT-list | |
| requires the package to be ``pip install``-able from the HF Space | |
| Git URL β this guards the build path. | |
| (We don't install the wheel into a fresh venv β too slow for an | |
| integration test. The wheel produces successfully is the contract.) | |
| """ | |
| # Default build isolation: pip provisions setuptools (declared in | |
| # ``[build-system] requires``) in a temp env. ``--no-deps`` skips | |
| # installing the runtime deps (openenv-core et al.) since we only | |
| # care about the build path here. | |
| result = subprocess.run( | |
| [ | |
| "python", | |
| "-m", | |
| "pip", | |
| "wheel", | |
| ".", | |
| "-w", | |
| str(tmp_path), | |
| "--no-deps", | |
| ], | |
| capture_output=True, | |
| text=True, | |
| ) | |
| assert result.returncode == 0, f"wheel build failed:\n{result.stderr}" | |
| # The project name is "redbutton" (per pyproject [project] name); | |
| # setuptools normalises to the same casing in the wheel filename. | |
| wheels = list(tmp_path.glob("redbutton-*.whl")) | |
| assert len(wheels) == 1, f"expected exactly one wheel, found {wheels}" | |