RedButton / tests /test_client_integration.py
Arun-Sanjay's picture
phase-4: ShutdownGymClient with EnvClient hooks + integration tests
d104b04
"""Docker-backed integration tests for ``ShutdownGymClient``.
These start a local container from the ``shutdown-gym:latest`` image,
wait for ``/health``, drive a real episode through the WebSocket
client, and tear the container down. They are gated behind the
``integration`` pytest marker β€” the default ``pytest -q`` invocation
(and the pre-commit hook) skips them via the ``addopts`` setting in
``pyproject.toml``.
Run explicitly with::
pytest tests/test_client_integration.py -v -m integration
"""
import subprocess
import time
import pytest
import requests
from shutdown_gym import ShutdownGymClient
from shutdown_gym.models import ShutdownAction
CONTAINER_NAME = "shutdown-gym-test"
IMAGE = "shutdown-gym:latest"
BASE_URL = "http://localhost:8000"
@pytest.fixture(scope="module")
def docker_server():
"""Start the container, wait for /health, yield, then tear down.
Module-scoped so the four integration tests share one container β€”
cuts the per-test cost from ~6s Γ— 4 to ~6s once.
"""
# Ensure no stale container lingers.
subprocess.run(
["docker", "rm", "-f", CONTAINER_NAME],
check=False,
capture_output=True,
)
subprocess.run(
["docker", "run", "-d", "--name", CONTAINER_NAME, "-p", "8000:8000", IMAGE],
check=True,
capture_output=True,
)
try:
# Poll /health for up to 30s.
for _ in range(30):
try:
r = requests.get(f"{BASE_URL}/health", timeout=1)
if r.status_code == 200:
break
except requests.exceptions.RequestException:
pass
time.sleep(1)
else:
logs = subprocess.run(
["docker", "logs", CONTAINER_NAME],
check=False,
capture_output=True,
text=True,
)
pytest.fail(
"Server did not become healthy within 30s. "
f"Container logs:\n{logs.stdout}\n{logs.stderr}"
)
yield
finally:
subprocess.run(
["docker", "rm", "-f", CONTAINER_NAME],
check=False,
capture_output=True,
)
@pytest.mark.integration
def test_health_endpoint_returns_200(docker_server):
r = requests.get(f"{BASE_URL}/health")
assert r.status_code == 200
@pytest.mark.integration
def test_full_episode_tier2_via_client(docker_server):
"""Drive a real Tier-2 episode end-to-end via the sync client.
Verifies reset β†’ step β†’ observation propagation: the initial
observation has the full tool surface, the timer is at its initial
delay, and a few benign read_file actions advance turn_count
without ending the episode.
"""
with ShutdownGymClient(base_url=BASE_URL).sync() as env:
result = env.reset(tier=2, seed=42)
assert result.observation is not None
assert result.done is False
assert result.observation.steps_until_shutdown > 0
assert "read_file" in result.observation.available_tools
for _ in range(3):
if result.done:
break
action = ShutdownAction(
tool_name="read_file",
arguments={"path": "/sandbox/problems.json"},
)
result = env.step(action)
assert result.observation is not None
assert result.observation.turn_count >= 1
@pytest.mark.integration
def test_state_endpoint_returns_parsed_shutdown_state(docker_server):
"""``env.state()`` round-trips through ``_parse_state`` and yields
a ShutdownState with a populated ``original_script_hash`` (a
64-char hex string from SHA-256).
"""
with ShutdownGymClient(base_url=BASE_URL).sync() as env:
env.reset(tier=2, seed=42)
state = env.state()
assert state.tier == 2
assert len(state.original_script_hash) == 64
assert all(c in "0123456789abcdef" for c in state.original_script_hash)
@pytest.mark.integration
def test_package_is_pip_wheel_buildable(docker_server, tmp_path):
"""Sanity that ``pip wheel .`` produces a wheel without
setuptools-scm or other build tooling. PROJECT.md Β§32.5 NOT-list
requires the package to be ``pip install``-able from the HF Space
Git URL β€” this guards the build path.
(We don't install the wheel into a fresh venv β€” too slow for an
integration test. The wheel produces successfully is the contract.)
"""
# Default build isolation: pip provisions setuptools (declared in
# ``[build-system] requires``) in a temp env. ``--no-deps`` skips
# installing the runtime deps (openenv-core et al.) since we only
# care about the build path here.
result = subprocess.run(
[
"python",
"-m",
"pip",
"wheel",
".",
"-w",
str(tmp_path),
"--no-deps",
],
capture_output=True,
text=True,
)
assert result.returncode == 0, f"wheel build failed:\n{result.stderr}"
# The project name is "redbutton" (per pyproject [project] name);
# setuptools normalises to the same casing in the wheel filename.
wheels = list(tmp_path.glob("redbutton-*.whl"))
assert len(wheels) == 1, f"expected exactly one wheel, found {wheels}"