Spaces:

Arun-Sanjay
/

RedButton

Sleeping

App Files Files Community

RedButton / tests /test_client_integration.py

Arun-Sanjay

phase-4: ShutdownGymClient with EnvClient hooks + integration tests

d104b04 19 days ago

raw

history blame contribute delete

5.37 kB

	"""Docker-backed integration tests for ``ShutdownGymClient``.

	These start a local container from the ``shutdown-gym:latest`` image,
	wait for ``/health``, drive a real episode through the WebSocket
	client, and tear the container down. They are gated behind the
	``integration`` pytest marker — the default ``pytest -q`` invocation
	(and the pre-commit hook) skips them via the ``addopts`` setting in
	``pyproject.toml``.

	Run explicitly with::

	pytest tests/test_client_integration.py -v -m integration
	"""

	import subprocess
	import time

	import pytest
	import requests

	from shutdown_gym import ShutdownGymClient
	from shutdown_gym.models import ShutdownAction

	CONTAINER_NAME = "shutdown-gym-test"
	IMAGE = "shutdown-gym:latest"
	BASE_URL = "http://localhost:8000"


	@pytest.fixture(scope="module")
	def docker_server():
	"""Start the container, wait for /health, yield, then tear down.

	Module-scoped so the four integration tests share one container —
	cuts the per-test cost from ~6s × 4 to ~6s once.
	"""
	# Ensure no stale container lingers.
	subprocess.run(
	["docker", "rm", "-f", CONTAINER_NAME],
	check=False,
	capture_output=True,
	)
	subprocess.run(
	["docker", "run", "-d", "--name", CONTAINER_NAME, "-p", "8000:8000", IMAGE],
	check=True,
	capture_output=True,
	)

	try:
	# Poll /health for up to 30s.
	for _ in range(30):
	try:
	r = requests.get(f"{BASE_URL}/health", timeout=1)
	if r.status_code == 200:
	break
	except requests.exceptions.RequestException:
	pass
	time.sleep(1)
	else:
	logs = subprocess.run(
	["docker", "logs", CONTAINER_NAME],
	check=False,
	capture_output=True,
	text=True,
	)
	pytest.fail(
	"Server did not become healthy within 30s. "
	f"Container logs:\n{logs.stdout}\n{logs.stderr}"
	)

	yield
	finally:
	subprocess.run(
	["docker", "rm", "-f", CONTAINER_NAME],
	check=False,
	capture_output=True,
	)


	@pytest.mark.integration
	def test_health_endpoint_returns_200(docker_server):
	r = requests.get(f"{BASE_URL}/health")
	assert r.status_code == 200


	@pytest.mark.integration
	def test_full_episode_tier2_via_client(docker_server):
	"""Drive a real Tier-2 episode end-to-end via the sync client.

	Verifies reset → step → observation propagation: the initial
	observation has the full tool surface, the timer is at its initial
	delay, and a few benign read_file actions advance turn_count
	without ending the episode.
	"""
	with ShutdownGymClient(base_url=BASE_URL).sync() as env:
	result = env.reset(tier=2, seed=42)
	assert result.observation is not None
	assert result.done is False
	assert result.observation.steps_until_shutdown > 0
	assert "read_file" in result.observation.available_tools

	for _ in range(3):
	if result.done:
	break
	action = ShutdownAction(
	tool_name="read_file",
	arguments={"path": "/sandbox/problems.json"},
	)
	result = env.step(action)
	assert result.observation is not None

	assert result.observation.turn_count >= 1


	@pytest.mark.integration
	def test_state_endpoint_returns_parsed_shutdown_state(docker_server):
	"""``env.state()`` round-trips through ``_parse_state`` and yields
	a ShutdownState with a populated ``original_script_hash`` (a
	64-char hex string from SHA-256).
	"""
	with ShutdownGymClient(base_url=BASE_URL).sync() as env:
	env.reset(tier=2, seed=42)
	state = env.state()
	assert state.tier == 2
	assert len(state.original_script_hash) == 64
	assert all(c in "0123456789abcdef" for c in state.original_script_hash)


	@pytest.mark.integration
	def test_package_is_pip_wheel_buildable(docker_server, tmp_path):
	"""Sanity that ``pip wheel .`` produces a wheel without
	setuptools-scm or other build tooling. PROJECT.md §32.5 NOT-list
	requires the package to be ``pip install``-able from the HF Space
	Git URL — this guards the build path.

	(We don't install the wheel into a fresh venv — too slow for an
	integration test. The wheel produces successfully is the contract.)
	"""
	# Default build isolation: pip provisions setuptools (declared in
	# ``[build-system] requires``) in a temp env. ``--no-deps`` skips
	# installing the runtime deps (openenv-core et al.) since we only
	# care about the build path here.
	result = subprocess.run(
	[
	"python",
	"-m",
	"pip",
	"wheel",
	".",
	"-w",
	str(tmp_path),
	"--no-deps",
	],
	capture_output=True,
	text=True,
	)
	assert result.returncode == 0, f"wheel build failed:\n{result.stderr}"
	# The project name is "redbutton" (per pyproject [project] name);
	# setuptools normalises to the same casing in the wheel filename.
	wheels = list(tmp_path.glob("redbutton-*.whl"))
	assert len(wheels) == 1, f"expected exactly one wheel, found {wheels}"