Spaces:
Sleeping
Sleeping
| set -euo pipefail | |
| echo "[validate] Running pytest" | |
| python -m pytest -q | |
| echo "[validate] Running grader determinism/bounds checks" | |
| python -m pytest -q tests/test_graders.py | |
| echo "[validate] Verifying openenv.yaml parses" | |
| python - <<'PY' | |
| import yaml | |
| with open("openenv.yaml", "r", encoding="utf-8") as f: | |
| data = yaml.safe_load(f) | |
| required = ["name", "version", "description", "action_space", "observation_space", "reward_description"] | |
| missing = [k for k in required if k not in data] | |
| if missing: | |
| raise SystemExit(f"openenv.yaml missing required keys: {missing}") | |
| print("openenv.yaml OK") | |
| PY | |
| echo "[validate] Verifying API endpoints and reset/step/state behavior" | |
| python - <<'PY' | |
| from fastapi.testclient import TestClient | |
| from server.app import app | |
| client = TestClient(app) | |
| r = client.get("/") | |
| if r.status_code != 200: | |
| raise SystemExit(f"GET / failed with status {r.status_code}") | |
| reset_resp = client.post("/reset", json={"task_id": "task_easy_1"}) | |
| if reset_resp.status_code != 200: | |
| raise SystemExit(f"POST /reset failed with status {reset_resp.status_code}") | |
| payload = reset_resp.json() | |
| session_id = payload.get("session_id") | |
| if not session_id: | |
| raise SystemExit("/reset response missing session_id") | |
| step_resp = client.post( | |
| "/step", | |
| json={ | |
| "session_id": session_id, | |
| "action": {"action_type": "check_policy", "parameters": {}}, | |
| }, | |
| ) | |
| if step_resp.status_code != 200: | |
| raise SystemExit(f"POST /step failed with status {step_resp.status_code}") | |
| state_resp = client.get(f"/state?session_id={session_id}") | |
| if state_resp.status_code != 200: | |
| raise SystemExit(f"GET /state failed with status {state_resp.status_code}") | |
| print("API endpoint checks OK") | |
| PY | |
| echo "[validate] Verifying task difficulty progression and reward ranges" | |
| python - <<'PY' | |
| from env.tasks import TASKS | |
| from env.environment import SupportTicketEnv | |
| from env.models import Action | |
| # Difficulty coverage | |
| difficulties = {task["difficulty"] for task in TASKS.values()} | |
| expected = {"easy", "medium", "hard"} | |
| if not expected.issubset(difficulties): | |
| raise SystemExit(f"Missing expected difficulties: {expected - difficulties}") | |
| # Reward range check across canonical task runs | |
| canonical = { | |
| "task_easy_1": [ | |
| Action(action_type="check_policy", parameters={}), | |
| Action(action_type="issue_refund", parameters={"amount": "full"}), | |
| Action(action_type="close_ticket", parameters={"resolution": "refunded"}), | |
| ], | |
| "task_medium_1": [ | |
| Action(action_type="check_policy", parameters={}), | |
| Action(action_type="reply_to_customer", parameters={"message": "Policy explained - no refund"}), | |
| Action(action_type="close_ticket", parameters={"resolution": "policy_explained"}), | |
| ], | |
| "task_hard_1": [ | |
| Action(action_type="fetch_user_data", parameters={"user_id": "USR-C3"}), | |
| Action(action_type="reply_to_customer", parameters={"message": "Escalating to billing tier 2."}), | |
| Action(action_type="escalate", parameters={"reason": "billing_tier2"}), | |
| ], | |
| } | |
| for task_id, actions in canonical.items(): | |
| env = SupportTicketEnv(task_id=task_id) | |
| env.reset() | |
| final_score = 0.0 | |
| for a in actions: | |
| _, _, done, info = env.step(a) | |
| final_score = info.get("current_reward", final_score) | |
| if done: | |
| break | |
| if not (0.0 <= final_score <= 1.0): | |
| raise SystemExit(f"Score out of range for {task_id}: {final_score}") | |
| print("Task checks OK") | |
| PY | |
| echo "[validate] Running baseline evaluation harness" | |
| python evaluate.py | |
| echo "[validate] Checking inference script smoke-run and timing" | |
| export API_BASE_URL="${API_BASE_URL:-https://api.openai.com/v1}" | |
| export MODEL_NAME="${MODEL_NAME:-gpt-4o}" | |
| export HF_TOKEN="${HF_TOKEN:-dummy-key}" | |
| /usr/bin/time -p python inference.py > /tmp/inference_validation.log 2>&1 || true | |
| if ! grep -q "\[START\]" /tmp/inference_validation.log; then | |
| echo "Missing [START] in inference output" | |
| exit 1 | |
| fi | |
| if ! grep -q "\[STEP\]" /tmp/inference_validation.log; then | |
| echo "Missing [STEP] in inference output" | |
| exit 1 | |
| fi | |
| if ! grep -q "\[END\]" /tmp/inference_validation.log; then | |
| echo "Missing [END] in inference output" | |
| exit 1 | |
| fi | |
| echo "[validate] All non-docker validation checks completed" | |