shinka-backup / scripts /dev /smoke_test_frontier_cs.sh
JustinTX's picture
Add files using upload-large-folder tool
3f6526a verified
#!/bin/bash
# Smoke test: run Frontier-CS on a few problems to verify the full pipeline.
#
# Prerequisites: Eval service should be running
# bash scripts/dev/start_eval_server.sh
# OPENHANDS_LOG_COMPLETIONS=1 ENABLE_FULL_TRAJECTORY_LOG=1 uv run eval_agent/ev2_service_standalone.py --host "0.0.0.0" --port 8755
set -euo pipefail
cd "$(dirname "$0")/../.."
PYTHON=".venv/bin/python"
GENS=10
PARALLEL=2
# ============================================================================
# Start eval service in the background (if not already running)
# ============================================================================
EVAL_PORT=8755
EVAL_URL="http://localhost:${EVAL_PORT}"
if curl -s "${EVAL_URL}/api/v1/status" > /dev/null 2>&1; then
echo "Eval service already running at ${EVAL_URL}"
else
echo "Starting eval service on port ${EVAL_PORT}..."
OPENHANDS_LOG_COMPLETIONS=1 ENABLE_FULL_TRAJECTORY_LOG=1 \
${PYTHON} eval_agent/ev2_service_standalone.py \
--host "0.0.0.0" --port "${EVAL_PORT}" &
EVAL_PID=$!
# Wait for service to become ready
for i in $(seq 1 30); do
if curl -s "${EVAL_URL}/api/v1/status" > /dev/null 2>&1; then
echo "Eval service ready (pid=${EVAL_PID})"
break
fi
sleep 1
done
if ! curl -s "${EVAL_URL}/api/v1/status" > /dev/null 2>&1; then
echo "ERROR: Eval service failed to start"
kill "${EVAL_PID}" 2>/dev/null || true
exit 1
fi
# Clean up eval service on script exit
trap "echo 'Stopping eval service...'; kill ${EVAL_PID} 2>/dev/null || true" EXIT
fi
echo "========================================"
echo "Frontier-CS Smoke Test"
echo "========================================"
echo ""
for PID in 0 1; do
echo "----------------------------------------"
echo "Problem ${PID} (${GENS} generations)"
echo "----------------------------------------"
${PYTHON} tasks/frontier_cs_entry/run_experiment.py \
--experiment-name "smoke_p${PID}" \
--problem-id "${PID}" \
--seed-model gemini3pro \
--num-generations "${GENS}" \
--max-parallel-jobs "${PARALLEL}" \
--use-eval-service \
--eval-service-url "${EVAL_URL}" \
--eval-trigger-mode periodic \
--eval-trigger-interval 5
echo ""
done
echo "========================================"
echo "Smoke test complete"
echo "========================================"