Hanrui / sglang /scripts /ci /amd /amd_ci_exec.sh
Lekr0's picture
Add files using upload-large-folder tool
61ba51e verified
#!/bin/bash
set -euo pipefail
# Detect GPU family from hostname (e.g., linux-mi35x-gpu-1-xxxxx-runner-zzzzz)
HOSTNAME_VALUE=$(hostname)
GPU_FAMILY=""
# Host names look like: linux-mi35x-gpu-1-xxxxx-runner-zzzzz
if [[ "${HOSTNAME_VALUE}" =~ ^linux-(mi[0-9]+[a-z]*)-gpu-[0-9]+ ]]; then
GPU_FAMILY="${BASH_REMATCH[1]}"
echo "Detected GPU family from hostname: ${GPU_FAMILY}"
else
echo "Warning: could not parse GPU family from '${HOSTNAME_VALUE}'"
fi
WORKDIR="/sglang-checkout/test/srt"
declare -A ENV_MAP=(
[SGLANG_IS_IN_CI_AMD]=1
[SGLANG_IS_IN_CI]=1
[SGLANG_USE_AITER]=1
)
# Conditionally add GPU_ARCHS only for mi35x
if [[ "${GPU_FAMILY}" == "mi35x" ]]; then
ENV_MAP[GPU_ARCHS]="gfx950"
fi
# Parse -w/--workdir and -e ENV=VAL
while [[ $# -gt 0 ]]; do
case "$1" in
-w|--workdir)
WORKDIR="$2"
shift 2
;;
-e)
IFS="=" read -r key val <<< "$2"
ENV_MAP["$key"]="$val"
shift 2
;;
--)
shift
break
;;
*)
break
;;
esac
done
# Build final ENV_ARGS
ENV_ARGS=()
for key in "${!ENV_MAP[@]}"; do
ENV_ARGS+=("-e" "$key=${ENV_MAP[$key]}")
done
# Run docker exec with retry logic for HuggingFace network/download issues
# When HF model downloads fail due to network timeouts or rate limits,
# retrying with HF_HUB_OFFLINE=1 uses cached models from previous downloads.
#
# First attempt: normal mode (allows HF downloads)
if docker exec \
-w "$WORKDIR" \
"${ENV_ARGS[@]}" \
ci_sglang "$@"; then
exit 0
else
FIRST_EXIT_CODE=$?
fi
echo "First attempt failed with exit code $FIRST_EXIT_CODE"
# Skip retry for test failures that won't be fixed by offline mode:
# - Exit 1: Test assertion failures (accuracy below threshold)
# - Exit 137 (128+9): Process killed by OOM
# - Exit 255: Test suite completed with test errors
# Only retry for other exit codes (e.g., network timeouts, HF download failures)
if [[ "$FIRST_EXIT_CODE" -eq 1 || "$FIRST_EXIT_CODE" -eq 137 || "$FIRST_EXIT_CODE" -eq 255 ]]; then
echo "Exit code $FIRST_EXIT_CODE indicates test failure (not network issue), not retrying"
exit $FIRST_EXIT_CODE
fi
echo "Retrying with HF_HUB_OFFLINE=1 (offline mode to use cached models)..."
# Second attempt: force HF offline mode to avoid network timeouts
docker exec \
-w "$WORKDIR" \
"${ENV_ARGS[@]}" \
-e HF_HUB_OFFLINE=1 \
ci_sglang "$@"