| | #!/usr/bin/env bash |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | set -u |
| |
|
| | cd "$(dirname "$0")/.." |
| |
|
| | RUN_NAME="korean_3b_fp8_run1" |
| | CKPT_DIR="checkpoints/${RUN_NAME}" |
| | PID_FILE="${CKPT_DIR}/train.pid" |
| | LOG_FILE="${CKPT_DIR}/train.log" |
| |
|
| | TEST_ONLY=false |
| | SKIP_STOP=false |
| | for arg in "$@"; do |
| | case "$arg" in |
| | --test-only) TEST_ONLY=true ;; |
| | --skip-stop) SKIP_STOP=true ;; |
| | esac |
| | done |
| |
|
| | echo "==================================================================" |
| | echo " FRANKENSTALLM 3B — Optimization Migration v2" |
| | echo " $(date)" |
| | echo "==================================================================" |
| |
|
| | |
| | echo "" |
| | echo "[1/6] Validating modified files..." |
| | ERRORS=0 |
| |
|
| | for pyfile in model/attention.py train/pretrain.py data/dataset.py scripts/migrate_qkv_checkpoint.py; do |
| | if python3 -c "import ast; ast.parse(open('$pyfile').read())" 2>/dev/null; then |
| | echo " ✓ $pyfile — syntax OK" |
| | else |
| | echo " ✗ $pyfile — SYNTAX ERROR" |
| | ((ERRORS++)) |
| | fi |
| | done |
| |
|
| | if bash -n scripts/launch_3b_pretrain.sh 2>/dev/null; then |
| | echo " ✓ scripts/launch_3b_pretrain.sh — syntax OK" |
| | else |
| | echo " ✗ scripts/launch_3b_pretrain.sh — SYNTAX ERROR" |
| | ((ERRORS++)) |
| | fi |
| |
|
| | |
| | python3 -c " |
| | import yaml |
| | with open('configs/korean_3b_fp8.yaml') as f: |
| | cfg = yaml.safe_load(f) |
| | assert cfg['train']['batch_size'] == 5, f'batch_size should be 5, got {cfg[\"train\"][\"batch_size\"]}' |
| | print(' ✓ configs/korean_3b_fp8.yaml — valid, batch_size=5') |
| | " 2>/dev/null || { echo " ✗ configs/korean_3b_fp8.yaml — INVALID"; ((ERRORS++)); } |
| |
|
| | if [[ $ERRORS -gt 0 ]]; then |
| | echo "" |
| | echo "[ERROR] $ERRORS file(s) failed validation. Aborting." |
| | exit 1 |
| | fi |
| | echo " All files validated successfully." |
| |
|
| | if $TEST_ONLY; then |
| | echo "" |
| | echo "[INFO] --test-only mode. Exiting without restart." |
| | exit 0 |
| | fi |
| |
|
| | |
| | if ! $SKIP_STOP; then |
| | echo "" |
| | echo "[2/6] Stopping current training (SIGTERM → emergency checkpoint)..." |
| | if [[ -f "$PID_FILE" ]]; then |
| | PID=$(cat "$PID_FILE") |
| | if kill -0 "$PID" 2>/dev/null; then |
| | echo " Sending SIGTERM to PID $PID..." |
| | kill "$PID" |
| | echo " Waiting for graceful shutdown (up to 120s)..." |
| | for i in $(seq 1 120); do |
| | if ! kill -0 "$PID" 2>/dev/null; then |
| | echo " Process stopped after ${i}s" |
| | break |
| | fi |
| | sleep 1 |
| | done |
| | if kill -0 "$PID" 2>/dev/null; then |
| | echo " [WARN] Process still running after 120s. Force killing..." |
| | kill -9 "$PID" 2>/dev/null || true |
| | sleep 2 |
| | fi |
| | else |
| | echo " Process $PID not running." |
| | fi |
| | else |
| | echo " No PID file found." |
| | fi |
| |
|
| | |
| | echo " Waiting for GPU processes to terminate..." |
| | for i in $(seq 1 30); do |
| | if ! pgrep -f "pretrain.py.*korean_3b" >/dev/null 2>&1; then |
| | echo " All GPU processes cleared." |
| | break |
| | fi |
| | sleep 1 |
| | done |
| | fi |
| |
|
| | |
| | echo "" |
| | echo "[3/6] Migrating latest checkpoint (QKV fusion)..." |
| | LATEST_CKPT=$(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | sort -V | tail -1) |
| | if [[ -z "$LATEST_CKPT" ]]; then |
| | echo " [ERROR] No checkpoint found!" |
| | exit 1 |
| | fi |
| | echo " Latest checkpoint: $LATEST_CKPT" |
| |
|
| | |
| | cp "${LATEST_CKPT}/model.pt" "${LATEST_CKPT}/model.pt.backup_pre_qkv" |
| | echo " Backup created: model.pt.backup_pre_qkv" |
| |
|
| | |
| | python3 scripts/migrate_qkv_checkpoint.py "$LATEST_CKPT" |
| | echo " QKV fusion migration complete." |
| |
|
| | |
| | echo "" |
| | echo "[4/6] Running 5-step validation test..." |
| | |
| | timeout 120 python3 train/pretrain.py \ |
| | --config configs/korean_3b_fp8.yaml \ |
| | --train_data data/3b_train.bin \ |
| | --checkpoint_dir /tmp/frankenstallm_test \ |
| | --max_steps 5 \ |
| | --batch_size 5 \ |
| | --resume "$LATEST_CKPT" \ |
| | 2>&1 | tail -10 |
| |
|
| | TEST_EXIT=$? |
| | if [[ $TEST_EXIT -eq 0 ]]; then |
| | echo " ✓ 5-step test passed!" |
| | else |
| | echo " ✗ 5-step test FAILED (exit code $TEST_EXIT)" |
| | echo " [WARN] Restoring original checkpoint..." |
| | cp "${LATEST_CKPT}/model.pt.backup_pre_qkv" "${LATEST_CKPT}/model.pt" |
| | echo " Original checkpoint restored. Aborting." |
| | exit 1 |
| | fi |
| |
|
| | |
| | echo "" |
| | echo "[5/6] Cleaning up test artifacts..." |
| | rm -rf /tmp/frankenstallm_test |
| |
|
| | |
| | echo "" |
| | echo "[6/6] Launching optimized training..." |
| | echo "" |
| | echo " Changes applied:" |
| | echo " • QKV Projection Fusion (single GEMM)" |
| | echo " • NUMA CPU Affinity (cores 0-35→GPU0-3, 36-71→GPU4-7)" |
| | echo " • Batch size: 4 → 5" |
| | echo " • NCCL: NVLS,Ring algorithm, 256MB buffers" |
| | echo " • DDP: bucket_cap_mb 400 → 800" |
| | echo " • DataLoader: 4→6 workers, prefetch 3→4" |
| | echo " • MADV_RANDOM + WILLNEED for dataset mmap" |
| | echo " • numactl --interleave=all on torchrun" |
| | echo "" |
| |
|
| | bash scripts/launch_3b_pretrain.sh |
| |
|
| | echo "" |
| | echo "==================================================================" |
| | echo " Migration complete! Monitor with:" |
| | echo " tail -f ${LOG_FILE}" |
| | echo "==================================================================" |
| |
|