| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| set -euo pipefail |
|
|
| |
| LOG_FILE="${1:-checkpoints/korean_1b_sft/train.log}" |
| CHECK_INTERVAL=30 |
| ZERO_LOSS_THRESHOLD=3 |
| GNORM_WARN=10.0 |
| GNORM_CRITICAL=50.0 |
| LOSS_SPIKE_FACTOR=3.0 |
| STALL_TIMEOUT=300 |
| DISK_WARN_PCT=80 |
| GPU_UTIL_WARN=50 |
| CHECK_ONCE=false |
|
|
| if [[ "${1:-}" == "--check-once" ]]; then |
| CHECK_ONCE=true |
| LOG_FILE="${2:-checkpoints/korean_1b_sft/train.log}" |
| fi |
|
|
| |
| RED='\033[0;31m' |
| YELLOW='\033[1;33m' |
| GREEN='\033[0;32m' |
| CYAN='\033[0;36m' |
| NC='\033[0m' |
|
|
| |
| timestamp() { date '+%Y-%m-%d %H:%M:%S'; } |
|
|
| alert() { |
| local level="$1" msg="$2" |
| case "$level" in |
| CRITICAL) echo -e "${RED}๐ด [$(timestamp)] [CRITICAL] ${msg}${NC}" ;; |
| WARNING) echo -e "${YELLOW}๐ [$(timestamp)] [WARNING] ${msg}${NC}" ;; |
| INFO) echo -e "${CYAN}๐ก [$(timestamp)] [INFO] ${msg}${NC}" ;; |
| OK) echo -e "${GREEN}โ
[$(timestamp)] [OK] ${msg}${NC}" ;; |
| esac |
| } |
|
|
| |
| parse_metrics() { |
| |
| local n="${1:-20}" |
| if [[ ! -f "$LOG_FILE" ]]; then |
| echo "" |
| return |
| fi |
| tail -n "$n" "$LOG_FILE" | grep "step.*loss.*gnorm" || true |
| } |
|
|
| extract_field() { |
| |
| echo "$1" | grep -oP "${2}\s+\K[0-9]+\.[0-9e+\-]+" | head -1 |
| } |
|
|
| |
|
|
| check_loss_zero() { |
| local lines |
| lines=$(parse_metrics "$ZERO_LOSS_THRESHOLD") |
| if [[ -z "$lines" ]]; then return; fi |
|
|
| local zero_count=0 |
| while IFS= read -r line; do |
| local loss |
| loss=$(extract_field "$line" "loss") |
| if [[ -n "$loss" ]]; then |
| |
| if (( $(echo "$loss < 0.001" | bc -l 2>/dev/null || echo 0) )); then |
| ((zero_count++)) |
| fi |
| fi |
| done <<< "$lines" |
|
|
| if [[ $zero_count -ge $ZERO_LOSS_THRESHOLD ]]; then |
| alert CRITICAL "Loss๊ฐ ${zero_count}ํ ์ฐ์ ~0! Labels ๋ฒ๊ทธ ๊ฐ๋ฅ์ฑ. ์ฆ์ ํ์ต ์ค๋จ!" |
| return 1 |
| fi |
| return 0 |
| } |
|
|
| check_loss_spike() { |
| local lines |
| lines=$(parse_metrics 20) |
| if [[ -z "$lines" ]]; then return 0; fi |
|
|
| local losses=() |
| while IFS= read -r line; do |
| local loss |
| loss=$(extract_field "$line" "loss") |
| [[ -n "$loss" ]] && losses+=("$loss") |
| done <<< "$lines" |
|
|
| local count=${#losses[@]} |
| if [[ $count -lt 5 ]]; then return 0; fi |
|
|
| |
| local last_loss="${losses[$((count-1))]}" |
| local sum=0 |
| for ((i=0; i<count-1; i++)); do |
| sum=$(echo "$sum + ${losses[$i]}" | bc -l 2>/dev/null || echo "$sum") |
| done |
| local avg=$(echo "$sum / ($count - 1)" | bc -l 2>/dev/null || echo "0") |
|
|
| if [[ "$avg" != "0" ]]; then |
| local ratio=$(echo "$last_loss / $avg" | bc -l 2>/dev/null || echo "1") |
| if (( $(echo "$ratio > $LOSS_SPIKE_FACTOR" | bc -l 2>/dev/null || echo 0) )); then |
| alert WARNING "Loss spike ๊ฐ์ง! ํ์ฌ=${last_loss}, ํ๊ท =${avg}, ๋น์จ=${ratio}x" |
| fi |
| fi |
| return 0 |
| } |
|
|
| check_gnorm() { |
| local lines |
| lines=$(parse_metrics 5) |
| if [[ -z "$lines" ]]; then return 0; fi |
|
|
| local last_line |
| last_line=$(echo "$lines" | tail -1) |
| local gnorm |
| gnorm=$(extract_field "$last_line" "gnorm") |
|
|
| if [[ -z "$gnorm" ]]; then return 0; fi |
|
|
| if (( $(echo "$gnorm > $GNORM_CRITICAL" | bc -l 2>/dev/null || echo 0) )); then |
| alert CRITICAL "GNorm=${gnorm} > ${GNORM_CRITICAL}! ๋ฐ์ฐ ์ง์ . ํ์ต ์ค๋จ ๊ณ ๋ ค." |
| elif (( $(echo "$gnorm > $GNORM_WARN" | bc -l 2>/dev/null || echo 0) )); then |
| alert WARNING "GNorm=${gnorm} > ${GNORM_WARN}. ๋ถ์์ ์งํ." |
| fi |
| return 0 |
| } |
|
|
| check_stall() { |
| if [[ ! -f "$LOG_FILE" ]]; then |
| alert INFO "๋ก๊ทธ ํ์ผ ์์: ${LOG_FILE}" |
| return 0 |
| fi |
|
|
| local last_modified |
| last_modified=$(stat -c %Y "$LOG_FILE" 2>/dev/null || echo 0) |
| local now |
| now=$(date +%s) |
| local diff=$((now - last_modified)) |
|
|
| if [[ $diff -gt $STALL_TIMEOUT ]]; then |
| alert CRITICAL "๋ก๊ทธ๊ฐ ${diff}์ด ($(( diff/60 ))๋ถ) ๋์ ์
๋ฐ์ดํธ ์์! Hang ๊ฐ๋ฅ์ฑ." |
| fi |
| return 0 |
| } |
|
|
| check_disk() { |
| local usage |
| usage=$(df /PROJECT 2>/dev/null | awk 'NR==2 {print $5}' | tr -d '%') |
| if [[ -n "$usage" && "$usage" -gt "$DISK_WARN_PCT" ]]; then |
| alert WARNING "๋์คํฌ ์ฌ์ฉ๋ฅ ${usage}% > ${DISK_WARN_PCT}%. ์ฒดํฌํฌ์ธํธ ์ ๋ฆฌ ํ์." |
| fi |
| return 0 |
| } |
|
|
| check_gpu() { |
| if ! command -v nvidia-smi &>/dev/null; then return 0; fi |
|
|
| local low_util=0 |
| local total_gpus=0 |
| while IFS= read -r util; do |
| ((total_gpus++)) |
| if [[ "$util" -lt "$GPU_UTIL_WARN" ]]; then |
| ((low_util++)) |
| fi |
| done < <(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null) |
|
|
| if [[ $total_gpus -gt 0 && $low_util -gt 0 ]]; then |
| alert INFO "${low_util}/${total_gpus} GPU utilization < ${GPU_UTIL_WARN}%. ๋ฐ์ดํฐ ๋ก๋ฉ ๋ณ๋ชฉ?" |
| fi |
| return 0 |
| } |
|
|
| |
| print_status() { |
| local lines |
| lines=$(parse_metrics 1) |
| if [[ -n "$lines" ]]; then |
| echo -e "${GREEN}์ต๊ทผ ๋ก๊ทธ:${NC} $lines" |
| fi |
|
|
| if command -v nvidia-smi &>/dev/null; then |
| echo -e "${CYAN}GPU ๋ฉ๋ชจ๋ฆฌ:${NC}" |
| nvidia-smi --query-gpu=index,memory.used,memory.total,utilization.gpu \ |
| --format=csv,noheader 2>/dev/null | head -8 |
| fi |
|
|
| local disk |
| disk=$(df -h /PROJECT 2>/dev/null | awk 'NR==2 {print "์ฌ์ฉ: "$3"/"$2" ("$5")"}') |
| echo -e "${CYAN}๋์คํฌ:${NC} ${disk}" |
| } |
|
|
| |
| echo "==================================================================" |
| echo " SFT Training Monitor" |
| echo " Log file: ${LOG_FILE}" |
| echo " Check interval: ${CHECK_INTERVAL}s" |
| echo " Press Ctrl+C to stop" |
| echo "==================================================================" |
|
|
| run_all_checks() { |
| check_loss_zero || true |
| check_loss_spike || true |
| check_gnorm || true |
| check_stall || true |
| check_disk || true |
| check_gpu || true |
| echo "---" |
| print_status |
| echo "" |
| } |
|
|
| if $CHECK_ONCE; then |
| run_all_checks |
| exit 0 |
| fi |
|
|
| while true; do |
| run_all_checks |
| sleep "$CHECK_INTERVAL" |
| done |
|
|