CabinLavatoryPrediction / code /monitor_training.sh
sutama's picture
Upload CabinLavatoryPrediction LoRA adapter, checkpoint, code, and evaluation artifacts
e74a796 verified
#!/usr/bin/env bash
set -euo pipefail
ROOT="${ROOT:-/home/ubuntu/Documents/MWave}"
INTERVAL_SECONDS="${INTERVAL_SECONDS:-1800}"
MONITOR_LOG="$ROOT/outputs/logs/training_monitor.log"
STATE_FILE="$ROOT/outputs/logs/training_monitor.state"
mkdir -p "$ROOT/outputs/logs"
touch "$STATE_FILE"
latest_train_log() {
ls -t "$ROOT"/outputs/logs/train_resume_*.log 2>/dev/null | head -n 1 || true
}
active_pipeline() {
pgrep -af 'scripts/train_qlora.py|scripts/evaluate.py|train_resume_2048|train_resume_1024' | grep -v monitor_training || true
}
checkpoint_summary() {
find "$ROOT/outputs/qwen35_9b_lora" -maxdepth 1 -type d -name 'checkpoint-*' 2>/dev/null | sort -V | tail -n 3 | xargs -r -n 1 basename
}
prediction_counts() {
for f in \
"$ROOT/outputs/predictions/finetuned_struct_predictions.jsonl" \
"$ROOT/outputs/predictions/finetuned_qa_predictions.jsonl"; do
if [[ -f "$f" ]]; then
wc -l "$f"
else
echo "0 $f"
fi
done
}
restart_1024_if_needed() {
local log="$1"
if grep -q 'fallback_1024_started=1' "$STATE_FILE"; then
return 0
fi
if [[ -n "$(active_pipeline)" ]]; then
return 0
fi
if [[ -f "$ROOT/outputs/metrics/finetuned_struct_metrics.json" ]]; then
return 0
fi
if [[ -n "$log" ]] && grep -q 'torch.OutOfMemoryError' "$log"; then
echo "fallback_1024_started=1" >> "$STATE_FILE"
local fallback_log="$ROOT/outputs/logs/train_resume_1024_$(date +%Y%m%d_%H%M%S).log"
echo "[$(date '+%F %T %Z')] OOM detected. Restarting with max_seq_length=1024. log=$fallback_log" >> "$MONITOR_LOG"
rm -rf "$ROOT/outputs/qwen35_9b_lora"
(
cd "$ROOT"
export ALL_PROXY=
export all_proxy=
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
python3 scripts/train_qlora.py \
--model-name Qwen/Qwen3.5-9B \
--train-file data/processed/train_mixed.jsonl \
--val-file data/processed/val_mixed.jsonl \
--output-dir outputs/qwen35_9b_lora \
--max-seq-length 1024 \
&& python3 scripts/evaluate.py \
--model-name Qwen/Qwen3.5-9B \
--adapter-dir outputs/qwen35_9b_lora \
--input-file data/processed/val_struct.jsonl \
--task-type struct \
--output-dir outputs \
--run-name finetuned \
&& python3 scripts/evaluate.py \
--model-name Qwen/Qwen3.5-9B \
--adapter-dir outputs/qwen35_9b_lora \
--input-file data/processed/val_qa.jsonl \
--task-type qa \
--output-dir outputs \
--run-name finetuned \
--max-new-tokens 512 \
&& python3 scripts/visualize_results.py \
--metrics outputs/metrics/base_struct_metrics.json outputs/metrics/base_qa_metrics.json outputs/metrics/finetuned_struct_metrics.json outputs/metrics/finetuned_qa_metrics.json \
--predictions outputs/predictions/base_struct_predictions.jsonl outputs/predictions/finetuned_struct_predictions.jsonl \
--out-dir outputs/figures \
&& python3 scripts/build_report.py
) > "$fallback_log" 2>&1 &
fi
}
check_once() {
local log
log="$(latest_train_log)"
{
echo "===== $(date '+%F %T %Z') ====="
echo "latest_train_log=${log:-none}"
echo "-- active processes --"
active_pipeline || true
echo "-- gpu --"
nvidia-smi --query-gpu=memory.used,memory.free,utilization.gpu --format=csv,noheader || true
nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv,noheader || true
echo "-- checkpoints --"
checkpoint_summary || true
echo "-- finetuned prediction counts --"
prediction_counts || true
if [[ -n "$log" ]]; then
echo "-- recent progress --"
grep -aoE '[0-9]+/6283|generated [0-9]+/4030|torch.OutOfMemoryError|Traceback|train_runtime|eval_loss' "$log" | tail -n 20 || true
fi
} >> "$MONITOR_LOG"
restart_1024_if_needed "$log"
}
while true; do
check_once
sleep "$INTERVAL_SECONDS"
done