frankenstallm / source /scripts /run_eval_full.sh
pathcosmos's picture
Upload folder using huggingface_hub (#17)
48ecd01
#!/usr/bin/env bash
# ============================================================
# run_eval_full.sh β€” 전체 ν•œκ΅­μ–΄ 벀치마크 평가 (λͺ©ν‘œ: 1.5-3μ‹œκ°„)
#
# μ‚¬μš©λ²•:
# bash scripts/run_eval_full.sh [CHECKPOINT_DIR] [OUTPUT_DIR]
#
# μ˜ˆμ‹œ:
# bash scripts/run_eval_full.sh \
# checkpoints/korean_1b_sft/checkpoint-0005000 \
# eval/outputs/full_5000
#
# νƒœμŠ€ν¬:
# - KoBEST (5): boolq, copa, hellaswag, sentineg, wic
# - HAE-RAE Bench (5): general_knowledge, history, loan_word, rare_word, standard_nomenclature
# - Global MMLU Korean: 57개 도메인
# - PAWS-Ko: νŒ¨λŸ¬ν”„λ ˆμ΄μ¦ˆ 탐지
# - KorMedMCQA: ν•œκ΅­μ–΄ μ˜ν•™ MCQ (선택)
#
# 총 μ˜ˆμƒ μƒ˜ν”Œ: ~15,000개
# 1B λͺ¨λΈ @ 8Γ—B200 κΈ°μ€€: μ•½ 1.5-3μ‹œκ°„
# ============================================================
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
# ─── 인자 처리 ────────────────────────────────────────────
CHECKPOINT="${1:-checkpoints/korean_1b_sft/checkpoint-0005000}"
TIMESTAMP="$(date +%Y%m%d_%H%M%S)"
OUTPUT_DIR="${2:-eval/outputs/full_${TIMESTAMP}}"
[[ "$CHECKPOINT" != /* ]] && CHECKPOINT="$PROJECT_DIR/$CHECKPOINT"
[[ "$OUTPUT_DIR" != /* ]] && OUTPUT_DIR="$PROJECT_DIR/$OUTPUT_DIR"
# ─── μ„€μ • ────────────────────────────────────────────────
HF_MODEL_DIR="$PROJECT_DIR/outputs/hf_$(basename "$CHECKPOINT")"
TOKENIZER="$PROJECT_DIR/tokenizer/korean_sp/tokenizer.json"
# GPU μ„€μ •: 단일 GPU λ˜λŠ” tensor parallel
# lm-eval의 hf backendλŠ” κΈ°λ³Έ 단일 GPU μ‚¬μš©
# λ©€ν‹° GPU: --model_args "pretrained=...,parallelize=True" (μžλ™ device_map)
USE_MULTI_GPU="${USE_MULTI_GPU:-0}"
if [ "$USE_MULTI_GPU" = "1" ]; then
MODEL_EXTRA_ARGS=",parallelize=True"
echo "β–Ά λ©€ν‹° GPU λͺ¨λ“œ ν™œμ„±ν™” (device_map=auto)"
else
MODEL_EXTRA_ARGS=""
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
fi
BATCH_SIZE="${BATCH_SIZE:-auto}"
NUM_FEWSHOT="${NUM_FEWSHOT:-0}"
# ─── νƒœμŠ€ν¬ μ •μ˜ ─────────────────────────────────────────
# Core Korean tasks (항상 μ‹€ν–‰)
TASKS_CORE="kobest,haerae,paws_ko"
# Extended tasks (μ‹œκ°„ μžˆμ„ λ•Œ)
TASKS_EXTENDED="global_mmlu_ko"
# 선택적 νƒœμŠ€ν¬
TASKS_OPTIONAL="kormedmcqa" # ν•œκ΅­μ–΄ μ˜ν•™ MCQ
# 전체 μ‹€ν–‰ νƒœμŠ€ν¬
TASKS="${TASKS_CORE},${TASKS_EXTENDED}"
# ─── μ˜μ‘΄μ„± 확인 ─────────────────────────────────────────
check_dep() {
python3 -c "import $1" 2>/dev/null || { echo "❌ $1 not found. pip install $2"; exit 1; }
}
check_dep lm_eval lm-eval
check_dep transformers transformers
check_dep safetensors safetensors
echo "=================================================="
echo " Ko-LLM Full Benchmark Evaluation"
echo "=================================================="
echo " Checkpoint : $CHECKPOINT"
echo " HF output : $HF_MODEL_DIR"
echo " Tasks : $TASKS"
echo " Few-shot : $NUM_FEWSHOT"
echo " Batch size : $BATCH_SIZE"
echo " Output : $OUTPUT_DIR"
echo " Multi-GPU : $USE_MULTI_GPU"
echo " Start time : $(date)"
echo "=================================================="
mkdir -p "$OUTPUT_DIR"
LOG_FILE="$OUTPUT_DIR/eval_full.log"
# ─── Step 1: HF 포맷 λ³€ν™˜ ───────────────────────────────
echo ""
echo "β–Ά [1/3] μ»€μŠ€ν…€ 체크포인트 β†’ HF 포맷 λ³€ν™˜..."
if [ ! -f "$HF_MODEL_DIR/config.json" ]; then
python3 "$PROJECT_DIR/scripts/convert_to_hf.py" \
--checkpoint "$CHECKPOINT" \
--output "$HF_MODEL_DIR" \
--tokenizer "$TOKENIZER" \
2>&1 | tee -a "$LOG_FILE"
echo "βœ… HF λ³€ν™˜ μ™„λ£Œ: $HF_MODEL_DIR"
else
echo " ↳ HF λͺ¨λΈ 이미 쑴재, λ³€ν™˜ μŠ€ν‚΅: $HF_MODEL_DIR"
fi
# ─── Step 2: 전체 평가 ──────────────────────────────────
echo ""
echo "β–Ά [2/3] lm-eval 전체 평가 μ‹œμž‘..."
echo " ↳ 둜그: $LOG_FILE"
START_TIME=$(date +%s)
if [ "$USE_MULTI_GPU" = "1" ]; then
python3 -m lm_eval \
--model hf \
--model_args "pretrained=$HF_MODEL_DIR,dtype=float16,parallelize=True" \
--tasks "$TASKS" \
--num_fewshot "$NUM_FEWSHOT" \
--batch_size "$BATCH_SIZE" \
--output_path "$OUTPUT_DIR" \
--log_samples \
--verbosity INFO \
2>&1 | tee -a "$LOG_FILE"
else
CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES" python3 -m lm_eval \
--model hf \
--model_args "pretrained=$HF_MODEL_DIR,dtype=float16" \
--tasks "$TASKS" \
--num_fewshot "$NUM_FEWSHOT" \
--batch_size "$BATCH_SIZE" \
--output_path "$OUTPUT_DIR" \
--log_samples \
--verbosity INFO \
2>&1 | tee -a "$LOG_FILE"
fi
END_TIME=$(date +%s)
ELAPSED=$(( END_TIME - START_TIME ))
echo ""
echo "βœ… 평가 μ™„λ£Œ! μ†Œμš”: $((ELAPSED/60))λΆ„ $((ELAPSED%60))초"
# ─── Step 3: κ²°κ³Ό μš”μ•½ 리포트 생성 ─────────────────────
echo ""
echo "β–Ά [3/3] κ²°κ³Ό 리포트 생성..."
python3 - "$OUTPUT_DIR" "$CHECKPOINT" <<'PYEOF'
import json, glob, sys, os
from datetime import datetime
output_dir = sys.argv[1]
checkpoint = sys.argv[2] if len(sys.argv) > 2 else "unknown"
results_files = sorted(glob.glob(f"{output_dir}/**/*.json", recursive=True))
results_files = [f for f in results_files if "samples_" not in os.path.basename(f)]
report_lines = [
f"# Ko-LLM Full Eval Report",
f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
f"Checkpoint: {checkpoint}",
"",
]
all_results = {}
for rf in results_files:
try:
with open(rf) as f:
data = json.load(f)
results = data.get("results", {})
if results:
all_results.update(results)
except Exception:
pass
# KoBEST μš”μ•½
kobest_tasks = [k for k in all_results if k.startswith("kobest_")]
if kobest_tasks:
report_lines.append("## KoBEST")
report_lines.append("| Task | Metric | Score |")
report_lines.append("|------|--------|-------|")
for task in sorted(kobest_tasks):
metrics = all_results[task]
for key, val in metrics.items():
if "stderr" not in key and isinstance(val, (int, float)):
report_lines.append(f"| {task} | {key} | {val:.4f} |")
# HAE-RAE μš”μ•½
haerae_tasks = [k for k in all_results if k.startswith("haerae")]
if haerae_tasks:
report_lines.append("\n## HAE-RAE Bench")
report_lines.append("| Task | Metric | Score |")
report_lines.append("|------|--------|-------|")
for task in sorted(haerae_tasks):
metrics = all_results[task]
for key, val in metrics.items():
if "stderr" not in key and isinstance(val, (int, float)):
report_lines.append(f"| {task} | {key} | {val:.4f} |")
# MMLU Ko μš”μ•½ (μƒμœ„ 레벨만)
mmlu_top = {k: v for k, v in all_results.items()
if k.startswith("global_mmlu_ko") and "_" not in k.replace("global_mmlu_ko", "")}
if mmlu_top:
report_lines.append("\n## Global MMLU (Korean)")
for task, metrics in mmlu_top.items():
for key, val in metrics.items():
if "stderr" not in key and isinstance(val, (int, float)):
report_lines.append(f"- {task} {key}: {val:.4f}")
# 기타
other_tasks = [k for k in all_results
if not k.startswith("kobest_")
and not k.startswith("haerae")
and not k.startswith("global_mmlu_ko")]
if other_tasks:
report_lines.append("\n## 기타 νƒœμŠ€ν¬")
for task in sorted(other_tasks):
metrics = all_results[task]
for key, val in metrics.items():
if "stderr" not in key and isinstance(val, (int, float)):
report_lines.append(f"- {task} | {key}: {val:.4f}")
report_path = os.path.join(output_dir, "SUMMARY.md")
with open(report_path, "w") as f:
f.write("\n".join(report_lines))
print("\n".join(report_lines))
print(f"\nπŸ“„ 리포트 μ €μž₯: {report_path}")
PYEOF
echo ""
echo "=================================================="
echo "βœ… 전체 평가 μ™„λ£Œ!"
echo " κ²°κ³Ό 디렉토리: $OUTPUT_DIR"
echo " μš”μ•½ 리포트 : $OUTPUT_DIR/SUMMARY.md"
echo " 전체 둜그 : $LOG_FILE"
echo " μ™„λ£Œ μ‹œκ° : $(date)"
echo "=================================================="