| | #!/usr/bin/env bash |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | set -euo pipefail |
| |
|
| | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| | PROJECT_DIR="$(dirname "$SCRIPT_DIR")" |
| |
|
| | |
| | CHECKPOINT="${1:-checkpoints/korean_1b_sft/checkpoint-0005000}" |
| | TIMESTAMP="$(date +%Y%m%d_%H%M%S)" |
| | OUTPUT_DIR="${2:-eval/outputs/full_${TIMESTAMP}}" |
| |
|
| | [[ "$CHECKPOINT" != /* ]] && CHECKPOINT="$PROJECT_DIR/$CHECKPOINT" |
| | [[ "$OUTPUT_DIR" != /* ]] && OUTPUT_DIR="$PROJECT_DIR/$OUTPUT_DIR" |
| |
|
| | |
| | HF_MODEL_DIR="$PROJECT_DIR/outputs/hf_$(basename "$CHECKPOINT")" |
| | TOKENIZER="$PROJECT_DIR/tokenizer/korean_sp/tokenizer.json" |
| |
|
| | |
| | |
| | |
| | USE_MULTI_GPU="${USE_MULTI_GPU:-0}" |
| | if [ "$USE_MULTI_GPU" = "1" ]; then |
| | MODEL_EXTRA_ARGS=",parallelize=True" |
| | echo "βΆ λ©ν° GPU λͺ¨λ νμ±ν (device_map=auto)" |
| | else |
| | MODEL_EXTRA_ARGS="" |
| | CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}" |
| | fi |
| |
|
| | BATCH_SIZE="${BATCH_SIZE:-auto}" |
| | NUM_FEWSHOT="${NUM_FEWSHOT:-0}" |
| |
|
| | |
| | |
| | TASKS_CORE="kobest,haerae,paws_ko" |
| |
|
| | |
| | TASKS_EXTENDED="global_mmlu_ko" |
| |
|
| | |
| | TASKS_OPTIONAL="kormedmcqa" |
| |
|
| | |
| | TASKS="${TASKS_CORE},${TASKS_EXTENDED}" |
| |
|
| | |
| | check_dep() { |
| | python3 -c "import $1" 2>/dev/null || { echo "β $1 not found. pip install $2"; exit 1; } |
| | } |
| | check_dep lm_eval lm-eval |
| | check_dep transformers transformers |
| | check_dep safetensors safetensors |
| |
|
| | echo "==================================================" |
| | echo " Ko-LLM Full Benchmark Evaluation" |
| | echo "==================================================" |
| | echo " Checkpoint : $CHECKPOINT" |
| | echo " HF output : $HF_MODEL_DIR" |
| | echo " Tasks : $TASKS" |
| | echo " Few-shot : $NUM_FEWSHOT" |
| | echo " Batch size : $BATCH_SIZE" |
| | echo " Output : $OUTPUT_DIR" |
| | echo " Multi-GPU : $USE_MULTI_GPU" |
| | echo " Start time : $(date)" |
| | echo "==================================================" |
| |
|
| | mkdir -p "$OUTPUT_DIR" |
| | LOG_FILE="$OUTPUT_DIR/eval_full.log" |
| |
|
| | |
| | echo "" |
| | echo "βΆ [1/3] 컀μ€ν
체ν¬ν¬μΈνΈ β HF ν¬λ§· λ³ν..." |
| |
|
| | if [ ! -f "$HF_MODEL_DIR/config.json" ]; then |
| | python3 "$PROJECT_DIR/scripts/convert_to_hf.py" \ |
| | --checkpoint "$CHECKPOINT" \ |
| | --output "$HF_MODEL_DIR" \ |
| | --tokenizer "$TOKENIZER" \ |
| | 2>&1 | tee -a "$LOG_FILE" |
| | echo "β
HF λ³ν μλ£: $HF_MODEL_DIR" |
| | else |
| | echo " β³ HF λͺ¨λΈ μ΄λ―Έ μ‘΄μ¬, λ³ν μ€ν΅: $HF_MODEL_DIR" |
| | fi |
| |
|
| | |
| | echo "" |
| | echo "βΆ [2/3] lm-eval μ 체 νκ° μμ..." |
| | echo " β³ λ‘κ·Έ: $LOG_FILE" |
| | START_TIME=$(date +%s) |
| |
|
| | if [ "$USE_MULTI_GPU" = "1" ]; then |
| | python3 -m lm_eval \ |
| | --model hf \ |
| | --model_args "pretrained=$HF_MODEL_DIR,dtype=float16,parallelize=True" \ |
| | --tasks "$TASKS" \ |
| | --num_fewshot "$NUM_FEWSHOT" \ |
| | --batch_size "$BATCH_SIZE" \ |
| | --output_path "$OUTPUT_DIR" \ |
| | --log_samples \ |
| | --verbosity INFO \ |
| | 2>&1 | tee -a "$LOG_FILE" |
| | else |
| | CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES" python3 -m lm_eval \ |
| | --model hf \ |
| | --model_args "pretrained=$HF_MODEL_DIR,dtype=float16" \ |
| | --tasks "$TASKS" \ |
| | --num_fewshot "$NUM_FEWSHOT" \ |
| | --batch_size "$BATCH_SIZE" \ |
| | --output_path "$OUTPUT_DIR" \ |
| | --log_samples \ |
| | --verbosity INFO \ |
| | 2>&1 | tee -a "$LOG_FILE" |
| | fi |
| |
|
| | END_TIME=$(date +%s) |
| | ELAPSED=$(( END_TIME - START_TIME )) |
| | echo "" |
| | echo "β
νκ° μλ£! μμ: $((ELAPSED/60))λΆ $((ELAPSED%60))μ΄" |
| |
|
| | |
| | echo "" |
| | echo "βΆ [3/3] κ²°κ³Ό 리ν¬νΈ μμ±..." |
| |
|
| | python3 - "$OUTPUT_DIR" "$CHECKPOINT" <<'PYEOF' |
| | import json, glob, sys, os |
| | from datetime import datetime |
| |
|
| | output_dir = sys.argv[1] |
| | checkpoint = sys.argv[2] if len(sys.argv) > 2 else "unknown" |
| |
|
| | results_files = sorted(glob.glob(f"{output_dir}/**/*.json", recursive=True)) |
| | results_files = [f for f in results_files if "samples_" not in os.path.basename(f)] |
| |
|
| | report_lines = [ |
| | f"# Ko-LLM Full Eval Report", |
| | f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", |
| | f"Checkpoint: {checkpoint}", |
| | "", |
| | ] |
| |
|
| | all_results = {} |
| | for rf in results_files: |
| | try: |
| | with open(rf) as f: |
| | data = json.load(f) |
| | results = data.get("results", {}) |
| | if results: |
| | all_results.update(results) |
| | except Exception: |
| | pass |
| |
|
| | |
| | kobest_tasks = [k for k in all_results if k.startswith("kobest_")] |
| | if kobest_tasks: |
| | report_lines.append("## KoBEST") |
| | report_lines.append("| Task | Metric | Score |") |
| | report_lines.append("|------|--------|-------|") |
| | for task in sorted(kobest_tasks): |
| | metrics = all_results[task] |
| | for key, val in metrics.items(): |
| | if "stderr" not in key and isinstance(val, (int, float)): |
| | report_lines.append(f"| {task} | {key} | {val:.4f} |") |
| |
|
| | |
| | haerae_tasks = [k for k in all_results if k.startswith("haerae")] |
| | if haerae_tasks: |
| | report_lines.append("\n## HAE-RAE Bench") |
| | report_lines.append("| Task | Metric | Score |") |
| | report_lines.append("|------|--------|-------|") |
| | for task in sorted(haerae_tasks): |
| | metrics = all_results[task] |
| | for key, val in metrics.items(): |
| | if "stderr" not in key and isinstance(val, (int, float)): |
| | report_lines.append(f"| {task} | {key} | {val:.4f} |") |
| |
|
| | |
| | mmlu_top = {k: v for k, v in all_results.items() |
| | if k.startswith("global_mmlu_ko") and "_" not in k.replace("global_mmlu_ko", "")} |
| | if mmlu_top: |
| | report_lines.append("\n## Global MMLU (Korean)") |
| | for task, metrics in mmlu_top.items(): |
| | for key, val in metrics.items(): |
| | if "stderr" not in key and isinstance(val, (int, float)): |
| | report_lines.append(f"- {task} {key}: {val:.4f}") |
| |
|
| | |
| | other_tasks = [k for k in all_results |
| | if not k.startswith("kobest_") |
| | and not k.startswith("haerae") |
| | and not k.startswith("global_mmlu_ko")] |
| | if other_tasks: |
| | report_lines.append("\n## κΈ°ν νμ€ν¬") |
| | for task in sorted(other_tasks): |
| | metrics = all_results[task] |
| | for key, val in metrics.items(): |
| | if "stderr" not in key and isinstance(val, (int, float)): |
| | report_lines.append(f"- {task} | {key}: {val:.4f}") |
| |
|
| | report_path = os.path.join(output_dir, "SUMMARY.md") |
| | with open(report_path, "w") as f: |
| | f.write("\n".join(report_lines)) |
| |
|
| | print("\n".join(report_lines)) |
| | print(f"\nπ 리ν¬νΈ μ μ₯: {report_path}") |
| | PYEOF |
| |
|
| | echo "" |
| | echo "==================================================" |
| | echo "β
μ 체 νκ° μλ£!" |
| | echo " κ²°κ³Ό λλ ν 리: $OUTPUT_DIR" |
| | echo " μμ½ λ¦¬ν¬νΈ : $OUTPUT_DIR/SUMMARY.md" |
| | echo " μ 체 λ‘κ·Έ : $LOG_FILE" |
| | echo " μλ£ μκ° : $(date)" |
| | echo "==================================================" |
| |
|