| """ |
| Markdown report generator for FRANKENSTALLM 3B evaluation pipeline. |
| |
| Generates comprehensive evaluation reports with sections for: |
| - Perplexity metrics across datasets |
| - Calibration statistics |
| - Token NLL distribution |
| - Generation quality samples |
| - Repetition parameter search results |
| - Standard benchmark results (lm-eval) โ Korean + English |
| - 0-shot vs 5-shot comparison |
| - Comparison with reference models |
| """ |
|
|
| from datetime import datetime |
| from pathlib import Path |
| from typing import Dict, List, Optional, Any, Tuple |
| import json |
| import logging |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| def _fmt_seconds(seconds: float) -> str: |
| """Format seconds into a human-readable duration string.""" |
| m, s = divmod(int(seconds), 60) |
| h, m = divmod(m, 60) |
| if h: |
| return f"{h}h {m}m {s}s" |
| if m: |
| return f"{m}m {s}s" |
| return f"{s}s" |
|
|
|
|
| |
| |
| |
|
|
| def _normalize_phase1_results(raw: dict) -> dict: |
| """Convert GPU-labelled phase1_results into logical sections. |
| |
| Returns dict with keys: perplexity, calibration, token_nll, generation, repetition. |
| """ |
| normalized: Dict[str, Any] = { |
| "perplexity": {}, |
| "calibration": {}, |
| "token_nll": {}, |
| "generation": {}, |
| "repetition": {}, |
| } |
|
|
| for label, data in raw.items(): |
| if not isinstance(data, (dict, list)): |
| continue |
|
|
| if "PPL" in label: |
| |
| if isinstance(data, dict) and "ppl" in data: |
| name = data.get("name", label) |
| normalized["perplexity"][name] = data |
| elif isinstance(data, list): |
| for item in data: |
| if isinstance(item, dict) and "ppl" in item: |
| name = item.get("name", f"unknown_{len(normalized['perplexity'])}") |
| normalized["perplexity"][name] = item |
| elif isinstance(data, dict) and "error" in data: |
| |
| pass |
| elif "Calibration" in label: |
| if isinstance(data, dict): |
| if "calibration" in data: |
| normalized["calibration"] = data["calibration"] |
| if "token_nll" in data: |
| normalized["token_nll"] = data["token_nll"] |
| elif "Generation" in label: |
| if isinstance(data, dict): |
| normalized["generation"] = data |
| elif "Repetition" in label: |
| if isinstance(data, dict): |
| normalized["repetition"] = data |
|
|
| return normalized |
|
|
|
|
| def _normalize_phase2_results(raw: dict) -> Tuple[Dict[str, Any], Dict[str, Any]]: |
| """Convert GPU-labelled phase2_results into flat task dicts for 0-shot and 5-shot. |
| |
| Returns (zero_shot_metrics, five_shot_metrics) where each is: |
| {"kobest_boolq": {"acc,none": 0.50, ...}, "haerae": {...}, ...} |
| """ |
| zero_shot: Dict[str, Any] = {} |
| five_shot: Dict[str, Any] = {} |
|
|
| for label, data in raw.items(): |
| if label == "5shot": |
| |
| if isinstance(data, dict): |
| for sub_label, sub_data in data.items(): |
| if isinstance(sub_data, dict) and "per_task_metrics" in sub_data: |
| for task_name, metrics in sub_data["per_task_metrics"].items(): |
| five_shot[task_name] = metrics |
| continue |
|
|
| if isinstance(data, dict) and "per_task_metrics" in data: |
| for task_name, metrics in data["per_task_metrics"].items(): |
| zero_shot[task_name] = metrics |
|
|
| return zero_shot, five_shot |
|
|
|
|
| def _get_acc(metrics: dict, prefer_norm: bool = False) -> Optional[float]: |
| """Extract accuracy from lm-eval metrics dict.""" |
| if prefer_norm and "acc_norm,none" in metrics: |
| val = metrics["acc_norm,none"] |
| if isinstance(val, (int, float)): |
| return float(val) |
| if "acc,none" in metrics: |
| val = metrics["acc,none"] |
| if isinstance(val, (int, float)): |
| return float(val) |
| return None |
|
|
|
|
| def _fmt_pct(val: Optional[float]) -> str: |
| """Format as percentage string or N/A.""" |
| if val is None: |
| return "N/A" |
| return f"{val * 100:.2f}%" |
|
|
|
|
| def _fmt_f(val, decimals: int = 4) -> str: |
| """Format float or return N/A.""" |
| if isinstance(val, (int, float)): |
| return f"{val:.{decimals}f}" |
| return str(val) if val is not None else "N/A" |
|
|
|
|
| |
| |
| |
|
|
| def generate_report( |
| phase1_results: dict, |
| phase2_results: dict, |
| generation_samples: list, |
| output_dir: Path, |
| checkpoint_name: str = "checkpoint-0057000", |
| total_elapsed_sec: float = 0.0, |
| ) -> str: |
| """Generate a comprehensive markdown evaluation report. |
| |
| Handles the GPU-labelled key structure from full_eval_pipeline.py |
| and produces multiple report files. |
| """ |
| output_dir = Path(output_dir) |
| output_dir.mkdir(parents=True, exist_ok=True) |
| reports_dir = output_dir / "reports" |
| reports_dir.mkdir(parents=True, exist_ok=True) |
|
|
| |
| p1 = _normalize_phase1_results(phase1_results) |
| zero_shot, five_shot = _normalize_phase2_results(phase2_results) |
|
|
| eval_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
|
|
| |
| ppl_report = _generate_perplexity_report(p1["perplexity"]) |
| cal_report = _generate_calibration_report(p1["calibration"], p1["token_nll"]) |
| gen_report = _generate_generation_report(p1["generation"], generation_samples) |
| bench_report = _generate_benchmark_report(zero_shot, five_shot, p1["repetition"]) |
| exec_summary = _generate_executive_summary( |
| p1, zero_shot, five_shot, checkpoint_name, eval_datetime, total_elapsed_sec, |
| ) |
|
|
| |
| (reports_dir / "00_executive_summary.md").write_text(exec_summary, encoding="utf-8") |
| (reports_dir / "01_perplexity_report.md").write_text(ppl_report, encoding="utf-8") |
| (reports_dir / "02_calibration_report.md").write_text(cal_report, encoding="utf-8") |
| (reports_dir / "03_generation_quality.md").write_text(gen_report, encoding="utf-8") |
| (reports_dir / "04_benchmark_report.md").write_text(bench_report, encoding="utf-8") |
|
|
| |
| full_report = "\n\n---\n\n".join([ |
| exec_summary, ppl_report, cal_report, gen_report, bench_report, |
| ]) |
|
|
| report_path = output_dir / "full_eval_report.md" |
| report_path.write_text(full_report, encoding="utf-8") |
|
|
| return full_report |
|
|
|
|
| |
| |
| |
|
|
| def _generate_executive_summary( |
| p1: dict, |
| zero_shot: dict, |
| five_shot: dict, |
| checkpoint_name: str, |
| eval_datetime: str, |
| total_elapsed_sec: float, |
| ) -> str: |
| lines = [ |
| "# FRANKENSTALLM 3B ์ข
ํฉ ํ๊ฐ ๋ฆฌํฌํธ\n", |
| f"- **๋ชจ๋ธ**: FRANKENSTALLM 3B", |
| f"- **์ฒดํฌํฌ์ธํธ**: {checkpoint_name}", |
| f"- **ํ๊ฐ ์ผ์**: {eval_datetime}", |
| f"- **์ด ์์ ์๊ฐ**: {total_elapsed_sec:.1f}์ด\n", |
| "## Executive Summary\n", |
| ] |
|
|
| |
| main_ppl = "N/A" |
| ppl_data = p1.get("perplexity", {}) |
| for name in ["3b", "3b_val"]: |
| if name in ppl_data and isinstance(ppl_data[name], dict): |
| main_ppl = _fmt_f(ppl_data[name].get("ppl")) |
| break |
|
|
| |
| kobest_tasks = ["kobest_boolq", "kobest_copa", "kobest_hellaswag", |
| "kobest_sentineg", "kobest_wic"] |
| kobest_accs = [] |
| for t in kobest_tasks: |
| if t in zero_shot: |
| a = _get_acc(zero_shot[t]) |
| if a is not None: |
| kobest_accs.append(a) |
| kobest_avg = _fmt_pct(sum(kobest_accs) / len(kobest_accs)) if kobest_accs else "N/A" |
|
|
| |
| mmlu_ko_avg = "N/A" |
| mmlu_ko_count = 0 |
| if "global_mmlu_ko" in zero_shot: |
| a = _get_acc(zero_shot["global_mmlu_ko"]) |
| if a is not None: |
| mmlu_ko_avg = _fmt_pct(a) |
| |
| mmlu_ko_count = sum( |
| 1 for t in zero_shot |
| if t.startswith("global_mmlu_ko_") and _get_acc(zero_shot[t]) is not None |
| ) |
| if mmlu_ko_count == 0: |
| mmlu_ko_count = 1 |
| else: |
| |
| mmlu_ko_accs = [] |
| for t, m in zero_shot.items(): |
| if t.startswith("global_mmlu_ko_"): |
| a = _get_acc(m) |
| if a is not None: |
| mmlu_ko_accs.append(a) |
| if mmlu_ko_accs: |
| mmlu_ko_avg = _fmt_pct(sum(mmlu_ko_accs) / len(mmlu_ko_accs)) |
| mmlu_ko_count = len(mmlu_ko_accs) |
|
|
| |
| _MMLU_EN_GROUPS = {"mmlu", "mmlu_humanities", "mmlu_social_sciences", "mmlu_stem", "mmlu_other"} |
| mmlu_en_accs = [] |
| for t, m in zero_shot.items(): |
| if (t.startswith("mmlu_") or t == "mmlu") and t not in _MMLU_EN_GROUPS: |
| a = _get_acc(m) |
| if a is not None: |
| mmlu_en_accs.append(a) |
| if not mmlu_en_accs: |
| |
| for t in _MMLU_EN_GROUPS: |
| if t in zero_shot: |
| a = _get_acc(zero_shot[t]) |
| if a is not None: |
| mmlu_en_accs.append(a) |
| mmlu_en_avg = _fmt_pct(sum(mmlu_en_accs) / len(mmlu_en_accs)) if mmlu_en_accs else "N/A" |
|
|
| |
| haerae_acc = "N/A" |
| if "haerae" in zero_shot: |
| a = _get_acc(zero_shot["haerae"]) |
| if a is not None: |
| haerae_acc = _fmt_pct(a) |
|
|
| |
| en_benchmarks = {} |
| for t in ["hellaswag", "arc_easy", "arc_challenge", "winogrande", "piqa"]: |
| if t in zero_shot: |
| a = _get_acc(zero_shot[t], prefer_norm=(t in ["hellaswag", "arc_challenge"])) |
| if a is not None: |
| en_benchmarks[t] = a |
|
|
| |
| top1 = _fmt_f(p1.get("calibration", {}).get("top1_accuracy")) |
|
|
| lines.append("| ๋ฉํธ๋ฆญ | ๊ฐ |") |
| lines.append("|--------|-----|") |
| lines.append(f"| ์ฃผ์ PPL (3b_val) | {main_ppl} |") |
| lines.append(f"| MMLU-KO ํ๊ท ({mmlu_ko_count}๊ณผ๋ชฉ) | {mmlu_ko_avg} |") |
| lines.append(f"| MMLU-EN ํ๊ท | {mmlu_en_avg} |") |
| lines.append(f"| KoBEST ํ๊ท ({len(kobest_accs)}ํ์คํฌ) | {kobest_avg} |") |
| lines.append(f"| HAE-RAE | {haerae_acc} |") |
| for t, a in en_benchmarks.items(): |
| lines.append(f"| {t} (0-shot) | {_fmt_pct(a)} |") |
| lines.append(f"| Top-1 ์ ํ๋ (Calibration) | {top1} |") |
| lines.append("") |
|
|
| |
| lines.append("## ์ฐธ๊ณ ๋ชจ๋ธ ๋น๊ต\n") |
| lines.append("| ๋ชจ๋ธ | ํ๋ผ๋ฏธํฐ | MMLU-KO | MMLU-EN | KoBEST ํ๊ท | PPL |") |
| lines.append("|------|---------|---------|---------|------------|-----|") |
| lines.append(f"| **FRANKENSTALLM 3B** | 3B | {mmlu_ko_avg} | {mmlu_en_avg} | {kobest_avg} | {main_ppl} |") |
| lines.append("| Llama-3.2-3B | 3B | ~42% | ~58% | ~55% | โ |") |
| lines.append("| Qwen2.5-3B | 3B | ~48% | ~65% | ~60% | โ |") |
| lines.append("| EXAONE-3.5-2.4B | 2.4B | ~35% | ~50% | ~50% | โ |") |
| lines.append("") |
|
|
| return "\n".join(lines) |
|
|
|
|
| def _generate_perplexity_report(ppl_data: dict) -> str: |
| lines = ["# Perplexity ํ๊ฐ\n"] |
|
|
| if not ppl_data: |
| lines.append("๋ฐ์ดํฐ ์์\n") |
| return "\n".join(lines) |
|
|
| rows = [] |
| for name, metrics in ppl_data.items(): |
| if isinstance(metrics, dict) and "ppl" in metrics: |
| rows.append({ |
| "name": name, |
| "ppl": metrics.get("ppl"), |
| "bits": metrics.get("bits_per_token"), |
| "n_tokens": metrics.get("n_tokens"), |
| "n_eval": metrics.get("n_eval_tokens"), |
| "elapsed": metrics.get("elapsed_sec"), |
| }) |
|
|
| rows.sort(key=lambda x: x["ppl"] if isinstance(x["ppl"], (int, float)) else float("inf"), |
| reverse=True) |
|
|
| lines.append("| ๋ฐ์ดํฐ์
| PPL | Bits/Token | ์ ์ฒด ํ ํฐ | ํ๊ฐ ํ ํฐ | ์์ ์๊ฐ |") |
| lines.append("|---------|-----|-----------|---------|---------|---------|") |
| for r in rows: |
| lines.append( |
| f"| {r['name']} | {_fmt_f(r['ppl'])} | {_fmt_f(r['bits'])} | " |
| f"{r['n_tokens']:,} | {r['n_eval']:,} | {_fmt_f(r['elapsed'], 1)}s |" |
| if isinstance(r['n_tokens'], (int, float)) and isinstance(r['n_eval'], (int, float)) |
| else f"| {r['name']} | {_fmt_f(r['ppl'])} | {_fmt_f(r['bits'])} | " |
| f"{r['n_tokens']} | {r['n_eval']} | {_fmt_f(r['elapsed'], 1)}s |" |
| ) |
| lines.append("") |
| return "\n".join(lines) |
|
|
|
|
| def _generate_calibration_report(cal_data: dict, nll_data: dict) -> str: |
| lines = ["# Calibration ๋ฐ Token NLL ๋ถ์\n"] |
|
|
| |
| lines.append("## Calibration ๊ฒฐ๊ณผ\n") |
| if cal_data: |
| lines.append("| ๋ฉํธ๋ฆญ | ๊ฐ |") |
| lines.append("|--------|-----|") |
| metrics_map = { |
| "top1_accuracy": "Top-1 Accuracy", |
| "top5_accuracy": "Top-5 Accuracy", |
| "top10_accuracy": "Top-10 Accuracy", |
| "mean_correct_prob": "Mean Correct Prob", |
| "mean_entropy": "Mean Entropy", |
| } |
| for key, label in metrics_map.items(): |
| lines.append(f"| {label} | {_fmt_f(cal_data.get(key))} |") |
| lines.append("") |
| else: |
| lines.append("๋ฐ์ดํฐ ์์\n") |
|
|
| |
| lines.append("## Token NLL ๋ถํฌ\n") |
| if nll_data: |
| |
| stats_map = [ |
| (["nll_mean", "mean"], "ํ๊ท "), |
| (["nll_std", "std"], "ํ์คํธ์ฐจ"), |
| (["nll_median", "median"], "์ค์๊ฐ"), |
| (["nll_min", "min"], "์ต์๊ฐ"), |
| (["nll_max", "max"], "์ต๋๊ฐ"), |
| ] |
| lines.append("| ํต๊ณ | ๊ฐ |") |
| lines.append("|------|-----|") |
| for candidates, label in stats_map: |
| val = None |
| for c in candidates: |
| if c in nll_data: |
| val = nll_data[c] |
| break |
| lines.append(f"| {label} | {_fmt_f(val)} |") |
| lines.append("") |
|
|
| |
| pct_data = nll_data.get("nll_percentiles", nll_data.get("percentiles")) |
| if pct_data and isinstance(pct_data, dict): |
| lines.append("### Percentiles\n") |
| lines.append("| Percentile | ๊ฐ |") |
| lines.append("|------------|-----|") |
| for pct, value in pct_data.items(): |
| lines.append(f"| {pct}th | {_fmt_f(value)} |") |
| lines.append("") |
|
|
| |
| hlf = nll_data.get("high_loss_fractions") |
| if hlf and isinstance(hlf, dict): |
| lines.append("### ๊ณ ์์ค ํ ํฐ ๋น์จ\n") |
| lines.append("| ์๊ณ๊ฐ | ๋น์จ |") |
| lines.append("|--------|-----|") |
| for threshold, fraction in hlf.items(): |
| lines.append(f"| NLL > {threshold} | {_fmt_f(fraction)} |") |
| lines.append("") |
| else: |
| |
| hlf_flat = {k.replace("high_loss_fraction_", ""): v |
| for k, v in nll_data.items() |
| if k.startswith("high_loss_fraction_")} |
| if hlf_flat: |
| lines.append("### ๊ณ ์์ค ํ ํฐ ๋น์จ\n") |
| lines.append("| ์๊ณ๊ฐ | ๋น์จ |") |
| lines.append("|--------|-----|") |
| for threshold, fraction in sorted(hlf_flat.items()): |
| lines.append(f"| NLL > {threshold} | {_fmt_f(fraction)} |") |
| lines.append("") |
| else: |
| lines.append("๋ฐ์ดํฐ ์์\n") |
|
|
| return "\n".join(lines) |
|
|
|
|
| def _generate_generation_report(gen_data: dict, samples: list) -> str: |
| lines = ["# ์์ฑ ํ์ง ๋ถ์\n"] |
|
|
| if gen_data and "summary" in gen_data: |
| lines.append("## ์์ฝ ํต๊ณ\n") |
| lines.append("| ๋ฉํธ๋ฆญ | ๊ฐ |") |
| lines.append("|--------|-----|") |
| for key, value in gen_data["summary"].items(): |
| display = key.replace("_", " ").title() |
| lines.append(f"| {display} | {_fmt_f(value)} |") |
| lines.append("") |
|
|
| if samples: |
| lines.append("## ์์ฑ ์ํ (Greedy)\n") |
| for i, sample in enumerate(samples[:5], 1): |
| if isinstance(sample, dict): |
| prompt = sample.get("prompt", "") |
| generated = sample.get("generated_text", "") |
| if len(generated) > 300: |
| generated = generated[:300] + "..." |
| lines.append(f"### ์ํ {i}\n") |
| lines.append(f"**Prompt**: {prompt}\n") |
| lines.append(f"**Generated**: {generated}\n") |
| lines.append("") |
| elif not gen_data: |
| lines.append("๋ฐ์ดํฐ ์์\n") |
|
|
| return "\n".join(lines) |
|
|
|
|
| def _generate_benchmark_report( |
| zero_shot: dict, |
| five_shot: dict, |
| repetition: dict, |
| ) -> str: |
| lines = ["# ํ์ค ๋ฒค์น๋งํฌ ๊ฒฐ๊ณผ\n"] |
|
|
| if not zero_shot and not five_shot: |
| lines.append("๋ฐ์ดํฐ ์์\n") |
| return "\n".join(lines) |
|
|
| |
| lines.append("## ํ๊ตญ์ด ๋ฒค์น๋งํฌ\n") |
|
|
| |
| kobest_names = ["kobest_boolq", "kobest_copa", "kobest_hellaswag", |
| "kobest_sentineg", "kobest_wic"] |
| kobest_0 = {t: zero_shot[t] for t in kobest_names if t in zero_shot} |
| if kobest_0: |
| lines.append("### KoBEST (0-shot)\n") |
| lines.append("| ํ์คํฌ | Accuracy | F1 |") |
| lines.append("|--------|----------|-----|") |
| for t in kobest_names: |
| if t in kobest_0: |
| m = kobest_0[t] |
| acc = _fmt_pct(_get_acc(m)) |
| f1 = _fmt_f(m.get("f1,none")) |
| lines.append(f"| {t} | {acc} | {f1} |") |
| kobest_accs = [_get_acc(kobest_0[t]) for t in kobest_names |
| if t in kobest_0 and _get_acc(kobest_0[t]) is not None] |
| if kobest_accs: |
| lines.append(f"| **ํ๊ท ** | **{_fmt_pct(sum(kobest_accs)/len(kobest_accs))}** | |") |
| lines.append("") |
|
|
| |
| if "haerae" in zero_shot: |
| lines.append("### HAE-RAE (0-shot)\n") |
| m = zero_shot["haerae"] |
| lines.append(f"- Accuracy: {_fmt_pct(_get_acc(m))}") |
| |
| haerae_subs = {t: zero_shot[t] for t in zero_shot if t.startswith("haerae_") and t != "haerae"} |
| if haerae_subs: |
| lines.append("\n| ์๋ธํ์คํฌ | Accuracy |") |
| lines.append("|-----------|----------|") |
| for t, sm in sorted(haerae_subs.items()): |
| lines.append(f"| {t} | {_fmt_pct(_get_acc(sm))} |") |
| lines.append("") |
|
|
| |
| mmlu_ko_tasks = {t: zero_shot[t] for t in zero_shot |
| if t.startswith("global_mmlu_ko") and t != "global_mmlu_ko"} |
| if mmlu_ko_tasks or "global_mmlu_ko" in zero_shot: |
| lines.append("### MMLU-KO (0-shot)\n") |
| if mmlu_ko_tasks: |
| lines.append(f"ํ๊ฐ๋ ๊ณผ๋ชฉ ์: **{len(mmlu_ko_tasks)}**\n") |
| accs = [(t, _get_acc(m)) for t, m in sorted(mmlu_ko_tasks.items()) |
| if _get_acc(m) is not None] |
| if accs: |
| |
| group_acc = _get_acc(zero_shot["global_mmlu_ko"]) if "global_mmlu_ko" in zero_shot else None |
| avg_acc = group_acc if group_acc is not None else sum(a for _, a in accs) / len(accs) |
| lines.append(f"์ ์ฒด ํ๊ท : **{_fmt_pct(avg_acc)}**\n") |
|
|
| |
| accs_sorted = sorted(accs, key=lambda x: x[1], reverse=True) |
| lines.append("**์์ 10๊ฐ ๊ณผ๋ชฉ**:\n") |
| lines.append("| ๊ณผ๋ชฉ | Accuracy |") |
| lines.append("|------|----------|") |
| for t, a in accs_sorted[:10]: |
| subject = t.replace("global_mmlu_ko_", "") |
| lines.append(f"| {subject} | {_fmt_pct(a)} |") |
| lines.append("") |
|
|
| lines.append("**ํ์ 10๊ฐ ๊ณผ๋ชฉ**:\n") |
| lines.append("| ๊ณผ๋ชฉ | Accuracy |") |
| lines.append("|------|----------|") |
| for t, a in accs_sorted[-10:]: |
| subject = t.replace("global_mmlu_ko_", "") |
| lines.append(f"| {subject} | {_fmt_pct(a)} |") |
| lines.append("") |
| elif "global_mmlu_ko" in zero_shot: |
| a = _get_acc(zero_shot["global_mmlu_ko"]) |
| lines.append(f"์ ์ฒด ์ ํ๋: {_fmt_pct(a)}\n") |
|
|
| |
| lines.append("## ์์ด ๋ฒค์น๋งํฌ\n") |
|
|
| en_tasks = ["hellaswag", "arc_easy", "arc_challenge", "winogrande", "piqa"] |
| en_found = {t: zero_shot[t] for t in en_tasks if t in zero_shot} |
| if en_found: |
| lines.append("### ์ฃผ์ ๋ฒค์น๋งํฌ (0-shot)\n") |
| lines.append("| ํ์คํฌ | Accuracy | Acc (norm) |") |
| lines.append("|--------|----------|-----------|") |
| for t in en_tasks: |
| if t in en_found: |
| m = en_found[t] |
| acc = _fmt_pct(_get_acc(m)) |
| acc_norm = _fmt_pct(_get_acc(m, prefer_norm=True) if "acc_norm,none" in m else None) |
| lines.append(f"| {t} | {acc} | {acc_norm} |") |
| lines.append("") |
|
|
| |
| mmlu_en_tasks = {t: zero_shot[t] for t in zero_shot |
| if (t.startswith("mmlu_") or t == "mmlu") and not t.startswith("mmlu_ko")} |
| if mmlu_en_tasks: |
| lines.append("### MMLU-EN (0-shot)\n") |
| |
| subtasks = {t: m for t, m in mmlu_en_tasks.items() if t != "mmlu"} |
| if subtasks: |
| lines.append(f"ํ๊ฐ๋ ๊ณผ๋ชฉ ์: **{len(subtasks)}**\n") |
| accs = [(t, _get_acc(m)) for t, m in sorted(subtasks.items()) |
| if _get_acc(m) is not None] |
| if accs: |
| avg_acc = sum(a for _, a in accs) / len(accs) |
| lines.append(f"์ ์ฒด ํ๊ท : **{_fmt_pct(avg_acc)}**\n") |
|
|
| accs_sorted = sorted(accs, key=lambda x: x[1], reverse=True) |
| lines.append("**์์ 10๊ฐ ๊ณผ๋ชฉ**:\n") |
| lines.append("| ๊ณผ๋ชฉ | Accuracy |") |
| lines.append("|------|----------|") |
| for t, a in accs_sorted[:10]: |
| subject = t.replace("mmlu_", "") |
| lines.append(f"| {subject} | {_fmt_pct(a)} |") |
| lines.append("") |
|
|
| lines.append("**ํ์ 10๊ฐ ๊ณผ๋ชฉ**:\n") |
| lines.append("| ๊ณผ๋ชฉ | Accuracy |") |
| lines.append("|------|----------|") |
| for t, a in accs_sorted[-10:]: |
| subject = t.replace("mmlu_", "") |
| lines.append(f"| {subject} | {_fmt_pct(a)} |") |
| lines.append("") |
| elif "mmlu" in mmlu_en_tasks: |
| a = _get_acc(mmlu_en_tasks["mmlu"]) |
| lines.append(f"์ ์ฒด ์ ํ๋: {_fmt_pct(a)}\n") |
|
|
| |
| if five_shot: |
| lines.append("## 0-shot vs 5-shot ๋น๊ต\n") |
|
|
| |
| common_tasks = sorted(set(zero_shot.keys()) & set(five_shot.keys())) |
| if common_tasks: |
| lines.append("| ํ์คํฌ | 0-shot Acc | 5-shot Acc | ๋ณํ |") |
| lines.append("|--------|-----------|-----------|------|") |
| for t in common_tasks: |
| a0 = _get_acc(zero_shot[t]) |
| a5 = _get_acc(five_shot[t]) |
| if a0 is not None and a5 is not None: |
| diff = a5 - a0 |
| sign = "+" if diff >= 0 else "" |
| lines.append( |
| f"| {t} | {_fmt_pct(a0)} | {_fmt_pct(a5)} | {sign}{diff*100:.2f}pp |" |
| ) |
| else: |
| lines.append(f"| {t} | {_fmt_pct(a0)} | {_fmt_pct(a5)} | โ |") |
| lines.append("") |
|
|
| |
| diffs = [] |
| for t in common_tasks: |
| a0 = _get_acc(zero_shot[t]) |
| a5 = _get_acc(five_shot[t]) |
| if a0 is not None and a5 is not None: |
| diffs.append(a5 - a0) |
| if diffs: |
| avg_diff = sum(diffs) / len(diffs) |
| improved = sum(1 for d in diffs if d > 0) |
| degraded = sum(1 for d in diffs if d < 0) |
| lines.append( |
| f"ํ๊ท ๋ณํ: {'+' if avg_diff >= 0 else ''}{avg_diff*100:.2f}pp | " |
| f"๊ฐ์ : {improved} | ํ๋ฝ: {degraded} | ๋์ผ: {len(diffs) - improved - degraded}\n" |
| ) |
|
|
| |
| if repetition and repetition.get("grid_results"): |
| lines.append("## Repetition ํ๋ผ๋ฏธํฐ ๊ฒ์\n") |
| rep_data = repetition["grid_results"] |
| rep_rows = [] |
| |
| items = rep_data.items() if isinstance(rep_data, dict) else enumerate(rep_data) |
| for key, metrics in items: |
| if isinstance(metrics, dict): |
| rep_rows.append({ |
| "config": metrics.get("params", str(key)), |
| "temp": metrics.get("temperature"), |
| "rep_pen": metrics.get("repetition_penalty"), |
| "3gram": metrics.get("avg_3gram_rep", metrics.get("3gram_repetition", float("inf"))), |
| "4gram": metrics.get("avg_4gram_rep", metrics.get("4gram_repetition")), |
| "eos_rate": metrics.get("eos_rate"), |
| "avg_tokens": metrics.get("avg_tokens"), |
| }) |
| rep_rows.sort(key=lambda x: x["3gram"] if isinstance(x["3gram"], (int, float)) else float("inf")) |
|
|
| lines.append("| ์ค์ | Temp | Rep Pen | 3-gram | 4-gram | EOS Rate | Avg Tokens |") |
| lines.append("|------|------|---------|--------|--------|----------|-----------|") |
| for i, r in enumerate(rep_rows): |
| marker = " **โ best**" if i == 0 else "" |
| lines.append( |
| f"| {r['config']} | {_fmt_f(r['temp'], 2)} | {_fmt_f(r['rep_pen'], 2)} | " |
| f"{_fmt_f(r['3gram'])} | {_fmt_f(r['4gram'])} | " |
| f"{_fmt_f(r['eos_rate'])} | {_fmt_f(r['avg_tokens'], 1)} |{marker}" |
| ) |
| lines.append("") |
|
|
| lines.append("---\n") |
| lines.append("*์ด ๋ฆฌํฌํธ๋ ์๋์ผ๋ก ์์ฑ๋์์ต๋๋ค.*") |
| return "\n".join(lines) |
|
|
|
|
| |
| |
| |
|
|
| |
| _BASE_PPL_REFERENCE = { |
| "3b_val": 5.2263, |
| "3b": 5.2263, |
| "korean_c4_val": 5.7173, |
| "korean_c4": 5.7173, |
| "hplt_ko_val": 2.4028, |
| "hplt_ko": 2.4028, |
| "cc100_ko_val": 21.782, |
| "cc100_ko": 21.782, |
| "korean_val": 9.6505, |
| "korean": 9.6505, |
| } |
|
|
| _BASE_BENCH_REFERENCE = { |
| "kobest_boolq": 0.5028, |
| "kobest_copa": 0.4930, |
| "kobest_hellaswag": 0.2160, |
| "kobest_sentineg": 0.4861, |
| "kobest_wic": 0.4865, |
| "haerae": 0.1971, |
| "global_mmlu_ko": 0.2275, |
| "hellaswag": 0.2600, |
| "arc_easy": 0.2563, |
| "arc_challenge": 0.2167, |
| "winogrande": 0.5059, |
| "piqa": 0.5250, |
| } |
|
|
| _BASE_GEN_REFERENCE = { |
| "greedy_3gram_rep": 0.6099, |
| "greedy_4gram_rep": 0.5702, |
| "greedy_eos_rate": 0.0, |
| } |
|
|
| _BASE_CALIB_REFERENCE = { |
| "top1_accuracy": 0.6875, |
| "top5_accuracy": 0.8164, |
| "top10_accuracy": 0.8593, |
| "mean_entropy": 1.5682, |
| } |
|
|
| _BASE_NLL_REFERENCE = { |
| "nll_mean": 1.5561, |
| "high_loss_fraction_5": 0.1086, |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| _SFT_TARGETS = { |
| |
| "greedy_3gram_rep_max": 0.05, |
| "eos_rate_min": 0.90, |
| "sampled_eos_min": 0.50, |
| "distinct_2_min": 0.70, |
| |
| "ppl_forgetting_max_pct": 15.0, |
| |
| "kobest_avg_min": 0.55, |
| "haerae_min": 0.25, |
| "mmlu_ko_min": 0.30, |
| |
| "top1_accuracy_min": 0.65, |
| |
| "hellaswag_min": 0.25, |
| "arc_easy_min": 0.25, |
| "arc_challenge_min": 0.21, |
| "winogrande_min": 0.49, |
| "piqa_min": 0.51, |
| "mmlu_en_avg_min": 0.25, |
| } |
|
|
| _REFERENCE_MODELS = { |
| "Llama 3.2 1B": {"kobest_avg": 0.52, "mmlu_ko": 0.28, "mmlu_en": 0.32}, |
| "Llama 3.2 3B": {"kobest_avg": 0.56, "mmlu_ko": 0.35, "mmlu_en": 0.55}, |
| "Qwen 2.5 3B": {"kobest_avg": 0.58, "mmlu_ko": 0.42, "mmlu_en": 0.58}, |
| } |
|
|
|
|
| def _compute_orpo_score(sft_p1, sft_zero, base_p1, base_zero): |
| """ORPO ํ์์ฑ ์ ๋ ํ์ (0-100์ ). |
| |
| Returns: |
| dict with keys: total_score, dimension_scores, decision, confidence, orpo_gain_estimate |
| """ |
| dimensions = {} |
| missing = 0 |
| total_dims = 7 |
|
|
| |
| max_forgetting = _get_max_forgetting(sft_p1, base_p1) |
| if max_forgetting is not None: |
| threshold = _SFT_TARGETS["ppl_forgetting_max_pct"] |
| score = 25 * max(0, 1 - max_forgetting / threshold) |
| dimensions["ppl_forgetting"] = { |
| "score": round(score, 1), "weight": 25, |
| "current": round(max_forgetting, 1), "threshold": f"<{threshold}%", |
| "status": "PASS" if max_forgetting < threshold else "FAIL", |
| } |
| else: |
| missing += 1 |
| dimensions["ppl_forgetting"] = {"score": 0, "weight": 25, "current": "N/A", "threshold": "<15%", "status": "N/A"} |
|
|
| |
| rep_rate = _get_greedy_3gram_rep(sft_p1) |
| if rep_rate is not None: |
| threshold = _SFT_TARGETS["greedy_3gram_rep_max"] |
| score = 20 * max(0, 1 - rep_rate / threshold) |
| dimensions["greedy_rep"] = { |
| "score": round(score, 1), "weight": 20, |
| "current": f"{rep_rate:.1%}", "threshold": f"<{threshold:.0%}", |
| "status": "PASS" if rep_rate < threshold else "FAIL", |
| } |
| else: |
| missing += 1 |
| dimensions["greedy_rep"] = {"score": 0, "weight": 20, "current": "N/A", "threshold": "<5%", "status": "N/A"} |
|
|
| |
| eos_rate = sft_p1.get("generation", {}).get("summary", {}).get("greedy_eos_rate") |
| if eos_rate is not None: |
| threshold = _SFT_TARGETS["eos_rate_min"] |
| score = 10 * min(eos_rate / threshold, 1) |
| dimensions["eos_rate"] = { |
| "score": round(score, 1), "weight": 10, |
| "current": f"{eos_rate:.0%}", "threshold": f">{threshold:.0%}", |
| "status": "PASS" if eos_rate >= threshold else "FAIL", |
| } |
| else: |
| missing += 1 |
| dimensions["eos_rate"] = {"score": 0, "weight": 10, "current": "N/A", "threshold": ">90%", "status": "N/A"} |
|
|
| |
| kobest_avg = _get_kobest_avg(sft_zero) |
| if kobest_avg is not None: |
| threshold = _SFT_TARGETS["kobest_avg_min"] |
| score = 20 * min(kobest_avg / threshold, 1) |
| dimensions["kobest_avg"] = { |
| "score": round(score, 1), "weight": 20, |
| "current": f"{kobest_avg:.1%}", "threshold": f">{threshold:.0%}", |
| "status": "PASS" if kobest_avg >= threshold else "FAIL", |
| } |
| else: |
| missing += 1 |
| dimensions["kobest_avg"] = {"score": 0, "weight": 20, "current": "N/A", "threshold": ">55%", "status": "N/A"} |
|
|
| |
| top1 = sft_p1.get("calibration", {}).get("top1_accuracy") |
| if top1 is not None: |
| threshold = _SFT_TARGETS["top1_accuracy_min"] |
| score = 10 * min(top1 / threshold, 1) |
| dimensions["calibration"] = { |
| "score": round(score, 1), "weight": 10, |
| "current": f"{top1:.1%}", "threshold": f">={threshold:.0%}", |
| "status": "PASS" if top1 >= threshold else "FAIL", |
| } |
| else: |
| missing += 1 |
| dimensions["calibration"] = {"score": 0, "weight": 10, "current": "N/A", "threshold": ">=65%", "status": "N/A"} |
|
|
| |
| distinct_2 = sft_p1.get("generation", {}).get("summary", {}).get("greedy_avg_distinct_2") |
| if distinct_2 is not None: |
| threshold = _SFT_TARGETS["distinct_2_min"] |
| score = 10 * min(distinct_2 / threshold, 1) |
| dimensions["diversity"] = { |
| "score": round(score, 1), "weight": 10, |
| "current": f"{distinct_2:.0%}", "threshold": f">{threshold:.0%}", |
| "status": "PASS" if distinct_2 >= threshold else "FAIL", |
| } |
| else: |
| missing += 1 |
| dimensions["diversity"] = {"score": 0, "weight": 10, "current": "N/A", "threshold": ">70%", "status": "N/A"} |
|
|
| |
| en_tasks = { |
| "hellaswag": _SFT_TARGETS["hellaswag_min"], |
| "arc_easy": _SFT_TARGETS["arc_easy_min"], |
| "arc_challenge": _SFT_TARGETS["arc_challenge_min"], |
| "winogrande": _SFT_TARGETS["winogrande_min"], |
| "piqa": _SFT_TARGETS["piqa_min"], |
| } |
| en_all_pass = True |
| en_count = 0 |
| for t, threshold in en_tasks.items(): |
| a = _get_acc(sft_zero.get(t, {})) if t in sft_zero else None |
| if a is not None: |
| en_count += 1 |
| if a < threshold: |
| en_all_pass = False |
| if en_count > 0: |
| score = 5.0 if en_all_pass else 0.0 |
| dimensions["english"] = { |
| "score": score, "weight": 5, |
| "current": "์ ๋ถ ํต๊ณผ" if en_all_pass else "์ผ๋ถ ๋ฏธ๋ฌ", |
| "threshold": "โ", "status": "PASS" if en_all_pass else "FAIL", |
| } |
| else: |
| missing += 1 |
| dimensions["english"] = {"score": 0, "weight": 5, "current": "N/A", "threshold": "โ", "status": "N/A"} |
|
|
| total_score = sum(d["score"] for d in dimensions.values()) |
| confidence = round(1.0 - (missing / total_dims), 2) |
|
|
| if missing >= 2: |
| logger.warning("ORPO score has %d/%d missing dimensions โ confidence %.0f%%", missing, total_dims, confidence * 100) |
|
|
| |
| orpo_improvable = 0.0 |
| if rep_rate is not None and rep_rate >= _SFT_TARGETS["greedy_3gram_rep_max"]: |
| orpo_improvable += 20.0 |
| if eos_rate is not None and eos_rate < _SFT_TARGETS["eos_rate_min"]: |
| orpo_improvable += 10.0 |
| if distinct_2 is not None and distinct_2 < _SFT_TARGETS["distinct_2_min"]: |
| orpo_improvable += 5.0 |
|
|
| |
| forgetting_ok = max_forgetting is not None and max_forgetting < _SFT_TARGETS["ppl_forgetting_max_pct"] |
| if total_score >= 80: |
| decision = "DEPLOY" |
| elif total_score >= 40 and forgetting_ok: |
| decision = "ORPO" |
| else: |
| decision = "SFT_RETRY" |
|
|
| return { |
| "total_score": round(total_score, 1), |
| "dimensions": dimensions, |
| "decision": decision, |
| "confidence": confidence, |
| "orpo_gain_estimate": round(orpo_improvable, 1), |
| } |
|
|
|
|
| def generate_comparison_report( |
| base_results_dir: Path, |
| sft_phase1_results: dict, |
| sft_phase2_results: dict, |
| output_path: Path, |
| sft_output_dir: Optional[Path] = None, |
| total_elapsed_sec: float = 0.0, |
| ) -> Path: |
| """Generate a comprehensive Base vs SFT comparison report. |
| |
| Args: |
| base_results_dir: Directory containing Base model's phase1/phase2_results.json |
| sft_phase1_results: SFT Phase 1 results dict |
| sft_phase2_results: SFT Phase 2 results dict |
| output_path: Where to write the markdown report |
| sft_output_dir: SFT eval outputs directory (for linking) |
| total_elapsed_sec: Total pipeline elapsed time |
| |
| Returns: |
| Path to the generated report |
| """ |
| base_results_dir = Path(base_results_dir) |
| output_path = Path(output_path) |
| output_path.parent.mkdir(parents=True, exist_ok=True) |
|
|
| |
| base_p1 = {} |
| base_p2 = {} |
| p1_file = base_results_dir / "phase1_results.json" |
| p2_file = base_results_dir / "phase2_results.json" |
| if p1_file.exists(): |
| with open(p1_file, encoding="utf-8") as f: |
| base_p1 = json.load(f) |
| if p2_file.exists(): |
| with open(p2_file, encoding="utf-8") as f: |
| base_p2 = json.load(f) |
|
|
| |
| sft_p1 = _normalize_phase1_results(sft_phase1_results) |
| base_p1_norm = _normalize_phase1_results(base_p1) |
| sft_zero, sft_five = _normalize_phase2_results(sft_phase2_results) |
| base_zero, base_five = _normalize_phase2_results(base_p2) |
|
|
| eval_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
|
|
| lines = [] |
|
|
| |
| lines.append("# FRANKENSTALLM 3B SFT ๋ชจ๋ธ ๋ค๋ฉด์ ์ข
ํฉ ํ๊ฐ ๋ณด๊ณ ์\n") |
| lines.append(f"- **ํ๊ฐ ์ผ์**: {eval_datetime}") |
| lines.append(f"- **SFT ์ฒดํฌํฌ์ธํธ**: checkpoint-best (val_loss=1.8851, step 25500)") |
| lines.append(f"- **Base ์ฐธ์กฐ ๊ฒฐ๊ณผ**: 3b_reeval_20260305_1451") |
| lines.append(f"- **์ด ์์ ์๊ฐ**: {_fmt_seconds(total_elapsed_sec)}") |
| if sft_output_dir: |
| lines.append(f"- **๊ฒฐ๊ณผ ๋๋ ํ ๋ฆฌ**: {sft_output_dir}") |
| lines.append("") |
|
|
| |
| lines.append("## 1. Executive Summary\n") |
| verdicts = _compute_verdicts(sft_p1, sft_zero, base_p1_norm, base_zero) |
| lines.append("| ํ๊ฐ ์ฐจ์ | ๊ฒฐ๊ณผ | ์์ธ |") |
| lines.append("|----------|------|------|") |
| for dim_name, verdict, detail in verdicts: |
| icon = "PASS" if verdict else "FAIL" |
| lines.append(f"| {dim_name} | **{icon}** | {detail} |") |
| lines.append("") |
|
|
| pass_count = sum(1 for _, v, _ in verdicts if v) |
| total_dims = len(verdicts) |
| lines.append(f"**์ข
ํฉ**: {pass_count}/{total_dims} ์ฐจ์ ํต๊ณผ\n") |
|
|
| |
| rep_rate = _get_greedy_3gram_rep(sft_p1) |
| kobest_avg = _get_kobest_avg(sft_zero) |
| max_forgetting = _get_max_forgetting(sft_p1, base_p1_norm) |
|
|
| lines.append("### ORPO ํ์ (์ ๋ ์ค์ฝ์ด)\n") |
| orpo_result = _compute_orpo_score(sft_p1, sft_zero, base_p1_norm, base_zero) |
|
|
| lines.append(f"**๊ฒฐ์ **: {orpo_result['decision']} (ํ์ ๋: {orpo_result['confidence']:.0%})\n") |
| lines.append(f"**์ ๋ ์ค์ฝ์ด**: {orpo_result['total_score']}/100\n") |
|
|
| lines.append("| ์ฐจ์ | ์ ์ | /๊ฐ์ค์น | ํ์ฌ๊ฐ | ๊ธฐ์ค | ์ํ |") |
| lines.append("|------|------|--------|--------|------|------|") |
| dim_names = { |
| "ppl_forgetting": "PPL Forgetting", |
| "greedy_rep": "Greedy ๋ฐ๋ณต๋ฅ ", |
| "eos_rate": "EOS ์ข
๋ฃ์จ", |
| "kobest_avg": "KoBEST ํ๊ท ", |
| "calibration": "Calibration", |
| "diversity": "๋ค์์ฑ", |
| "english": "์์ด ์ ์ง", |
| } |
| for key, label in dim_names.items(): |
| d = orpo_result["dimensions"].get(key, {}) |
| lines.append( |
| f"| {label} | {d.get('score', 0)} | /{d.get('weight', 0)} | " |
| f"{d.get('current', 'N/A')} | {d.get('threshold', 'โ')} | {d.get('status', 'N/A')} |" |
| ) |
| lines.append("") |
|
|
| if orpo_result["orpo_gain_estimate"] > 0: |
| lines.append(f"**ORPO ๊ธฐ๋ ์ด๋**: +{orpo_result['orpo_gain_estimate']}์ " |
| f"(๋ฐ๋ณต๋ฅ /EOS/๋ค์์ฑ ๊ฐ์ ๊ธฐ๋, PPL/๋ฒค์น ๋ณํ ์์)\n") |
|
|
| |
| lines.append("**์ฐธ์กฐ ๋ชจ๋ธ ๋น๊ต**:\n") |
| for model_name, ref in _REFERENCE_MODELS.items(): |
| lines.append(f"- {model_name}: KoBEST={ref['kobest_avg']:.0%}, MMLU-KO={ref['mmlu_ko']:.0%}") |
| lines.append("") |
|
|
| |
| if orpo_result["decision"] == "DEPLOY": |
| lines.append("**โ Phase 4: GGUF + Ollama ๋ฐฐํฌ** (์ค์ฝ์ด โฅ80, ๋ชจ๋ ํต์ฌ ์กฐ๊ฑด ์ถฉ์กฑ)\n") |
| elif orpo_result["decision"] == "ORPO": |
| lines.append("**โ Phase 3: ORPO** (์ค์ฝ์ด 40-79, ์ง์ ๋ณด์กด ์ํธ, ์์ฑ ๊ฐ์ ํ์)\n") |
| else: |
| lines.append("**โ SFT ์ฌ์๋** (์ค์ฝ์ด <40 ๋๋ ์ฌ๊ฐํ forgetting)\n") |
|
|
| |
| lines.append("## 2. Perplexity ๋น๊ต (์ง์ ๋ณด์กด)\n") |
| lines.append("| ๋ฐ์ดํฐ์
| Base PPL | SFT PPL | ๋ณํ | Forgetting % | ํ์ |") |
| lines.append("|---------|---------|---------|------|-------------|------|") |
|
|
| sft_ppl = sft_p1.get("perplexity", {}) |
| base_ppl = base_p1_norm.get("perplexity", {}) |
|
|
| |
| all_ppl_names = sorted(set(list(sft_ppl.keys()) + list(base_ppl.keys()))) |
| forgetting_values = [] |
| for name in all_ppl_names: |
| sft_val = sft_ppl.get(name, {}).get("ppl") if isinstance(sft_ppl.get(name), dict) else None |
| base_val = base_ppl.get(name, {}).get("ppl") if isinstance(base_ppl.get(name), dict) else None |
| |
| if base_val is None: |
| base_val = _BASE_PPL_REFERENCE.get(name) |
|
|
| if sft_val is not None and base_val is not None: |
| forgetting = (sft_val - base_val) / base_val * 100 |
| forgetting_values.append(forgetting) |
| verdict = "PASS" if forgetting < _SFT_TARGETS["ppl_forgetting_max_pct"] else "FAIL" |
| lines.append( |
| f"| {name} | {base_val:.4f} | {sft_val:.4f} | " |
| f"{'+' if sft_val >= base_val else ''}{sft_val - base_val:.4f} | " |
| f"{forgetting:+.1f}% | {verdict} |" |
| ) |
| elif sft_val is not None: |
| lines.append(f"| {name} | โ | {sft_val:.4f} | โ | โ | โ |") |
| elif base_val is not None: |
| lines.append(f"| {name} | {base_val:.4f} | โ | โ | โ | โ |") |
|
|
| if forgetting_values: |
| avg_forgetting = sum(forgetting_values) / len(forgetting_values) |
| max_f = max(forgetting_values) |
| lines.append("") |
| lines.append(f"**ํ๊ท Forgetting**: {avg_forgetting:+.1f}% | **์ต๋**: {max_f:+.1f}% | " |
| f"**ํ์ **: {'PASS' if max_f < _SFT_TARGETS['ppl_forgetting_max_pct'] else 'FAIL'} (์๊ณ๊ฐ {_SFT_TARGETS['ppl_forgetting_max_pct']}%)") |
| lines.append("") |
|
|
| |
| lines.append("## 3. ์์ฑ ํ์ง ๋น๊ต\n") |
| sft_gen = sft_p1.get("generation", {}) |
| if not sft_gen: |
| logger.warning("Generation results missing from SFT Phase 1") |
| sft_summary = sft_gen.get("summary", {}) |
|
|
| lines.append("| ์งํ | Base | SFT | ๋ชฉํ | ํ์ |") |
| lines.append("|------|------|-----|------|------|") |
|
|
| greedy_3gram = sft_summary.get("greedy_avg_3gram_rep") |
| greedy_4gram = sft_summary.get("greedy_avg_4gram_rep") |
| eos_rate = sft_summary.get("greedy_eos_rate") |
|
|
| rep_threshold = _SFT_TARGETS["greedy_3gram_rep_max"] |
| eos_threshold = _SFT_TARGETS["eos_rate_min"] |
| greedy_3gram_verdict = "PASS" if greedy_3gram is not None and greedy_3gram < rep_threshold else "FAIL" |
| greedy_4gram_verdict = "PASS" if greedy_4gram is not None and greedy_4gram < 0.05 else "FAIL" |
| eos_verdict = "PASS" if eos_rate is not None and eos_rate >= eos_threshold else "FAIL" |
| lines.append(f"| Greedy 3-gram ๋ฐ๋ณต๋ฅ | {_BASE_GEN_REFERENCE['greedy_3gram_rep']:.2%} | " |
| f"{_fmt_pct(greedy_3gram)} | < {rep_threshold:.0%} | {greedy_3gram_verdict} |") |
| lines.append(f"| Greedy 4-gram ๋ฐ๋ณต๋ฅ | {_BASE_GEN_REFERENCE['greedy_4gram_rep']:.2%} | " |
| f"{_fmt_pct(greedy_4gram)} | < 5% | {greedy_4gram_verdict} |") |
| lines.append(f"| EOS ์ข
๋ฃ์จ | {_BASE_GEN_REFERENCE['greedy_eos_rate']:.0%} | " |
| f"{_fmt_pct(eos_rate)} | > {eos_threshold:.0%} | {eos_verdict} |") |
|
|
| sampled_3gram = sft_summary.get("sampled_avg_3gram_rep") |
| sampled_eos = sft_summary.get("sampled_eos_rate") |
| if sampled_3gram is not None: |
| lines.append(f"| Sampled 3-gram ๋ฐ๋ณต๋ฅ | โ | {sampled_3gram:.2%} | โ | โ |") |
| if sampled_eos is not None: |
| lines.append(f"| Sampled EOS ์ข
๋ฃ์จ | โ | {sampled_eos:.2%} | โ | โ |") |
| lines.append("") |
|
|
| |
| chat_status = "ํ์ฑํ" if sft_summary else "๋นํ์ฑํ" |
| lines.append(f"**Chat Template**: {chat_status}\n") |
|
|
| |
| if sft_gen.get("samples"): |
| lines.append("### ์์ฑ ์ํ (Greedy, Chat Template)\n") |
| greedy_samples = [s for s in sft_gen["samples"] if s.get("temperature") == 0.0] |
| for i, s in enumerate(greedy_samples[:5], 1): |
| prompt = s.get("prompt", "") |
| text = s.get("text", "")[:400] |
| hit_eos = s.get("hit_eos", False) |
| rep3 = s.get("3gram_rep", 0) |
| lines.append(f"**[{i}]** `{prompt}`") |
| lines.append(f"> {text}") |
| lines.append(f"> *EOS={hit_eos}, 3gram_rep={rep3:.2%}, tokens={s.get('generated_tokens', 0)}*\n") |
|
|
| |
| sft_rep = sft_p1.get("repetition", {}) |
| if sft_rep.get("grid_results"): |
| lines.append("### Repetition ํ๋ผ๋ฏธํฐ ๊ฒ์ ๊ฒฐ๊ณผ\n") |
| lines.append("| ์ค์ | 3-gram | EOS Rate | Avg Tokens |") |
| lines.append("|------|--------|----------|-----------|") |
| grid = sft_rep["grid_results"] |
| items = grid if isinstance(grid, list) else list(grid.values()) |
| for r in items[:6]: |
| if isinstance(r, dict): |
| lines.append( |
| f"| {r.get('params', '?')} | " |
| f"{_fmt_f(r.get('avg_3gram_rep'))} | " |
| f"{_fmt_f(r.get('eos_rate'))} | " |
| f"{_fmt_f(r.get('avg_tokens'), 1)} |" |
| ) |
| lines.append("") |
|
|
| |
| lines.append("## 4. ํ๊ตญ์ด ๋ฒค์น๋งํฌ\n") |
| lines.append("### KoBEST (0-shot)\n") |
| lines.append("| ํ์คํฌ | Base | SFT | ๋ณํ | ๋ชฉํ | ํ์ |") |
| lines.append("|--------|------|-----|------|------|------|") |
|
|
| kobest_tasks = ["kobest_boolq", "kobest_copa", "kobest_hellaswag", |
| "kobest_sentineg", "kobest_wic"] |
| kobest_targets = {"kobest_boolq": 0.60, "kobest_copa": 0.65, |
| "kobest_hellaswag": 0.30, "kobest_sentineg": 0.60, |
| "kobest_wic": 0.55} |
| sft_kobest_accs = [] |
| base_kobest_accs = [] |
|
|
| for t in kobest_tasks: |
| base_a = _get_acc(base_zero.get(t, {})) if t in base_zero else _BASE_BENCH_REFERENCE.get(t) |
| sft_a = _get_acc(sft_zero.get(t, {})) if t in sft_zero else None |
| target = kobest_targets.get(t, 0.50) |
|
|
| if sft_a is not None: |
| sft_kobest_accs.append(sft_a) |
| if base_a is not None: |
| base_kobest_accs.append(base_a) |
|
|
| diff = "" |
| verdict = "โ" |
| if sft_a is not None and base_a is not None: |
| d = (sft_a - base_a) * 100 |
| diff = f"{'+' if d >= 0 else ''}{d:.1f}pp" |
| verdict = "PASS" if sft_a >= target else "FAIL" |
|
|
| lines.append(f"| {t} | {_fmt_pct(base_a)} | {_fmt_pct(sft_a)} | {diff} | " |
| f"โฅ{target*100:.0f}% | {verdict} |") |
|
|
| if sft_kobest_accs: |
| sft_avg = sum(sft_kobest_accs) / len(sft_kobest_accs) |
| base_avg = sum(base_kobest_accs) / len(base_kobest_accs) if base_kobest_accs else _BASE_BENCH_REFERENCE.get("kobest_avg", 0.4369) |
| diff_avg = (sft_avg - base_avg) * 100 |
| lines.append(f"| **ํ๊ท ** | **{base_avg*100:.2f}%** | **{sft_avg*100:.2f}%** | " |
| f"**{'+' if diff_avg >= 0 else ''}{diff_avg:.1f}pp** | " |
| f"**โฅ{_SFT_TARGETS['kobest_avg_min']*100:.0f}%** | **{'PASS' if sft_avg >= _SFT_TARGETS['kobest_avg_min'] else 'FAIL'}** |") |
| lines.append("") |
|
|
| |
| lines.append("### HAE-RAE (0-shot)\n") |
| base_haerae = _get_acc(base_zero.get("haerae", {})) if "haerae" in base_zero else _BASE_BENCH_REFERENCE.get("haerae") |
| sft_haerae = _get_acc(sft_zero.get("haerae", {})) if "haerae" in sft_zero else None |
| if sft_haerae is not None: |
| diff_h = (sft_haerae - (base_haerae or 0)) * 100 if base_haerae else 0 |
| lines.append(f"- Base: {_fmt_pct(base_haerae)} โ SFT: {_fmt_pct(sft_haerae)} " |
| f"({'+' if diff_h >= 0 else ''}{diff_h:.1f}pp) | " |
| f"๋ชฉํ โฅ{_SFT_TARGETS['haerae_min']*100:.0f}% | {'PASS' if sft_haerae >= _SFT_TARGETS['haerae_min'] else 'FAIL'}") |
| else: |
| lines.append(f"- Base: {_fmt_pct(base_haerae)} โ SFT: N/A") |
| lines.append("") |
|
|
| |
| lines.append("### MMLU-KO (0-shot)\n") |
| base_mmlu_ko = _get_acc(base_zero.get("global_mmlu_ko", {})) if "global_mmlu_ko" in base_zero else _BASE_BENCH_REFERENCE.get("global_mmlu_ko") |
| sft_mmlu_ko = _get_acc(sft_zero.get("global_mmlu_ko", {})) if "global_mmlu_ko" in sft_zero else None |
| if sft_mmlu_ko is not None: |
| diff_mk = (sft_mmlu_ko - (base_mmlu_ko or 0)) * 100 if base_mmlu_ko else 0 |
| lines.append(f"- Base: {_fmt_pct(base_mmlu_ko)} โ SFT: {_fmt_pct(sft_mmlu_ko)} " |
| f"({'+' if diff_mk >= 0 else ''}{diff_mk:.1f}pp) | " |
| f"๋ชฉํ โฅ{_SFT_TARGETS['mmlu_ko_min']*100:.0f}% | {'PASS' if sft_mmlu_ko >= _SFT_TARGETS['mmlu_ko_min'] else 'FAIL'}") |
| else: |
| lines.append(f"- Base: {_fmt_pct(base_mmlu_ko)} โ SFT: N/A") |
| lines.append("") |
|
|
| |
| if sft_five: |
| lines.append("### 5-shot ๋น๊ต (ํ๊ตญ์ด)\n") |
| lines.append("| ํ์คํฌ | 0-shot | 5-shot | ๋ณํ |") |
| lines.append("|--------|--------|--------|------|") |
| for t in kobest_tasks + ["haerae", "global_mmlu_ko"]: |
| a0 = _get_acc(sft_zero.get(t, {})) if t in sft_zero else None |
| a5 = _get_acc(sft_five.get(t, {})) if t in sft_five else None |
| if a0 is not None and a5 is not None: |
| d = (a5 - a0) * 100 |
| lines.append(f"| {t} | {a0*100:.2f}% | {a5*100:.2f}% | {'+' if d >= 0 else ''}{d:.1f}pp |") |
| lines.append("") |
|
|
| |
| lines.append("## 5. ์์ด ๋ฒค์น๋งํฌ (์ ์ง ํ์ธ)\n") |
| lines.append("| ํ์คํฌ | Base | SFT | ๋ณํ | ํํ | ํ์ |") |
| lines.append("|--------|------|-----|------|------|------|") |
|
|
| en_tasks = { |
| "hellaswag": _SFT_TARGETS["hellaswag_min"], |
| "arc_easy": _SFT_TARGETS["arc_easy_min"], |
| "arc_challenge": _SFT_TARGETS["arc_challenge_min"], |
| "winogrande": _SFT_TARGETS["winogrande_min"], |
| "piqa": _SFT_TARGETS["piqa_min"], |
| } |
| for t, threshold in en_tasks.items(): |
| base_a = _get_acc(base_zero.get(t, {}), prefer_norm=(t in ["hellaswag", "arc_challenge"])) \ |
| if t in base_zero else _BASE_BENCH_REFERENCE.get(t) |
| sft_a = _get_acc(sft_zero.get(t, {}), prefer_norm=(t in ["hellaswag", "arc_challenge"])) \ |
| if t in sft_zero else None |
| diff = "" |
| verdict = "โ" |
| if sft_a is not None and base_a is not None: |
| d = (sft_a - base_a) * 100 |
| diff = f"{'+' if d >= 0 else ''}{d:.1f}pp" |
| verdict = "PASS" if sft_a >= threshold else "FAIL" |
| lines.append(f"| {t} | {_fmt_pct(base_a)} | {_fmt_pct(sft_a)} | {diff} | " |
| f"โฅ{threshold*100:.0f}% | {verdict} |") |
|
|
| |
| _MMLU_EN_GROUPS = {"mmlu", "mmlu_humanities", "mmlu_social_sciences", "mmlu_stem", "mmlu_other"} |
| sft_mmlu_en = [] |
| base_mmlu_en = [] |
| for t, m in sft_zero.items(): |
| if (t.startswith("mmlu_") or t == "mmlu") and t not in _MMLU_EN_GROUPS: |
| a = _get_acc(m) |
| if a is not None: |
| sft_mmlu_en.append(a) |
| if not sft_mmlu_en: |
| for t in _MMLU_EN_GROUPS: |
| if t in sft_zero: |
| a = _get_acc(sft_zero[t]) |
| if a is not None: |
| sft_mmlu_en.append(a) |
| for t, m in base_zero.items(): |
| if (t.startswith("mmlu_") or t == "mmlu") and t not in _MMLU_EN_GROUPS: |
| a = _get_acc(m) |
| if a is not None: |
| base_mmlu_en.append(a) |
| if not base_mmlu_en: |
| for t in _MMLU_EN_GROUPS: |
| if t in base_zero: |
| a = _get_acc(base_zero[t]) |
| if a is not None: |
| base_mmlu_en.append(a) |
|
|
| sft_mmlu_en_avg = sum(sft_mmlu_en) / len(sft_mmlu_en) if sft_mmlu_en else None |
| base_mmlu_en_avg = sum(base_mmlu_en) / len(base_mmlu_en) if base_mmlu_en else 0.2581 |
| if sft_mmlu_en_avg is not None: |
| d = (sft_mmlu_en_avg - base_mmlu_en_avg) * 100 |
| lines.append(f"| MMLU-EN ํ๊ท | {base_mmlu_en_avg*100:.2f}% | {sft_mmlu_en_avg*100:.2f}% | " |
| f"{'+' if d >= 0 else ''}{d:.1f}pp | โฅ25% | " |
| f"{'PASS' if sft_mmlu_en_avg >= _SFT_TARGETS['mmlu_en_avg_min'] else 'FAIL'} |") |
| lines.append("") |
|
|
| |
| lines.append("## 6. Calibration ๋น๊ต\n") |
| sft_cal = sft_p1.get("calibration", {}) |
| lines.append("| ์งํ | Base | SFT | ๋ชฉํ | ํ์ |") |
| lines.append("|------|------|-----|------|------|") |
|
|
| cal_checks = [ |
| ("top1_accuracy", "Top-1 Accuracy", _SFT_TARGETS["top1_accuracy_min"], True), |
| ("top5_accuracy", "Top-5 Accuracy", 0.78, True), |
| ("top10_accuracy", "Top-10 Accuracy", 0.82, True), |
| ("mean_entropy", "Mean Entropy", 2.0, False), |
| ] |
| for key, label, threshold, is_higher_better in cal_checks: |
| base_v = _BASE_CALIB_REFERENCE.get(key) |
| sft_v = sft_cal.get(key) |
| verdict = "โ" |
| if sft_v is not None: |
| if is_higher_better: |
| verdict = "PASS" if sft_v >= threshold else "FAIL" |
| else: |
| verdict = "PASS" if sft_v <= threshold else "FAIL" |
| lines.append(f"| {label} | {_fmt_f(base_v)} | {_fmt_f(sft_v)} | " |
| f"{'โฅ' if is_higher_better else '<'}{threshold} | {verdict} |") |
|
|
| |
| sft_nll = sft_p1.get("token_nll", {}) |
| nll_mean = sft_nll.get("nll_mean", sft_nll.get("mean")) |
| base_nll_mean = _BASE_NLL_REFERENCE.get("nll_mean") |
| if nll_mean is not None: |
| lines.append(f"| Token NLL mean | {_fmt_f(base_nll_mean)} | {_fmt_f(nll_mean)} | " |
| f"< 2.0 | {'PASS' if nll_mean < 2.0 else 'FAIL'} |") |
| hlf5 = sft_nll.get("high_loss_fractions", {}).get("5", sft_nll.get("high_loss_fraction_5")) |
| base_hlf5 = _BASE_NLL_REFERENCE.get("high_loss_fraction_5") |
| if hlf5 is not None: |
| lines.append(f"| NLL > 5 ๋น์จ | {_fmt_f(base_hlf5)} | {_fmt_f(hlf5)} | " |
| f"< 0.15 | {'PASS' if hlf5 < 0.15 else 'FAIL'} |") |
| lines.append("") |
|
|
| |
| lines.append("## 7. ์ข
ํฉ ํ์ ๋ฐ ๋ค์ ๋จ๊ณ\n") |
|
|
| lines.append("### ํต์ฌ ํ์ ๊ธฐ์ค\n") |
| lines.append("| ์กฐ๊ฑด | ํ์ฌ ๊ฐ | ๊ธฐ์ค | ์ถฉ์กฑ |") |
| lines.append("|------|---------|------|------|") |
|
|
| rep_val = rep_rate |
| lines.append(f"| Greedy 3-gram ๋ฐ๋ณต๋ฅ | {_fmt_pct(rep_val)} | < {_SFT_TARGETS['greedy_3gram_rep_max']:.0%} | " |
| f"{'YES' if rep_val is not None and rep_val < _SFT_TARGETS['greedy_3gram_rep_max'] else 'NO'} |") |
| lines.append(f"| KoBEST ํ๊ท | {_fmt_pct(kobest_avg)} | > {_SFT_TARGETS['kobest_avg_min']*100:.0f}% | " |
| f"{'YES' if kobest_avg is not None and kobest_avg > _SFT_TARGETS['kobest_avg_min'] else 'NO'} |") |
| lines.append(f"| ์ต๋ Forgetting | {f'{max_forgetting:.1f}%' if max_forgetting is not None else 'N/A'} | " |
| f"< {_SFT_TARGETS['ppl_forgetting_max_pct']}% | {'YES' if max_forgetting is not None and max_forgetting < _SFT_TARGETS['ppl_forgetting_max_pct'] else 'NO'} |") |
| lines.append("") |
|
|
| |
| lines.append("### ๊ถ๊ณ \n") |
| orpo_result = _compute_orpo_score(sft_p1, sft_zero, base_p1_norm, base_zero) |
| orpo_score = orpo_result["total_score"] |
| orpo_decision = orpo_result["decision"] |
|
|
| all_core_pass = ( |
| rep_rate is not None and rep_rate < _SFT_TARGETS["greedy_3gram_rep_max"] |
| and kobest_avg is not None and kobest_avg > _SFT_TARGETS["kobest_avg_min"] |
| and max_forgetting is not None and max_forgetting < _SFT_TARGETS["ppl_forgetting_max_pct"] |
| ) |
|
|
| if all_core_pass: |
| lines.append("**๋ชจ๋ ํต์ฌ ์กฐ๊ฑด ์ถฉ์กฑ โ Phase 4: GGUF ๋ณํ + Ollama ๋ฐฐํฌ ์งํ**\n") |
| elif orpo_decision == "ORPO": |
| lines.append(f"**ORPO ํ์ ์ค์ฝ์ด {orpo_score:.1f}/100 โ Phase 3: ORPO ํ์ต ์งํ** (795K preference pairs ํ์ฉ)\n") |
| lines.append("ORPO ํ์ต ์ ์ฃผ์์ :") |
| lines.append("- Greedy ๋ฐ๋ณต๋ฅ ๊ฐ์ (ํ์ฌ 72.97% โ ๋ชฉํ <5%)") |
| lines.append("- EOS ์ข
๋ฃ์จ ๊ฐ์ (ํ์ฌ 60% โ ๋ชฉํ >90%)") |
| lines.append("- ๋ฒค์น๋งํฌ ์ ์ ์ ์ง/ํฅ์") |
| lines.append("- ์ง์ ๋ณด์กด ์ ์ง (ํ์ฌ forgetting 0.9%)") |
| elif orpo_decision == "SKIP_ORPO": |
| lines.append("**ORPO ๋ถํ์ โ Phase 4: GGUF ๋ณํ + Ollama ๋ฐฐํฌ ์งํ**\n") |
| else: |
| lines.append("**ํต์ฌ ์กฐ๊ฑด ๋ฏธ๋ฌ โ SFT ์ฌ์๋**\n") |
| lines.append("์ฌ์๋ ์ ๊ฒํ ์ฌํญ:") |
| lines.append("- ํ์ต๋ฅ ์กฐ์ ") |
| lines.append("- ๋ฐ์ดํฐ ๊ตฌ์ฑ ์ฌ๊ฒํ ") |
| lines.append("- ์ํญ ์ ์กฐ์ ") |
| lines.append("") |
|
|
| lines.append("---\n") |
| lines.append("*์ด ๋ณด๊ณ ์๋ `eval/sft_eval_pipeline.py`์ ์ํด ์๋ ์์ฑ๋์์ต๋๋ค.*") |
|
|
| report_text = "\n".join(lines) |
| output_path.write_text(report_text, encoding="utf-8") |
|
|
| |
| if sft_output_dir: |
| (Path(sft_output_dir) / "sft_comparison_report.md").write_text(report_text, encoding="utf-8") |
|
|
| return output_path |
|
|
|
|
| def _compute_verdicts(sft_p1, sft_zero, base_p1, base_zero): |
| """Compute pass/fail verdicts for each of the 6 evaluation dimensions.""" |
| verdicts = [] |
|
|
| |
| max_forgetting = _get_max_forgetting(sft_p1, base_p1) |
| if max_forgetting is not None: |
| verdicts.append(( |
| "์ฐจ์ 1: Perplexity (์ง์ ๋ณด์กด)", |
| max_forgetting < _SFT_TARGETS["ppl_forgetting_max_pct"], |
| f"์ต๋ forgetting {max_forgetting:.1f}% (์๊ณ๊ฐ {_SFT_TARGETS['ppl_forgetting_max_pct']}%)", |
| )) |
| else: |
| verdicts.append(("์ฐจ์ 1: Perplexity (์ง์ ๋ณด์กด)", False, "๋ฐ์ดํฐ ์์")) |
|
|
| |
| rep_rate = _get_greedy_3gram_rep(sft_p1) |
| eos_rate = sft_p1.get("generation", {}).get("summary", {}).get("greedy_eos_rate") |
| if rep_rate is not None and eos_rate is not None: |
| gen_pass = rep_rate < _SFT_TARGETS["greedy_3gram_rep_max"] and eos_rate > _SFT_TARGETS["eos_rate_min"] |
| verdicts.append(( |
| "์ฐจ์ 2: ์์ฑ ํ์ง", |
| gen_pass, |
| f"๋ฐ๋ณต๋ฅ {rep_rate:.2%} (๋ชฉํ <{_SFT_TARGETS['greedy_3gram_rep_max']:.0%}), EOS {eos_rate:.0%} (๋ชฉํ >{_SFT_TARGETS['eos_rate_min']:.0%})", |
| )) |
| else: |
| verdicts.append(("์ฐจ์ 2: ์์ฑ ํ์ง", False, "๋ฐ์ดํฐ ์์")) |
|
|
| |
| kobest_avg = _get_kobest_avg(sft_zero) |
| if kobest_avg is not None: |
| verdicts.append(( |
| "์ฐจ์ 3: ํ๊ตญ์ด ๋ฒค์น๋งํฌ", |
| kobest_avg > _SFT_TARGETS["kobest_avg_min"], |
| f"KoBEST ํ๊ท {kobest_avg*100:.2f}% (๋ชฉํ >{_SFT_TARGETS['kobest_avg_min']*100:.0f}%)", |
| )) |
| else: |
| verdicts.append(("์ฐจ์ 3: ํ๊ตญ์ด ๋ฒค์น๋งํฌ", False, "๋ฐ์ดํฐ ์์")) |
|
|
| |
| en_tasks = { |
| "hellaswag": _SFT_TARGETS["hellaswag_min"], |
| "arc_easy": _SFT_TARGETS["arc_easy_min"], |
| "arc_challenge": _SFT_TARGETS["arc_challenge_min"], |
| "winogrande": _SFT_TARGETS["winogrande_min"], |
| "piqa": _SFT_TARGETS["piqa_min"], |
| } |
| en_pass = True |
| en_detail_parts = [] |
| for t, threshold in en_tasks.items(): |
| a = _get_acc(sft_zero.get(t, {})) if t in sft_zero else None |
| if a is not None: |
| if a < threshold: |
| en_pass = False |
| en_detail_parts.append(f"{t}={a*100:.1f}%") |
| if en_detail_parts: |
| verdicts.append(( |
| "์ฐจ์ 4: ์์ด ๋ฒค์น๋งํฌ", |
| en_pass, |
| ", ".join(en_detail_parts[:3]) + ("..." if len(en_detail_parts) > 3 else ""), |
| )) |
| else: |
| verdicts.append(("์ฐจ์ 4: ์์ด ๋ฒค์น๋งํฌ", False, "๋ฐ์ดํฐ ์์")) |
|
|
| |
| cal = sft_p1.get("calibration", {}) |
| top1 = cal.get("top1_accuracy") |
| if top1 is not None: |
| cal_pass = top1 >= _SFT_TARGETS["top1_accuracy_min"] |
| verdicts.append(( |
| "์ฐจ์ 5: Calibration", |
| cal_pass, |
| f"Top-1 {top1*100:.2f}% (๋ชฉํ โฅ{_SFT_TARGETS['top1_accuracy_min']*100:.0f}%)", |
| )) |
| else: |
| verdicts.append(("์ฐจ์ 5: Calibration", False, "๋ฐ์ดํฐ ์์")) |
|
|
| |
| if eos_rate is not None: |
| chat_pass = eos_rate > 0.50 |
| verdicts.append(( |
| "์ฐจ์ 6: SFT Chat ๋ฅ๋ ฅ", |
| chat_pass, |
| f"EOS ์ข
๋ฃ์จ {eos_rate:.0%}, ์์ฑ ์ํ ์๋ ๊ฒํ ํ์", |
| )) |
| else: |
| verdicts.append(("์ฐจ์ 6: SFT Chat ๋ฅ๋ ฅ", False, "๋ฐ์ดํฐ ์์")) |
|
|
| return verdicts |
|
|
|
|
| def _get_greedy_3gram_rep(p1: dict) -> Optional[float]: |
| gen = p1.get("generation", {}) |
| return gen.get("summary", {}).get("greedy_avg_3gram_rep") |
|
|
|
|
| def _get_kobest_avg(zero_shot: dict) -> Optional[float]: |
| kobest_tasks = ["kobest_boolq", "kobest_copa", "kobest_hellaswag", |
| "kobest_sentineg", "kobest_wic"] |
| accs = [] |
| for t in kobest_tasks: |
| if t in zero_shot: |
| a = _get_acc(zero_shot[t]) |
| if a is not None: |
| accs.append(a) |
| return sum(accs) / len(accs) if accs else None |
|
|
|
|
| def _get_max_forgetting(sft_p1: dict, base_p1: dict) -> Optional[float]: |
| sft_ppl = sft_p1.get("perplexity", {}) |
| base_ppl = base_p1.get("perplexity", {}) |
| forgetting_values = [] |
| for name in sft_ppl: |
| sft_val = sft_ppl[name].get("ppl") if isinstance(sft_ppl[name], dict) else None |
| base_val = base_ppl.get(name, {}).get("ppl") if isinstance(base_ppl.get(name), dict) else None |
| if base_val is None: |
| base_val = _BASE_PPL_REFERENCE.get(name) |
| if sft_val is not None and base_val is not None and base_val > 0: |
| forgetting_values.append((sft_val - base_val) / base_val * 100) |
| return max(forgetting_values) if forgetting_values else None |
|
|
|
|
| |
| |
| |
|
|
| def _compute_orpo_verdicts( |
| orpo_p1: dict, |
| orpo_zero: dict, |
| sft_p1: dict, |
| sft_zero: dict, |
| training_curve: Optional[dict] = None, |
| ) -> List[Tuple[str, bool, str]]: |
| """Compute the 4 ORPO-specific evaluation dimensions. |
| |
| Returns list of (dimension_name, pass_bool, detail_string). |
| """ |
| verdicts: List[Tuple[str, bool, str]] = [] |
|
|
| |
| pref_acc = None |
| if training_curve and training_curve.get("eval_steps"): |
| last_step = training_curve["eval_steps"][-1] |
| pref_acc = last_step.get("rewards_accuracies", last_step.get("preference_accuracy")) |
| if pref_acc is not None: |
| verdicts.append(( |
| "ORPO-1: Preference Accuracy", |
| pref_acc > 0.65, |
| f"์ต์ข
{pref_acc:.2%} (๋ชฉํ > 65%)", |
| )) |
| else: |
| verdicts.append(("ORPO-1: Preference Accuracy", False, "๋ฐ์ดํฐ ์์")) |
|
|
| |
| reward_margin = None |
| if training_curve and training_curve.get("eval_steps"): |
| last_step = training_curve["eval_steps"][-1] |
| reward_margin = last_step.get("rewards_margins", last_step.get("reward_margins")) |
| if reward_margin is not None: |
| verdicts.append(( |
| "ORPO-2: Reward Margins", |
| reward_margin > 0.1, |
| f"์ต์ข
{reward_margin:.4f} (๋ชฉํ > 0.1)", |
| )) |
| else: |
| verdicts.append(("ORPO-2: Reward Margins", False, "๋ฐ์ดํฐ ์์")) |
|
|
| |
| rep_grid = orpo_p1.get("repetition", {}).get("grid_results") |
| param_sens_pass = False |
| param_sens_detail = "๋ฐ์ดํฐ ์์" |
| if rep_grid: |
| items = rep_grid if isinstance(rep_grid, list) else list(rep_grid.values()) |
| for r in items: |
| if isinstance(r, dict): |
| rp = r.get("repetition_penalty", r.get("rep_penalty")) |
| if rp is not None and abs(float(rp) - 1.0) < 1e-6: |
| rep_val = r.get("avg_3gram_rep", r.get("3gram_repetition")) |
| if rep_val is not None: |
| param_sens_pass = rep_val < 0.05 |
| param_sens_detail = f"rep_penalty=1.0 ์ 3-gram rep={rep_val:.2%} (๋ชฉํ < 5%)" |
| break |
| verdicts.append(( |
| "ORPO-3: Parameter Sensitivity", |
| param_sens_pass, |
| param_sens_detail, |
| )) |
|
|
| |
| sft_rep = _get_greedy_3gram_rep(sft_p1) |
| orpo_rep = _get_greedy_3gram_rep(orpo_p1) |
| sft_eos = sft_p1.get("generation", {}).get("summary", {}).get("greedy_eos_rate") |
| orpo_eos = orpo_p1.get("generation", {}).get("summary", {}).get("greedy_eos_rate") |
|
|
| if all(v is not None for v in [sft_rep, orpo_rep, sft_eos, orpo_eos]): |
| rep_improved = orpo_rep < sft_rep |
| eos_improved = orpo_eos > sft_eos |
| verdicts.append(( |
| "ORPO-4: SFTโORPO ๊ฐ์ ", |
| rep_improved and eos_improved, |
| f"๋ฐ๋ณต๋ฅ {sft_rep:.2%}โ{orpo_rep:.2%} ({'โ' if rep_improved else 'โ'}), " |
| f"EOS {sft_eos:.0%}โ{orpo_eos:.0%} ({'โ' if eos_improved else 'โ'})", |
| )) |
| else: |
| verdicts.append(("ORPO-4: SFTโORPO ๊ฐ์ ", False, "๋ฐ์ดํฐ ์์")) |
|
|
| return verdicts |
|
|
|
|
| |
| |
| |
|
|
| def generate_three_way_report( |
| base_results_dir: Path, |
| sft_results_dir: Path, |
| orpo_phase1_results: dict, |
| orpo_phase2_results: dict, |
| output_path: Path, |
| orpo_output_dir: Optional[Path] = None, |
| training_curve: Optional[dict] = None, |
| total_elapsed_sec: float = 0.0, |
| ) -> Path: |
| """Generate a comprehensive Base vs SFT vs ORPO 3-way comparison report. |
| |
| Args: |
| base_results_dir: Directory containing Base model's phase1/phase2_results.json |
| sft_results_dir: Directory containing SFT model's phase1/phase2_results.json |
| orpo_phase1_results: ORPO Phase 1 results dict |
| orpo_phase2_results: ORPO Phase 2 results dict |
| output_path: Where to write the markdown report |
| orpo_output_dir: ORPO eval outputs directory (for linking) |
| training_curve: Dict with "eval_steps" list of per-step metrics |
| total_elapsed_sec: Total pipeline elapsed time |
| |
| Returns: |
| Path to the generated report |
| """ |
| base_results_dir = Path(base_results_dir) |
| sft_results_dir = Path(sft_results_dir) |
| output_path = Path(output_path) |
| output_path.parent.mkdir(parents=True, exist_ok=True) |
|
|
| |
| base_p1_raw, base_p2_raw = {}, {} |
| p1_file = base_results_dir / "phase1_results.json" |
| p2_file = base_results_dir / "phase2_results.json" |
| if p1_file.exists(): |
| with open(p1_file, encoding="utf-8") as f: |
| base_p1_raw = json.load(f) |
| if p2_file.exists(): |
| with open(p2_file, encoding="utf-8") as f: |
| base_p2_raw = json.load(f) |
|
|
| |
| sft_p1_raw, sft_p2_raw = {}, {} |
| sft_p1_file = sft_results_dir / "phase1_results.json" |
| sft_p2_file = sft_results_dir / "phase2_results.json" |
| if sft_p1_file.exists(): |
| with open(sft_p1_file, encoding="utf-8") as f: |
| sft_p1_raw = json.load(f) |
| if sft_p2_file.exists(): |
| with open(sft_p2_file, encoding="utf-8") as f: |
| sft_p2_raw = json.load(f) |
|
|
| |
| base_p1 = _normalize_phase1_results(base_p1_raw) |
| base_zero, base_five = _normalize_phase2_results(base_p2_raw) |
| sft_p1 = _normalize_phase1_results(sft_p1_raw) |
| sft_zero, sft_five = _normalize_phase2_results(sft_p2_raw) |
| orpo_p1 = _normalize_phase1_results(orpo_phase1_results) |
| orpo_zero, orpo_five = _normalize_phase2_results(orpo_phase2_results) |
|
|
| eval_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
|
|
| lines: List[str] = [] |
|
|
| |
| |
| |
| lines.append("# FRANKENSTALLM 3B ORPO ๋ชจ๋ธ ์ข
ํฉ ํ๊ฐ ๋ณด๊ณ ์\n") |
| lines.append(f"- **ํ๊ฐ ์ผ์**: {eval_datetime}") |
| lines.append(f"- **๋น๊ต ๋์**: Base โ SFT โ ORPO") |
| lines.append(f"- **์ด ์์ ์๊ฐ**: {_fmt_seconds(total_elapsed_sec)}") |
| if orpo_output_dir: |
| lines.append(f"- **๊ฒฐ๊ณผ ๋๋ ํ ๋ฆฌ**: {orpo_output_dir}") |
| lines.append("") |
|
|
| |
| |
| |
| lines.append("## 1. Executive Summary\n") |
|
|
| |
| std_verdicts = _compute_verdicts(orpo_p1, orpo_zero, base_p1, base_zero) |
| |
| orpo_verdicts = _compute_orpo_verdicts(orpo_p1, orpo_zero, sft_p1, sft_zero, training_curve) |
|
|
| all_verdicts = std_verdicts + orpo_verdicts |
|
|
| lines.append("| # | ํ๊ฐ ์ฐจ์ | ๊ฒฐ๊ณผ | ์์ธ |") |
| lines.append("|---|----------|------|------|") |
| for i, (dim_name, verdict, detail) in enumerate(all_verdicts, 1): |
| icon = "PASS" if verdict else "FAIL" |
| lines.append(f"| {i} | {dim_name} | **{icon}** | {detail} |") |
| lines.append("") |
|
|
| pass_count = sum(1 for _, v, _ in all_verdicts if v) |
| total_dims = len(all_verdicts) |
| lines.append(f"**์ข
ํฉ**: {pass_count}/{total_dims} ์ฐจ์ ํต๊ณผ\n") |
|
|
| |
| orpo_score_result = _compute_orpo_score(orpo_p1, orpo_zero, base_p1, base_zero) |
| lines.append(f"**์ ๋ ์ค์ฝ์ด**: {orpo_score_result['total_score']}/100\n") |
|
|
| |
| orpo_rep = _get_greedy_3gram_rep(orpo_p1) |
| orpo_eos = orpo_p1.get("generation", {}).get("summary", {}).get("greedy_eos_rate") |
| orpo_forgetting = _get_max_forgetting(orpo_p1, base_p1) |
| orpo_kobest = _get_kobest_avg(orpo_zero) |
|
|
| deploy_criteria_met = ( |
| orpo_rep is not None and orpo_rep < 0.05 |
| and orpo_eos is not None and orpo_eos > 0.90 |
| and orpo_forgetting is not None and orpo_forgetting < 5.0 |
| and orpo_kobest is not None and orpo_kobest >= 0.43 |
| ) |
| final_decision = "DEPLOY" if deploy_criteria_met else "RETRY" |
| lines.append(f"**์ต์ข
ํ์ **: **{final_decision}**\n") |
| lines.append("") |
|
|
| |
| |
| |
| lines.append("## 2. ํ์ต ๊ณก์ ๋ถ์\n") |
| if training_curve and training_curve.get("eval_steps"): |
| eval_steps = training_curve["eval_steps"] |
|
|
| lines.append("### Training / Eval Loss\n") |
| lines.append("| Step | Train Loss | Eval Loss | Pref Accuracy | Reward Margin |") |
| lines.append("|------|-----------|-----------|---------------|---------------|") |
| for step_data in eval_steps: |
| step = step_data.get("step", "?") |
| train_loss = _fmt_f(step_data.get("train_loss", step_data.get("loss")), 4) |
| eval_loss = _fmt_f(step_data.get("eval_loss"), 4) |
| pref_acc = _fmt_f(step_data.get("rewards_accuracies", step_data.get("preference_accuracy")), 4) |
| reward_m = _fmt_f(step_data.get("rewards_margins", step_data.get("reward_margins")), 4) |
| lines.append(f"| {step} | {train_loss} | {eval_loss} | {pref_acc} | {reward_m} |") |
| lines.append("") |
|
|
| |
| first_step = eval_steps[0] |
| last_step = eval_steps[-1] |
| lines.append("### ํ์ต ๊ณก์ ์์ฝ\n") |
| first_loss = first_step.get("train_loss", first_step.get("loss")) |
| last_loss = last_step.get("train_loss", last_step.get("loss")) |
| if first_loss is not None and last_loss is not None: |
| lines.append(f"- **Train Loss**: {first_loss:.4f} โ {last_loss:.4f}") |
| first_eval = first_step.get("eval_loss") |
| last_eval = last_step.get("eval_loss") |
| if first_eval is not None and last_eval is not None: |
| lines.append(f"- **Eval Loss**: {first_eval:.4f} โ {last_eval:.4f}") |
| last_pref = last_step.get("rewards_accuracies", last_step.get("preference_accuracy")) |
| if last_pref is not None: |
| lines.append(f"- **์ต์ข
Preference Accuracy**: {last_pref:.2%}") |
| last_margin = last_step.get("rewards_margins", last_step.get("reward_margins")) |
| if last_margin is not None: |
| lines.append(f"- **์ต์ข
Reward Margin**: {last_margin:.4f}") |
| lines.append("") |
| else: |
| lines.append("ํ์ต ๊ณก์ ๋ฐ์ดํฐ ์์\n") |
|
|
| |
| |
| |
| lines.append("## 3. Perplexity ๋น๊ต (์ง์ ๋ณด์กด)\n") |
| lines.append("| ๋ฐ์ดํฐ์
| Base PPL | SFT PPL | ORPO PPL | SFT Forgetting | ORPO Forgetting |") |
| lines.append("|---------|---------|---------|---------|----------------|-----------------|") |
|
|
| base_ppl = base_p1.get("perplexity", {}) |
| sft_ppl = sft_p1.get("perplexity", {}) |
| orpo_ppl = orpo_p1.get("perplexity", {}) |
|
|
| all_ppl_names = sorted(set( |
| list(base_ppl.keys()) + list(sft_ppl.keys()) + list(orpo_ppl.keys()) |
| )) |
| for name in all_ppl_names: |
| base_val = base_ppl.get(name, {}).get("ppl") if isinstance(base_ppl.get(name), dict) else None |
| if base_val is None: |
| base_val = _BASE_PPL_REFERENCE.get(name) |
| sft_val = sft_ppl.get(name, {}).get("ppl") if isinstance(sft_ppl.get(name), dict) else None |
| orpo_val = orpo_ppl.get(name, {}).get("ppl") if isinstance(orpo_ppl.get(name), dict) else None |
|
|
| sft_forg = f"{(sft_val - base_val) / base_val * 100:+.1f}%" if (sft_val is not None and base_val is not None and base_val > 0) else "โ" |
| orpo_forg = f"{(orpo_val - base_val) / base_val * 100:+.1f}%" if (orpo_val is not None and base_val is not None and base_val > 0) else "โ" |
|
|
| lines.append( |
| f"| {name} | {_fmt_f(base_val)} | {_fmt_f(sft_val)} | {_fmt_f(orpo_val)} | " |
| f"{sft_forg} | {orpo_forg} |" |
| ) |
| lines.append("") |
|
|
| |
| |
| |
| lines.append("## 4. ์์ฑ ํ์ง ๋น๊ต\n") |
|
|
| base_gen_summary = base_p1.get("generation", {}).get("summary", {}) |
| sft_gen_summary = sft_p1.get("generation", {}).get("summary", {}) |
| orpo_gen_summary = orpo_p1.get("generation", {}).get("summary", {}) |
|
|
| base_3gram = base_gen_summary.get("greedy_avg_3gram_rep", _BASE_GEN_REFERENCE.get("greedy_3gram_rep")) |
| sft_3gram = sft_gen_summary.get("greedy_avg_3gram_rep") |
| orpo_3gram = orpo_gen_summary.get("greedy_avg_3gram_rep") |
|
|
| base_4gram = base_gen_summary.get("greedy_avg_4gram_rep", _BASE_GEN_REFERENCE.get("greedy_4gram_rep")) |
| sft_4gram = sft_gen_summary.get("greedy_avg_4gram_rep") |
| orpo_4gram = orpo_gen_summary.get("greedy_avg_4gram_rep") |
|
|
| base_eos = base_gen_summary.get("greedy_eos_rate", _BASE_GEN_REFERENCE.get("greedy_eos_rate")) |
| sft_eos_val = sft_gen_summary.get("greedy_eos_rate") |
| orpo_eos_val = orpo_gen_summary.get("greedy_eos_rate") |
|
|
| lines.append("| ์งํ | Base | SFT | ORPO | SFTโORPO ๋ณํ |") |
| lines.append("|------|------|-----|------|---------------|") |
|
|
| |
| sft_orpo_3gram_diff = "" |
| if sft_3gram is not None and orpo_3gram is not None: |
| d = (orpo_3gram - sft_3gram) * 100 |
| sft_orpo_3gram_diff = f"{'+' if d >= 0 else ''}{d:.1f}pp" |
| lines.append(f"| Greedy 3-gram ๋ฐ๋ณต๋ฅ | {_fmt_pct(base_3gram)} | {_fmt_pct(sft_3gram)} | " |
| f"{_fmt_pct(orpo_3gram)} | {sft_orpo_3gram_diff} |") |
|
|
| |
| sft_orpo_4gram_diff = "" |
| if sft_4gram is not None and orpo_4gram is not None: |
| d = (orpo_4gram - sft_4gram) * 100 |
| sft_orpo_4gram_diff = f"{'+' if d >= 0 else ''}{d:.1f}pp" |
| lines.append(f"| Greedy 4-gram ๋ฐ๋ณต๋ฅ | {_fmt_pct(base_4gram)} | {_fmt_pct(sft_4gram)} | " |
| f"{_fmt_pct(orpo_4gram)} | {sft_orpo_4gram_diff} |") |
|
|
| |
| sft_orpo_eos_diff = "" |
| if sft_eos_val is not None and orpo_eos_val is not None: |
| d = (orpo_eos_val - sft_eos_val) * 100 |
| sft_orpo_eos_diff = f"{'+' if d >= 0 else ''}{d:.1f}pp" |
| lines.append(f"| EOS ์ข
๋ฃ์จ | {_fmt_pct(base_eos)} | {_fmt_pct(sft_eos_val)} | " |
| f"{_fmt_pct(orpo_eos_val)} | {sft_orpo_eos_diff} |") |
| lines.append("") |
|
|
| |
| |
| |
| lines.append("## 5. ํ๊ตญ์ด ๋ฒค์น๋งํฌ\n") |
|
|
| |
| lines.append("### KoBEST (0-shot)\n") |
| lines.append("| ํ์คํฌ | Base | SFT | ORPO | BaseโORPO |") |
| lines.append("|--------|------|-----|------|-----------|") |
|
|
| kobest_tasks = ["kobest_boolq", "kobest_copa", "kobest_hellaswag", |
| "kobest_sentineg", "kobest_wic"] |
| base_kobest_accs, sft_kobest_accs, orpo_kobest_accs = [], [], [] |
|
|
| for t in kobest_tasks: |
| base_a = _get_acc(base_zero.get(t, {})) if t in base_zero else _BASE_BENCH_REFERENCE.get(t) |
| sft_a = _get_acc(sft_zero.get(t, {})) if t in sft_zero else None |
| orpo_a = _get_acc(orpo_zero.get(t, {})) if t in orpo_zero else None |
|
|
| if base_a is not None: |
| base_kobest_accs.append(base_a) |
| if sft_a is not None: |
| sft_kobest_accs.append(sft_a) |
| if orpo_a is not None: |
| orpo_kobest_accs.append(orpo_a) |
|
|
| diff = "" |
| if orpo_a is not None and base_a is not None: |
| d = (orpo_a - base_a) * 100 |
| diff = f"{'+' if d >= 0 else ''}{d:.1f}pp" |
|
|
| lines.append(f"| {t} | {_fmt_pct(base_a)} | {_fmt_pct(sft_a)} | {_fmt_pct(orpo_a)} | {diff} |") |
|
|
| |
| base_kavg = sum(base_kobest_accs) / len(base_kobest_accs) if base_kobest_accs else None |
| sft_kavg = sum(sft_kobest_accs) / len(sft_kobest_accs) if sft_kobest_accs else None |
| orpo_kavg = sum(orpo_kobest_accs) / len(orpo_kobest_accs) if orpo_kobest_accs else None |
| avg_diff = "" |
| if orpo_kavg is not None and base_kavg is not None: |
| d = (orpo_kavg - base_kavg) * 100 |
| avg_diff = f"{'+' if d >= 0 else ''}{d:.1f}pp" |
| lines.append(f"| **ํ๊ท ** | **{_fmt_pct(base_kavg)}** | **{_fmt_pct(sft_kavg)}** | " |
| f"**{_fmt_pct(orpo_kavg)}** | **{avg_diff}** |") |
| lines.append("") |
|
|
| |
| lines.append("### HAE-RAE (0-shot)\n") |
| base_haerae = _get_acc(base_zero.get("haerae", {})) if "haerae" in base_zero else _BASE_BENCH_REFERENCE.get("haerae") |
| sft_haerae = _get_acc(sft_zero.get("haerae", {})) if "haerae" in sft_zero else None |
| orpo_haerae = _get_acc(orpo_zero.get("haerae", {})) if "haerae" in orpo_zero else None |
| lines.append(f"- Base: {_fmt_pct(base_haerae)} โ SFT: {_fmt_pct(sft_haerae)} โ ORPO: {_fmt_pct(orpo_haerae)}") |
| lines.append("") |
|
|
| |
| lines.append("### MMLU-KO (0-shot)\n") |
| base_mmlu_ko = _get_acc(base_zero.get("global_mmlu_ko", {})) if "global_mmlu_ko" in base_zero else _BASE_BENCH_REFERENCE.get("global_mmlu_ko") |
| sft_mmlu_ko = _get_acc(sft_zero.get("global_mmlu_ko", {})) if "global_mmlu_ko" in sft_zero else None |
| orpo_mmlu_ko = _get_acc(orpo_zero.get("global_mmlu_ko", {})) if "global_mmlu_ko" in orpo_zero else None |
| lines.append(f"- Base: {_fmt_pct(base_mmlu_ko)} โ SFT: {_fmt_pct(sft_mmlu_ko)} โ ORPO: {_fmt_pct(orpo_mmlu_ko)}") |
| lines.append("") |
|
|
| |
| |
| |
| lines.append("## 6. ์์ด ๋ฒค์น๋งํฌ\n") |
| lines.append("| ํ์คํฌ | Base | SFT | ORPO | BaseโORPO |") |
| lines.append("|--------|------|-----|------|-----------|") |
|
|
| en_tasks_list = ["hellaswag", "arc_easy", "arc_challenge", "winogrande", "piqa"] |
| for t in en_tasks_list: |
| prefer_norm = t in ["hellaswag", "arc_challenge"] |
| base_a = _get_acc(base_zero.get(t, {}), prefer_norm=prefer_norm) if t in base_zero else _BASE_BENCH_REFERENCE.get(t) |
| sft_a = _get_acc(sft_zero.get(t, {}), prefer_norm=prefer_norm) if t in sft_zero else None |
| orpo_a = _get_acc(orpo_zero.get(t, {}), prefer_norm=prefer_norm) if t in orpo_zero else None |
|
|
| diff = "" |
| if orpo_a is not None and base_a is not None: |
| d = (orpo_a - base_a) * 100 |
| diff = f"{'+' if d >= 0 else ''}{d:.1f}pp" |
| lines.append(f"| {t} | {_fmt_pct(base_a)} | {_fmt_pct(sft_a)} | {_fmt_pct(orpo_a)} | {diff} |") |
|
|
| |
| _MMLU_EN_GROUPS = {"mmlu", "mmlu_humanities", "mmlu_social_sciences", "mmlu_stem", "mmlu_other"} |
|
|
| def _mmlu_en_avg(zero: dict) -> Optional[float]: |
| accs = [] |
| for t, m in zero.items(): |
| if (t.startswith("mmlu_") or t == "mmlu") and t not in _MMLU_EN_GROUPS: |
| a = _get_acc(m) |
| if a is not None: |
| accs.append(a) |
| if not accs: |
| for t in _MMLU_EN_GROUPS: |
| if t in zero: |
| a = _get_acc(zero[t]) |
| if a is not None: |
| accs.append(a) |
| return sum(accs) / len(accs) if accs else None |
|
|
| base_mmlu_en = _mmlu_en_avg(base_zero) |
| sft_mmlu_en = _mmlu_en_avg(sft_zero) |
| orpo_mmlu_en = _mmlu_en_avg(orpo_zero) |
|
|
| mmlu_en_diff = "" |
| if orpo_mmlu_en is not None and base_mmlu_en is not None: |
| d = (orpo_mmlu_en - base_mmlu_en) * 100 |
| mmlu_en_diff = f"{'+' if d >= 0 else ''}{d:.1f}pp" |
| lines.append(f"| MMLU-EN ํ๊ท | {_fmt_pct(base_mmlu_en)} | {_fmt_pct(sft_mmlu_en)} | " |
| f"{_fmt_pct(orpo_mmlu_en)} | {mmlu_en_diff} |") |
| lines.append("") |
|
|
| |
| |
| |
| lines.append("## 7. Calibration ๋น๊ต\n") |
| lines.append("| ์งํ | Base | SFT | ORPO |") |
| lines.append("|------|------|-----|------|") |
|
|
| base_cal = base_p1.get("calibration", {}) |
| sft_cal = sft_p1.get("calibration", {}) |
| orpo_cal = orpo_p1.get("calibration", {}) |
|
|
| cal_metrics = [ |
| ("top1_accuracy", "Top-1 Accuracy"), |
| ("top5_accuracy", "Top-5 Accuracy"), |
| ("top10_accuracy", "Top-10 Accuracy"), |
| ] |
| for key, label in cal_metrics: |
| base_v = base_cal.get(key, _BASE_CALIB_REFERENCE.get(key)) |
| sft_v = sft_cal.get(key) |
| orpo_v = orpo_cal.get(key) |
| lines.append(f"| {label} | {_fmt_f(base_v)} | {_fmt_f(sft_v)} | {_fmt_f(orpo_v)} |") |
| lines.append("") |
|
|
| |
| |
| |
| lines.append("## 8. ORPO ๊ณ ์ ์งํ\n") |
|
|
| |
| if training_curve and training_curve.get("eval_steps"): |
| last_step = training_curve["eval_steps"][-1] |
| final_pref = last_step.get("rewards_accuracies", last_step.get("preference_accuracy")) |
| final_margin = last_step.get("rewards_margins", last_step.get("reward_margins")) |
| if final_pref is not None: |
| lines.append(f"- **์ต์ข
Preference Accuracy**: {final_pref:.2%}") |
| if final_margin is not None: |
| lines.append(f"- **์ต์ข
Reward Margins**: {final_margin:.4f}") |
| else: |
| lines.append("- Preference Accuracy / Reward Margins: ๋ฐ์ดํฐ ์์") |
|
|
| |
| rep_grid = orpo_p1.get("repetition", {}).get("grid_results") |
| if rep_grid: |
| items = rep_grid if isinstance(rep_grid, list) else list(rep_grid.values()) |
| for r in items: |
| if isinstance(r, dict): |
| rp = r.get("repetition_penalty", r.get("rep_penalty")) |
| if rp is not None and abs(float(rp) - 1.0) < 1e-6: |
| rep_val = r.get("avg_3gram_rep", r.get("3gram_repetition")) |
| if rep_val is not None: |
| verdict = "PASS" if rep_val < 0.05 else "FAIL" |
| lines.append(f"- **Parameter Sensitivity**: rep_penalty=1.0 โ 3-gram rep={rep_val:.2%} " |
| f"(๋ชฉํ < 5%) โ {verdict}") |
| break |
| lines.append("") |
|
|
| |
| |
| |
| lines.append("## 9. ๋ฐ๋ณต๋ฅ ๊ทธ๋ฆฌ๋ ์์น\n") |
| if rep_grid: |
| items = rep_grid if isinstance(rep_grid, list) else list(rep_grid.values()) |
| rep_rows = [] |
| for r in items: |
| if isinstance(r, dict): |
| rep_rows.append({ |
| "config": r.get("params", "?"), |
| "temp": r.get("temperature"), |
| "rep_pen": r.get("repetition_penalty"), |
| "3gram": r.get("avg_3gram_rep", r.get("3gram_repetition", float("inf"))), |
| "4gram": r.get("avg_4gram_rep", r.get("4gram_repetition")), |
| "eos_rate": r.get("eos_rate"), |
| "avg_tokens": r.get("avg_tokens"), |
| }) |
| rep_rows.sort(key=lambda x: x["3gram"] if isinstance(x["3gram"], (int, float)) else float("inf")) |
|
|
| lines.append("| ์ค์ | Temp | Rep Pen | 3-gram | 4-gram | EOS Rate | Avg Tokens |") |
| lines.append("|------|------|---------|--------|--------|----------|-----------|") |
| for i, r in enumerate(rep_rows): |
| marker = " **โ best**" if i == 0 else "" |
| lines.append( |
| f"| {r['config']} | {_fmt_f(r['temp'], 2)} | {_fmt_f(r['rep_pen'], 2)} | " |
| f"{_fmt_f(r['3gram'])} | {_fmt_f(r['4gram'])} | " |
| f"{_fmt_f(r['eos_rate'])} | {_fmt_f(r['avg_tokens'], 1)} |{marker}" |
| ) |
| lines.append("") |
| else: |
| lines.append("๋ฐ๋ณต๋ฅ ๊ทธ๋ฆฌ๋ ์์น ๋ฐ์ดํฐ ์์\n") |
|
|
| |
| |
| |
| lines.append("## 10. ์์ฑ ์ํ\n") |
| orpo_gen = orpo_p1.get("generation", {}) |
| orpo_samples = orpo_gen.get("samples", []) |
| greedy_samples = [s for s in orpo_samples if isinstance(s, dict) and s.get("temperature", 1.0) == 0.0] |
| if not greedy_samples: |
| greedy_samples = orpo_samples |
|
|
| if greedy_samples: |
| lines.append("### ORPO Greedy ์์ฑ ์ํ\n") |
| for i, s in enumerate(greedy_samples[:15], 1): |
| if isinstance(s, dict): |
| prompt = s.get("prompt", "") |
| text = s.get("text", s.get("generated_text", "")) |
| if len(text) > 500: |
| text = text[:500] + "..." |
| hit_eos = s.get("hit_eos", "?") |
| rep3 = s.get("3gram_rep", s.get("avg_3gram_rep")) |
| tokens = s.get("generated_tokens", s.get("num_tokens", "?")) |
| lines.append(f"**[{i}]** `{prompt}`") |
| lines.append(f"> {text}") |
| meta_parts = [f"EOS={hit_eos}"] |
| if rep3 is not None: |
| meta_parts.append(f"3gram_rep={rep3:.2%}") |
| meta_parts.append(f"tokens={tokens}") |
| lines.append(f"> *{', '.join(meta_parts)}*\n") |
| else: |
| lines.append("์์ฑ ์ํ ๋ฐ์ดํฐ ์์\n") |
|
|
| |
| |
| |
| lines.append("## 11. ์ต์ข
ํ์ \n") |
| lines.append("### ๋ฐฐํฌ ๊ธฐ์ค ์ถฉ์กฑ ์ฌ๋ถ\n") |
| lines.append("| ์กฐ๊ฑด | ๊ธฐ์ค | ํ์ฌ ๊ฐ | ์ถฉ์กฑ |") |
| lines.append("|------|------|---------|------|") |
|
|
| criteria = [ |
| ("Greedy 3-gram ๋ฐ๋ณต๋ฅ ", "< 5%", _fmt_pct(orpo_rep), |
| "YES" if orpo_rep is not None and orpo_rep < 0.05 else "NO"), |
| ("EOS ์ข
๋ฃ์จ", "> 90%", _fmt_pct(orpo_eos), |
| "YES" if orpo_eos is not None and orpo_eos > 0.90 else "NO"), |
| ("PPL Forgetting", "< 5%", f"{orpo_forgetting:.1f}%" if orpo_forgetting is not None else "N/A", |
| "YES" if orpo_forgetting is not None and orpo_forgetting < 5.0 else "NO"), |
| ("KoBEST ํ๊ท ", ">= 43%", _fmt_pct(orpo_kobest), |
| "YES" if orpo_kobest is not None and orpo_kobest >= 0.43 else "NO"), |
| ] |
| for cond, threshold, current, met in criteria: |
| lines.append(f"| {cond} | {threshold} | {current} | {met} |") |
| lines.append("") |
|
|
| if deploy_criteria_met: |
| lines.append("**โ ๋ชจ๋ ๋ฐฐํฌ ๊ธฐ์ค ์ถฉ์กฑ: DEPLOY (Phase 4: GGUF ๋ณํ + Ollama ๋ฐฐํฌ ์งํ)**\n") |
| else: |
| lines.append("**โ ๋ฐฐํฌ ๊ธฐ์ค ๋ฏธ๋ฌ: RETRY (ORPO ์ฌํ์ต ๋๋ ํ์ดํผํ๋ผ๋ฏธํฐ ์กฐ์ ํ์)**\n") |
|
|
| lines.append("---\n") |
| lines.append("*์ด ๋ณด๊ณ ์๋ `eval/report_generator.py::generate_three_way_report()`์ ์ํด ์๋ ์์ฑ๋์์ต๋๋ค.*") |
|
|
| report_text = "\n".join(lines) |
| output_path.write_text(report_text, encoding="utf-8") |
|
|
| |
| if orpo_output_dir: |
| orpo_output_dir = Path(orpo_output_dir) |
| orpo_output_dir.mkdir(parents=True, exist_ok=True) |
| (orpo_output_dir / "orpo_three_way_report.md").write_text(report_text, encoding="utf-8") |
|
|
| return output_path |
|
|
|
|
| if __name__ == "__main__": |
| print("report_generator.py โ use via full_eval_pipeline.py or sft_eval_pipeline.py") |
|
|