| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import argparse |
| import glob |
| import json |
| import os |
| import re |
| from dataclasses import dataclass |
| from typing import Dict, List, Optional, Tuple |
|
|
| import numpy as np |
| import pandas as pd |
| import matplotlib.pyplot as plt |
|
|
|
|
| |
| |
| |
| CONFIG_META = { |
| "A": {"hno": "HNO3", "variant": "0-shot"}, |
| "B": {"hno": "HNO3", "variant": "CoT"}, |
| "C": {"hno": "HNO3", "variant": "Fake CoT"}, |
| "D": {"hno": "HNO2", "variant": "0-shot"}, |
| "E": {"hno": "HNO2", "variant": "CoT"}, |
| "F": {"hno": "HNO2", "variant": "Fake CoT"}, |
| "G": {"hno": "HNO1", "variant": "0-shot"}, |
| "H": {"hno": "HNO1", "variant": "CoT"}, |
| "I": {"hno": "HNO1", "variant": "Fake CoT"}, |
| } |
|
|
| EVAL_TYPE_ORDER = [ |
| "Original", |
| "Paraphrase P1", |
| "Paraphrase P2", |
| "Paraphrase P3", |
| "Paraphrase P4", |
| "Paraphrase P5", |
| "Reverse R1", |
| "Reverse R2", |
| "Reverse R3", |
| "Aggregate A1", |
| "Aggregate A2", |
| "Aggregate A3", |
| "Aggregate A4", |
| ] |
|
|
| |
| RE_P = re.compile(r"_P([1-5])(?:\.json|_results\.json)$") |
| RE_R = re.compile(r"_R([1-3])(?:\.json|_results\.json)$") |
| RE_A = re.compile(r"_A([1-4])(?:\.json|_results\.json)$") |
|
|
|
|
| def infer_eval_type_from_filename(fn: str) -> str: |
| base = os.path.basename(fn) |
| m = RE_P.search(base) |
| if m: |
| return f"Paraphrase P{m.group(1)}" |
| m = RE_R.search(base) |
| if m: |
| return f"Reverse R{m.group(1)}" |
| m = RE_A.search(base) |
| if m: |
| return f"Aggregate A{m.group(1)}" |
| |
| return "Original" |
|
|
|
|
| def safe_read_json(path: str): |
| try: |
| with open(path, "r", encoding="utf-8") as f: |
| return json.load(f) |
| except Exception: |
| return None |
|
|
|
|
| def list_result_files(config_dir: str) -> List[str]: |
| |
| return sorted(glob.glob(os.path.join(config_dir, "*_results.json"))) |
|
|
|
|
| def extract_steps_from_one_entry(entry: dict) -> List[int]: |
| steps = [] |
| for k in entry.keys(): |
| if k.startswith("step_"): |
| try: |
| steps.append(int(k.split("_", 1)[1])) |
| except Exception: |
| pass |
| return sorted(set(steps)) |
|
|
|
|
| def summarize_results_file(path: str) -> Optional[pd.DataFrame]: |
| """ |
| Return a dataframe with columns: step, accuracy_mean, n |
| computed from entry["step_<s>"]["accuracy"] for all entries. |
| """ |
| data = safe_read_json(path) |
| if not isinstance(data, list) or len(data) == 0: |
| return None |
|
|
| steps = extract_steps_from_one_entry(data[0]) |
| if not steps: |
| |
| for e in data[:50]: |
| steps = extract_steps_from_one_entry(e) |
| if steps: |
| break |
| if not steps: |
| return None |
|
|
| rows = [] |
| for s in steps: |
| k = f"step_{s}" |
| accs = [] |
| for e in data: |
| v = e.get(k) or {} |
| a = v.get("accuracy", None) |
| if isinstance(a, (int, float)): |
| accs.append(float(a)) |
| if len(accs) == 0: |
| continue |
| rows.append( |
| { |
| "step": s, |
| "accuracy_mean": float(np.mean(accs)), |
| "n": int(len(accs)), |
| } |
| ) |
|
|
| if not rows: |
| return None |
| return pd.DataFrame(rows).sort_values("step").reset_index(drop=True) |
|
|
|
|
| def build_long_dataframe(base_dir: str, configs: List[str]) -> pd.DataFrame: |
| """ |
| Build long-form df: |
| config, hno, variant, eval_file, eval_type, step, accuracy, n |
| """ |
| all_rows = [] |
|
|
| for cfg in configs: |
| config_dir = os.path.join(base_dir, cfg) |
| if not os.path.isdir(config_dir): |
| continue |
|
|
| meta = CONFIG_META.get(cfg, {"hno": "UNKNOWN", "variant": "UNKNOWN"}) |
| files = list_result_files(config_dir) |
|
|
| for fpath in files: |
| eval_type = infer_eval_type_from_filename(fpath) |
| summary = summarize_results_file(fpath) |
| if summary is None: |
| continue |
|
|
| eval_file = os.path.basename(fpath).replace("_results.json", ".json") |
|
|
| for _, r in summary.iterrows(): |
| all_rows.append( |
| { |
| "config": cfg, |
| "hno": meta["hno"], |
| "variant": meta["variant"], |
| "eval_file": eval_file, |
| "eval_type": eval_type, |
| "step": int(r["step"]), |
| "accuracy": float(r["accuracy_mean"]), |
| "n": int(r["n"]), |
| } |
| ) |
|
|
| df = pd.DataFrame(all_rows) |
| if df.empty: |
| return df |
|
|
| |
| df["eval_type"] = pd.Categorical(df["eval_type"], categories=EVAL_TYPE_ORDER, ordered=True) |
|
|
| |
| return df.sort_values(["hno", "variant", "config", "eval_type", "step"]).reset_index(drop=True) |
|
|
|
|
| def ensure_dir(path: str) -> None: |
| os.makedirs(path, exist_ok=True) |
|
|
|
|
| def save_fig(fig: plt.Figure, out_dir: str, name: str) -> None: |
| ensure_dir(out_dir) |
| png = os.path.join(out_dir, f"{name}.png") |
| pdf = os.path.join(out_dir, f"{name}.pdf") |
| fig.savefig(png, dpi=200, bbox_inches="tight") |
| fig.savefig(pdf, bbox_inches="tight") |
| plt.close(fig) |
|
|
|
|
| def pick_final_step(df: pd.DataFrame) -> int: |
| |
| steps = sorted(df["step"].unique().tolist()) |
| if not steps: |
| return 0 |
| if 10000 in steps: |
| return 10000 |
| return steps[-1] |
|
|
|
|
| |
| |
| |
| def fig_scaling_curves_overall(df: pd.DataFrame, out_dir: str) -> None: |
| """ |
| Q4: Accuracy vs step (scaling) for each config, averaged over all eval files/types. |
| """ |
| if df.empty: |
| return |
|
|
| |
| g = ( |
| df.groupby(["config", "hno", "variant", "step"], as_index=False)["accuracy"] |
| .mean() |
| .rename(columns={"accuracy": "acc_mean_overall"}) |
| ) |
|
|
| |
| for hno in ["HNO1", "HNO2", "HNO3"]: |
| gh = g[g["hno"] == hno].copy() |
| if gh.empty: |
| continue |
|
|
| fig = plt.figure() |
| ax = fig.add_subplot(1, 1, 1) |
| for cfg, sub in gh.groupby("config"): |
| sub = sub.sort_values("step") |
| ax.plot(sub["step"], sub["acc_mean_overall"], marker="o", linewidth=1, label=f"{cfg} ({CONFIG_META[cfg]['variant']})") |
|
|
| ax.set_title(f"Scaling (Accuracy vs Steps) — {hno} — Mean over all eval sets") |
| ax.set_xlabel("Training step (checkpoint)") |
| ax.set_ylabel("Accuracy") |
| ax.set_ylim(0.0, 1.0) |
| ax.grid(True, linewidth=0.5, alpha=0.5) |
| ax.legend(loc="lower right", fontsize=8) |
|
|
| save_fig(fig, out_dir, f"Q4_scaling_curves_overall_{hno}") |
|
|
|
|
| def fig_scaling_curves_by_eval_type(df: pd.DataFrame, out_dir: str) -> None: |
| """ |
| Q2/Q4: Accuracy vs step, separated by eval_type (hardness differences show up as gaps). |
| Produces one figure per config (may be many, but comprehensive). |
| """ |
| if df.empty: |
| return |
|
|
| for cfg in sorted(df["config"].unique().tolist()): |
| sub = df[df["config"] == cfg].copy() |
| if sub.empty: |
| continue |
|
|
| fig = plt.figure(figsize=(10, 6)) |
| ax = fig.add_subplot(1, 1, 1) |
|
|
| for et, etdf in sub.groupby("eval_type"): |
| etdf = etdf.groupby("step", as_index=False)["accuracy"].mean().sort_values("step") |
| ax.plot(etdf["step"], etdf["accuracy"], marker="o", linewidth=1, label=str(et)) |
|
|
| meta = CONFIG_META.get(cfg, {}) |
| ax.set_title(f"Scaling by Eval Set — Config {cfg} ({meta.get('hno','?')}, {meta.get('variant','?')})") |
| ax.set_xlabel("Training step (checkpoint)") |
| ax.set_ylabel("Accuracy") |
| ax.set_ylim(0.0, 1.0) |
| ax.grid(True, linewidth=0.5, alpha=0.5) |
| ax.legend(loc="lower right", fontsize=8, ncol=2) |
|
|
| save_fig(fig, out_dir, f"Q2Q4_scaling_by_evaltype_config_{cfg}") |
|
|
|
|
| def fig_entropy_effect_final(df: pd.DataFrame, out_dir: str) -> None: |
| """ |
| Q1: Compare HNO1 vs HNO2 vs HNO3 at final step, controlling for variant (0-shot/CoT/Fake CoT). |
| We plot: |
| - Final accuracy on Original eval |
| - Final accuracy averaged over all eval types |
| """ |
| if df.empty: |
| return |
|
|
| final_step = pick_final_step(df) |
|
|
| |
| d1 = df[(df["step"] == final_step) & (df["eval_type"] == "Original")].copy() |
| if not d1.empty: |
| g1 = d1.groupby(["hno", "variant"], as_index=False)["accuracy"].mean() |
|
|
| fig = plt.figure() |
| ax = fig.add_subplot(1, 1, 1) |
|
|
| |
| hnos = ["HNO1", "HNO2", "HNO3"] |
| variants = ["0-shot", "CoT", "Fake CoT"] |
| x = np.arange(len(hnos)) |
| width = 0.25 |
|
|
| for j, v in enumerate(variants): |
| vals = [] |
| for h in hnos: |
| m = g1[(g1["hno"] == h) & (g1["variant"] == v)] |
| vals.append(float(m["accuracy"].iloc[0]) if len(m) else np.nan) |
| ax.bar(x + (j - 1) * width, vals, width=width, label=v) |
|
|
| ax.set_title(f"Q1 Entropy Effect — Final step={final_step} — Original eval only") |
| ax.set_xlabel("Training entropy level (HNO)") |
| ax.set_ylabel("Accuracy") |
| ax.set_xticks(x) |
| ax.set_xticklabels(hnos) |
| ax.set_ylim(0.0, 1.0) |
| ax.grid(True, axis="y", linewidth=0.5, alpha=0.5) |
| ax.legend(loc="lower right", fontsize=9) |
|
|
| save_fig(fig, out_dir, f"Q1_entropy_effect_finalstep_{final_step}_original") |
|
|
| |
| d2 = df[df["step"] == final_step].copy() |
| if not d2.empty: |
| g2 = d2.groupby(["config", "hno", "variant"], as_index=False)["accuracy"].mean() |
| g2 = g2.groupby(["hno", "variant"], as_index=False)["accuracy"].mean() |
|
|
| fig = plt.figure() |
| ax = fig.add_subplot(1, 1, 1) |
|
|
| hnos = ["HNO1", "HNO2", "HNO3"] |
| variants = ["0-shot", "CoT", "Fake CoT"] |
| x = np.arange(len(hnos)) |
| width = 0.25 |
|
|
| for j, v in enumerate(variants): |
| vals = [] |
| for h in hnos: |
| m = g2[(g2["hno"] == h) & (g2["variant"] == v)] |
| vals.append(float(m["accuracy"].iloc[0]) if len(m) else np.nan) |
| ax.bar(x + (j - 1) * width, vals, width=width, label=v) |
|
|
| ax.set_title(f"Q1 Entropy Effect — Final step={final_step} — Mean over all eval sets") |
| ax.set_xlabel("Training entropy level (HNO)") |
| ax.set_ylabel("Accuracy") |
| ax.set_xticks(x) |
| ax.set_xticklabels(hnos) |
| ax.set_ylim(0.0, 1.0) |
| ax.grid(True, axis="y", linewidth=0.5, alpha=0.5) |
| ax.legend(loc="lower right", fontsize=9) |
|
|
| save_fig(fig, out_dir, f"Q1_entropy_effect_finalstep_{final_step}_overall") |
|
|
|
|
| def fig_label_structure_effect(df: pd.DataFrame, out_dir: str) -> None: |
| """ |
| Q3: Compare (0-shot vs CoT vs Fake CoT) within each HNO level across steps. |
| Use mean over eval types to avoid 12-line clutter. |
| """ |
| if df.empty: |
| return |
|
|
| g = df.groupby(["hno", "variant", "step"], as_index=False)["accuracy"].mean() |
|
|
| for hno in ["HNO1", "HNO2", "HNO3"]: |
| sub = g[g["hno"] == hno].copy() |
| if sub.empty: |
| continue |
|
|
| fig = plt.figure() |
| ax = fig.add_subplot(1, 1, 1) |
|
|
| for v, vdf in sub.groupby("variant"): |
| vdf = vdf.sort_values("step") |
| ax.plot(vdf["step"], vdf["accuracy"], marker="o", linewidth=1, label=v) |
|
|
| ax.set_title(f"Q3 Label Context Structure — {hno} — Mean over all eval sets") |
| ax.set_xlabel("Training step (checkpoint)") |
| ax.set_ylabel("Accuracy") |
| ax.set_ylim(0.0, 1.0) |
| ax.grid(True, linewidth=0.5, alpha=0.5) |
| ax.legend(loc="lower right", fontsize=9) |
|
|
| save_fig(fig, out_dir, f"Q3_label_structure_over_steps_{hno}") |
|
|
|
|
| def fig_eval_hardness_final(df: pd.DataFrame, out_dir: str) -> None: |
| """ |
| Q2: "Hardness" by eval set type at final step: |
| - Average across all configs (global hardness) |
| - Also per HNO level (since training data differs) |
| """ |
| if df.empty: |
| return |
|
|
| final_step = pick_final_step(df) |
| d = df[df["step"] == final_step].copy() |
| if d.empty: |
| return |
|
|
| |
| g_all = d.groupby(["eval_type"], as_index=False)["accuracy"].mean() |
| g_all = g_all.sort_values("eval_type") |
|
|
| fig = plt.figure(figsize=(11, 5)) |
| ax = fig.add_subplot(1, 1, 1) |
| x = np.arange(len(g_all)) |
| ax.bar(x, g_all["accuracy"].to_numpy()) |
| ax.set_title(f"Q2 Eval Hardness — Final step={final_step} — Mean across ALL configs") |
| ax.set_xlabel("Eval set type") |
| ax.set_ylabel("Accuracy") |
| ax.set_ylim(0.0, 1.0) |
| ax.set_xticks(x) |
| ax.set_xticklabels([str(v) for v in g_all["eval_type"].tolist()], rotation=35, ha="right") |
| ax.grid(True, axis="y", linewidth=0.5, alpha=0.5) |
|
|
| save_fig(fig, out_dir, f"Q2_eval_hardness_finalstep_{final_step}_allconfigs") |
|
|
| |
| for hno in ["HNO1", "HNO2", "HNO3"]: |
| dh = d[d["hno"] == hno].copy() |
| if dh.empty: |
| continue |
| gh = dh.groupby(["eval_type"], as_index=False)["accuracy"].mean().sort_values("eval_type") |
|
|
| fig = plt.figure(figsize=(11, 5)) |
| ax = fig.add_subplot(1, 1, 1) |
| x = np.arange(len(gh)) |
| ax.bar(x, gh["accuracy"].to_numpy()) |
| ax.set_title(f"Q2 Eval Hardness — {hno} — Final step={final_step} — Mean across configs") |
| ax.set_xlabel("Eval set type") |
| ax.set_ylabel("Accuracy") |
| ax.set_ylim(0.0, 1.0) |
| ax.set_xticks(x) |
| ax.set_xticklabels([str(v) for v in gh["eval_type"].tolist()], rotation=35, ha="right") |
| ax.grid(True, axis="y", linewidth=0.5, alpha=0.5) |
|
|
| save_fig(fig, out_dir, f"Q2_eval_hardness_finalstep_{final_step}_{hno}") |
|
|
|
|
| def fig_training_accuracy_proxy(df: pd.DataFrame, out_dir: str) -> None: |
| """ |
| If you treat "Original" trimmed eval (from train distribution) as a proxy for "learning/training accuracy", |
| plot it vs steps for each config, and also aggregate per HNO/variant. |
| (If you have true train-set eval elsewhere, point the script at those results similarly.) |
| """ |
| if df.empty: |
| return |
|
|
| d = df[df["eval_type"] == "Original"].copy() |
| if d.empty: |
| return |
|
|
| |
| fig = plt.figure(figsize=(10, 6)) |
| ax = fig.add_subplot(1, 1, 1) |
| for cfg, sub in d.groupby("config"): |
| sub = sub.groupby("step", as_index=False)["accuracy"].mean().sort_values("step") |
| ax.plot(sub["step"], sub["accuracy"], marker="o", linewidth=1, label=cfg) |
| ax.set_title("Training-Accuracy Proxy — Original eval only — per config") |
| ax.set_xlabel("Training step (checkpoint)") |
| ax.set_ylabel("Accuracy") |
| ax.set_ylim(0.0, 1.0) |
| ax.grid(True, linewidth=0.5, alpha=0.5) |
| ax.legend(loc="lower right", fontsize=8, ncol=3) |
| save_fig(fig, out_dir, "Training_accuracy_proxy_original_per_config") |
|
|
| |
| g = d.groupby(["hno", "variant", "step"], as_index=False)["accuracy"].mean() |
| for hno in ["HNO1", "HNO2", "HNO3"]: |
| sub = g[g["hno"] == hno].copy() |
| if sub.empty: |
| continue |
| fig = plt.figure() |
| ax = fig.add_subplot(1, 1, 1) |
| for v, vdf in sub.groupby("variant"): |
| vdf = vdf.sort_values("step") |
| ax.plot(vdf["step"], vdf["accuracy"], marker="o", linewidth=1, label=v) |
| ax.set_title(f"Training-Accuracy Proxy — {hno} — Original eval only") |
| ax.set_xlabel("Training step (checkpoint)") |
| ax.set_ylabel("Accuracy") |
| ax.set_ylim(0.0, 1.0) |
| ax.grid(True, linewidth=0.5, alpha=0.5) |
| ax.legend(loc="lower right", fontsize=9) |
| save_fig(fig, out_dir, f"Training_accuracy_proxy_original_{hno}") |
|
|
|
|
| def export_summary_tables(df: pd.DataFrame, out_dir: str) -> None: |
| """ |
| Save a couple CSVs that are useful for the report: |
| - long dataframe |
| - final-step pivot tables |
| """ |
| if df.empty: |
| return |
| ensure_dir(out_dir) |
|
|
| long_csv = os.path.join(out_dir, "summary_long.csv") |
| df.to_csv(long_csv, index=False) |
|
|
| final_step = pick_final_step(df) |
| d = df[df["step"] == final_step].copy() |
|
|
| |
| pivot1 = ( |
| d.groupby(["config", "hno", "variant", "eval_type"], as_index=False)["accuracy"] |
| .mean() |
| .pivot_table(index=["config", "hno", "variant"], columns="eval_type", values="accuracy", aggfunc="mean") |
| ) |
| pivot1.to_csv(os.path.join(out_dir, f"finalstep_{final_step}_pivot_config_by_evaltype.csv")) |
|
|
| |
| pivot2 = ( |
| d.groupby(["hno", "variant", "eval_type"], as_index=False)["accuracy"] |
| .mean() |
| .pivot_table(index=["hno", "variant"], columns="eval_type", values="accuracy", aggfunc="mean") |
| ) |
| pivot2.to_csv(os.path.join(out_dir, f"finalstep_{final_step}_pivot_hno_variant_by_evaltype.csv")) |
|
|
|
|
| def main(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--base_dir", type=str, default="/workspace/v121rc_exp1", help="Base exp dir containing A..I") |
| ap.add_argument("--out_dir", type=str, default="/workspace/v121rc_exp1/FIGS", help="Where to save figures") |
| ap.add_argument( |
| "--configs", |
| type=str, |
| default="ABCDEFGHI", |
| help="Which configs to include, e.g. ABC or ABCDEFGHI", |
| ) |
| args = ap.parse_args() |
|
|
| configs = [c for c in args.configs if c in CONFIG_META] |
| if not configs: |
| raise SystemExit("No valid configs selected. Use --configs like ABCDEFGHI.") |
|
|
| df = build_long_dataframe(args.base_dir, configs) |
| if df.empty: |
| raise SystemExit( |
| "No results found. Check that /workspace/v121rc_exp1/{A..I} contain '*_results.json' " |
| "with 'step_<n>' fields." |
| ) |
|
|
| ensure_dir(args.out_dir) |
|
|
| |
| export_summary_tables(df, args.out_dir) |
|
|
| |
| fig_training_accuracy_proxy(df, args.out_dir) |
| fig_scaling_curves_overall(df, args.out_dir) |
| fig_label_structure_effect(df, args.out_dir) |
| fig_entropy_effect_final(df, args.out_dir) |
| fig_eval_hardness_final(df, args.out_dir) |
|
|
| |
| fig_scaling_curves_by_eval_type(df, args.out_dir) |
|
|
| print(f"Done. Figures + CSV summaries saved to: {args.out_dir}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|