| """Aggregate per-seed metrics.json into paper-style result tables (mean±SD). |
| |
| Scans results/<exp_name>/**/seed*/metrics.json, groups by (dataset, protocol, arch), |
| reports mean±SD over seeds (over folds for CV datasets). Emits: |
| - summary.csv : full per-(dataset,method) detail, every metric (raw data export) |
| - summary.md : the main Dice table, methods×datasets (quick read) |
| - summary.tex : the main Dice table as booktabs LaTeX (paper-ready) |
| - summary.html : full paper-style report (main tables, per-class, significance, setup) |
| |
| python framework/report/aggregate.py --exp_name baselines [--out_root results] |
| """ |
| from __future__ import annotations |
|
|
| import os |
| import json |
| import glob |
| import argparse |
| import warnings |
| from collections import defaultdict |
|
|
| import numpy as np |
|
|
| |
| |
| warnings.filterwarnings("ignore", message="Mean of empty slice") |
|
|
| |
| METRICS = [ |
| ("dice", "Dice", True, True), |
| ("iou", "IoU", True, True), |
| ("hd95", "HD95", False, False), |
| ("assd", "ASSD", False, False), |
| ("sensitivity", "Sens", True, True), |
| ("specificity", "Spec", True, True), |
| ("precision", "Prec", True, True), |
| ] |
|
|
|
|
| def load_runs(out_root, exp_name): |
| runs = [] |
| for path in glob.glob(os.path.join(out_root, exp_name, "**", "seed*", "metrics.json"), recursive=True): |
| try: |
| with open(path) as f: |
| runs.append(json.load(f)) |
| except Exception: |
| pass |
| return runs |
|
|
|
|
| _PROTO_LABEL = { |
| ("idridd_segmentation", "fold01"): "official", |
| ("busi", "fold01"): "single-split", |
| ("medsegdb_kits19", "fold01"): "single-split", |
| ("pannuke_semantic", "fold01"): "single-split", |
| } |
| _CV_DATASETS = {"pannuke_semantic"} |
|
|
|
|
| def _proto_label(dataset, protocol): |
| return _PROTO_LABEL.get((dataset, protocol), protocol) |
|
|
|
|
| def _agg_over(items, key): |
| vals = np.array([it.get("metrics", {}).get(f"{key}_mean", np.nan) for it in items], np.float64) |
| vals = vals[~np.isnan(vals)] |
| return (float(vals.mean()), float(vals.std())) if vals.size else (float("nan"), float("nan")) |
|
|
|
|
| def summarize(runs): |
| by_da = defaultdict(lambda: defaultdict(list)) |
| for d in runs: |
| by_da[(d.get("dataset"), d.get("arch"))][d.get("protocol")].append(d) |
| rows = [] |
| for (dataset, arch), proto_map in sorted(by_da.items()): |
| protos = sorted(p for p in proto_map if p is not None) |
| row = {"dataset": dataset, "arch": arch} |
| if dataset in _CV_DATASETS and len(protos) > 1: |
| row["protocol"] = f"{len(protos)}-fold" |
| row["n_seeds"] = len(protos) |
| for key, _, _, _ in METRICS: |
| fold_means = [m for m in (_agg_over(proto_map[p], key)[0] for p in protos) |
| if not np.isnan(m)] |
| fm = np.array(fold_means, np.float64) |
| row[f"{key}_mean"] = float(fm.mean()) if fm.size else float("nan") |
| row[f"{key}_sd"] = float(fm.std()) if fm.size else float("nan") |
| else: |
| proto = protos[0] if protos else None |
| items = proto_map.get(proto, []) |
| row["protocol"] = _proto_label(dataset, proto) |
| row["n_seeds"] = len(items) |
| for key, _, _, _ in METRICS: |
| row[f"{key}_mean"], row[f"{key}_sd"] = _agg_over(items, key) |
| rows.append(row) |
| return rows |
|
|
|
|
| |
| _ARCH_ORDER = ["unet", "unetpp", "deeplabv3plus", "attention_unet", "transunet", "swinunet", |
| "nnunet", "umamba"] |
| _ARCH_DISP = {"unet": "UNet", "unetpp": "UNet++", "deeplabv3plus": "DeepLabV3+", |
| "attention_unet": "Attention-UNet", "transunet": "TransUNet", |
| "swinunet": "Swin-UNet", "nnunet": "nnU-Net", "umamba": "U-Mamba"} |
| _DS_ORDER = ["cvc_clinicdb", "kvasir_seg", "fives", "busi", "refuge2", "acdc_png", |
| "idridd_segmentation", "pannuke_semantic", "medsegdb_isic2018", "medsegdb_kits19"] |
| _DS_DISP = {"cvc_clinicdb": "CVC-ClinicDB", "kvasir_seg": "Kvasir-SEG", "fives": "FIVES", |
| "busi": "BUSI", "refuge2": "REFUGE2", "acdc_png": "ACDC", |
| "idridd_segmentation": "IDRiD", "pannuke_semantic": "PanNuke", |
| "medsegdb_isic2018": "ISIC2018", "medsegdb_kits19": "KiTS19"} |
|
|
|
|
| def _fmt(row, key, pct): |
| m, s = row[f"{key}_mean"], row[f"{key}_sd"] |
| if m != m: |
| return "—" |
| return f"{m*100:.2f}±{s*100:.2f}" if pct else f"{m:.2f}±{s:.2f}" |
|
|
|
|
| def _grid(rows): |
| cell = {(r["dataset"], r["arch"]): r for r in rows} |
| methods = [a for a in _ARCH_ORDER if any(r["arch"] == a for r in rows)] or \ |
| sorted({r["arch"] for r in rows}) |
| seen = [d for d in _DS_ORDER if any(r["dataset"] == d for r in rows)] |
| extra = [r["dataset"] for r in rows if r["dataset"] not in _DS_ORDER] |
| datasets = list(dict.fromkeys(seen + extra)) |
| return cell, datasets, methods |
|
|
|
|
| |
| def _per_image_dice_vec(runs_for_da): |
| by_proto = defaultdict(list) |
| for d in runs_for_da: |
| by_proto[d.get("protocol")].append(d) |
| parts = [] |
| for proto in sorted(by_proto): |
| arrs = [np.array([pi.get("dice", np.nan) for pi in d.get("per_image", [])], float) |
| for d in by_proto[proto]] |
| arrs = [a for a in arrs if a.size] |
| if not arrs: |
| continue |
| L = min(a.size for a in arrs) |
| parts.append(np.nanmean(np.stack([a[:L] for a in arrs]), axis=0)) |
| return np.concatenate(parts) if parts else np.array([]) |
|
|
|
|
| def _sig_tied_sets(runs): |
| """{dataset: set(archs whose per-image Dice is NOT significantly worse than the best, |
| paired Wilcoxon p>=0.05)} — the 'statistically best' set, used to bold the Dice table.""" |
| try: |
| from scipy.stats import wilcoxon |
| except Exception: |
| return {} |
| by_da = defaultdict(list) |
| for d in runs: |
| by_da[(d.get("dataset"), d.get("arch"))].append(d) |
|
|
| def pval(a, b): |
| L = min(a.size, b.size) |
| if L < 6: |
| return float("nan") |
| x, y = a[:L], b[:L] |
| m = ~(np.isnan(x) | np.isnan(y)) |
| if m.sum() < 6 or np.allclose(x[m], y[m]): |
| return 1.0 |
| try: |
| return float(wilcoxon(x[m], y[m]).pvalue) |
| except Exception: |
| return 1.0 |
|
|
| out = {} |
| for ds in {k[0] for k in by_da}: |
| vecs = {a: _per_image_dice_vec(by_da[(ds, a)]) for a in _ARCH_ORDER if (ds, a) in by_da} |
| vecs = {a: v for a, v in vecs.items() if v.size} |
| if not vecs: |
| continue |
| means = {a: float(np.nanmean(v)) for a, v in vecs.items()} |
| best = max(means, key=means.get) |
| tied = {best} |
| for a, v in vecs.items(): |
| if a != best and not (pval(vecs[best], v) < 0.05): |
| tied.add(a) |
| out[ds] = tied |
| return out |
|
|
|
|
| |
| def to_csv(rows): |
| cols = ["dataset", "protocol", "arch", "n_seeds"] |
| for k, _, _, _ in METRICS: |
| cols += [f"{k}_mean", f"{k}_sd"] |
| out = ",".join(cols) + "\n" |
| for r in rows: |
| out += ",".join(str(r[c]) for c in cols) + "\n" |
| return out |
|
|
|
|
| def _dice_matrix(rows): |
| """(methods, datasets, cell, avg) for the main Dice table.""" |
| cell, datasets, methods = _grid(rows) |
| avg = {a: np.nanmean([cell[(d, a)]["dice_mean"] for d in datasets if (d, a) in cell] or [np.nan]) |
| for a in methods} |
| return cell, datasets, methods, avg |
|
|
|
|
| def _dice_bold(a, d, cell, best, sig): |
| """Whether (dataset d, arch a)'s Dice cell should be bold: in the significance |
| 'tied-for-best' set when available, else the single best per dataset.""" |
| if (d, a) not in cell: |
| return False |
| if sig is not None: |
| return a in sig.get(d, set()) |
| return cell[(d, a)]["dice_mean"] == best[d] |
|
|
|
|
| def to_markdown(rows, sig=None): |
| cell, datasets, methods, _ = _dice_matrix(rows) |
| head = ["Method"] + [_DS_DISP.get(d, d) for d in datasets] |
| out = "## Main results — Dice (mean±SD %, ↑)\n\n" |
| out += ("_**Bold** = best or not significantly worse than best per dataset " |
| "(paired Wilcoxon on per-image Dice, p≥0.05). No cross-dataset average column — " |
| "the seven modalities are too heterogeneous for one number to be meaningful._\n\n") |
| out += "| " + " | ".join(head) + " |\n|" + "---|" * len(head) + "\n" |
| best = {d: max((cell[(d, a)]["dice_mean"] for a in methods if (d, a) in cell), default=np.nan) |
| for d in datasets} |
| for a in methods: |
| cells = [_ARCH_DISP.get(a, a)] |
| for d in datasets: |
| if (d, a) in cell: |
| t = _fmt(cell[(d, a)], "dice", True) |
| cells.append(f"**{t}**" if _dice_bold(a, d, cell, best, sig) else t) |
| else: |
| cells.append("–") |
| out += "| " + " | ".join(cells) + " |\n" |
| return out |
|
|
|
|
| def to_latex(rows, sig=None): |
| cell, datasets, methods, _ = _dice_matrix(rows) |
| spec = "l" + "c" * len(datasets) |
| out = ("% Main results: Dice (mean over seeds, %). Bold = best or not significantly\n" |
| "% worse than best per dataset (paired Wilcoxon on per-image Dice, p>=0.05).\n" |
| "% No cross-dataset average column (modalities too heterogeneous).\n") |
| out += "\\begin{tabular}{" + spec + "}\n\\toprule\n" |
| out += "Method & " + " & ".join(_DS_DISP.get(d, d) for d in datasets) + " \\\\\n\\midrule\n" |
| best = {d: max((cell[(d, a)]["dice_mean"] for a in methods if (d, a) in cell), default=np.nan) |
| for d in datasets} |
| for a in methods: |
| cells = [_ARCH_DISP.get(a, a)] |
| for d in datasets: |
| if (d, a) in cell: |
| t = f"{cell[(d, a)]['dice_mean'] * 100:.1f}" |
| cells.append(f"\\textbf{{{t}}}" if _dice_bold(a, d, cell, best, sig) else t) |
| else: |
| cells.append("--") |
| out += " & ".join(cells) + " \\\\\n" |
| if a == "attention_unet": |
| out += "\\midrule\n" |
| out += "\\bottomrule\n\\end{tabular}\n" |
| return out |
|
|
|
|
| |
| _DATASETS_INFO = [ |
| ("1", "CVC-ClinicDB", "Colonoscopy (endoscopy)", "Polyp", "2", "RGB", "384×288", "official", "490 / 61 / 61"), |
| ("2", "Kvasir-SEG", "GI endoscopy", "Polyp", "2", "RGB", "~622×529 (var)", "official", "800 / 100 / 100"), |
| ("3", "FIVES", "Retinal fundus", "Vessel", "2", "RGB", "2048×2048", "official", "480 / 120 / 200"), |
| ("4", "BUSI", "Breast ultrasound", "Tumor", "2", "grayscale¹", "variable", "single-split²", "545 / 78 / 157"), |
| ("5", "REFUGE2", "Retinal fundus", "Optic disc & cup", "3", "RGB", "~2124×2056", "official", "400 / 400 / 400"), |
| ("6", "ACDC", "Cardiac MRI (2D slices)", "RV / Myo / LV", "4", "grayscale", "~240×256 (var)", "official", "136 / 210 / 380"), |
| ("7", "IDRiD", "Retinal fundus", "DR lesions (4) + optic disc", "6", "RGB", "4288×2848", "official", "43 / 11 / 27"), |
| ("8", "PanNuke", "Histopathology (H&E)", "Nuclei (5 types)", "6", "RGB", "256×256", "official 3-fold CV", "~2.7k / 2.6k / 2.6k per fold"), |
| ("9", "ISIC2018", "Dermoscopy", "Skin lesion", "2", "RGB", "256×256", "holdout", "2582 / 369 / 737"), |
| ("10", "KiTS19", "Kidney CT (2D slices)", "Kidney (binary)", "2", "grayscale¹", "256×256", "single-split²", "2832 / 479 / 705"), |
| ] |
| _METHODS_INFO = [ |
| ("UNet", "CNN encoder–decoder", "SMP, ResNet-50 encoder (ImageNet)"), |
| ("UNet++", "Nested UNet", "SMP, ResNet-50 (ImageNet)"), |
| ("DeepLabV3+", "Atrous CNN", "SMP, ResNet-50 (ImageNet)"), |
| ("Attention-UNet", "Attention-gated UNet", "Re-implemented, from scratch"), |
| ("TransUNet", "CNN–Transformer hybrid", "R50-ViT-B/16 (ImageNet), input 256"), |
| ("Swin-UNet", "Pure-Transformer UNet", "Swin-Tiny (ImageNet), input 224"), |
| ("nnU-Net (v2)", "Self-configuring CNN", "2D config, 250 epochs"), |
| ("U-Mamba", "State-space (Mamba) UNet", "U-Mamba_Bot, 100 epochs"), |
| ] |
| _METRICS_INFO = [ |
| ("Dice (DSC)", "2TP / (2TP+FP+FN)", "↑", "%", "区域重叠度(主指标),对类别不平衡较鲁棒。"), |
| ("IoU (Jaccard)", "TP / (TP+FP+FN)", "↑", "%", "交并比,更严格的重叠度,常与 Dice 并列。"), |
| ("HD95", "95% Hausdorff distance (boundaries)", "↓", "px", "边界最大误差的95%分位,越小边界越贴合。"), |
| ("ASSD", "average symmetric surface distance", "↓", "px", "平均对称表面距离,整体边界吻合度。"), |
| ("Sensitivity", "TP / (TP+FN)", "↑", "%", "召回/敏感度,反映漏分割程度。"), |
| ("Specificity", "TN / (TN+FP)", "↑", "%", "特异度,背景误报控制。"), |
| ("Precision", "TP / (TP+FP)", "↑", "%", "精确率,反映过分割/误报程度。"), |
| ] |
| _PERCLASS_NAMES = { |
| "acdc_png": {"1": "RV", "2": "Myocardium", "3": "LV"}, |
| "refuge2": {"1": "Optic Disc", "2": "Optic Cup"}, |
| "idridd_segmentation": {"1": "MA", "2": "Haemorrhage", "3": "Hard Exudate", "4": "Soft Exudate", "5": "Optic Disc"}, |
| "pannuke_semantic": {"1": "Neoplastic", "2": "Inflammatory", "3": "Connective", "4": "Dead", "5": "Epithelial"}, |
| } |
|
|
|
|
| def _collect_perclass(runs): |
| acc = defaultdict(lambda: defaultdict(list)) |
| for d in runs: |
| key = (d.get("dataset"), d.get("arch")) |
| for pi in d.get("per_image", []): |
| for c, m in (pi.get("per_class") or {}).items(): |
| v = (m or {}).get("dice") |
| if v is not None and v == v: |
| acc[key][c].append(v) |
| return {k: {c: float(np.mean(v)) for c, v in cd.items() if v} for k, cd in acc.items()} |
|
|
|
|
| _CSS = """ |
| body{font-family:'Helvetica Neue',Arial,sans-serif;margin:30px auto;max-width:1180px;color:#1a1a1a;line-height:1.5} |
| h1{font-size:21px;margin:0 0 4px}h2{font-size:15px;color:#0a5a33;margin:30px 0 4px;border-bottom:1px solid #e3e3e3;padding-bottom:3px} |
| h3{font-size:13px;margin:16px 0 4px;color:#333} |
| p,li{font-size:13px}code{background:#f2f2f2;padding:1px 4px;border-radius:3px} |
| .cap{color:#666;font-size:11.5px;margin:3px 0 6px} |
| .tw{overflow-x:auto} |
| table.rt{border-collapse:collapse;margin:6px 0 8px;font-size:11.5px} |
| table.rt th,table.rt td{padding:4px 9px;text-align:center;white-space:nowrap} |
| table.rt thead th{border-top:2px solid #222;border-bottom:1.2px solid #222;font-weight:600} |
| table.rt tbody tr:last-child td{border-bottom:2px solid #222} |
| table.rt td.m,table.rt th.m{text-align:left;font-weight:600} |
| table.rt td.avg,table.rt th.avg{border-left:1px solid #c8c8c8;background:#f7f9f8} |
| table.rt tbody tr.grp td{border-top:1px solid #cfcfcf} |
| table.rt b{color:#08402a} |
| table.info{border-collapse:collapse;margin:6px 0 14px;font-size:12px} |
| table.info th,table.info td{border:1px solid #ddd;padding:4px 8px;text-align:center} |
| table.info th{background:#f3f3f3}table.info td.l{text-align:left} |
| .note{background:#eef7f0;border-left:3px solid #0a6;padding:8px 12px;font-size:12.5px;margin:8px 0} |
| hr{border:none;border-top:1px solid #e3e3e3;margin:24px 0} |
| """ |
|
|
|
|
| def _metric_table(cell, datasets, methods, key, pct, hib, bold_sets=None): |
| """Transposed table: methods (rows) × datasets (cols). bold_sets[ds] (set of archs) |
| if given (Dice significance), else bold the single best per column. Deliberately NO |
| cross-dataset summary column: the ten datasets span seven modalities with very |
| different difficulty, so a simple average is not meaningful (and would conflict with |
| the per-dataset ranking).""" |
| best = {} |
| for d in datasets: |
| vals = {a: cell[(d, a)][f"{key}_mean"] for a in methods |
| if (d, a) in cell and cell[(d, a)][f"{key}_mean"] == cell[(d, a)][f"{key}_mean"]} |
| best[d] = ((max if hib else min)(vals, key=vals.get) if vals else None) |
| h = ["<div class='tw'><table class='rt'><thead><tr><th class='m'>Method</th>" |
| + "".join(f"<th>{_DS_DISP.get(d, d)}</th>" for d in datasets) |
| + "</tr></thead><tbody>"] |
| for a in methods: |
| grp = " class='grp'" if a == "transunet" else "" |
| tds = [f"<td class='m'>{_ARCH_DISP.get(a, a)}</td>"] |
| for d in datasets: |
| if (d, a) in cell and cell[(d, a)][f"{key}_mean"] == cell[(d, a)][f"{key}_mean"]: |
| t = _fmt(cell[(d, a)], key, pct) |
| b = (a in bold_sets.get(d, set())) if bold_sets is not None else (a == best[d]) |
| tds.append(f"<td>{'<b>'+t+'</b>' if b else t}</td>") |
| else: |
| tds.append("<td>–</td>") |
| h.append(f"<tr{grp}>" + "".join(tds) + "</tr>") |
| h.append("</tbody></table></div>") |
| return "\n".join(h) |
|
|
|
|
| def _perclass_section(runs): |
| pc = _collect_perclass(runs) |
| h = [] |
| for ds, names in _PERCLASS_NAMES.items(): |
| methods = [a for a in _ARCH_ORDER if (ds, a) in pc and pc[(ds, a)]] |
| if not methods: |
| continue |
| classes = sorted(names, key=int) |
| colbest = {c: max((pc[(ds, a)].get(c, float('nan')) for a in methods), default=float('nan')) |
| for c in classes} |
| h.append(f"<h3>{_DS_DISP.get(ds, ds)}</h3>") |
| h.append("<div class='tw'><table class='rt'><thead><tr><th class='m'>Method</th>" |
| + "".join(f"<th>{names[c]}</th>" for c in classes) + "<th class='avg'>macro</th></tr></thead><tbody>") |
| for a in methods: |
| grp = " class='grp'" if a == "transunet" else "" |
| cells, present = [], [] |
| for c in classes: |
| v = pc[(ds, a)].get(c) |
| if v is None: |
| cells.append("<td>–</td>") |
| else: |
| present.append(v) |
| t = f"{v*100:.1f}" |
| cells.append(f"<td>{'<b>'+t+'</b>' if v == colbest[c] else t}</td>") |
| macro = (sum(present) / len(present) * 100) if present else float("nan") |
| h.append(f"<tr{grp}><td class='m'>{_ARCH_DISP.get(a, a)}</td>{''.join(cells)}" |
| f"<td class='avg'>{macro:.1f}</td></tr>") |
| h.append("</tbody></table></div>") |
| return "\n".join(h) |
|
|
|
|
| def _setup_html(): |
| h = ["<h2>A. Datasets</h2>", |
| "<table class='info'><tr><th>#</th><th>Dataset</th><th>Modality</th><th>Target</th><th>Cls</th>" |
| "<th>Ch</th><th>Native size</th><th>Protocol</th><th>Train/Val/Test</th></tr>"] |
| for r in _DATASETS_INFO: |
| h.append("<tr><td>%s</td><td class='l'>%s</td><td class='l'>%s</td><td class='l'>%s</td><td>%s</td>" |
| "<td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>" % r) |
| h.append("</table>") |
| h.append("<div class='cap'>¹ BUSI/KiTS19 grayscale stored as 3-ch PNG (read as grayscale). " |
| "² no canonical split → one fixed fold (of 5) with 3 seeds; others use the official split. " |
| "Labels 0…C-1 (0=bg); multi-class metrics macro-averaged over foreground classes.</div>") |
| h.append("<h2>B. Methods</h2>") |
| h.append("<table class='info'><tr><th>Method</th><th>Family</th><th>Backbone / setup</th></tr>") |
| for m in _METHODS_INFO: |
| h.append("<tr><td class='l'>%s</td><td class='l'>%s</td><td class='l'>%s</td></tr>" % m) |
| h.append("</table>") |
| h.append("<h2>C. Metrics</h2>") |
| h.append("<table class='info'><tr><th>Metric</th><th>Definition</th><th>Dir</th><th>Unit</th>" |
| "<th>作用 / 含义(中文)</th></tr>") |
| for m in _METRICS_INFO: |
| h.append("<tr><td class='l'>%s</td><td class='l'>%s</td><td>%s</td><td>%s</td><td class='l'>%s</td></tr>" % m) |
| h.append("</table>") |
| return "\n".join(h) |
|
|
|
|
| def to_html(rows, runs=None, title="SegGen benchmark", sig=None): |
| cell, datasets, methods = _grid(rows) |
| if sig is None: |
| sig = _sig_tied_sets(runs) if runs else None |
| h = [f"<!doctype html><html><head><meta charset='utf-8'><title>{title}</title><style>{_CSS}</style>" |
| "</head><body>"] |
| h.append(f"<h1>{title}: 8 methods × 10 datasets (unified 512, resolution-fair)</h1>") |
| h.append("<p>Eight 2D medical-image segmentation methods on ten public datasets (seven modalities). " |
| "Values are <b>mean±SD</b> over 3 seeds (over the 3 folds for PanNuke). " |
| "Each (dataset,method) cell aggregates tens–thousands of test images.</p>") |
| h.append("<div class='note'><b>Resolution-fair protocol.</b> Convolutional nets train at 512; the fixed-input " |
| "transformers (Swin-UNet 224, TransUNet 256) and nnU-Net/U-Mamba run at their native size; " |
| "<b>every prediction and ground truth is then resized to a common 512×512 before scoring</b>, so " |
| "boundary metrics (HD95/ASSD, in pixels) are directly comparable across methods.</div>") |
|
|
| h.append("<h2>1. Main results — Dice (%) ↑</h2>") |
| h.append("<div class='cap'><b>Bold</b> = best, or not significantly different from the best per dataset " |
| "(paired Wilcoxon on per-image Dice, p≥0.05). " |
| "Horizontal rule separates CNNs (top) from Transformer / foundation models (bottom). " |
| "No cross-dataset average is reported — the seven modalities differ too much in difficulty " |
| "for a single number to be meaningful.</div>") |
| h.append(_metric_table(cell, datasets, methods, "dice", True, True, bold_sets=sig)) |
|
|
| h.append("<h2>2. Boundary accuracy — HD95 (px) ↓</h2>") |
| h.append("<div class='cap'>95% Hausdorff distance at the common 512 resolution (lower = better; " |
| "<b>bold</b> = best per dataset). Now comparable across methods.</div>") |
| h.append(_metric_table(cell, datasets, methods, "hd95", False, False)) |
|
|
| h.append("<h2>3. Overlap — IoU (%) ↑</h2>") |
| h.append("<div class='cap'>Jaccard index, the stricter overlap measure (<b>bold</b> = best per dataset).</div>") |
| h.append(_metric_table(cell, datasets, methods, "iou", True, True)) |
|
|
| if runs: |
| pcs = _perclass_section(runs) |
| if pcs.strip(): |
| h.append("<h2>4. Per-class Dice (%) — multi-class datasets</h2>") |
| h.append("<div class='cap'>Mean per-class Dice over all test images/runs (0=background excluded; " |
| "<b>bold</b>=best per class). The <i>macro</i> column weights each foreground class " |
| "equally (a within-dataset mean, not a cross-dataset one). It can differ by ~1 pt from " |
| "the §1 Dice — which is image-weighted (each image is first averaged over the classes it " |
| "contains) — whenever some images lack a class (e.g. ACDC's RV appears in only 335/380 " |
| "images); both conventions are standard, neither is an error.</div>") |
| h.append(pcs) |
|
|
| h.append("<h2>5. Supplementary metrics — Sensitivity & Precision (%) ↑</h2>") |
| h.append("<div class='cap'>Two complementary error views (<b>bold</b> = best per dataset): low " |
| "<b>Sensitivity</b> (recall) signals under-segmentation (missed foreground); low " |
| "<b>Precision</b> signals over-segmentation (false positives). <i>Specificity</i> is omitted " |
| "— background dominates, so it stays >96% with almost no spread across methods (≤0.6 pt on " |
| "average) — and <i>ASSD</i> is omitted as redundant with HD95; both, and every metric, are " |
| "tabulated in full in <code>summary.csv</code>.</div>") |
| h.append("<h3>Sensitivity / recall ↑</h3>") |
| h.append(_metric_table(cell, datasets, methods, "sensitivity", True, True)) |
| h.append("<h3>Precision ↑</h3>") |
| h.append(_metric_table(cell, datasets, methods, "precision", True, True)) |
|
|
| h.append("<hr><h2>Appendix — Experimental setup</h2>") |
| h.append("<p class='cap'>Full per-(dataset,method) values for <b>every</b> metric " |
| "(IoU, HD95, ASSD, Sensitivity, Specificity, Precision, …) are in " |
| "<code>summary.csv</code>; the Dice table as LaTeX is in <code>summary.tex</code>.</p>") |
| h.append(_setup_html()) |
| h.append("</body></html>") |
| return "\n".join(h) |
|
|
|
|
| def main(): |
| p = argparse.ArgumentParser() |
| p.add_argument("--exp_name", required=True) |
| p.add_argument("--out_root", default="results") |
| args = p.parse_args() |
|
|
| runs = load_runs(args.out_root, args.exp_name) |
| if not runs: |
| print(f"no metrics.json under {args.out_root}/{args.exp_name}") |
| return |
| rows = summarize(runs) |
| sig = _sig_tied_sets(runs) |
| base = os.path.join(args.out_root, args.exp_name) |
| open(os.path.join(base, "summary.csv"), "w").write(to_csv(rows)) |
| open(os.path.join(base, "summary.md"), "w").write(to_markdown(rows, sig)) |
| open(os.path.join(base, "summary.tex"), "w").write(to_latex(rows, sig)) |
| open(os.path.join(base, "summary.html"), "w").write( |
| to_html(rows, runs, title=f"SegGen benchmark ({args.exp_name})", sig=sig)) |
| print(to_markdown(rows, sig)) |
| print(f"{len(runs)} runs -> {len(rows)} (dataset,arch) cells; written {base}/summary.{{csv,md,tex,html}}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|