Create mlp_ablation.py

14993d7 verified 29 days ago

17.7 kB

	# Run cell 1 and cell 2, shape factory and model then run this to continue.
	# ablation showed random chance without the full geometric architecture.

	# =============================================================================
	# CELL 5: Architecture Ablation — MLP Baseline with Same Loss
	# Requires: Cell 1 + Cell 2 already executed (constants, generator, deform_grid)
	# Question: Is the loss creating the behavior, or the architecture?
	#
	# Same composite loss, same data, same hyperparams.
	# Plain MLP replaces: tracer attention, capacity cascade,
	# differentiation gate, curvature head, rectified flow arbiter.
	# =============================================================================

	import math, time, numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from pathlib import Path

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	if device.type == "cuda":
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cudnn.allow_tf32 = True
	torch.backends.cudnn.benchmark = True

	use_amp = device.type == "cuda"
	amp_dtype = (torch.bfloat16 if (device.type == "cuda" and
	torch.cuda.is_bf16_supported()) else torch.float16)


	# =============================================================================
	# MLP Baseline — same output contract as GeometricShapeClassifier
	# =============================================================================

	class MLPBaseline(nn.Module):
	"""Plain MLP producing the same output dict as GeometricShapeClassifier.
	No geometric inductive bias. Same loss surface."""

	def __init__(self, grid_size=GS, n_classes=NUM_CLASSES,
	n_curvatures=NUM_CURVATURES, trunk_dim=256):
	super().__init__()
	inp = grid_size ** 3 # 125

	self.trunk = nn.Sequential(
	nn.Linear(inp, 512), nn.GELU(),
	nn.Linear(512, 512), nn.GELU(),
	nn.Linear(512, trunk_dim), nn.GELU(),
	nn.Linear(trunk_dim, trunk_dim), nn.GELU(),
	)

	# Primary classifier
	self.classifier = nn.Sequential(
	nn.Linear(trunk_dim, 128), nn.GELU(), nn.Dropout(0.1),
	nn.Linear(128, n_classes))

	# Capacity analog: fill ratios (4 dims, sigmoid)
	self.fill_head = nn.Sequential(
	nn.Linear(trunk_dim, 64), nn.GELU(),
	nn.Linear(64, 4), nn.Sigmoid())

	# Learned capacities for diversity loss
	self.cap_head = nn.Sequential(
	nn.Linear(trunk_dim, 32), nn.GELU(),
	nn.Linear(32, 4), nn.Softplus())

	# Peak dimension (4-class)
	self.peak_head = nn.Sequential(
	nn.Linear(trunk_dim, 32), nn.GELU(), nn.Linear(32, 4))

	# Overflow (4 dims, sigmoid)
	self.overflow_head = nn.Sequential(
	nn.Linear(trunk_dim, 32), nn.GELU(),
	nn.Linear(32, 4), nn.Sigmoid())

	# Volume regression
	self.volume_head = nn.Sequential(
	nn.Linear(trunk_dim, 64), nn.GELU(), nn.Linear(64, 1))

	# Cayley-Menger determinant sign
	self.cm_head = nn.Sequential(
	nn.Linear(trunk_dim, 64), nn.GELU(),
	nn.Linear(64, 1), nn.Tanh())

	# Curvature binary
	self.curved_head = nn.Sequential(
	nn.Linear(trunk_dim, 32), nn.GELU(),
	nn.Linear(32, 1), nn.Sigmoid())

	# Curvature type (8-class)
	self.curv_type_head = nn.Sequential(
	nn.Linear(trunk_dim, 64), nn.GELU(),
	nn.Linear(64, n_curvatures))

	# Second classifier (arbiter analog)
	self.refiner = nn.Sequential(
	nn.Linear(trunk_dim, 128), nn.GELU(), nn.Dropout(0.1),
	nn.Linear(128, n_classes))

	# Confidence and blend
	self.confidence_head = nn.Sequential(
	nn.Linear(trunk_dim, 32), nn.GELU(),
	nn.Linear(32, 1), nn.Sigmoid())
	self.blend_head = nn.Sequential(
	nn.Linear(trunk_dim, 32), nn.GELU(),
	nn.Linear(32, 1), nn.Sigmoid())

	def forward(self, grid, labels=None):
	B = grid.shape[0]
	x = grid.reshape(B, -1).float()
	feat = self.trunk(x)

	initial_logits = self.classifier(feat)
	refined_logits = self.refiner(feat)

	blend = self.blend_head(feat).squeeze(-1)
	class_logits = (blend.unsqueeze(-1) * initial_logits +
	(1 - blend.unsqueeze(-1)) * refined_logits)

	conf = self.confidence_head(feat).squeeze(-1)

	return {
	"class_logits": class_logits,
	"initial_logits": initial_logits,
	"refined_logits": refined_logits,
	"fill_ratios": self.fill_head(feat),
	"peak_logits": self.peak_head(feat),
	"overflows": self.overflow_head(feat),
	"capacities": self.cap_head(feat),
	"volume_pred": self.volume_head(feat).squeeze(-1),
	"cm_pred": self.cm_head(feat).squeeze(-1),
	"is_curved_pred": self.curved_head(feat),
	"curv_type_logits": self.curv_type_head(feat),
	"trajectory_logits": [refined_logits],
	"flow_loss": torch.tensor(0.0, device=grid.device),
	"refined_confidence": self.confidence_head(feat),
	"blend_weight": blend,
	"confidence": conf,
	"alternation": torch.zeros(B, device=grid.device),
	"features": feat,
	}


	# =============================================================================
	# Loss Functions (identical to Cell 3)
	# =============================================================================

	def _safe_bce(inp, tgt):
	with torch.amp.autocast('cuda', enabled=False):
	return F.binary_cross_entropy(inp.float(), tgt.float())

	def capacity_fill_loss(fr, dt): return _safe_bce(fr, dt)

	def overflow_reg(on, dt):
	pk = dt.sum(dim=-1).long() - 1
	loss = sum(on[b, pk[b].item():].sum() for b in range(on.shape[0]))
	return loss / (on.shape[0] + 1e-8)

	def cap_diversity(c): return -c.var()
	def peak_loss(l, t): return F.cross_entropy(l, t)
	def cm_loss(p, t): return F.mse_loss(p, torch.sign(t))
	def curved_bce(p, t): return _safe_bce(p.squeeze(-1), t)
	def ctype_loss(l, t): return F.cross_entropy(l, t)


	# =============================================================================
	# Data — load cached or generate + cache
	# =============================================================================

	DATASET_PATH = Path("./cached_dataset.pt")
	N_SAMPLES = 500000
	SEED = 42

	if DATASET_PATH.exists():
	print(f"Loading cached dataset from {DATASET_PATH}...")
	t0 = time.time()
	_cached = torch.load(DATASET_PATH, weights_only=True)
	if _cached["n_samples"] == N_SAMPLES and _cached["seed"] == SEED:
	train_ds = ShapeDataset.__new__(ShapeDataset)
	val_ds = ShapeDataset.__new__(ShapeDataset)
	for k in ["grids", "labels", "dim_conf", "peak_dim", "volume",
	"cm_det", "is_curved", "curvature"]:
	setattr(train_ds, k, _cached["train"][k])
	setattr(val_ds, k, _cached["val"][k])
	print(f"Loaded {len(train_ds)} train + {len(val_ds)} val in {time.time()-t0:.1f}s")
	else:
	print(f"Cache mismatch — regenerating")
	DATASET_PATH.unlink()

	if not DATASET_PATH.exists():
	print("Generating dataset...")
	all_samples = generate_parallel(N_SAMPLES, seed=SEED, n_workers=8)
	n_train = int(len(all_samples) * 0.8)
	train_ds = ShapeDataset(all_samples[:n_train])
	val_ds = ShapeDataset(all_samples[n_train:])

	print(f"Caching to {DATASET_PATH}...")
	cache_data = {
	"n_samples": N_SAMPLES, "seed": SEED,
	"train": {k: getattr(train_ds, k) for k in ["grids", "labels", "dim_conf",
	"peak_dim", "volume", "cm_det", "is_curved", "curvature"]},
	"val": {k: getattr(val_ds, k) for k in ["grids", "labels", "dim_conf",
	"peak_dim", "volume", "cm_det", "is_curved", "curvature"]},
	}
	torch.save(cache_data, DATASET_PATH)
	size_mb = DATASET_PATH.stat().st_size / 1e6
	print(f"Cached: {size_mb:.0f}MB \| {len(train_ds)} train + {len(val_ds)} val")

	train_loader = torch.utils.data.DataLoader(
	train_ds, batch_size=4096, shuffle=True,
	num_workers=4, pin_memory=True, persistent_workers=True)
	val_loader = torch.utils.data.DataLoader(
	val_ds, batch_size=4096, shuffle=False,
	num_workers=4, pin_memory=True, persistent_workers=True)


	# =============================================================================
	# Train
	# =============================================================================

	model = MLPBaseline().to(device)
	n_params = sum(p.numel() for p in model.parameters())
	print(f"MLPBaseline: {n_params:,} params")
	print(f"(GeometricShapeClassifier was 1,852,870 params)")

	if device.type == "cuda" and hasattr(torch, 'compile'):
	try:
	model = torch.compile(model, mode="default")
	print("torch.compile: enabled")
	except Exception as e:
	print(f"torch.compile: skipped ({e})")

	epochs = 80
	lr = 3e-3
	optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
	warmup_epochs = 5
	def lr_lambda(ep):
	if ep < warmup_epochs: return (ep + 1) / warmup_epochs
	return 0.5 * (1 + math.cos(math.pi * (ep - warmup_epochs) / (epochs - warmup_epochs)))
	scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

	w = {"cls": 1.0, "fill": 0.3, "peak": 0.3, "ovf": 0.05,
	"div": 0.02, "vol": 0.1, "cm": 0.1, "curved": 0.2, "ctype": 0.2,
	"arb_cls": 0.8, "arb_traj": 0.2, "arb_conf": 0.1, "flow": 0.5}

	use_scaler = use_amp and amp_dtype == torch.float16
	scaler = torch.amp.GradScaler('cuda', enabled=use_scaler)

	print(f"\nAblation: MLPBaseline vs GeometricShapeClassifier")
	print(f"Same loss ({len(w)} terms), same data, same schedule")
	print(f"{'='*70}")

	best_val_acc = 0
	t_start = time.time()

	for epoch in range(epochs):
	t0 = time.time()
	model.train()
	correct, total = 0, 0
	correct_init, correct_ref = 0, 0

	for grid, label, dc, pd, vol, cm, ic, ct in train_loader:
	grid = grid.to(device, non_blocking=True)
	label = label.to(device, non_blocking=True)
	dc = dc.to(device, non_blocking=True)
	pd = pd.to(device, non_blocking=True)
	vol = vol.to(device, non_blocking=True)
	cm = cm.to(device, non_blocking=True)
	ic = ic.to(device, non_blocking=True)
	ct = ct.to(device, non_blocking=True)

	grid = deform_grid(grid, p_dropout=0.05, p_add=0.05, p_shift=0.08)
	optimizer.zero_grad(set_to_none=True)

	with torch.amp.autocast('cuda', enabled=use_amp, dtype=amp_dtype):
	out = model(grid, labels=label)

	loss_first = (w["cls"] * F.cross_entropy(out["initial_logits"], label) +
	w["fill"] * capacity_fill_loss(out["fill_ratios"], dc) +
	w["peak"] * peak_loss(out["peak_logits"], pd) +
	w["ovf"] * overflow_reg(out["overflows"], dc) +
	w["div"] * cap_diversity(out["capacities"]) +
	w["vol"] * F.mse_loss(out["volume_pred"], torch.log1p(vol)) +
	w["cm"] * cm_loss(out["cm_pred"], cm) +
	w["curved"] * curved_bce(out["is_curved_pred"], ic) +
	w["ctype"] * ctype_loss(out["curv_type_logits"], ct))

	loss_arb = w["arb_cls"] * F.cross_entropy(out["refined_logits"], label)
	traj_loss = 0
	for step_i, step_logits in enumerate(out["trajectory_logits"]):
	step_weight = (step_i + 1) / len(out["trajectory_logits"])
	traj_loss += step_weight * F.cross_entropy(step_logits, label)
	traj_loss /= len(out["trajectory_logits"])
	loss_arb += w["arb_traj"] * traj_loss
	loss_arb += w["flow"] * out["flow_loss"]

	with torch.no_grad():
	is_correct = (out["refined_logits"].argmax(1) == label).float()
	loss_arb += w["arb_conf"] * _safe_bce(
	out["refined_confidence"].squeeze(-1), is_correct)

	with torch.no_grad():
	init_correct = (out["initial_logits"].argmax(1) == label).float()
	ref_correct = (out["refined_logits"].argmax(1) == label).float()
	blend_target = torch.where(init_correct >= ref_correct,
	torch.ones_like(init_correct) * 0.8,
	torch.ones_like(init_correct) * 0.2)
	loss_arb += 0.1 * _safe_bce(out["blend_weight"], blend_target)

	loss_blend = w["cls"] * F.cross_entropy(out["class_logits"], label)
	loss = loss_first + loss_arb + loss_blend

	scaler.scale(loss).backward()
	scaler.unscale_(optimizer)
	torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
	scaler.step(optimizer)
	scaler.update()

	correct += (out["class_logits"].argmax(1) == label).sum().item()
	correct_init += (out["initial_logits"].argmax(1) == label).sum().item()
	correct_ref += (out["refined_logits"].argmax(1) == label).sum().item()
	total += grid.size(0)

	scheduler.step()

	if epoch == 0 and device.type == "cuda":
	peak = torch.cuda.max_memory_allocated() / 1e9
	print(f"VRAM peak: {peak:.2f}GB \| throughput: {total/(time.time()-t0):.0f} samples/s")

	# Validate
	model.eval()
	vc, vt, vcc, vct = 0, 0, 0, 0
	vc_init, vc_ref = 0, 0

	with torch.no_grad(), torch.amp.autocast('cuda', enabled=use_amp, dtype=amp_dtype):
	for grid, label, dc, pd, vol, cm, ic, ct in val_loader:
	grid = grid.to(device, non_blocking=True)
	label = label.to(device, non_blocking=True)
	ic = ic.to(device, non_blocking=True)
	out = model(grid)
	vc += (out["class_logits"].argmax(1) == label).sum().item()
	vc_init += (out["initial_logits"].argmax(1) == label).sum().item()
	vc_ref += (out["refined_logits"].argmax(1) == label).sum().item()
	vt += grid.size(0)
	vcc += ((out["is_curved_pred"].squeeze(-1) > 0.5).float() == ic).sum().item()
	vct += grid.size(0)

	val_acc = vc / vt
	val_init = vc_init / vt
	val_ref = vc_ref / vt
	curved_acc = vcc / vct
	marker = " *" if val_acc > best_val_acc else ""
	if val_acc > best_val_acc:
	best_val_acc = val_acc

	dt = time.time() - t0
	if (epoch + 1) % 10 == 0 or epoch == 0 or marker:
	print(f"Ep {epoch+1:3d}/{epochs} [{dt:.1f}s] \| "
	f"blend {val_acc:.3f} init {val_init:.3f} arb {val_ref:.3f} \| "
	f"curved {curved_acc:.3f}{marker}")

	total_time = time.time() - t_start
	print(f"\nDone in {total_time:.0f}s ({total_time/60:.1f}min)")


	# =============================================================================
	# Per-Class Breakdown
	# =============================================================================

	print(f"\n{'='*70}")
	print(f"Per-Class Results — MLPBaseline")
	print(f"{'='*70}")

	model.eval()
	cc_b = {n: 0 for n in CLASS_NAMES}
	cc_i = {n: 0 for n in CLASS_NAMES}
	cc_r = {n: 0 for n in CLASS_NAMES}
	ct_c = {n: 0 for n in CLASS_NAMES}

	with torch.no_grad(), torch.amp.autocast('cuda', enabled=use_amp, dtype=amp_dtype):
	for grid, label, *_ in val_loader:
	grid = grid.to(device, non_blocking=True)
	label = label.to(device, non_blocking=True)
	out = model(grid)
	pb = out["class_logits"].argmax(1)
	pi = out["initial_logits"].argmax(1)
	pr = out["refined_logits"].argmax(1)
	for k in range(len(label)):
	name = CLASS_NAMES[label[k].item()]
	cc_b[name] += (pb[k] == label[k]).item()
	cc_i[name] += (pi[k] == label[k]).item()
	cc_r[name] += (pr[k] == label[k]).item()
	ct_c[name] += 1

	print(f"\n{'Class':22s} \| {'Blend':>5s} {'Init':>5s} {'Arb':>5s} \| "
	f"{'Corr':>4s}/{'Tot':>4s} \| {'Type':8s} Curvature")
	print("-" * 85)
	for name in CLASS_NAMES:
	if ct_c[name] == 0: continue
	ab = cc_b[name]/ct_c[name]
	ai = cc_i[name]/ct_c[name]
	ar = cc_r[name]/ct_c[name]
	info = SHAPE_CATALOG[name]
	print(f" {name:20s} \| {ab:.3f} {ai:.3f} {ar:.3f} \| "
	f"{cc_b[name]:4d}/{ct_c[name]:4d} \| "
	f"{'CURVED' if info['curved'] else 'rigid':8s} {info['curvature']}")


	# =============================================================================
	# Summary Comparison
	# =============================================================================

	print(f"\n{'='*70}")
	print(f"ABLATION SUMMARY")
	print(f"{'='*70}")
	print(f" MLPBaseline: {n_params:>10,} params \| best val acc: {best_val_acc:.4f}")
	print(f" GeometricShapeClassifier: 1,852,870 params \| best val acc: 0.9022")
	print(f" Delta: {n_params - 1852870:>+10,} params \| "
	f"delta acc: {best_val_acc - 0.9022:+.4f}")
	print()
	if best_val_acc >= 0.89:
	print(" -> Loss is doing most of the work.")
	print(" The composite multi-task signal is sufficient to discover")
	print(" geometric structure without architectural inductive bias.")
	elif best_val_acc >= 0.80:
	print(" -> Architecture contributes meaningfully.")
	print(" The loss provides signal but the geometric inductive bias")
	print(" (capacity cascade, tracers, flow arbiter) adds real value.")
	else:
	print(" -> Architecture is critical.")
	print(" The MLP cannot recover the same behavior from loss alone.")
	print(" Geometric inductive bias is doing the heavy lifting.")
	print(f"\n Curved detection: {curved_acc:.3f}")
	print(f" Training time: {total_time:.0f}s ({total_time/60:.1f}min)")