PULSE-code / experiments /tasks /train_seqpred.py

Upload folder using huggingface_hub

b4b2877 verified 3 days ago

18.7 kB

	#!/usr/bin/env python3
	"""
	Training loop for T10 Triplet Next-Action Prediction.

	Usage example:
	python3 experiments/train_seqpred.py \
	--model dailyactformer \
	--modalities imu,emg,eyetrack,mocap,pressure \
	--t_obs 8 --t_fut 2 \
	--epochs 40 --batch_size 32 --lr 3e-4 \
	--output_dir results/seqpred/ours_all5_tfut2_seed42 \
	--seed 42
	"""

	from __future__ import annotations

	# pandas must be imported BEFORE torch/numpy to avoid a GLIBCXX load-order bug
	# on this cluster (libstdc++ from Anaconda vs system).
	import pandas # noqa: F401

	import argparse
	import json
	import os
	import random
	import sys
	import time
	from pathlib import Path
	from typing import Dict

	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.utils.data import DataLoader

	# Make sibling modules importable from either (a) the neurips26 root (running
	# as `python experiments/train_seqpred.py`) or (b) the frozen row/code/ folder
	# (running via the per-row run.sh after setup_row.sh snapshots the code).
	THIS = Path(__file__).resolve()
	sys.path.insert(0, str(THIS.parent)) # row/code/
	sys.path.insert(0, str(THIS.parents[1])) # neurips26/

	try:
	from experiments.dataset_seqpred import (
	TripletSeqPredDataset, build_train_test, collate_triplet,
	TRAIN_VOLS_V3, TEST_VOLS_V3,
	)
	from experiments.models_seqpred import build_model
	from experiments.taxonomy import (
	NUM_VERB_FINE, NUM_VERB_COMPOSITE, NUM_NOUN, NUM_HAND,
	)
	except ModuleNotFoundError:
	from dataset_seqpred import (
	TripletSeqPredDataset, build_train_test, collate_triplet,
	TRAIN_VOLS_V3, TEST_VOLS_V3,
	)
	from models_seqpred import build_model
	from taxonomy import (
	NUM_VERB_FINE, NUM_VERB_COMPOSITE, NUM_NOUN, NUM_HAND,
	)


	# ---------------------------------------------------------------------------
	# Utilities
	# ---------------------------------------------------------------------------

	def set_seed(seed: int) -> None:
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)


	def top_k_correct(logits: torch.Tensor, target: torch.Tensor, k: int) -> torch.Tensor:
	"""Return a bool tensor (B,) indicating whether `target` is in top-k of logits."""
	k = min(k, logits.size(1))
	_, top = logits.topk(k, dim=1)
	return (top == target.unsqueeze(1)).any(dim=1)


	def mean_class_recall(logits: torch.Tensor, target: torch.Tensor,
	num_classes: int) -> float:
	pred = logits.argmax(dim=1)
	recall_per_cls = []
	for c in range(num_classes):
	sel = (target == c)
	n = int(sel.sum().item())
	if n == 0:
	continue
	r = float((pred[sel] == c).float().mean().item())
	recall_per_cls.append(r)
	return float(np.mean(recall_per_cls)) if recall_per_cls else 0.0


	def build_class_weights(counts: np.ndarray) -> torch.Tensor:
	"""Inverse-frequency weights, normalized so mean weight = 1."""
	counts = counts.astype(np.float32).clip(min=1.0)
	w = 1.0 / counts
	w = w / w.mean()
	return torch.from_numpy(w)


	# ---------------------------------------------------------------------------
	# Core loss
	# ---------------------------------------------------------------------------

	def triplet_loss(
	logits: Dict[str, torch.Tensor],
	y: Dict[str, torch.Tensor],
	weights: Dict[str, torch.Tensor],
	lambda_cfg: Dict[str, float],
	label_smoothing: float = 0.05,
	) -> Dict[str, torch.Tensor]:
	losses = {}
	for head in ("verb_fine", "verb_composite", "noun", "hand"):
	w = weights.get(head, None)
	if w is not None:
	w = w.to(logits[head].device)
	l = F.cross_entropy(
	logits[head], y[head], weight=w,
	label_smoothing=label_smoothing,
	)
	losses[head] = l
	total = sum(lambda_cfg.get(k, 1.0) * losses[k] for k in losses)
	losses["total"] = total
	return losses


	# ---------------------------------------------------------------------------
	# Eval
	# ---------------------------------------------------------------------------

	@torch.no_grad()
	def evaluate(model, loader, device) -> Dict[str, float]:
	model.eval()
	all_logits: Dict[str, list] = {k: [] for k in
	("verb_fine", "verb_composite", "noun", "hand")}
	all_y: Dict[str, list] = {k: [] for k in
	("verb_fine", "verb_composite", "noun", "hand")}

	for batch in loader:
	# Backward-compatible unpack: collate returns 5 or 6 elements.
	if len(batch) == 6:
	x, mask, lens, y, meta, prev = batch
	else:
	x, mask, lens, y, meta = batch
	prev = None
	x = {m: t.to(device) for m, t in x.items()}
	mask = mask.to(device)
	kwargs = {}
	if prev is not None and getattr(model, "use_prev_action", False):
	kwargs["prev_v_comp"] = prev["verb_composite"].to(device)
	kwargs["prev_noun"] = prev["noun"].to(device)
	logits = model(x, mask, **kwargs)
	for k in all_logits:
	all_logits[k].append(logits[k].cpu())
	all_y[k].append(y[k])

	logits_cat = {k: torch.cat(v, dim=0) for k, v in all_logits.items()}
	y_cat = {k: torch.cat(v, dim=0) for k, v in all_y.items()}

	m = {}
	for k, K in [("verb_fine", NUM_VERB_FINE),
	("verb_composite", NUM_VERB_COMPOSITE),
	("noun", NUM_NOUN),
	("hand", NUM_HAND)]:
	preds = logits_cat[k].argmax(dim=1)
	acc1 = float((preds == y_cat[k]).float().mean().item())
	m[f"{k}_top1"] = acc1
	if K > 5:
	acc5 = float(top_k_correct(logits_cat[k], y_cat[k], 5).float().mean().item())
	m[f"{k}_top5"] = acc5
	m[f"{k}_mcr"] = mean_class_recall(logits_cat[k], y_cat[k], K)

	# Per-head argmax predictions
	vf_pred = logits_cat["verb_fine"].argmax(dim=1)
	n_pred = logits_cat["noun"].argmax(dim=1)
	h_pred = logits_cat["hand"].argmax(dim=1)

	# Headline (current default): action_vn = (verb_fine, noun) joint top-1.
	# Hand is dropped from the joint metric because the hand label is dominated
	# by a single majority class (~48% train, ~42% test) so a constant predictor
	# already saturates it; including hand in the joint compresses the signal
	# from the verb / noun heads where models actually learn. Hand is still
	# reported separately as `hand_top1`.
	vn_correct = (vf_pred == y_cat["verb_fine"]) & (n_pred == y_cat["noun"])
	m["action_vn_top1"] = float(vn_correct.float().mean().item())

	# Top-5 action over (verb_fine, noun)
	vf_top5 = top_k_correct(logits_cat["verb_fine"], y_cat["verb_fine"], 5)
	n_top5 = top_k_correct(logits_cat["noun"], y_cat["noun"], 5)
	m["action_vn_top5"] = float((vf_top5 & n_top5).float().mean().item())

	# Legacy: include hand in the joint, kept for backward compatibility with
	# earlier reports. Will be deprecated.
	vfn_h_correct = vn_correct & (h_pred == y_cat["hand"])
	m["action_top1"] = float(vfn_h_correct.float().mean().item())
	h_top1 = (h_pred == y_cat["hand"])
	m["action_top5"] = float((vf_top5 & n_top5 & h_top1).float().mean().item())
	return m


	# ---------------------------------------------------------------------------
	# Modality dropout (train-time only)
	# ---------------------------------------------------------------------------

	def apply_modality_dropout(x: Dict[str, torch.Tensor], p: float) -> Dict[str, torch.Tensor]:
	"""Per-sample per-modality dropout: zero out each (sample, modality) cell
	independently with probability p, but force-keep at least one modality
	per sample so the model never receives an all-zero input."""
	if p <= 0.0:
	return x
	mods = list(x.keys())
	if len(mods) <= 1:
	return x
	any_t = next(iter(x.values()))
	B = any_t.shape[0]
	device = any_t.device
	keep = (torch.rand(B, len(mods), device=device) >= p)
	forced = torch.randint(len(mods), (B,), device=device)
	keep[torch.arange(B, device=device), forced] = True
	out = {}
	for i, m in enumerate(mods):
	km = keep[:, i].to(x[m].dtype).view(B, ([1] (x[m].ndim - 1)))
	out[m] = x[m] * km
	return out


	# ---------------------------------------------------------------------------
	# Main training
	# ---------------------------------------------------------------------------

	def main():
	ap = argparse.ArgumentParser()
	ap.add_argument("--model", type=str, default="deepconvlstm",
	choices=["deepconvlstm", "dailyactformer",
	"rulstm", "futr", "afft",
	"handformer", "actionllm"])
	ap.add_argument("--modalities", type=str,
	default="imu,emg,eyetrack,mocap,pressure")
	ap.add_argument("--t_obs", type=float, default=8.0,
	help="Anticipation mode only: observation window length (s).")
	ap.add_argument("--t_fut", type=float, default=2.0,
	help="Anticipation mode only: prediction horizon (s).")
	ap.add_argument("--mode", type=str, default="recognition",
	choices=["recognition", "anticipation"],
	help="recognition = classify segment from its own [start,end] sensor "
	"window (default). anticipation = legacy T10 setup, predict from "
	"[start-t_fut-t_obs, start-t_fut].")
	ap.add_argument("--downsample", type=int, default=5)

	ap.add_argument("--epochs", type=int, default=40)
	ap.add_argument("--batch_size", type=int, default=32)
	ap.add_argument("--lr", type=float, default=3e-4)
	ap.add_argument("--weight_decay", type=float, default=1e-4)
	ap.add_argument("--grad_clip", type=float, default=1.0)
	ap.add_argument("--label_smoothing", type=float, default=0.05)
	ap.add_argument("--dropout", type=float, default=0.1,
	help="Dropout used inside DAF stems / transformer / pool.")
	ap.add_argument("--use_prev_action", action="store_true",
	help="Condition DAF on previous-segment (verb_composite, noun) "
	"labels via embedding concat to pooled features. Only DAF "
	"uses this; baselines ignore it.")
	ap.add_argument("--modality_dropout", type=float, default=0.0,
	help="Train-time per-sample per-modality dropout prob "
	"(0.0=off). At least one modality is always kept.")

	ap.add_argument("--use_class_weights", action="store_true",
	help="Weight CE by inverse class frequency (better for tail).")
	ap.add_argument("--lambda_verb_fine", type=float, default=1.0)
	ap.add_argument("--lambda_verb_composite", type=float, default=0.5)
	ap.add_argument("--lambda_noun", type=float, default=1.0)
	ap.add_argument("--lambda_hand", type=float, default=0.5)

	ap.add_argument("--patience", type=int, default=12)
	ap.add_argument("--warmup_epochs", type=int, default=0,
	help="Linear LR warmup over the first N epochs (0=off).")
	ap.add_argument("--seed", type=int, default=42)
	ap.add_argument("--output_dir", type=str, required=True)
	ap.add_argument("--num_workers", type=int, default=0)
	ap.add_argument("--tag", type=str, default="")
	args = ap.parse_args()

	set_seed(args.seed)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	if args.mode == "anticipation":
	print(f"[cfg] model={args.model} modalities={args.modalities} "
	f"mode={args.mode} T_obs={args.t_obs}s T_fut={args.t_fut}s seed={args.seed}")
	else:
	print(f"[cfg] model={args.model} modalities={args.modalities} "
	f"mode={args.mode} (segment-aligned window) seed={args.seed}")
	print(f"[cfg] device={device} epochs={args.epochs} lr={args.lr} "
	f"batch_size={args.batch_size}")

	mods = tuple(args.modalities.split(","))
	train_ds, test_ds = build_train_test(
	modalities=mods, t_obs_sec=args.t_obs, t_fut_sec=args.t_fut,
	downsample=args.downsample, mode=args.mode,
	)
	print(f"[data] train={len(train_ds)} test={len(test_ds)} "
	f"modality_dims={train_ds.modality_dims}")

	# Class counts for weighting (train only)
	counts = train_ds.class_counts()
	weights: Dict[str, torch.Tensor] = {}
	if args.use_class_weights:
	for k in ("verb_fine", "verb_composite", "noun", "hand"):
	weights[k] = build_class_weights(counts[k])

	train_loader = DataLoader(
	train_ds, batch_size=args.batch_size, shuffle=True,
	collate_fn=collate_triplet, num_workers=args.num_workers, drop_last=True,
	)
	test_loader = DataLoader(
	test_ds, batch_size=args.batch_size, shuffle=False,
	collate_fn=collate_triplet, num_workers=args.num_workers,
	)

	# For DailyActFormer: causal mask only when doing anticipation; bidirectional
	# attention for recognition (the default). Other models ignore unknown kwargs.
	extra_kwargs = {}
	if args.model in ("dailyactformer", "ours", "daf"):
	extra_kwargs["causal"] = (args.mode == "anticipation")
	extra_kwargs["dropout"] = args.dropout
	# Every model class now accepts use_prev_action; pass it uniformly.
	extra_kwargs["use_prev_action"] = args.use_prev_action
	model = build_model(args.model, train_ds.modality_dims, **extra_kwargs).to(device)
	n_params = sum(p.numel() for p in model.parameters())
	print(f"[model] {args.model} params={n_params:,}")

	opt = torch.optim.AdamW(
	model.parameters(), lr=args.lr, weight_decay=args.weight_decay,
	)
	if args.warmup_epochs > 0:
	warmup = torch.optim.lr_scheduler.LinearLR(
	opt, start_factor=1.0 / max(1, args.warmup_epochs), end_factor=1.0,
	total_iters=args.warmup_epochs,
	)
	cosine = torch.optim.lr_scheduler.CosineAnnealingLR(
	opt, T_max=max(1, args.epochs - args.warmup_epochs),
	eta_min=args.lr * 0.05,
	)
	sched = torch.optim.lr_scheduler.SequentialLR(
	opt, schedulers=[warmup, cosine], milestones=[args.warmup_epochs],
	)
	else:
	sched = torch.optim.lr_scheduler.CosineAnnealingLR(
	opt, T_max=args.epochs, eta_min=args.lr * 0.05,
	)

	lambda_cfg = {
	"verb_fine": args.lambda_verb_fine,
	"verb_composite": args.lambda_verb_composite,
	"noun": args.lambda_noun,
	"hand": args.lambda_hand,
	}

	# Output directory
	out_dir = Path(args.output_dir)
	if args.tag:
	out_dir = out_dir.parent / f"{out_dir.name}_{args.tag}"
	out_dir.mkdir(parents=True, exist_ok=True)
	with open(out_dir / "config.json", "w") as f:
	json.dump(vars(args) \| {"n_params": n_params}, f, indent=2)

	best = {"action_vn_top1": -1.0, "action_top1": -1.0}
	best_epoch = 0
	best_path = out_dir / "model_best.pt"
	patience = 0
	history = []

	for epoch in range(1, args.epochs + 1):
	t0 = time.time()
	model.train()
	losses_epoch = {k: 0.0 for k in
	("verb_fine", "verb_composite", "noun", "hand", "total")}
	n_batches = 0
	for batch in train_loader:
	if len(batch) == 6:
	x, mask, lens, y, meta, prev = batch
	else:
	x, mask, lens, y, meta = batch
	prev = None
	x = {m: t.to(device) for m, t in x.items()}
	mask = mask.to(device)
	y = {k: v.to(device) for k, v in y.items()}

	if args.modality_dropout > 0.0:
	x = apply_modality_dropout(x, args.modality_dropout)

	kwargs = {}
	if prev is not None and getattr(model, "use_prev_action", False):
	kwargs["prev_v_comp"] = prev["verb_composite"].to(device)
	kwargs["prev_noun"] = prev["noun"].to(device)

	opt.zero_grad()
	logits = model(x, mask, **kwargs)
	l = triplet_loss(logits, y, weights, lambda_cfg,
	label_smoothing=args.label_smoothing)
	l["total"].backward()
	torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
	opt.step()

	for k in losses_epoch:
	losses_epoch[k] += float(l[k].detach().item())
	n_batches += 1

	for k in losses_epoch:
	losses_epoch[k] /= max(1, n_batches)
	sched.step()

	metrics = evaluate(model, test_loader, device)
	dur = time.time() - t0

	print(
	f" E{epoch:3d} loss={losses_epoch['total']:.3f} "
	f"(vf={losses_epoch['verb_fine']:.2f} "
	f"n={losses_epoch['noun']:.2f} "
	f"h={losses_epoch['hand']:.2f}) \| "
	f"act_vn@1={metrics['action_vn_top1']:.3f} "
	f"vf@1={metrics['verb_fine_top1']:.3f} "
	f"n@1={metrics['noun_top1']:.3f} "
	f"h@1={metrics['hand_top1']:.3f} \| "
	f"{dur:.1f}s",
	flush=True,
	)

	history.append({"epoch": epoch, losses_epoch, metrics})
	if metrics["action_vn_top1"] > best["action_vn_top1"]:
	best = dict(metrics)
	best_epoch = epoch
	patience = 0
	torch.save(
	{"state_dict": {k: v.cpu().clone()
	for k, v in model.state_dict().items()},
	"epoch": epoch,
	"metrics": metrics},
	best_path,
	)
	else:
	patience += 1
	if patience >= args.patience:
	print(f" early stop at epoch {epoch} (best epoch {best_epoch})")
	break

	# Write results
	results = {
	"best_epoch": best_epoch,
	"best_test_metrics": best,
	"history": history,
	"n_params": n_params,
	"train_size": len(train_ds),
	"test_size": len(test_ds),
	"train_class_counts": {k: v.tolist() for k, v in counts.items()},
	"modality_dims": train_ds.modality_dims,
	"args": vars(args),
	}
	with open(out_dir / "results.json", "w") as f:
	json.dump(results, f, indent=2)
	print(f"\n[done] best action_vn@1 = {best['action_vn_top1']:.4f} "
	f"(legacy action@1 = {best['action_top1']:.4f}, epoch {best_epoch}) "
	f"saved to {out_dir}")


	if __name__ == "__main__":
	main()