AbstractPhil
/

mobiusnet

Safetensors

Model card Files Files and versions

xet

Community

AbstractPhil commited on 17 days ago

Commit

3aac63e

verified ·

1 Parent(s): e30e73e

Update trainer.py

Browse files

Files changed (1) hide show

trainer.py +705 -135

trainer.py CHANGED Viewed

@@ -1,26 +1,46 @@
 """
-MobiusNet - CIFAR-100 (Dynamic Stages)
-======================================
-Properly handles variable stage counts.
-Author: AbstractPhil
-https://huggingface.co/AbstractPhil/mobiusnet
-License: Apache 2.0
 """
 import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import Tensor
-from typing import Tuple
 from torchvision import datasets, transforms
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 print(f"Device: {device}")
 # ============================================================================
 # MÖBIUS LENS
@@ -36,6 +56,9 @@ class MobiusLens(nn.Module):
     ):
         super().__init__()
         self.t = layer_idx / max(total_layers - 1, 1)
         scale_span = scale_range[1] - scale_range[0]
@@ -45,12 +68,10 @@ class MobiusLens(nn.Module):
         self.register_buffer('scales', torch.tensor([scale_low, scale_high]))
-        # TWIST IN
         self.twist_in_angle = nn.Parameter(torch.tensor(self.t * math.pi))
         self.twist_in_proj = nn.Linear(dim, dim, bias=False)
         nn.init.orthogonal_(self.twist_in_proj.weight)
-        # CENTER LENS
         self.omega = nn.Parameter(torch.tensor(math.pi))
         self.alpha = nn.Parameter(torch.tensor(1.5))
@@ -64,7 +85,8 @@ class MobiusLens(nn.Module):
         self.accum_weights = nn.Parameter(torch.tensor([0.4, 0.2, 0.4]))
         self.xor_weight = nn.Parameter(torch.tensor(0.7))
-        # TWIST OUT
         self.twist_out_angle = nn.Parameter(torch.tensor(-self.t * math.pi))
         self.twist_out_proj = nn.Linear(dim, dim, bias=False)
         nn.init.orthogonal_(self.twist_out_proj.weight)
@@ -99,9 +121,9 @@ class MobiusLens(nn.Module):
         gate = w[0] * L + w[1] * M + w[2] * R
         gate = gate * (0.5 + 0.5 * lr)
-        gate = gate / (gate.mean() + 1e-6) * 0.5
-        return x * gate.clamp(0, 1)
     def _twist_out(self, x: Tensor) -> Tensor:
         cos_t = torch.cos(self.twist_out_angle)
@@ -110,6 +132,19 @@ class MobiusLens(nn.Module):
     def forward(self, x: Tensor) -> Tensor:
         return self._twist_out(self._center_lens(self._twist_in(x)))
 # ============================================================================
@@ -157,25 +192,24 @@ class MobiusConvBlock(nn.Module):
         rw = torch.sigmoid(self.residual_weight)
         return rw * identity + (1 - rw) * h
 # ============================================================================
-# MÖBIUS NET - DYNAMIC STAGES
 # ============================================================================
 class MobiusNet(nn.Module):
-    """
-    Pure conv with Möbius topology.
-    Dynamic number of stages based on len(depths).
-    """
     def __init__(
         self,
         in_chans: int = 3,
-        num_classes: int = 100,
-        channels: Tuple[int, ...] = (64, 64, 128, 128),
-        depths: Tuple[int, ...] = (8, 4, 2),
         scale_range: Tuple[float, float] = (0.5, 2.5),
     ):
         super().__init__()
@@ -184,22 +218,22 @@ class MobiusNet(nn.Module):
         self.total_layers = total_layers
         self.scale_range = scale_range
-        self.channels = channels
-        self.depths = depths
         self.num_stages = num_stages
-        # Ensure we have enough channel specs
         channels = list(channels)
         while len(channels) < num_stages:
             channels.append(channels[-1])
-        # Stem
         self.stem = nn.Sequential(
-            nn.Conv2d(in_chans, channels[0], 3, padding=1, bias=False),
             nn.BatchNorm2d(channels[0]),
         )
-        # Build stages dynamically
         layer_idx = 0
         self.stages = nn.ModuleList()
         self.downsamples = nn.ModuleList()
@@ -207,16 +241,12 @@ class MobiusNet(nn.Module):
         for stage_idx in range(num_stages):
             ch = channels[stage_idx]
-            # Stage blocks
             stage = nn.ModuleList()
             for _ in range(depths[stage_idx]):
-                stage.append(MobiusConvBlock(
-                    ch, layer_idx, total_layers, scale_range
-                ))
                 layer_idx += 1
             self.stages.append(stage)
-            # Downsample between stages (not after last)
             if stage_idx < num_stages - 1:
                 ch_next = channels[stage_idx + 1]
                 self.downsamples.append(nn.Sequential(
@@ -224,9 +254,18 @@ class MobiusNet(nn.Module):
                     nn.BatchNorm2d(ch_next),
                 ))
-        # Head
         self.pool = nn.AdaptiveAvgPool2d(1)
-        self.head = nn.Linear(channels[num_stages - 1], num_classes)
     def forward(self, x: Tensor) -> Tensor:
         x = self.stem(x)
@@ -237,40 +276,104 @@ class MobiusNet(nn.Module):
             if i < len(self.downsamples):
                 x = self.downsamples[i](x)
         return self.head(self.pool(x).flatten(1))
-    def get_info(self) -> str:
-        return (
-            f"MobiusNet: channels={self.channels}, depths={self.depths}, "
-            f"total_layers={self.total_layers}, scale_range={self.scale_range}"
-        )
-    def get_topology_info(self) -> str:
-        lines = ["Möbius Ribbon Topology:"]
-        lines.append("=" * 60)
-        scale_span = self.scale_range[1] - self.scale_range[0]
         layer_idx = 0
-        for stage_idx, depth in enumerate(self.depths):
-            ch = self.channels[stage_idx] if stage_idx < len(self.channels) else self.channels[-1]
-            for local_idx in range(depth):
-                t = layer_idx / max(self.total_layers - 1, 1)
-                scale_low = self.scale_range[0] + t * scale_span
-                scale_high = scale_low + scale_span / self.total_layers
-                lines.append(
-                    f"Layer {layer_idx:2d} (Stage {stage_idx+1}, ch={ch:3d}): "
-                    f"t={t:.3f}, scales=[{scale_low:.3f}, {scale_high:.3f}]"
-                )
                 layer_idx += 1
-            if stage_idx < self.num_stages - 1:
-                ch_next = self.channels[stage_idx + 1] if stage_idx + 1 < len(self.channels) else self.channels[-1]
-                lines.append(f"  ↓ Downsample {ch} → {ch_next}")
-        lines.append("=" * 60)
-        return "\n".join(lines)
 # ============================================================================
@@ -278,112 +381,520 @@ class MobiusNet(nn.Module):
 # ============================================================================
 PRESETS = {
-    'mobius_xs': {
-        'channels': (64, 64, 128),
-        'depths': (4, 2, 2),
         'scale_range': (0.5, 2.5),
     },
-    'mobius_stretched': {
-        'channels': (32, 64, 96, 128, 192, 256, 320, 384, 448),
-        'depths': (4, 4, 4, 3, 3, 3, 2, 2, 2),
-        'scale_range': (0.2915, 2.85),
     },
-    'mobius_m': {
-        'channels': (64, 128, 256, 256),
-        'depths': (8, 4, 2),
-        'scale_range': (0.5, 3.0),
-    },
-    'mobius_deep': {
-        'channels': (64, 64, 128, 128),
-        'depths': (12, 6, 4),
         'scale_range': (0.5, 3.5),
     },
-    'mobius_wide': {
-        'channels': (96, 96, 192, 192),
-        'depths': (8, 4, 2),
-        'scale_range': (0.5, 2.5),
     },
 }
 # ============================================================================
 # TRAINING
 # ============================================================================
-def train_mobius_cifar100(
-    preset: str = 'mobius_s',
     epochs: int = 100,
     lr: float = 1e-3,
     batch_size: int = 128,
-    use_autoaugment: bool = True,
 ):
     config = PRESETS[preset]
     print("=" * 70)
-    print(f"MÖBIUS NET - {preset.upper()} - CIFAR-100")
     print("=" * 70)
     print(f"Device: {device}")
     print(f"Channels: {config['channels']}")
     print(f"Depths: {config['depths']}")
     print(f"Scale range: {config['scale_range']}")
-    print(f"AutoAugment: {use_autoaugment}")
     print()
-    # CIFAR-100 normalization
-    mean = (0.5071, 0.4867, 0.4408)
-    std = (0.2675, 0.2565, 0.2761)
-    train_transforms = [
-        transforms.RandomCrop(32, padding=4),
-        transforms.RandomHorizontalFlip(),
-    ]
-    if use_autoaugment:
-        train_transforms.append(transforms.AutoAugment(transforms.AutoAugmentPolicy.CIFAR10))
-    train_transforms.extend([
-        transforms.ToTensor(),
-        transforms.Normalize(mean, std),
-    ])
-    train_tf = transforms.Compose(train_transforms)
-    test_tf = transforms.Compose([
-        transforms.ToTensor(),
-        transforms.Normalize(mean, std),
-    ])
-    train_ds = datasets.CIFAR100('./data', train=True, download=True, transform=train_tf)
-    test_ds = datasets.CIFAR100('./data', train=False, download=True, transform=test_tf)
-    train_loader = DataLoader(
-        train_ds, batch_size=batch_size, shuffle=True,
-        num_workers=8, pin_memory=True, persistent_workers=True
-    )
-    test_loader = DataLoader(
-        test_ds, batch_size=256, num_workers=2, pin_memory=True, persistent_workers=True,
     )
     model = MobiusNet(
         in_chans=3,
-        num_classes=100,
         **config
     ).to(device)
-    print(model.get_info())
-    print()
-    print(model.get_topology_info())
-    print()
-    model.compile(mode='reduce-overhead')
     total_params = sum(p.numel() for p in model.parameters())
     print(f"Total params: {total_params:,}")
     print()
     optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.05)
     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
     best_acc = 0.0
-    for epoch in range(1, epochs + 1):
         model.train()
         train_loss, train_correct, train_total = 0, 0, 0
@@ -406,30 +917,76 @@ def train_mobius_cifar100(
         scheduler.step()
         model.eval()
         val_correct, val_total = 0, 0
         with torch.no_grad():
-            for x, y in test_loader:
                 x, y = x.to(device), y.to(device)
                 logits = model(x)
                 val_correct += (logits.argmax(1) == y).sum().item()
                 val_total += x.size(0)
         train_acc = train_correct / train_total
         val_acc = val_correct / val_total
-        best_acc = max(best_acc, val_acc)
-        marker = " ★" if val_acc >= best_acc else ""
-        print(f"Epoch {epoch:3d} | Loss: {train_loss/train_total:.4f} | "
               f"Train: {train_acc:.4f} | Val: {val_acc:.4f} | Best: {best_acc:.4f}{marker}")
     print()
     print("=" * 70)
     print("FINAL RESULTS")
     print("=" * 70)
-    print(model.get_info())
     print(f"Best accuracy: {best_acc:.4f}")
     print(f"Total params: {total_params:,}")
     print("=" * 70)
     return model, best_acc
@@ -440,9 +997,22 @@ def train_mobius_cifar100(
 # ============================================================================
 if __name__ == '__main__':
-    model, best_acc = train_mobius_cifar100(
-        preset='mobius_stretched',  # channels=(64, 64, 128, 128), depths=(8, 4, 2)
-        epochs=100,
-        lr=1e-3,
-        use_autoaugment=True,
     )

 """
+MobiusNet Trainer with TensorBoard, SafeTensors, and HuggingFace Upload
+=======================================================================
 """
+import os
+import re
+import json
 import math
+import shutil
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import Tensor
+from typing import Tuple, Optional, Dict, Any
 from torchvision import datasets, transforms
 from torch.utils.data import DataLoader
+from torch.utils.tensorboard import SummaryWriter
 from tqdm.auto import tqdm
+from datetime import datetime
+from pathlib import Path
+from safetensors.torch import save_file as save_safetensors, load_file as load_safetensors
+from huggingface_hub import HfApi, login
+# Colab HF login
+try:
+    from google.colab import userdata
+    token = userdata.get('HF_TOKEN')
+    os.environ['HF_TOKEN'] = token
+    login(token=token)
+    print("Logged in to HuggingFace via Colab")
+except:
+    # Not in Colab or token not set
+    pass
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 print(f"Device: {device}")
+# Enable TF32 for faster computation on Ampere+ GPUs
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+torch.set_float32_matmul_precision('high')
 # ============================================================================
 # MÖBIUS LENS
     ):
         super().__init__()
+        self.dim = dim
+        self.layer_idx = layer_idx
+        self.total_layers = total_layers
         self.t = layer_idx / max(total_layers - 1, 1)
         scale_span = scale_range[1] - scale_range[0]
         self.register_buffer('scales', torch.tensor([scale_low, scale_high]))
         self.twist_in_angle = nn.Parameter(torch.tensor(self.t * math.pi))
         self.twist_in_proj = nn.Linear(dim, dim, bias=False)
         nn.init.orthogonal_(self.twist_in_proj.weight)
         self.omega = nn.Parameter(torch.tensor(math.pi))
         self.alpha = nn.Parameter(torch.tensor(1.5))
         self.accum_weights = nn.Parameter(torch.tensor([0.4, 0.2, 0.4]))
         self.xor_weight = nn.Parameter(torch.tensor(0.7))
+        self.gate_norm = nn.LayerNorm(dim)
         self.twist_out_angle = nn.Parameter(torch.tensor(-self.t * math.pi))
         self.twist_out_proj = nn.Linear(dim, dim, bias=False)
         nn.init.orthogonal_(self.twist_out_proj.weight)
         gate = w[0] * L + w[1] * M + w[2] * R
         gate = gate * (0.5 + 0.5 * lr)
+        gate = torch.sigmoid(self.gate_norm(gate))
+        return x * gate
     def _twist_out(self, x: Tensor) -> Tensor:
         cos_t = torch.cos(self.twist_out_angle)
     def forward(self, x: Tensor) -> Tensor:
         return self._twist_out(self._center_lens(self._twist_in(x)))
+    def get_lens_stats(self) -> Dict[str, float]:
+        """Return lens parameters for logging."""
+        return {
+            'omega': self.omega.item(),
+            'alpha': self.alpha.item(),
+            'twist_in_angle': self.twist_in_angle.item(),
+            'twist_out_angle': self.twist_out_angle.item(),
+            'xor_weight': torch.sigmoid(self.xor_weight).item(),
+            'accum_weights_l': torch.softmax(self.accum_weights, dim=0)[0].item(),
+            'accum_weights_m': torch.softmax(self.accum_weights, dim=0)[1].item(),
+            'accum_weights_r': torch.softmax(self.accum_weights, dim=0)[2].item(),
+        }
 # ============================================================================
         rw = torch.sigmoid(self.residual_weight)
         return rw * identity + (1 - rw) * h
+    def get_residual_weight(self) -> float:
+        return torch.sigmoid(self.residual_weight).item()
 # ============================================================================
+# MÖBIUS NET
 # ============================================================================
 class MobiusNet(nn.Module):
     def __init__(
         self,
         in_chans: int = 3,
+        num_classes: int = 200,
+        channels: Tuple[int, ...] = (64, 128, 256, 512),
+        depths: Tuple[int, ...] = (2, 2, 2, 2),
         scale_range: Tuple[float, float] = (0.5, 2.5),
+        use_integrator: bool = True,
     ):
         super().__init__()
         self.total_layers = total_layers
         self.scale_range = scale_range
+        self.channels = tuple(channels)
+        self.depths = tuple(depths)
         self.num_stages = num_stages
+        self.use_integrator = use_integrator
+        self.num_classes = num_classes
+        self.in_chans = in_chans
         channels = list(channels)
         while len(channels) < num_stages:
             channels.append(channels[-1])
         self.stem = nn.Sequential(
+            nn.Conv2d(in_chans, channels[0], 3, stride=1, padding=1, bias=False),
             nn.BatchNorm2d(channels[0]),
         )
         layer_idx = 0
         self.stages = nn.ModuleList()
         self.downsamples = nn.ModuleList()
         for stage_idx in range(num_stages):
             ch = channels[stage_idx]
             stage = nn.ModuleList()
             for _ in range(depths[stage_idx]):
+                stage.append(MobiusConvBlock(ch, layer_idx, total_layers, scale_range))
                 layer_idx += 1
             self.stages.append(stage)
             if stage_idx < num_stages - 1:
                 ch_next = channels[stage_idx + 1]
                 self.downsamples.append(nn.Sequential(
                     nn.BatchNorm2d(ch_next),
                 ))
+        final_ch = channels[num_stages - 1]
+        if use_integrator:
+            self.integrator = nn.Sequential(
+                nn.Conv2d(final_ch, final_ch, 3, padding=1, bias=False),
+                nn.BatchNorm2d(final_ch),
+                nn.GELU(),
+            )
+        else:
+            self.integrator = nn.Identity()
         self.pool = nn.AdaptiveAvgPool2d(1)
+        self.head = nn.Linear(final_ch, num_classes)
     def forward(self, x: Tensor) -> Tensor:
         x = self.stem(x)
             if i < len(self.downsamples):
                 x = self.downsamples[i](x)
+        x = self.integrator(x)
         return self.head(self.pool(x).flatten(1))
+    def get_config(self) -> Dict[str, Any]:
+        """Return model configuration for saving."""
+        return {
+            'in_chans': self.in_chans,
+            'num_classes': self.num_classes,
+            'channels': self.channels,
+            'depths': self.depths,
+            'scale_range': self.scale_range,
+            'use_integrator': self.use_integrator,
+            'total_layers': self.total_layers,
+            'num_stages': self.num_stages,
+        }
+    def get_all_lens_stats(self) -> Dict[str, Dict[str, float]]:
+        """Return stats from all lenses for logging."""
+        stats = {}
         layer_idx = 0
+        for stage_idx, stage in enumerate(self.stages):
+            for block_idx, block in enumerate(stage):
+                key = f"stage{stage_idx}_block{block_idx}"
+                stats[key] = block.lens.get_lens_stats()
+                stats[key]['residual_weight'] = block.get_residual_weight()
                 layer_idx += 1
+        return stats
+# ============================================================================
+# TINY IMAGENET DATASET
+# ============================================================================
+def get_tiny_imagenet_loaders(data_dir='./data/tiny-imagenet-200', batch_size=128):
+    train_dir = os.path.join(data_dir, 'train')
+    val_dir = os.path.join(data_dir, 'val')
+    val_images_dir = os.path.join(val_dir, 'images')
+    if os.path.exists(val_images_dir):
+        print("Reorganizing validation folder...")
+        reorganize_val_folder(val_dir)
+    train_transform = transforms.Compose([
+        transforms.RandomCrop(64, padding=8),
+        transforms.RandomHorizontalFlip(),
+        transforms.AutoAugment(transforms.AutoAugmentPolicy.IMAGENET),
+        transforms.ToTensor(),
+        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+    ])
+    val_transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+    ])
+    train_dataset = datasets.ImageFolder(train_dir, transform=train_transform)
+    val_dataset = datasets.ImageFolder(val_dir, transform=val_transform)
+    train_loader = DataLoader(
+        train_dataset, batch_size=batch_size, shuffle=True,
+        num_workers=8, pin_memory=True, persistent_workers=True
+    )
+    val_loader = DataLoader(
+        val_dataset, batch_size=256, shuffle=False,
+        num_workers=4, pin_memory=True, persistent_workers=True
+    )
+    return train_loader, val_loader
+def reorganize_val_folder(val_dir):
+    """Reorganize Tiny ImageNet val folder into class subfolders."""
+    val_images_dir = os.path.join(val_dir, 'images')
+    val_annotations = os.path.join(val_dir, 'val_annotations.txt')
+    if not os.path.exists(val_images_dir):
+        return
+    with open(val_annotations, 'r') as f:
+        for line in f:
+            parts = line.strip().split('\t')
+            img_name, class_id = parts[0], parts[1]
+            class_dir = os.path.join(val_dir, class_id)
+            os.makedirs(class_dir, exist_ok=True)
+            src = os.path.join(val_images_dir, img_name)
+            dst = os.path.join(class_dir, img_name)
+            if os.path.exists(src):
+                shutil.move(src, dst)
+    if os.path.exists(val_images_dir):
+        shutil.rmtree(val_images_dir)
+    if os.path.exists(val_annotations):
+        os.remove(val_annotations)
+    print("Validation folder reorganized.")
 # ============================================================================
 # ============================================================================
 PRESETS = {
+    'mobius_tiny_s': {
+        'channels': (64, 128, 256),
+        'depths': (2, 2, 2),
         'scale_range': (0.5, 2.5),
     },
+    'mobius_tiny_m': {
+        'channels': (64, 128, 256, 512, 768),
+        'depths': (2, 2, 4, 2, 2),
+        'scale_range': (0.25, 2.75),
     },
+    'mobius_tiny_l': {
+        'channels': (96, 192, 384, 768),
+        'depths': (3, 3, 3, 3),
         'scale_range': (0.5, 3.5),
     },
+    'mobius_base': {
+        'channels': (128, 256, 512, 768, 1024),
+        'depths': (2, 2, 2, 2, 2),
+        'scale_range': (0.25, 2.75),
     },
 }
+# ============================================================================
+# CHECKPOINT MANAGER
+# ============================================================================
+class CheckpointManager:
+    def __init__(
+        self,
+        base_dir: str,
+        variant_name: str,
+        dataset_name: str,
+        hf_repo: str = "AbstractPhil/mobiusnet",
+        upload_every_n_epochs: int = 10,
+        save_every_n_epochs: int = 10,
+        timestamp: Optional[str] = None,
+    ):
+        self.timestamp = timestamp or datetime.now().strftime("%Y%m%d_%H%M%S")
+        self.variant_name = variant_name
+        self.dataset_name = dataset_name
+        self.hf_repo = hf_repo
+        self.upload_every_n_epochs = upload_every_n_epochs
+        self.save_every_n_epochs = save_every_n_epochs
+        # Directory structure
+        self.run_name = f"{variant_name}_{dataset_name}"
+        self.run_dir = Path(base_dir) / "checkpoints" / self.run_name / self.timestamp
+        self.checkpoints_dir = self.run_dir / "checkpoints"
+        self.tensorboard_dir = self.run_dir / "tensorboard"
+        # Create directories
+        self.checkpoints_dir.mkdir(parents=True, exist_ok=True)
+        self.tensorboard_dir.mkdir(parents=True, exist_ok=True)
+        # TensorBoard writer
+        self.writer = SummaryWriter(log_dir=str(self.tensorboard_dir))
+        # HuggingFace API
+        self.hf_api = HfApi()
+        self.uploaded_files = set()
+        # Track best
+        self.best_acc = 0.0
+        self.best_epoch = 0
+        self.best_changed_since_upload = False
+        print(f"Checkpoint directory: {self.run_dir}")
+    @staticmethod
+    def extract_timestamp(checkpoint_path: str) -> Optional[str]:
+        """Extract timestamp from checkpoint path."""
+        # Match YYYYMMDD_HHMMSS pattern
+        match = re.search(r'(\d{8}_\d{6})', checkpoint_path)
+        if match:
+            return match.group(1)
+        return None
+    def save_config(self, config: Dict[str, Any], training_config: Dict[str, Any]):
+        """Save model and training configuration."""
+        full_config = {
+            'model': config,
+            'training': training_config,
+            'timestamp': self.timestamp,
+            'variant_name': self.variant_name,
+            'dataset_name': self.dataset_name,
+        }
+        config_path = self.run_dir / "config.json"
+        with open(config_path, 'w') as f:
+            json.dump(full_config, f, indent=2)
+        return config_path
+    def save_checkpoint(
+        self,
+        model: nn.Module,
+        optimizer: torch.optim.Optimizer,
+        scheduler: Any,
+        epoch: int,
+        train_acc: float,
+        val_acc: float,
+        train_loss: float,
+        is_best: bool = False,
+    ):
+        """Save checkpoint every N epochs, always save best (overwriting)."""
+        # Unwrap compiled model if necessary
+        raw_model = model._orig_mod if hasattr(model, '_orig_mod') else model
+        # Checkpoint data
+        checkpoint = {
+            'epoch': epoch,
+            'train_acc': train_acc,
+            'val_acc': val_acc,
+            'train_loss': train_loss,
+            'best_acc': self.best_acc,
+            'optimizer_state_dict': optimizer.state_dict(),
+            'scheduler_state_dict': scheduler.state_dict(),
+        }
+        # Save epoch checkpoint every N epochs
+        if epoch % self.save_every_n_epochs == 0:
+            epoch_pt_path = self.checkpoints_dir / f"checkpoint_epoch_{epoch:04d}.pt"
+            torch.save({**checkpoint, 'model_state_dict': raw_model.state_dict()}, epoch_pt_path)
+            epoch_st_path = self.checkpoints_dir / f"checkpoint_epoch_{epoch:04d}.safetensors"
+            save_safetensors(raw_model.state_dict(), str(epoch_st_path))
+        # Save best model (overwrites previous best)
+        if is_best:
+            self.best_acc = val_acc
+            self.best_epoch = epoch
+            self.best_changed_since_upload = True
+            # PyTorch best
+            best_pt_path = self.checkpoints_dir / "best_model.pt"
+            torch.save({**checkpoint, 'model_state_dict': raw_model.state_dict()}, best_pt_path)
+            # SafeTensors best
+            best_st_path = self.checkpoints_dir / "best_model.safetensors"
+            save_safetensors(raw_model.state_dict(), str(best_st_path))
+            # Save accuracy info
+            acc_path = self.run_dir / "best_accuracy.json"
+            with open(acc_path, 'w') as f:
+                json.dump({
+                    'best_acc': val_acc,
+                    'best_epoch': epoch,
+                    'train_acc': train_acc,
+                    'train_loss': train_loss,
+                }, f, indent=2)
+    def save_final(self, model: nn.Module, final_acc: float, final_epoch: int):
+        """Save final model."""
+        raw_model = model._orig_mod if hasattr(model, '_orig_mod') else model
+        # SafeTensors final
+        final_st_path = self.checkpoints_dir / "final_model.safetensors"
+        save_safetensors(raw_model.state_dict(), str(final_st_path))
+        # PyTorch final
+        final_pt_path = self.checkpoints_dir / "final_model.pt"
+        torch.save({
+            'model_state_dict': raw_model.state_dict(),
+            'final_acc': final_acc,
+            'final_epoch': final_epoch,
+            'best_acc': self.best_acc,
+            'best_epoch': self.best_epoch,
+        }, final_pt_path)
+        # Final accuracy info
+        acc_path = self.run_dir / "final_accuracy.json"
+        with open(acc_path, 'w') as f:
+            json.dump({
+                'final_acc': final_acc,
+                'final_epoch': final_epoch,
+                'best_acc': self.best_acc,
+                'best_epoch': self.best_epoch,
+            }, f, indent=2)
+        return final_st_path, final_pt_path
+    def log_scalars(self, epoch: int, scalars: Dict[str, float], prefix: str = ""):
+        """Log scalars to TensorBoard."""
+        for name, value in scalars.items():
+            tag = f"{prefix}/{name}" if prefix else name
+            self.writer.add_scalar(tag, value, epoch)
+    def log_lens_stats(self, epoch: int, model: nn.Module):
+        """Log lens statistics to TensorBoard."""
+        raw_model = model._orig_mod if hasattr(model, '_orig_mod') else model
+        stats = raw_model.get_all_lens_stats()
+        for block_name, block_stats in stats.items():
+            for stat_name, value in block_stats.items():
+                self.writer.add_scalar(f"lens/{block_name}/{stat_name}", value, epoch)
+    def log_histograms(self, epoch: int, model: nn.Module):
+        """Log weight histograms to TensorBoard."""
+        raw_model = model._orig_mod if hasattr(model, '_orig_mod') else model
+        for name, param in raw_model.named_parameters():
+            if param.requires_grad:
+                self.writer.add_histogram(f"weights/{name}", param.data, epoch)
+                if param.grad is not None:
+                    self.writer.add_histogram(f"gradients/{name}", param.grad, epoch)
+    def upload_to_hf(self, epoch: int, force: bool = False):
+        """Upload checkpoint every N epochs. Best uploads only on upload epochs if changed."""
+        if not force and epoch % self.upload_every_n_epochs != 0:
+            return
+        try:
+            hf_base_path = f"checkpoints/{self.run_name}/{self.timestamp}"
+            files_to_upload = []
+            # Always upload config
+            config_path = self.run_dir / "config.json"
+            if config_path.exists():
+                files_to_upload.append(config_path)
+            # Upload checkpoint if saved this epoch
+            if epoch % self.save_every_n_epochs == 0:
+                ckpt_st = self.checkpoints_dir / f"checkpoint_epoch_{epoch:04d}.safetensors"
+                ckpt_pt = self.checkpoints_dir / f"checkpoint_epoch_{epoch:04d}.pt"
+                if ckpt_st.exists():
+                    files_to_upload.append(ckpt_st)
+                if ckpt_pt.exists():
+                    files_to_upload.append(ckpt_pt)
+            # Upload best if it changed since last upload
+            if self.best_changed_since_upload:
+                best_files = [
+                    self.checkpoints_dir / "best_model.safetensors",
+                    self.checkpoints_dir / "best_model.pt",
+                    self.run_dir / "best_accuracy.json",
+                ]
+                for f in best_files:
+                    if f.exists():
+                        files_to_upload.append(f)
+                self.best_changed_since_upload = False
+            # Upload files
+            for local_path in files_to_upload:
+                rel_path = local_path.relative_to(self.run_dir)
+                hf_path = f"{hf_base_path}/{rel_path}"
+                try:
+                    self.hf_api.upload_file(
+                        path_or_fileobj=str(local_path),
+                        path_in_repo=hf_path,
+                        repo_id=self.hf_repo,
+                        repo_type="model",
+                    )
+                    print(f"Uploaded: {hf_path}")
+                except Exception as e:
+                    print(f"Failed to upload {rel_path}: {e}")
+        except Exception as e:
+            print(f"HuggingFace upload error: {e}")
+    def close(self):
+        """Close TensorBoard writer."""
+        self.writer.close()
+    @staticmethod
+    def load_checkpoint(
+        checkpoint_path: str,
+        model: nn.Module,
+        optimizer: Optional[torch.optim.Optimizer] = None,
+        scheduler: Optional[Any] = None,
+        hf_repo: str = "AbstractPhil/mobiusnet",
+        device: torch.device = torch.device('cpu'),
+    ) -> Dict[str, Any]:
+        """
+        Load checkpoint from local path or HuggingFace repo.
+        Args:
+            checkpoint_path: Either:
+                - Local file path to .pt checkpoint
+                - Local directory containing checkpoints
+                - HuggingFace path like "checkpoints/variant_dataset/timestamp"
+            model: Model to load weights into
+            optimizer: Optional optimizer to restore state
+            scheduler: Optional scheduler to restore state
+            hf_repo: HuggingFace repo ID
+            device: Device to load tensors to
+        Returns:
+            Dict with checkpoint info (epoch, best_acc, etc.)
+        """
+        from huggingface_hub import hf_hub_download, list_repo_files
+        checkpoint_file = None
+        # Check if it's a local file
+        if os.path.isfile(checkpoint_path):
+            checkpoint_file = checkpoint_path
+        # Check if it's a local directory
+        elif os.path.isdir(checkpoint_path):
+            # Look for best_model.pt or latest checkpoint
+            best_path = os.path.join(checkpoint_path, "checkpoints", "best_model.pt")
+            if os.path.exists(best_path):
+                checkpoint_file = best_path
+            else:
+                # Find latest epoch checkpoint
+                ckpt_dir = os.path.join(checkpoint_path, "checkpoints")
+                if os.path.isdir(ckpt_dir):
+                    pt_files = sorted([f for f in os.listdir(ckpt_dir) if f.startswith("checkpoint_epoch_") and f.endswith(".pt")])
+                    if pt_files:
+                        checkpoint_file = os.path.join(ckpt_dir, pt_files[-1])
+        # Try HuggingFace download
+        if checkpoint_file is None:
+            print(f"Attempting to download from HuggingFace: {hf_repo}/{checkpoint_path}")
+            try:
+                # If checkpoint_path is a directory path in the repo
+                if not checkpoint_path.endswith(".pt"):
+                    # Try to download best_model.pt
+                    try:
+                        checkpoint_file = hf_hub_download(
+                            repo_id=hf_repo,
+                            filename=f"{checkpoint_path}/checkpoints/best_model.pt",
+                            repo_type="model",
+                        )
+                        print(f"Downloaded best_model.pt from {hf_repo}")
+                    except:
+                        # List files and find latest checkpoint
+                        files = list_repo_files(repo_id=hf_repo, repo_type="model")
+                        ckpt_files = sorted([f for f in files if checkpoint_path in f and f.endswith(".pt") and "checkpoint_epoch_" in f])
+                        if ckpt_files:
+                            checkpoint_file = hf_hub_download(
+                                repo_id=hf_repo,
+                                filename=ckpt_files[-1],
+                                repo_type="model",
+                            )
+                            print(f"Downloaded {ckpt_files[-1]} from {hf_repo}")
+                else:
+                    # Direct file path
+                    checkpoint_file = hf_hub_download(
+                        repo_id=hf_repo,
+                        filename=checkpoint_path,
+                        repo_type="model",
+                    )
+                    print(f"Downloaded {checkpoint_path} from {hf_repo}")
+            except Exception as e:
+                raise FileNotFoundError(f"Could not find or download checkpoint: {checkpoint_path}. Error: {e}")
+        if checkpoint_file is None:
+            raise FileNotFoundError(f"Could not find checkpoint: {checkpoint_path}")
+        print(f"Loading checkpoint from: {checkpoint_file}")
+        checkpoint = torch.load(checkpoint_file, map_location=device, weights_only=False)
+        # Load model weights
+        raw_model = model._orig_mod if hasattr(model, '_orig_mod') else model
+        raw_model.load_state_dict(checkpoint['model_state_dict'])
+        print(f"Loaded model weights")
+        # Load optimizer state
+        if optimizer is not None and 'optimizer_state_dict' in checkpoint:
+            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+            print(f"Loaded optimizer state")
+        # Load scheduler state
+        if scheduler is not None and 'scheduler_state_dict' in checkpoint:
+            scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
+            print(f"Loaded scheduler state")
+        info = {
+            'epoch': checkpoint.get('epoch', 0),
+            'best_acc': checkpoint.get('best_acc', 0.0),
+            'train_acc': checkpoint.get('train_acc', 0.0),
+            'val_acc': checkpoint.get('val_acc', 0.0),
+            'train_loss': checkpoint.get('train_loss', 0.0),
+        }
+        print(f"Resuming from epoch {info['epoch']} (best_acc: {info['best_acc']:.4f})")
+        return info
 # ============================================================================
 # TRAINING
 # ============================================================================
+def train_tiny_imagenet(
+    preset: str = 'mobius_tiny_m',
     epochs: int = 100,
     lr: float = 1e-3,
     batch_size: int = 128,
+    use_integrator: bool = True,
+    data_dir: str = './data/tiny-imagenet-200',
+    output_dir: str = './outputs',
+    hf_repo: str = "AbstractPhil/mobiusnet",
+    save_every_n_epochs: int = 10,
+    upload_every_n_epochs: int = 10,
+    log_histograms_every: int = 10,
+    use_compile: bool = True,
+    continue_from: Optional[str] = None,
 ):
+    """
+    Train MobiusNet on Tiny ImageNet.
+    Args:
+        preset: Model preset name
+        epochs: Total epochs to train
+        lr: Learning rate
+        batch_size: Batch size
+        use_integrator: Whether to use integrator layer
+        data_dir: Path to Tiny ImageNet data
+        output_dir: Output directory for checkpoints
+        hf_repo: HuggingFace repo for uploads/downloads
+        save_every_n_epochs: Save checkpoint every N epochs
+        upload_every_n_epochs: Upload to HF every N epochs
+        log_histograms_every: Log weight histograms every N epochs
+        use_compile: Whether to use torch.compile
+        continue_from: Resume from checkpoint. Can be:
+            - Local .pt file path
+            - Local checkpoint directory
+            - HuggingFace path (e.g., "checkpoints/mobius_base_tiny_imagenet/20240101_120000")
+    """
     config = PRESETS[preset]
+    dataset_name = "tiny_imagenet"
     print("=" * 70)
+    print(f"MÖBIUS NET - {preset.upper()} - TINY IMAGENET")
     print("=" * 70)
     print(f"Device: {device}")
     print(f"Channels: {config['channels']}")
     print(f"Depths: {config['depths']}")
     print(f"Scale range: {config['scale_range']}")
+    print(f"Integrator: {use_integrator}")
+    if continue_from:
+        print(f"Continuing from: {continue_from}")
     print()
+    # Extract timestamp from checkpoint path if continuing
+    resume_timestamp = None
+    if continue_from:
+        resume_timestamp = CheckpointManager.extract_timestamp(continue_from)
+        if resume_timestamp:
+            print(f"Using original timestamp: {resume_timestamp}")
+    # Initialize checkpoint manager
+    ckpt_manager = CheckpointManager(
+        base_dir=output_dir,
+        variant_name=preset,
+        dataset_name=dataset_name,
+        hf_repo=hf_repo,
+        upload_every_n_epochs=upload_every_n_epochs,
+        save_every_n_epochs=save_every_n_epochs,
+        timestamp=resume_timestamp,
     )
+    # Data
+    train_loader, val_loader = get_tiny_imagenet_loaders(data_dir, batch_size)
+    # Model
     model = MobiusNet(
         in_chans=3,
+        num_classes=200,
+        use_integrator=use_integrator,
         **config
     ).to(device)
     total_params = sum(p.numel() for p in model.parameters())
     print(f"Total params: {total_params:,}")
     print()
+    # Save config
+    training_config = {
+        'epochs': epochs,
+        'lr': lr,
+        'batch_size': batch_size,
+        'optimizer': 'AdamW',
+        'weight_decay': 0.05,
+        'scheduler': 'CosineAnnealingLR',
+        'total_params': total_params,
+    }
+    ckpt_manager.save_config(model.get_config(), training_config)
+    # Compile model
+    if use_compile:
+        model = torch.compile(model, mode='reduce-overhead')
+    # Optimizer and scheduler
     optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.05)
     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
+    # Load checkpoint if continuing
+    start_epoch = 1
     best_acc = 0.0
+    if continue_from:
+        ckpt_info = CheckpointManager.load_checkpoint(
+            checkpoint_path=continue_from,
+            model=model,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            hf_repo=hf_repo,
+            device=device,
+        )
+        start_epoch = ckpt_info['epoch'] + 1
+        best_acc = ckpt_info['best_acc']
+        ckpt_manager.best_acc = best_acc
+        ckpt_manager.best_epoch = ckpt_info['epoch']
+        print(f"Resuming training from epoch {start_epoch}")
+    for epoch in range(start_epoch, epochs + 1):
+        # Training
         model.train()
         train_loss, train_correct, train_total = 0, 0, 0
         scheduler.step()
+        # Validation
         model.eval()
         val_correct, val_total = 0, 0
         with torch.no_grad():
+            for x, y in val_loader:
                 x, y = x.to(device), y.to(device)
                 logits = model(x)
                 val_correct += (logits.argmax(1) == y).sum().item()
                 val_total += x.size(0)
+        # Metrics
         train_acc = train_correct / train_total
         val_acc = val_correct / val_total
+        avg_loss = train_loss / train_total
+        current_lr = scheduler.get_last_lr()[0]
+        is_best = val_acc > best_acc
+        if is_best:
+            best_acc = val_acc
+        marker = " ★" if is_best else ""
+        print(f"Epoch {epoch:3d} | Loss: {avg_loss:.4f} | "
               f"Train: {train_acc:.4f} | Val: {val_acc:.4f} | Best: {best_acc:.4f}{marker}")
+        # TensorBoard logging
+        ckpt_manager.log_scalars(epoch, {
+            'loss': avg_loss,
+            'train_acc': train_acc,
+            'val_acc': val_acc,
+            'best_acc': best_acc,
+            'learning_rate': current_lr,
+        }, prefix="train")
+        # Log lens stats
+        ckpt_manager.log_lens_stats(epoch, model)
+        # Log histograms periodically
+        if epoch % log_histograms_every == 0:
+            ckpt_manager.log_histograms(epoch, model)
+        # Save checkpoint
+        ckpt_manager.save_checkpoint(
+            model=model,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            epoch=epoch,
+            train_acc=train_acc,
+            val_acc=val_acc,
+            train_loss=avg_loss,
+            is_best=is_best,
+        )
+        # Upload to HuggingFace (handles both checkpoint and best)
+        ckpt_manager.upload_to_hf(epoch)
+    # Save final model
+    ckpt_manager.save_final(model, val_acc, epochs)
+    # Final upload
+    ckpt_manager.upload_to_hf(epochs, force=True)
+    ckpt_manager.close()
     print()
     print("=" * 70)
     print("FINAL RESULTS")
     print("=" * 70)
+    print(f"Preset: {preset}")
     print(f"Best accuracy: {best_acc:.4f}")
     print(f"Total params: {total_params:,}")
+    print(f"Checkpoints: {ckpt_manager.run_dir}")
     print("=" * 70)
     return model, best_acc
 # ============================================================================
 if __name__ == '__main__':
+    model, best_acc = train_tiny_imagenet(
+        preset='mobius_base',
+        epochs=200,
+        lr=3e-4,
+        batch_size=128,
+        use_integrator=True,
+        data_dir='./data/tiny-imagenet-200',
+        output_dir='./outputs',
+        hf_repo='AbstractPhil/mobiusnet',
+        save_every_n_epochs=10,
+        upload_every_n_epochs=10,
+        log_histograms_every=10,
+        use_compile=True,
+        continue_from='/content/outputs/checkpoints/mobius_base_tiny_imagenet/20260110_132436/checkpoints/best_model.pt',  # Set to path or HF checkpoint to resume
+        # Examples:
+        # continue_from="./outputs/checkpoints/mobius_base_tiny_imagenet/20240101_120000"
+        # continue_from="./outputs/checkpoints/mobius_base_tiny_imagenet/20240101_120000/checkpoints/best_model.pt"
+        # continue_from="checkpoints/mobius_base_tiny_imagenet/20240101_120000"  # downloads from HF
     )