theapemachine commited on 10 days ago

Commit

bc1b8eb

1 Parent(s): 514b330

Add sparse transformer v19 with Triton-backed KNN scheduler and various backward modes. Includes utilities for synthetic data generation and model training. Implements chunked sparse updates and integrates with existing sparse linear layers.

Browse files

Files changed (34) hide show

e2e_bench.py → experiments/e2e_bench.py +0 -0
e2e_full.py → experiments/e2e_full.py +0 -0
experiments/pyproject.toml +19 -0
experiments/sparse_linear_v10_metal/README.md +62 -0
experiments/sparse_linear_v10_metal/setup.py +45 -0
experiments/sparse_linear_v10_metal/sparse_linear.metal +65 -0
experiments/sparse_linear_v10_metal/sparse_linear_ops.mm +158 -0
experiments/sparse_linear_v10_metal/sparse_transformer_v10.py +1112 -0
experiments/sparse_linear_v11_gather_vs_metal/README.md +62 -0
experiments/sparse_linear_v11_gather_vs_metal/input.txt +0 -0
experiments/sparse_linear_v11_gather_vs_metal/setup.py +15 -0
experiments/sparse_linear_v11_gather_vs_metal/sparse_linear.metal +62 -0
experiments/sparse_linear_v11_gather_vs_metal/sparse_linear_ops.mm +168 -0
experiments/sparse_linear_v11_gather_vs_metal/sparse_transformer_v11.py +406 -0
experiments/sparse_linear_v11_gather_vs_metal/sparse_transformer_v13.py +419 -0
experiments/sparse_linear_v11_gather_vs_metal/tiny.py +441 -0
experiments/sparse_transformer_v15_inactive_prediction.py +729 -0
experiments/sparse_transformer_v16_sensor_scheduler.py +677 -0
experiments/sparse_transformer_v17_radar_scheduler.py +725 -0
experiments/sparse_transformer_v6.py +596 -0
experiments/sparse_transformer_v7.py +780 -0
experiments/sparse_transformer_v8.py +943 -0
experiments/sparse_transformer_v9.py +1042 -0
experiments/surprise_topk_gradient_prototype-v2.py +418 -0
experiments/surprise_topk_gradient_prototype-v3.py +487 -0
experiments/surprise_topk_gradient_prototype-v4.py +571 -0
experiments/surprise_topk_gradient_prototype-v5.py +563 -0
experiments/surprise_topk_gradient_prototype.py +426 -0
triton_sparse.py → experiments/triton_sparse.py +0 -0
triton_v2.py → experiments/triton_v2.py +0 -0
experiments/uv.lock +0 -0
paper/main.tex +307 -0
sparse_transformer_v18_fast_knn.py +459 -0
sparse_transformer_v18_fast_knn_triton.py +1044 -0

e2e_bench.py → experiments/e2e_bench.py RENAMED Viewed

File without changes

e2e_full.py → experiments/e2e_full.py RENAMED Viewed

File without changes

experiments/pyproject.toml ADDED Viewed

	@@ -0,0 +1,19 @@

+[build-system]
+requires = ["setuptools>=61"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "surprise-topk-gradient"
+version = "0.1.0"
+description = "Prototype for surprise Top-K gradient training experiments"
+requires-python = ">=3.10"
+dependencies = [
+    "numpy>=1.24",
+    "torch>=2.0",
+]
+[project.scripts]
+surprise-topk-gradient = "surprise_topk_gradient_prototype:main"
+[tool.setuptools]
+py-modules = ["surprise_topk_gradient_prototype"]

experiments/sparse_linear_v10_metal/README.md ADDED Viewed

	@@ -0,0 +1,62 @@

+# Sparse Transformer v10: Metal-backed active-row Linear backward benchmark
+This bundle is the first empirical optimization test rather than only a correctness prototype.
+It compares dense Transformer training against sparse active-row backward variants and reports
+validation loss plus wall-clock metrics (`step_ms`, `tokens_per_s`).
+## Files
+- `sparse_transformer_v10.py` — training + benchmark harness.
+- `sparse_linear.metal` — Metal kernels for active-row `dW/db` and sparse `dX`.
+- `sparse_linear_ops.mm` — PyTorch/MPS extension glue.
+- `setup.py` — builds the extension and compiles the `.metallib`.
+## Install the Metal extension
+From this directory on macOS with PyTorch MPS available:
+```bash
+python3 -m pip install -e .
+```
+This uses `xcrun metal` and `xcrun metallib`, so Xcode command-line tools are required.
+## Correctness/sanity run with PyTorch fallback
+```bash
+python3 sparse_transformer_v10.py \
+  --device mps \
+  --steps 2000 \
+  --active_fractions 0.05 0.02 \
+  --warmup_steps_list 5 \
+  --policies predicted_magnitude random \
+  --backward_modes sparse_dW_full_dX sparse_dW_sparse_dX \
+  --audit_every 0 \
+  --kernel_backend torch \
+  --benchmark_sync
+```
+## Metal benchmark run
+```bash
+python3 sparse_transformer_v10.py \
+  --device mps \
+  --steps 2000 \
+  --active_fractions 0.05 0.02 \
+  --warmup_steps_list 5 \
+  --policies predicted_magnitude random \
+  --backward_modes sparse_dW_full_dX sparse_dW_sparse_dX \
+  --audit_every 0 \
+  --kernel_backend metal \
+  --benchmark_sync
+```
+## Empirical pass/fail criteria
+A genuine optimization would show:
+1. `predicted_magnitude` validation loss close to the PyTorch fallback v9/v10 result.
+2. `predicted_magnitude` much better than `random` at the same active fraction.
+3. `--kernel_backend metal` has lower `step_ms` or higher `tokens_per_s` than `--kernel_backend torch` and ideally dense baseline.
+The first Metal kernels are intentionally simple and fp32-only. They are meant to prove or disprove the acceleration path before investing in tiled half-precision kernels.

experiments/sparse_linear_v10_metal/setup.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import shutil
+import subprocess
+from pathlib import Path
+import torch
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CppExtension
+ROOT = Path(__file__).parent.resolve()
+# PyTorch dylibs (libc10, libtorch, …) are @rpath-linked; embed torch/lib so import works from any cwd.
+_TORCH_LIB = Path(torch.__file__).resolve().parent / "lib"
+METAL_SRC = ROOT / "sparse_linear.metal"
+AIR = ROOT / "sparse_linear.air"
+METALLIB = ROOT / "sparse_linear_ops.metallib"
+class MetalBuildExt(BuildExtension):
+    def run(self):
+        if shutil.which("xcrun") is None:
+            raise RuntimeError("xcrun not found. Install Xcode command line tools.")
+        subprocess.check_call(["xcrun", "-sdk", "macosx", "metal", "-c", str(METAL_SRC), "-o", str(AIR)])
+        subprocess.check_call(["xcrun", "-sdk", "macosx", "metallib", str(AIR), "-o", str(METALLIB)])
+        super().run()
+        # Copy metallib next to the built extension .so.
+        build_lib = Path(self.build_lib)
+        for so in build_lib.rglob("sparse_linear_metal*.so"):
+            shutil.copy2(METALLIB, so.parent / METALLIB.name)
+setup(
+    name="sparse_linear_metal",
+    version="0.1.0",
+    ext_modules=[
+        CppExtension(
+            name="sparse_linear_metal",
+            sources=["sparse_linear_ops.mm"],
+            extra_compile_args={"cxx": ["-std=c++17", "-ObjC++", "-fobjc-arc"]},
+            extra_link_args=[
+                "-framework",
+                "Metal",
+                "-framework",
+                "Foundation",
+                f"-Wl,-rpath,{_TORCH_LIB}",
+            ],
+        )
+    ],
+    cmdclass={"build_ext": MetalBuildExt},
+)

experiments/sparse_linear_v10_metal/sparse_linear.metal ADDED Viewed

	@@ -0,0 +1,65 @@

+#include <metal_stdlib>
+using namespace metal;
+struct SparseLinearParams {
+    uint32_t N;
+    uint32_t In;
+    uint32_t Out;
+    uint32_t dummy; // alignment padding
+};
+kernel void sparse_linear_grad_w_float(
+    device const float* x          [[buffer(0)]], // [N, In]
+    device const float* gy         [[buffer(1)]], // [N, Out]
+    device const bool* active_mask [[buffer(2)]], // [Out] boolean mask
+    device float* grad_w           [[buffer(3)]], //[Out, In], zeroed by caller
+    device float* grad_b           [[buffer(4)]], // [Out], zeroed by caller
+    constant SparseLinearParams& p [[buffer(5)]],
+    uint2 tid                      [[thread_position_in_grid]])
+{
+    uint c = tid.x;
+    uint row = tid.y;
+    // Bounds check
+    if (row >= p.Out || c >= p.In) return;
+    // The magic: if the row isn't active, the thread exits instantly.
+    // No CPU sync required.
+    if (!active_mask[row]) return;
+    float acc = 0.0f;
+    for (uint n = 0; n < p.N; ++n) {
+        acc += gy[n * p.Out + row] * x[n * p.In + c];
+    }
+    grad_w[row * p.In + c] = acc;
+    // Bias calculation (could be optimized further, but fine for now)
+    if (c == 0) {
+        float bacc = 0.0f;
+        for (uint n = 0; n < p.N; ++n) {
+            bacc += gy[n * p.Out + row];
+        }
+        grad_b[row] = bacc;
+    }
+}
+kernel void sparse_linear_grad_x_float(
+    device const float* gy         [[buffer(0)]], // [N, Out]
+    device const float* weight     [[buffer(1)]], //[Out, In]
+    device const bool* active_mask [[buffer(2)]], // [Out] boolean mask
+    device float* grad_x           [[buffer(3)]], // [N, In], zeroed by caller
+    constant SparseLinearParams& p [[buffer(4)]],
+    uint2 tid                      [[thread_position_in_grid]])
+{
+    uint c = tid.x;
+    uint n = tid.y;
+    if (n >= p.N || c >= p.In) return;
+    float acc = 0.0f;
+    for (uint row = 0; row < p.Out; ++row) {
+        if (active_mask[row]) {
+            acc += gy[n * p.Out + row] * weight[row * p.In + c];
+        }
+    }
+    grad_x[n * p.In + c] = acc;
+}

experiments/sparse_linear_v10_metal/sparse_linear_ops.mm ADDED Viewed

	@@ -0,0 +1,158 @@

+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/mps/MPSDevice.h>
+#include <c10/util/Exception.h>
+#include <filesystem>
+#include <dlfcn.h>
+#import <Metal/Metal.h>
+#import <Foundation/Foundation.h>
+#include <mutex>
+namespace fs = std::filesystem;
+namespace {
+struct SparseLinearParams {
+    uint32_t N;
+    uint32_t In;
+    uint32_t Out;
+    uint32_t dummy;
+};
+static id<MTLLibrary> g_lib = nil;
+static id<MTLComputePipelineState> g_pipeline_grad_w = nil;
+static id<MTLComputePipelineState> g_pipeline_grad_x = nil;
+static std::mutex g_mutex;
+static std::string metallib_path_for_this_module() {
+    Dl_info info;
+    if (dladdr((void*)&metallib_path_for_this_module, &info) == 0 || info.dli_fname == nullptr) return std::string();
+    fs::path so_path(info.dli_fname);
+    return (so_path.parent_path() / "sparse_linear_ops.metallib").string();
+}
+static void ensure_library_locked(id<MTLDevice> device) {
+    if (g_lib != nil) return;
+    std::string path = metallib_path_for_this_module();
+    TORCH_CHECK(!path.empty(), "sparse_linear_ops: failed to locate extension path via dladdr");
+    NSString* ns_path = [NSString stringWithUTF8String:path.c_str()];
+    NSURL* url = [NSURL fileURLWithPath:ns_path];
+    NSError* err = nil;
+    g_lib = [device newLibraryWithURL:url error:&err];
+    if (g_lib == nil) {
+        const char* msg = err ? [[err localizedDescription] UTF8String] : "unknown error";
+        TORCH_CHECK(false, "sparse_linear_ops: failed to load metallib at ", path, ": ", msg);
+    }
+}
+static id<MTLComputePipelineState> ensure_pipeline(id<MTLDevice> device, id<MTLComputePipelineState>* pipeline, const char* fn_name) {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    ensure_library_locked(device);
+    if (*pipeline != nil) return *pipeline;
+    NSString* ns_fn = [NSString stringWithUTF8String:fn_name];
+    id<MTLFunction> fn =[g_lib newFunctionWithName:ns_fn];
+    TORCH_CHECK(fn != nil, "sparse_linear_ops: function `", fn_name, "` not found in metallib");
+    NSError* err = nil;
+    *pipeline = [device newComputePipelineStateWithFunction:fn error:&err];
+    if (*pipeline == nil) {
+        const char* msg = err ? [[err localizedDescription] UTF8String] : "unknown error";
+        TORCH_CHECK(false, "sparse_linear_ops: failed to create pipeline for ", fn_name, ": ", msg);
+    }
+    return *pipeline;
+}
+static inline id<MTLBuffer> storage_as_mtlbuffer(const at::Tensor& t) {
+    void* ctx = t.storage().data_ptr().get_context();
+    TORCH_CHECK(ctx != nullptr, "sparse_linear_ops: expected MPS tensor storage with MTLBuffer context");
+    return (__bridge id<MTLBuffer>)ctx;
+}
+static inline NSUInteger storage_offset_bytes(const at::Tensor& t) {
+    return (NSUInteger)(t.storage_offset() * (int64_t)t.element_size());
+}
+static void check_mps_float_contig(const at::Tensor& t, const char* name) {
+    TORCH_CHECK(t.device().is_mps(), name, " must be on MPS");
+    TORCH_CHECK(t.dtype() == at::kFloat, name, " must be float32 for v12 kernel");
+    TORCH_CHECK(t.is_contiguous(), name, " must be contiguous");
+}
+} // namespace
+std::vector<at::Tensor> sparse_linear_grad_wb(at::Tensor x2d, at::Tensor gy2d, at::Tensor active_mask) {
+    check_mps_float_contig(x2d, "x2d");
+    check_mps_float_contig(gy2d, "gy2d");
+    TORCH_CHECK(active_mask.device().is_mps(), "active_mask must be on MPS");
+    TORCH_CHECK(active_mask.dtype() == at::kBool, "active_mask must be bool");
+    TORCH_CHECK(active_mask.is_contiguous(), "active_mask must be contiguous");
+    int64_t N = x2d.size(0);
+    int64_t In = x2d.size(1);
+    int64_t Out = active_mask.size(0);
+    TORCH_CHECK(gy2d.size(1) == Out, "gy2d width must equal active_mask size");
+    auto grad_w = at::zeros({Out, In}, x2d.options());
+    auto grad_b = at::zeros({Out}, x2d.options());
+    id<MTLDevice> device = (id<MTLDevice>)at::mps::MPSDevice::getInstance()->device();
+    id<MTLComputePipelineState> pipeline = ensure_pipeline(device, &g_pipeline_grad_w, "sparse_linear_grad_w_float");
+    at::mps::MPSStream* stream = at::mps::getCurrentMPSStream();
+    id<MTLComputeCommandEncoder> encoder = (id<MTLComputeCommandEncoder>)stream->commandEncoder();[encoder setComputePipelineState:pipeline];
+    auto set_tensor = [&](const at::Tensor& t, int idx) {[encoder setBuffer:storage_as_mtlbuffer(t) offset:storage_offset_bytes(t) atIndex:(NSUInteger)idx];
+    };
+    set_tensor(x2d, 0);
+    set_tensor(gy2d, 1);
+    set_tensor(active_mask, 2);
+    set_tensor(grad_w, 3);
+    set_tensor(grad_b, 4);
+    SparseLinearParams prm{(uint32_t)N, (uint32_t)In, (uint32_t)Out, 0};
+    [encoder setBytes:&prm length:sizeof(SparseLinearParams) atIndex:5];
+    MTLSize tg = MTLSizeMake(16, 16, 1);
+    MTLSize grid = MTLSizeMake((NSUInteger)((In + 15) / 16), (NSUInteger)((Out + 15) / 16), 1);[encoder dispatchThreadgroups:grid threadsPerThreadgroup:tg];
+    return {grad_w, grad_b};
+}
+at::Tensor sparse_linear_grad_x(at::Tensor gy2d, at::Tensor weight, at::Tensor active_mask) {
+    check_mps_float_contig(gy2d, "gy2d");
+    check_mps_float_contig(weight, "weight");
+    TORCH_CHECK(active_mask.device().is_mps(), "active_mask must be on MPS");
+    TORCH_CHECK(active_mask.dtype() == at::kBool, "active_mask must be bool");
+    TORCH_CHECK(active_mask.is_contiguous(), "active_mask must be contiguous");
+    int64_t N = gy2d.size(0);
+    int64_t Out = gy2d.size(1);
+    int64_t In = weight.size(1);
+    TORCH_CHECK(weight.size(0) == Out, "weight out_features must match gy2d width");
+    auto grad_x = at::zeros({N, In}, gy2d.options());
+    id<MTLDevice> device = (id<MTLDevice>)at::mps::MPSDevice::getInstance()->device();
+    id<MTLComputePipelineState> pipeline = ensure_pipeline(device, &g_pipeline_grad_x, "sparse_linear_grad_x_float");
+    at::mps::MPSStream* stream = at::mps::getCurrentMPSStream();
+    id<MTLComputeCommandEncoder> encoder = (id<MTLComputeCommandEncoder>)stream->commandEncoder();[encoder setComputePipelineState:pipeline];
+    auto set_tensor = [&](const at::Tensor& t, int idx) {[encoder setBuffer:storage_as_mtlbuffer(t) offset:storage_offset_bytes(t) atIndex:(NSUInteger)idx];
+    };
+    set_tensor(gy2d, 0);
+    set_tensor(weight, 1);
+    set_tensor(active_mask, 2);
+    set_tensor(grad_x, 3);
+    SparseLinearParams prm{(uint32_t)N, (uint32_t)In, (uint32_t)Out, 0};[encoder setBytes:&prm length:sizeof(SparseLinearParams) atIndex:4];
+    MTLSize tg = MTLSizeMake(16, 16, 1);
+    MTLSize grid = MTLSizeMake((NSUInteger)((In + 15) / 16), (NSUInteger)((N + 15) / 16), 1);
+    [encoder dispatchThreadgroups:grid threadsPerThreadgroup:tg];
+    return grad_x;
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("sparse_linear_grad_wb", &sparse_linear_grad_wb, "Sparse active-row Linear dW/db (Metal/MPS, fp32)");
+    m.def("sparse_linear_grad_x", &sparse_linear_grad_x, "Sparse active-row Linear dX (Metal/MPS, fp32)");
+}

experiments/sparse_linear_v10_metal/sparse_transformer_v10.py ADDED Viewed

	@@ -0,0 +1,1112 @@

+"""
+Sparse Transformer v10: empirical sparse-backward optimization benchmark with optional Metal kernels.
+v8 proved that the row-sparse mask can be moved into a custom Linear backward.
+v10 keeps the no-audit training path and adds wall-clock benchmarking plus an optional Metal active-row Linear backward kernel.
+Default behavior
+----------------
+1. Run a short dense warmup, usually 5 steps.
+2. Initialize the EMA row-importance predictor from those dense warmup gradients.
+3. After warmup, choose active rows from the predictor.
+4. Train using sparse backward.
+5. Update EMA statistics only from rows that were actually active/observed.
+6. Do not compute dense gradients unless --audit_every > 0.
+Audit behavior
+--------------
+--audit_every 0
+    No dense audit after warmup. Cosine/Jaccard/top20 are unavailable and show as nan.
+--audit_every N
+    Every N steps, run an extra dense backward pass on the same batch only to
+    measure cosine/top20/Jaccard. The audit is NOT used to update the selector,
+    except for oracle_current, which is explicitly an upper-bound control.
+This is still not a wall-clock benchmark on vanilla PyTorch/MPS/CPU. The custom
+backward uses indexing and ordinary PyTorch matmuls. The goal is to verify that
+the method survives without dense information after warmup.
+Examples
+--------
+No-audit practical run:
+    python3 sparse_transformer_v9.py \
+      --device mps \
+      --steps 2000 \
+      --active_fractions 0.05 0.02 \
+      --warmup_steps_list 5 \
+      --policies predicted_magnitude random \
+      --backward_modes sparse_dW_full_dX sparse_dW_sparse_dX \
+      --audit_every 0
+Occasional audit for measurement only:
+    python3 sparse_transformer_v9.py \
+      --steps 2000 \
+      --active_fractions 0.05 0.02 \
+      --warmup_steps_list 5 \
+      --policies predicted_magnitude random \
+      --backward_modes sparse_dW_full_dX sparse_dW_sparse_dX \
+      --audit_every 100
+"""
+from __future__ import annotations
+import argparse
+import math
+import random
+import time
+from typing import Dict, List, Literal, Optional, Tuple
+import torch
+torch.set_num_threads(1)
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    import sparse_linear_metal  # built by setup.py in this folder
+except Exception:
+    sparse_linear_metal = None
+USE_METAL_KERNEL = False
+def sync_device(device: str) -> None:
+    if device == "cuda" and torch.cuda.is_available():
+        torch.cuda.synchronize()
+    elif device == "mps" and hasattr(torch, "mps"):
+        torch.mps.synchronize()
+Policy = Literal["predicted_magnitude", "ucb_magnitude", "oracle_current", "stale_current", "random"]
+BackwardMode = Literal["masked_optimizer", "sparse_dW_full_dX", "sparse_dW_sparse_dX"]
+# -----------------------------
+# Reproducibility and device
+# -----------------------------
+def set_seed(seed: int) -> None:
+    random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+def default_device() -> str:
+    if torch.cuda.is_available():
+        return "cuda"
+    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        return "mps"
+    return "cpu"
+def make_cpu_generator(seed: int) -> torch.Generator:
+    gen = torch.Generator(device="cpu")
+    gen.manual_seed(seed)
+    return gen
+# -----------------------------
+# Data
+# -----------------------------
+def make_synthetic_corpus(n_sentences: int = 12000, seed: int = 7) -> str:
+    rng = random.Random(seed)
+    names = ["ada", "turing", "grace", "lovelace", "noether", "shannon", "hopper", "gauss"]
+    verbs = ["builds", "tests", "traces", "compresses", "predicts", "routes", "writes", "measures"]
+    objects = ["signals", "gradients", "tokens", "circuits", "features", "masks", "errors", "states"]
+    adverbs = ["quietly", "boldly", "slowly", "quickly", "cleanly", "strangely", "carefully"]
+    clauses = [
+        "when the loss falls",
+        "after the mask shifts",
+        "before the model answers",
+        "while the signal drifts",
+        "if the pattern repeats",
+        "because the tail is noisy",
+    ]
+    symbols = ["alpha", "beta", "gamma", "delta", "omega", "sigma"]
+    lines: List[str] = []
+    for _ in range(n_sentences):
+        t = rng.randrange(6)
+        if t == 0:
+            line = f"{rng.choice(names)} {rng.choice(verbs)} {rng.choice(objects)} {rng.choice(adverbs)}."
+        elif t == 1:
+            line = f"{rng.choice(clauses)}, {rng.choice(names)} {rng.choice(verbs)} {rng.choice(objects)}."
+        elif t == 2:
+            a, b = rng.sample(symbols, 2)
+            line = f"rule {a}: {rng.choice(objects)} -> {rng.choice(objects)}; rule {b}: {rng.choice(objects)} -> {rng.choice(objects)}."
+        elif t == 3:
+            line = f"the {rng.choice(objects)} {rng.choice(verbs)} the {rng.choice(objects)} {rng.choice(adverbs)}."
+        elif t == 4:
+            seq = " ".join(rng.choice(symbols) for _ in range(rng.randint(2, 7)))
+            line = f"sequence {seq} ends when {rng.choice(names)} {rng.choice(verbs)}."
+        else:
+            line = f"if {rng.choice(objects)} rise then {rng.choice(names)} {rng.choice(verbs)} {rng.choice(objects)} else wait."
+        lines.append(line)
+    return "\n".join(lines) + "\n"
+class CharCorpus:
+    def __init__(self, text: str, block_size: int, device: str):
+        chars = sorted(set(text))
+        self.stoi = {ch: i for i, ch in enumerate(chars)}
+        self.itos = {i: ch for ch, i in self.stoi.items()}
+        self.vocab_size = len(chars)
+        self.block_size = block_size
+        self.device = device
+        data = torch.tensor([self.stoi[ch] for ch in text], dtype=torch.long)
+        split = int(0.9 * len(data))
+        self.train_data = data[:split]
+        self.val_data = data[split:]
+    def get_batch(
+        self,
+        split: str,
+        batch_size: int,
+        generator: Optional[torch.Generator] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        data = self.train_data if split == "train" else self.val_data
+        max_start = len(data) - self.block_size - 1
+        if max_start <= 0:
+            raise ValueError("Corpus too small for block_size")
+        ix = torch.randint(max_start, (batch_size,), generator=generator)
+        x = torch.stack([data[i : i + self.block_size] for i in ix])
+        y = torch.stack([data[i + 1 : i + self.block_size + 1] for i in ix])
+        return x.to(self.device), y.to(self.device)
+def load_text(args: argparse.Namespace) -> str:
+    if args.text_path:
+        with open(args.text_path, "r", encoding="utf-8") as f:
+            return f.read()
+    return make_synthetic_corpus(args.synthetic_sentences, args.seed)
+# -----------------------------
+# Sparse Linear autograd
+# -----------------------------
+class MaskedLinearFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(  # type: ignore[override]
+        ctx,
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        bias: Optional[torch.Tensor],
+        active_rows: torch.Tensor,
+        sparse_dx: bool,
+    ) -> torch.Tensor:
+        ctx.save_for_backward(x, weight, active_rows)
+        ctx.has_bias = bias is not None
+        ctx.sparse_dx = bool(sparse_dx)
+        return F.linear(x, weight, bias)
+    @staticmethod
+    def backward(ctx, grad_y: torch.Tensor):  # type: ignore[override]
+        x, weight, active_rows = ctx.saved_tensors
+        sparse_dx = bool(ctx.sparse_dx)
+        has_bias = bool(ctx.has_bias)
+        x_shape = x.shape
+        x_flat = x.reshape(-1, x.shape[-1]).contiguous()
+        gy_flat = grad_y.reshape(-1, grad_y.shape[-1]).contiguous()
+        active_idx = torch.nonzero(active_rows, as_tuple=False).flatten().to(dtype=torch.long).contiguous()
+        can_use_metal = (
+            USE_METAL_KERNEL
+            and sparse_linear_metal is not None
+            and x.device.type == "mps"
+            and weight.device.type == "mps"
+            and x.dtype == torch.float32
+            and weight.dtype == torch.float32
+            and gy_flat.dtype == torch.float32
+            and x_flat.is_contiguous()
+            and gy_flat.is_contiguous()
+            and weight.is_contiguous()
+        )
+        if can_use_metal:
+            grad_weight, grad_bias_full = sparse_linear_metal.sparse_linear_grad_wb(
+                x_flat, gy_flat, active_idx, int(weight.shape[0])
+            )
+            grad_bias = grad_bias_full if has_bias else None
+            if sparse_dx:
+                grad_x_flat = sparse_linear_metal.sparse_linear_grad_x(gy_flat, weight.contiguous(), active_idx)
+            else:
+                grad_x_flat = gy_flat @ weight
+            grad_x = grad_x_flat.reshape(x_shape)
+            return grad_x, grad_weight, grad_bias, None, None
+        # Portable PyTorch fallback. This is mathematically identical but usually
+        # not an actual wall-clock optimization on MPS/CPU because indexing and
+        # general matmul overhead dominate.
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros(weight.shape[0], device=weight.device, dtype=weight.dtype) if has_bias else None
+        if active_idx.numel() > 0:
+            gy_active = gy_flat[:, active_idx]
+            grad_weight[active_idx] = gy_active.transpose(0, 1) @ x_flat
+            if grad_bias is not None:
+                grad_bias[active_idx] = gy_active.sum(dim=0)
+            if sparse_dx:
+                grad_x_flat = gy_active @ weight[active_idx]
+            else:
+                grad_x_flat = gy_flat @ weight
+        else:
+            if sparse_dx:
+                grad_x_flat = torch.zeros_like(x_flat)
+            else:
+                grad_x_flat = gy_flat @ weight
+        grad_x = grad_x_flat.reshape(x_shape)
+        return grad_x, grad_weight, grad_bias, None, None
+class SparseLinear(nn.Linear):
+    """nn.Linear with an optional row-sparse backward pass."""
+    def __init__(self, in_features: int, out_features: int, bias: bool = True):
+        super().__init__(in_features, out_features, bias=bias)
+        self.sparse_enabled = False
+        self.sparse_dx = False
+        self.active_rows: Optional[torch.Tensor] = None
+    def set_sparse_backward(self, enabled: bool, active_rows: Optional[torch.Tensor], sparse_dx: bool) -> None:
+        self.sparse_enabled = bool(enabled)
+        self.sparse_dx = bool(sparse_dx)
+        self.active_rows = active_rows
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not self.sparse_enabled or self.active_rows is None:
+            return F.linear(x, self.weight, self.bias)
+        return MaskedLinearFunction.apply(x, self.weight, self.bias, self.active_rows, self.sparse_dx)
+# -----------------------------
+# Mini GPT
+# -----------------------------
+class CausalSelfAttention(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        assert n_embd % n_head == 0
+        self.n_head = n_head
+        self.head_dim = n_embd // n_head
+        self.c_attn = SparseLinear(n_embd, 3 * n_embd)
+        self.c_proj = SparseLinear(n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+        self.register_buffer("mask", torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, T, C = x.shape
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(C, dim=2)
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float("-inf"))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.c_proj(y)
+class FeedForward(nn.Module):
+    def __init__(self, n_embd: int, dropout: float):
+        super().__init__()
+        self.c_fc = SparseLinear(n_embd, 4 * n_embd)
+        self.c_proj = SparseLinear(4 * n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.dropout(self.c_proj(F.gelu(self.c_fc(x))))
+class Block(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(n_embd)
+        self.attn = CausalSelfAttention(n_embd, n_head, block_size, dropout)
+        self.ln2 = nn.LayerNorm(n_embd)
+        self.mlp = FeedForward(n_embd, dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+class MiniGPT(nn.Module):
+    def __init__(self, vocab_size: int, block_size: int, n_layer: int, n_head: int, n_embd: int, dropout: float):
+        super().__init__()
+        self.block_size = block_size
+        self.tok_emb = nn.Embedding(vocab_size, n_embd)
+        self.pos_emb = nn.Embedding(block_size, n_embd)
+        self.drop = nn.Dropout(dropout)
+        self.blocks = nn.Sequential(*[Block(n_embd, n_head, block_size, dropout) for _ in range(n_layer)])
+        self.ln_f = nn.LayerNorm(n_embd)
+        self.lm_head = SparseLinear(n_embd, vocab_size)
+    def forward(self, idx: torch.Tensor, targets: Optional[torch.Tensor] = None):
+        B, T = idx.shape
+        pos = torch.arange(T, device=idx.device)
+        x = self.tok_emb(idx) + self.pos_emb(pos)[None, :, :]
+        x = self.drop(x)
+        x = self.blocks(x)
+        x = self.ln_f(x)
+        logits = self.lm_head(x)
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss
+def named_sparse_linear_modules(model: nn.Module) -> List[Tuple[str, SparseLinear]]:
+    return [(name, m) for name, m in model.named_modules() if isinstance(m, SparseLinear)]
+def parameter_fractions(model: nn.Module) -> Tuple[int, int, float]:
+    total = sum(p.numel() for p in model.parameters())
+    linear = 0
+    for _, m in named_sparse_linear_modules(model):
+        linear += m.weight.numel()
+        if m.bias is not None:
+            linear += m.bias.numel()
+    return total, linear, linear / max(1, total)
+def configure_sparse_linears(
+    model: nn.Module,
+    masker: Optional["RowMasker"],
+    enabled: bool,
+    backward_mode: Optional[str],
+) -> None:
+    sparse_dx = backward_mode == "sparse_dW_sparse_dX"
+    for _, m in named_sparse_linear_modules(model):
+        active = masker.row_mask_for(m) if masker is not None else None
+        m.set_sparse_backward(enabled=enabled, active_rows=active, sparse_dx=sparse_dx)
+# -----------------------------
+# Mask selector
+# -----------------------------
+class RowMasker:
+    def __init__(
+        self,
+        model: nn.Module,
+        policy: Policy,
+        active_fraction: float,
+        explore_fraction: float,
+        mass_beta: float,
+        unobserved_decay: float,
+        warmup_steps: int,
+        ucb_alpha: float,
+        mass_init: float,
+        device: str,
+    ):
+        self.model = model
+        self.policy = policy
+        self.active_fraction = active_fraction
+        self.explore_fraction = explore_fraction
+        self.mass_beta = mass_beta
+        self.unobserved_decay = unobserved_decay
+        self.warmup_steps = warmup_steps
+        self.ucb_alpha = ucb_alpha
+        self.mass_init = mass_init
+        self.device = device
+        self.step_index = 0
+        self.linear_modules = [m for _, m in named_sparse_linear_modules(model)]
+        self.module_to_ids: Dict[SparseLinear, torch.Tensor] = {}
+        ids = []
+        offset = 0
+        for m in self.linear_modules:
+            n = m.weight.shape[0]
+            block_ids = torch.arange(offset, offset + n, device=device)
+            self.module_to_ids[m] = block_ids
+            ids.append(block_ids)
+            offset += n
+        self.n_blocks = offset
+        self.predicted_mass = torch.full((self.n_blocks,), mass_init, device=device)
+        self.last_full_mass = torch.full((self.n_blocks,), mass_init, device=device)
+        self.observed_count = torch.zeros(self.n_blocks, device=device)
+        self.global_mass_ema = torch.tensor(max(mass_init, 1e-6), device=device)
+        self.prev_active = torch.zeros(self.n_blocks, dtype=torch.bool, device=device)
+        self.active = torch.zeros(self.n_blocks, dtype=torch.bool, device=device)
+        self.row_masks: Dict[SparseLinear, torch.Tensor] = {
+            m: torch.zeros(m.weight.shape[0], dtype=torch.bool, device=device) for m in self.linear_modules
+        }
+    def _topk_mask(self, values: torch.Tensor, fraction: float) -> torch.Tensor:
+        k = max(1, int(fraction * values.numel()))
+        mask = torch.zeros_like(values, dtype=torch.bool)
+        noisy = values + 1e-9 * torch.rand_like(values)
+        mask[torch.topk(noisy, k=k).indices] = True
+        return mask
+    @staticmethod
+    def _jaccard(a: torch.Tensor, b: torch.Tensor) -> float:
+        inter = (a & b).sum().float()
+        union = (a | b).sum().float()
+        return float((inter / torch.clamp(union, min=1.0)).item())
+    def _set_active(self, active: torch.Tensor) -> None:
+        self.active = active
+        self.row_masks = {}
+        for m, ids in self.module_to_ids.items():
+            self.row_masks[m] = active[ids]
+    def _sample_exploit_explore(self, scores: torch.Tensor) -> torch.Tensor:
+        n = self.n_blocks
+        k_total = max(1, int(self.active_fraction * n))
+        k_explore = min(k_total, max(0, int(self.explore_fraction * k_total)))
+        k_exploit = k_total - k_explore
+        active = torch.zeros(n, dtype=torch.bool, device=self.device)
+        if k_exploit > 0:
+            active[torch.topk(scores + 1e-9 * torch.rand_like(scores), k=k_exploit).indices] = True
+        if k_explore > 0:
+            remaining = torch.nonzero(~active, as_tuple=False).flatten()
+            pick = remaining[torch.randperm(remaining.numel(), device=self.device)[:k_explore]]
+            active[pick] = True
+        return active
+    def choose_pre_backward(self, step: int) -> None:
+        self.step_index = step
+        if step < self.warmup_steps:
+            self._set_active(torch.ones(self.n_blocks, dtype=torch.bool, device=self.device))
+            return
+        if self.policy == "oracle_current":
+            # Oracle cannot choose until the dense audit gradient is known.
+            self._set_active(torch.zeros(self.n_blocks, dtype=torch.bool, device=self.device))
+            return
+        if self.policy == "random":
+            self._set_active(self._sample_exploit_explore(torch.rand(self.n_blocks, device=self.device)))
+            return
+        if self.policy == "stale_current":
+            self._set_active(self._topk_mask(self.last_full_mass, self.active_fraction))
+            return
+        if self.policy == "predicted_magnitude":
+            self._set_active(self._sample_exploit_explore(self.predicted_mass))
+            return
+        if self.policy == "ucb_magnitude":
+            t = max(1, step - self.warmup_steps + 1)
+            log_term = torch.log(torch.tensor(float(t + 2), device=self.device))
+            bonus_scale = torch.clamp(self.global_mass_ema, min=1e-8)
+            bonus = self.ucb_alpha * bonus_scale * torch.sqrt(log_term / (self.observed_count + 1.0))
+            self._set_active(self._sample_exploit_explore(self.predicted_mass + bonus))
+            return
+        raise ValueError(f"Unknown policy: {self.policy}")
+    @torch.no_grad()
+    def current_gradient_mass_from_grads(self) -> torch.Tensor:
+        mass = torch.zeros(self.n_blocks, device=self.device)
+        for m, ids in self.module_to_ids.items():
+            if m.weight.grad is None:
+                continue
+            row_sq = m.weight.grad.square().sum(dim=1)
+            if m.bias is not None and m.bias.grad is not None:
+                row_sq = row_sq + m.bias.grad.square()
+            mass[ids] = torch.sqrt(row_sq + 1e-30)
+        return mass
+    @torch.no_grad()
+    @torch.no_grad()
+    def update_predictor_from_observed_mass(self, mass: torch.Tensor, observed: Optional[torch.Tensor] = None) -> Dict[str, float]:
+        """Update EMA statistics only for observed rows.
+        After warmup, sparse backward only gives trustworthy gradients for active
+        rows, so only those rows are allowed to update predicted_mass.
+        """
+        if observed is None:
+            observed = self.active
+        new_active = observed & (self.observed_count == 0)
+        self.predicted_mass.mul_(self.unobserved_decay)
+        if bool(observed.any().item()):
+            obs_mass = mass[observed]
+            first_seen = self.observed_count[observed] == 0
+            ema_mass = self.mass_beta * self.predicted_mass[observed] + (1.0 - self.mass_beta) * obs_mass
+            self.predicted_mass[observed] = torch.where(first_seen, obs_mass, ema_mass)
+            self.observed_count[observed] += 1.0
+            self.global_mass_ema = self.mass_beta * self.global_mass_ema + (1.0 - self.mass_beta) * obs_mass.mean()
+        stability = self._jaccard(self.active, self.prev_active)
+        self.prev_active = self.active.clone()
+        return {
+            "stability": stability,
+            "active_fraction_real": float(self.active.float().mean().item()),
+            "coverage": float((self.observed_count > 0).float().mean().item()),
+            "avg_obs_count": float(self.observed_count.mean().item()),
+            "new_active_fraction": float(new_active.float().mean().item()),
+        }
+    @torch.no_grad()
+    def audit_metrics_from_mass(self, mass: torch.Tensor) -> Dict[str, float]:
+        """Compute dense-audit metrics without updating the practical selector."""
+        active = self.active
+        true_sq = mass.square().sum()
+        approx_sq = mass[active].square().sum()
+        cosine = float((torch.sqrt(approx_sq + 1e-30) / torch.sqrt(true_sq + 1e-30)).item())
+        oracle_mask = self._topk_mask(mass, self.active_fraction)
+        jacc = self._jaccard(active, oracle_mask)
+        k20 = max(1, int(0.2 * self.n_blocks))
+        sorted_mass = torch.sort(mass, descending=True).values
+        top20_mass = float((sorted_mass[:k20].sum() / (sorted_mass.sum() + 1e-12)).item())
+        return {
+            "cosine": cosine,
+            "norm_ratio": cosine,
+            "top20_mass": top20_mass,
+            "jacc_oracle": jacc,
+        }
+    def audit_and_update_from_mass(self, step: int, mass: torch.Tensor) -> Dict[str, float]:
+        if step < self.warmup_steps:
+            active = torch.ones(self.n_blocks, dtype=torch.bool, device=self.device)
+            self._set_active(active)
+        elif self.policy == "oracle_current":
+            active = self._topk_mask(mass, self.active_fraction)
+            self._set_active(active)
+        else:
+            active = self.active
+        true_sq = mass.square().sum()
+        approx_sq = mass[active].square().sum()
+        cosine = float((torch.sqrt(approx_sq + 1e-30) / torch.sqrt(true_sq + 1e-30)).item())
+        oracle_mask = self._topk_mask(mass, self.active_fraction)
+        jacc = self._jaccard(active, oracle_mask)
+        stability = self._jaccard(active, self.prev_active)
+        self.prev_active = active.clone()
+        k20 = max(1, int(0.2 * self.n_blocks))
+        sorted_mass = torch.sort(mass, descending=True).values
+        top20_mass = float((sorted_mass[:k20].sum() / (sorted_mass.sum() + 1e-12)).item())
+        new_active = active & (self.observed_count == 0)
+        # Practical rule: update predicted statistics only for active/observed rows.
+        self.predicted_mass.mul_(self.unobserved_decay)
+        observed = active
+        if bool(observed.any().item()):
+            obs_mass = mass[observed]
+            first_seen = self.observed_count[observed] == 0
+            ema_mass = self.mass_beta * self.predicted_mass[observed] + (1.0 - self.mass_beta) * obs_mass
+            self.predicted_mass[observed] = torch.where(first_seen, obs_mass, ema_mass)
+            self.observed_count[observed] += 1.0
+            self.global_mass_ema = self.mass_beta * self.global_mass_ema + (1.0 - self.mass_beta) * obs_mass.mean()
+        # Dense audit signal; only stale_current is allowed to use this for selection.
+        self.last_full_mass = mass.detach().clone()
+        return {
+            "cosine": cosine,
+            "norm_ratio": cosine,
+            "top20_mass": top20_mass,
+            "jacc_oracle": jacc,
+            "stability": stability,
+            "active_fraction_real": float(active.float().mean().item()),
+            "coverage": float((self.observed_count > 0).float().mean().item()),
+            "avg_obs_count": float(self.observed_count.mean().item()),
+            "new_active_fraction": float(new_active.float().mean().item()),
+        }
+    def row_mask_for(self, module: SparseLinear) -> Optional[torch.Tensor]:
+        return self.row_masks.get(module)
+# -----------------------------
+# Masked Adam
+# -----------------------------
+class MaskedAdam:
+    def __init__(
+        self,
+        model: nn.Module,
+        masker: Optional[RowMasker],
+        lr: float,
+        betas=(0.9, 0.95),
+        eps=1e-8,
+        weight_decay=0.0,
+        freeze_non_linear_when_sparse: bool = False,
+    ):
+        self.model = model
+        self.masker = masker
+        self.lr = lr
+        self.beta1, self.beta2 = betas
+        self.eps = eps
+        self.weight_decay = weight_decay
+        self.freeze_non_linear_when_sparse = freeze_non_linear_when_sparse
+        self.state: Dict[nn.Parameter, Dict[str, torch.Tensor]] = {}
+        self.linear_param: Dict[nn.Parameter, Tuple[SparseLinear, str]] = {}
+        for _, m in named_sparse_linear_modules(model):
+            self.linear_param[m.weight] = (m, "weight")
+            if m.bias is not None:
+                self.linear_param[m.bias] = (m, "bias")
+    def zero_grad(self) -> None:
+        for p in self.model.parameters():
+            p.grad = None
+    @torch.no_grad()
+    def step(self) -> None:
+        for p in self.model.parameters():
+            if p.grad is None:
+                continue
+            if self.masker is not None and self.freeze_non_linear_when_sparse and p not in self.linear_param:
+                continue
+            if p not in self.state:
+                self.state[p] = {"m": torch.zeros_like(p), "v": torch.zeros_like(p)}
+            m = self.state[p]["m"]
+            v = self.state[p]["v"]
+            g = p.grad
+            if self.weight_decay:
+                g = g.add(p, alpha=self.weight_decay)
+            row_mask = None
+            if self.masker is not None and p in self.linear_param:
+                module, kind = self.linear_param[p]
+                base = self.masker.row_mask_for(module)
+                if base is not None:
+                    row_mask = base.view(-1, *([1] * (p.ndim - 1))) if kind == "weight" else base
+            if row_mask is None:
+                m.mul_(self.beta1).add_(g, alpha=1.0 - self.beta1)
+                v.mul_(self.beta2).addcmul_(g, g, value=1.0 - self.beta2)
+                p.add_(m / (torch.sqrt(v) + self.eps), alpha=-self.lr)
+            else:
+                # MPS can mis-handle expanded boolean masks for row-wise assignment
+                # (e.g. reporting nonsense out-of-bounds indices). Use explicit
+                # row indices and index_copy_ instead. This also avoids materializing
+                # a full expanded mask for weight matrices.
+                active_rows = row_mask.reshape(-1).nonzero(as_tuple=False).flatten()
+                if active_rows.numel() == 0:
+                    continue
+                m_rows = m.index_select(0, active_rows)
+                v_rows = v.index_select(0, active_rows)
+                g_rows = g.index_select(0, active_rows)
+                new_m_rows = self.beta1 * m_rows + (1.0 - self.beta1) * g_rows
+                new_v_rows = self.beta2 * v_rows + (1.0 - self.beta2) * g_rows * g_rows
+                update_rows = new_m_rows / (torch.sqrt(new_v_rows) + self.eps)
+                p_rows = p.index_select(0, active_rows) - self.lr * update_rows
+                m.index_copy_(0, active_rows, new_m_rows)
+                v.index_copy_(0, active_rows, new_v_rows)
+                p.index_copy_(0, active_rows, p_rows)
+# -----------------------------
+# Training utilities
+# -----------------------------
+@torch.no_grad()
+def estimate_loss(model: nn.Module, corpus: CharCorpus, batch_size: int, eval_iters: int, seed: int) -> Dict[str, float]:
+    model.eval()
+    configure_sparse_linears(model, masker=None, enabled=False, backward_mode=None)
+    out = {}
+    for split in ["train", "val"]:
+        losses = []
+        gen = make_cpu_generator(seed + (0 if split == "train" else 100000))
+        for _ in range(eval_iters):
+            x, y = corpus.get_batch(split, batch_size, generator=gen)
+            _, loss = model(x, y)
+            losses.append(float(loss.item()))
+        out[split] = sum(losses) / len(losses)
+    model.train()
+    return out
+def dense_audit_pass(model: nn.Module, corpus_batch: Tuple[torch.Tensor, torch.Tensor], opt: MaskedAdam, masker: RowMasker) -> torch.Tensor:
+    x, y = corpus_batch
+    configure_sparse_linears(model, masker=None, enabled=False, backward_mode=None)
+    opt.zero_grad()
+    _, audit_loss = model(x, y)
+    audit_loss.backward()
+    mass = masker.current_gradient_mass_from_grads()
+    opt.zero_grad()
+    return mass
+def sparse_training_backward(
+    model: nn.Module,
+    corpus_batch: Tuple[torch.Tensor, torch.Tensor],
+    opt: MaskedAdam,
+    masker: Optional[RowMasker],
+    backward_mode: Optional[BackwardMode],
+) -> float:
+    x, y = corpus_batch
+    opt.zero_grad()
+    if masker is None or backward_mode is None or backward_mode == "masked_optimizer":
+        configure_sparse_linears(model, masker=None, enabled=False, backward_mode=None)
+    else:
+        configure_sparse_linears(model, masker=masker, enabled=True, backward_mode=backward_mode)
+    _, loss = model(x, y)
+    loss.backward()
+    configure_sparse_linears(model, masker=None, enabled=False, backward_mode=None)
+    return float(loss.item())
+def train_run(
+    corpus: CharCorpus,
+    args: argparse.Namespace,
+    policy: Optional[Policy],
+    backward_mode: Optional[BackwardMode],
+    active_fraction: float,
+    warmup_steps: int,
+    explore_fraction: float,
+    seed_offset: int,
+) -> Dict[str, float | str]:
+    # Same model initialization and same minibatch sequence for every run by default.
+    set_seed(args.seed + (seed_offset if args.unpaired_seeds else 0))
+    data_gen = make_cpu_generator(args.seed + 12345)
+    dev = corpus.device
+    model = MiniGPT(corpus.vocab_size, args.block_size, args.n_layer, args.n_head, args.n_embd, args.dropout).to(dev)
+    masker = None
+    if policy is not None:
+        masker = RowMasker(
+            model=model,
+            policy=policy,
+            active_fraction=active_fraction,
+            explore_fraction=explore_fraction,
+            mass_beta=args.mass_beta,
+            unobserved_decay=args.unobserved_decay,
+            warmup_steps=warmup_steps,
+            ucb_alpha=args.ucb_alpha,
+            mass_init=args.mass_init,
+            device=dev,
+        )
+    opt = MaskedAdam(
+        model,
+        masker,
+        lr=args.lr,
+        weight_decay=args.weight_decay,
+        freeze_non_linear_when_sparse=args.freeze_non_linear_when_sparse,
+    )
+    sums = {
+        "cosine": 0.0,
+        "norm_ratio": 0.0,
+        "top20_mass": 0.0,
+        "jacc_oracle": 0.0,
+        "stability": 0.0,
+        "active_fraction_real": 0.0,
+        "coverage": 0.0,
+        "avg_obs_count": 0.0,
+        "new_active_fraction": 0.0,
+    }
+    counts = {k: 0 for k in sums}
+    def add_metrics(metrics: Dict[str, float]) -> None:
+        for k, v in metrics.items():
+            if k in sums:
+                sums[k] += float(v)
+                counts[k] += 1
+    if args.benchmark_sync:
+        sync_device(dev)
+    t0 = time.perf_counter()
+    for step in range(args.steps):
+        batch = corpus.get_batch("train", args.batch_size, generator=data_gen)
+        if masker is None:
+            loss_value = sparse_training_backward(model, batch, opt, masker=None, backward_mode=None)
+            opt.step()
+        else:
+            if step < warmup_steps:
+                # Dense bootstrap. Every row is active and every row updates the predictor.
+                masker._set_active(torch.ones(masker.n_blocks, dtype=torch.bool, device=dev))
+                loss_value = sparse_training_backward(model, batch, opt, masker=masker, backward_mode="masked_optimizer")
+                full_mass = masker.current_gradient_mass_from_grads()
+                masker.last_full_mass = full_mass.detach().clone()
+                add_metrics(masker.audit_metrics_from_mass(full_mass))
+                add_metrics(masker.update_predictor_from_observed_mass(full_mass, observed=masker.active))
+                opt.step()
+            else:
+                masker.choose_pre_backward(step)
+                if policy == "oracle_current":
+                    # Explicit upper bound. Oracle necessarily computes dense gradients to choose rows.
+                    full_mass = dense_audit_pass(model, batch, opt, masker)
+                    masker._set_active(masker._topk_mask(full_mass, active_fraction))
+                    masker.last_full_mass = full_mass.detach().clone()
+                    add_metrics(masker.audit_metrics_from_mass(full_mass))
+                elif args.audit_every > 0 and ((step - warmup_steps) % args.audit_every == 0):
+                    # Measurement only. Do not update predicted_magnitude/ucb/random with this dense mass.
+                    full_mass = dense_audit_pass(model, batch, opt, masker)
+                    add_metrics(masker.audit_metrics_from_mass(full_mass))
+                    if policy == "stale_current":
+                        masker.last_full_mass = full_mass.detach().clone()
+                loss_value = sparse_training_backward(model, batch, opt, masker=masker, backward_mode=backward_mode)
+                # Practical selector update: only active rows were observed by the training backward pass.
+                observed_mass = masker.current_gradient_mass_from_grads()
+                add_metrics(masker.update_predictor_from_observed_mass(observed_mass, observed=masker.active))
+                opt.step()
+        if args.benchmark_sync:
+            sync_device(dev)
+        if args.verbose and (step % args.eval_interval == 0 or step == args.steps - 1):
+            losses = estimate_loss(model, corpus, args.batch_size, args.eval_iters, seed=args.seed + 555)
+            name = "dense" if policy is None else f"{policy}/{backward_mode}"
+            print(
+                f"{name:38s} step={step:5d} warm={warmup_steps:4d} explore={explore_fraction:.2f} "
+                f"loss={loss_value:.4f} train={losses['train']:.4f} val={losses['val']:.4f}"
+            )
+    if args.benchmark_sync:
+        sync_device(dev)
+    elapsed_s = time.perf_counter() - t0
+    train_tokens = float(args.steps * args.batch_size * args.block_size)
+    step_ms = 1000.0 * elapsed_s / max(1, args.steps)
+    tokens_per_s = train_tokens / max(elapsed_s, 1e-9)
+    losses = estimate_loss(model, corpus, args.batch_size, args.eval_iters, seed=args.seed + 999)
+    row: Dict[str, float | str] = {
+        "run": "dense_baseline" if policy is None else policy,
+        "mode": "dense" if backward_mode is None else backward_mode,
+        "target_active": 1.0 if policy is None else active_fraction,
+        "warmup": warmup_steps,
+        "explore": explore_fraction if policy is not None else 0.0,
+        "train_loss": losses["train"],
+        "val_loss": losses["val"],
+        "elapsed_s": elapsed_s,
+        "step_ms": step_ms,
+        "tokens_per_s": tokens_per_s,
+    }
+    if masker is None:
+        row.update({
+            "cosine": float("nan"),
+            "norm_ratio": float("nan"),
+            "top20_mass": float("nan"),
+            "jacc_oracle": float("nan"),
+            "stability": float("nan"),
+            "active_fraction_real": 1.0,
+            "coverage": float("nan"),
+            "avg_obs_count": float("nan"),
+            "new_active_fraction": float("nan"),
+        })
+    else:
+        for k in sums:
+            row[k] = (sums[k] / counts[k]) if counts[k] > 0 else float("nan")
+    return row
+def print_summary(rows: List[Dict[str, float | str]]) -> None:
+    print("\nSummary")
+    header = (
+        f"{'run':>22s} {'mode':>19s} {'target':>7s} {'actual':>7s} {'warm':>5s} {'expl':>5s} "
+        f"{'val':>8s} {'train':>8s} {'ms':>8s} {'tok/s':>9s} {'cos':>7s} {'top20':>7s} {'jacc':>7s} "
+        f"{'stable':>7s} {'cover':>7s} {'new':>7s}"
+    )
+    print(header)
+    print("-" * len(header))
+    for r in rows:
+        print(
+            f"{str(r['run']):>22s} "
+            f"{str(r['mode']):>19s} "
+            f"{float(r['target_active']):7.3f} "
+            f"{float(r['active_fraction_real']):7.3f} "
+            f"{int(float(r['warmup'])):5d} "
+            f"{float(r['explore']):5.2f} "
+            f"{float(r['val_loss']):8.4f} "
+            f"{float(r['train_loss']):8.4f} "
+            f"{float(r.get('step_ms', float('nan'))):8.2f} "
+            f"{float(r.get('tokens_per_s', float('nan'))):9.0f} "
+            f"{float(r['cosine']):7.3f} "
+            f"{float(r['top20_mass']):7.3f} "
+            f"{float(r['jacc_oracle']):7.3f} "
+            f"{float(r['stability']):7.3f} "
+            f"{float(r['coverage']):7.3f} "
+            f"{float(r['new_active_fraction']):7.3f}"
+        )
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser()
+    p.add_argument("--text_path", type=str, default=None)
+    p.add_argument("--synthetic_sentences", type=int, default=12000)
+    p.add_argument("--steps", type=int, default=1000)
+    p.add_argument("--quick", action="store_true")
+    p.add_argument("--batch_size", type=int, default=32)
+    p.add_argument("--block_size", type=int, default=64)
+    p.add_argument("--n_layer", type=int, default=2)
+    p.add_argument("--n_head", type=int, default=4)
+    p.add_argument("--n_embd", type=int, default=64)
+    p.add_argument("--dropout", type=float, default=0.0)
+    p.add_argument("--lr", type=float, default=3e-4)
+    p.add_argument("--weight_decay", type=float, default=0.0)
+    p.add_argument("--active_fractions", type=float, nargs="+", default=[0.05, 0.02])
+    p.add_argument("--policies", type=str, nargs="+", default=["oracle_current", "predicted_magnitude", "random"])
+    p.add_argument(
+        "--backward_modes",
+        type=str,
+        nargs="+",
+        default=["masked_optimizer", "sparse_dW_full_dX", "sparse_dW_sparse_dX"],
+    )
+    p.add_argument("--explore_fractions", type=float, nargs="+", default=[0.0])
+    p.add_argument("--warmup_steps_list", type=int, nargs="+", default=[5])
+    p.add_argument("--mass_beta", type=float, default=0.95)
+    p.add_argument("--unobserved_decay", type=float, default=1.0)
+    p.add_argument("--mass_init", type=float, default=0.0)
+    p.add_argument("--ucb_alpha", type=float, default=1.0)
+    p.add_argument("--freeze_non_linear_when_sparse", action="store_true")
+    p.add_argument("--eval_interval", type=int, default=200)
+    p.add_argument("--eval_iters", type=int, default=20)
+    p.add_argument("--seed", type=int, default=7)
+    p.add_argument("--device", type=str, default="auto", choices=["auto", "cpu", "cuda", "mps"])
+    p.add_argument("--audit_every", type=int, default=0, help="Dense audit interval after warmup. 0 disables audits except oracle_current.")
+    p.add_argument("--kernel_backend", type=str, default="torch", choices=["torch", "metal"], help="Use PyTorch fallback or the optional Metal active-row Linear backward kernel.")
+    p.add_argument("--benchmark_sync", action="store_true", help="Synchronize after every train step for fair wall-clock benchmarking on async devices.")
+    p.add_argument("--unpaired_seeds", action="store_true", help="Use different init seeds per run instead of paired seeds.")
+    p.add_argument("--verbose", action="store_true")
+    return p.parse_args()
+def main() -> None:
+    global USE_METAL_KERNEL
+    args = parse_args()
+    USE_METAL_KERNEL = args.kernel_backend == "metal"
+    if USE_METAL_KERNEL and sparse_linear_metal is None:
+        raise RuntimeError(
+            "--kernel_backend metal requested, but sparse_linear_metal is not importable. "
+            "Build in the repo venv (PyTorch must be importable during setup): "
+            "cd sparse_linear_v10_metal && python3 -m pip install -e . --no-build-isolation"
+        )
+    if args.quick:
+        args.steps = 40
+        args.eval_iters = 2
+        args.batch_size = 8
+        args.block_size = 32
+        args.n_layer = 1
+        args.n_embd = 32
+        args.n_head = 4
+        args.synthetic_sentences = 1200
+        args.active_fractions = [0.05]
+        args.policies = ["predicted_magnitude", "random"]
+        args.backward_modes = ["masked_optimizer", "sparse_dW_full_dX", "sparse_dW_sparse_dX"]
+        args.explore_fractions = [0.0]
+        args.warmup_steps_list = [5]
+        args.audit_every = 10
+    valid_policies = {"predicted_magnitude", "ucb_magnitude", "oracle_current", "stale_current", "random"}
+    valid_modes = {"masked_optimizer", "sparse_dW_full_dX", "sparse_dW_sparse_dX"}
+    for pol in args.policies:
+        if pol not in valid_policies:
+            raise ValueError(f"Unknown policy {pol!r}. Valid policies: {sorted(valid_policies)}")
+    for mode in args.backward_modes:
+        if mode not in valid_modes:
+            raise ValueError(f"Unknown backward mode {mode!r}. Valid modes: {sorted(valid_modes)}")
+    set_seed(args.seed)
+    dev = args.device if args.device != "auto" else default_device()
+    print(f"device={dev}")
+    corpus = CharCorpus(load_text(args), args.block_size, dev)
+    print(f"vocab_size={corpus.vocab_size} train_tokens={len(corpus.train_data)} val_tokens={len(corpus.val_data)}")
+    print(f"policies={args.policies}")
+    print(f"backward_modes={args.backward_modes}")
+    print(f"active_fractions={args.active_fractions}")
+    print(f"warmup_steps_list={args.warmup_steps_list} explore_fractions={args.explore_fractions}")
+    print(f"mass_init={args.mass_init} mass_beta={args.mass_beta} ucb_alpha={args.ucb_alpha}")
+    print(f"paired_seeds={not args.unpaired_seeds}")
+    print(f"kernel_backend={args.kernel_backend} benchmark_sync={args.benchmark_sync} metal_available={sparse_linear_metal is not None}")
+    print(f"audit_every={args.audit_every} (0 means no dense audit after warmup, except oracle_current)")
+    tmp_model = MiniGPT(corpus.vocab_size, args.block_size, args.n_layer, args.n_head, args.n_embd, args.dropout).to(dev)
+    total_params, linear_params, linear_frac = parameter_fractions(tmp_model)
+    del tmp_model
+    print(f"params total={total_params} linear={linear_params} linear_fraction={linear_frac:.3f}")
+    if args.freeze_non_linear_when_sparse:
+        print("freeze_non_linear_when_sparse=True: embeddings/layernorm/etc. are frozen in sparse runs")
+    else:
+        print("freeze_non_linear_when_sparse=False: non-Linear params are still updated densely")
+    if args.dropout != 0.0:
+        print("warning: dropout is nonzero; dense audit and sparse training passes may see different dropout masks")
+    rows: List[Dict[str, float | str]] = []
+    print("\nRunning dense baseline")
+    rows.append(
+        train_run(
+            corpus,
+            args,
+            policy=None,
+            backward_mode=None,
+            active_fraction=1.0,
+            warmup_steps=0,
+            explore_fraction=0.0,
+            seed_offset=0,
+        )
+    )
+    seed_offset = 100
+    for mode in args.backward_modes:
+        for af in args.active_fractions:
+            for pol in args.policies:
+                explore_values = args.explore_fractions if pol in {"predicted_magnitude", "ucb_magnitude"} else [0.0]
+                for warmup in args.warmup_steps_list:
+                    for explore in explore_values:
+                        print(
+                            f"\nRunning mode={mode}, policy={pol}, "
+                            f"active_fraction={af:.3f}, warmup={warmup}, explore={explore:.2f}"
+                        )
+                        rows.append(
+                            train_run(
+                                corpus,
+                                args,
+                                policy=pol,  # type: ignore[arg-type]
+                                backward_mode=mode,  # type: ignore[arg-type]
+                                active_fraction=af,
+                                warmup_steps=warmup,
+                                explore_fraction=explore,
+                                seed_offset=seed_offset,
+                            )
+                        )
+                        seed_offset += 1
+    print_summary(rows)
+    print("\nNotes")
+    print("  masked_optimizer is the v7-style dense-backward simulation control.")
+    print("  sparse_dW_full_dX uses custom Linear backward: sparse weight/bias grads, full input gradient.")
+    print("  sparse_dW_sparse_dX uses custom Linear backward: sparse weight/bias grads and sparse input gradient.")
+    print("  oracle_current uses dense audit gradients to choose rows; it is an upper bound.")
+    print("  predicted_magnitude uses EMA mass from active/observed rows only.")
+    print("  random is the sparse-support control.")
+    print("  v10 does not compute dense audit gradients after warmup unless --audit_every > 0, except oracle_current.")
+    print("  predicted_magnitude updates EMA statistics only from active rows observed by the training backward pass.")
+    print("  kernel_backend=torch uses the portable PyTorch fallback; kernel_backend=metal uses sparse_linear_metal if installed.")
+    print("  Use --benchmark_sync on MPS/CUDA for honest step_ms/tokens_per_s comparisons.")
+    print("  cosine/top20/jacc require --audit_every > 0, otherwise there is no dense reference gradient.")
+if __name__ == "__main__":
+    main()

experiments/sparse_linear_v11_gather_vs_metal/README.md ADDED Viewed

	@@ -0,0 +1,62 @@

+# Sparse Transformer v10: Metal-backed active-row Linear backward benchmark
+This bundle is the first empirical optimization test rather than only a correctness prototype.
+It compares dense Transformer training against sparse active-row backward variants and reports
+validation loss plus wall-clock metrics (`step_ms`, `tokens_per_s`).
+## Files
+- `sparse_transformer_v10.py` — training + benchmark harness.
+- `sparse_linear.metal` — Metal kernels for active-row `dW/db` and sparse `dX`.
+- `sparse_linear_ops.mm` — PyTorch/MPS extension glue.
+- `setup.py` — builds the extension and compiles the `.metallib`.
+## Install the Metal extension
+From this directory on macOS with PyTorch MPS available:
+```bash
+python3 -m pip install -e .
+```
+This uses `xcrun metal` and `xcrun metallib`, so Xcode command-line tools are required.
+## Correctness/sanity run with PyTorch fallback
+```bash
+python3 sparse_transformer_v10.py \
+  --device mps \
+  --steps 2000 \
+  --active_fractions 0.05 0.02 \
+  --warmup_steps_list 5 \
+  --policies predicted_magnitude random \
+  --backward_modes sparse_dW_full_dX sparse_dW_sparse_dX \
+  --audit_every 0 \
+  --kernel_backend torch \
+  --benchmark_sync
+```
+## Metal benchmark run
+```bash
+python3 sparse_transformer_v10.py \
+  --device mps \
+  --steps 2000 \
+  --active_fractions 0.05 0.02 \
+  --warmup_steps_list 5 \
+  --policies predicted_magnitude random \
+  --backward_modes sparse_dW_full_dX sparse_dW_sparse_dX \
+  --audit_every 0 \
+  --kernel_backend metal \
+  --benchmark_sync
+```
+## Empirical pass/fail criteria
+A genuine optimization would show:
+1. `predicted_magnitude` validation loss close to the PyTorch fallback v9/v10 result.
+2. `predicted_magnitude` much better than `random` at the same active fraction.
+3. `--kernel_backend metal` has lower `step_ms` or higher `tokens_per_s` than `--kernel_backend torch` and ideally dense baseline.
+The first Metal kernels are intentionally simple and fp32-only. They are meant to prove or disprove the acceleration path before investing in tiled half-precision kernels.

experiments/sparse_linear_v11_gather_vs_metal/input.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

experiments/sparse_linear_v11_gather_vs_metal/setup.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from setuptools import setup
+from torch.utils.cpp_extension import CppExtension, BuildExtension
+setup(
+    name='sparse_linear',
+    ext_modules=[
+        CppExtension(
+            name='sparse_linear',
+            sources=['sparse_linear_ops.mm'],
+            extra_compile_args=['-ObjC++'],
+            extra_link_args=['-framework', 'Metal', '-framework', 'Foundation']
+        )
+    ],
+    cmdclass={'build_ext': BuildExtension}
+)

experiments/sparse_linear_v11_gather_vs_metal/sparse_linear.metal ADDED Viewed

	@@ -0,0 +1,62 @@

+#include <metal_stdlib>
+using namespace metal;
+struct SparseLinearParams {
+    uint32_t N;          // flattened batch/token rows
+    uint32_t In;         // in_features
+    uint32_t Out;        // out_features
+    uint32_t K;          // active rows count
+};
+kernel void sparse_linear_grad_w_float(
+    device const float* x          [[buffer(0)]], // [N, In]
+    device const float* gy         [[buffer(1)]], // [N, Out]
+    device const int64_t* active   [[buffer(2)]], // [K]
+    device float* grad_w           [[buffer(3)]], // [Out, In], zeroed by caller
+    device float* grad_b           [[buffer(4)]], // [Out], zeroed by caller
+    constant SparseLinearParams& p [[buffer(5)]],
+    uint2 tid                      [[thread_position_in_grid]]) {
+    uint k = tid.y;
+    uint c = tid.x;
+    if (k >= p.K || c >= p.In) return;
+    int64_t row64 = active[k];
+    if (row64 < 0 || row64 >= (int64_t)p.Out) return;
+    uint row = (uint)row64;
+    float acc = 0.0f;
+    for (uint n = 0; n < p.N; ++n) {
+        acc += gy[n * p.Out + row] * x[n * p.In + c];
+    }
+    grad_w[row * p.In + c] = acc;
+    // One thread per active row computes bias.
+    if (c == 0) {
+        float bacc = 0.0f;
+        for (uint n = 0; n < p.N; ++n) {
+            bacc += gy[n * p.Out + row];
+        }
+        grad_b[row] = bacc;
+    }
+}
+kernel void sparse_linear_grad_x_float(
+    device const float* gy         [[buffer(0)]], // [N, Out]
+    device const float* weight     [[buffer(1)]], // [Out, In]
+    device const int64_t* active   [[buffer(2)]], // [K]
+    device float* grad_x           [[buffer(3)]], // [N, In]
+    constant SparseLinearParams& p [[buffer(4)]],
+    uint2 tid                      [[thread_position_in_grid]]) {
+    uint c = tid.x;
+    uint n = tid.y;
+    if (n >= p.N || c >= p.In) return;
+    float acc = 0.0f;
+    for (uint k = 0; k < p.K; ++k) {
+        int64_t row64 = active[k];
+        if (row64 < 0 || row64 >= (int64_t)p.Out) continue;
+        uint row = (uint)row64;
+        acc += gy[n * p.Out + row] * weight[row * p.In + c];
+    }
+    grad_x[n * p.In + c] = acc;
+}

experiments/sparse_linear_v11_gather_vs_metal/sparse_linear_ops.mm ADDED Viewed

	@@ -0,0 +1,168 @@

+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/mps/MPSDevice.h>
+#include <c10/util/Exception.h>
+#include <filesystem>
+#include <dlfcn.h>
+#import <Metal/Metal.h>
+#import <Foundation/Foundation.h>
+#include <mutex>
+namespace fs = std::filesystem;
+namespace {
+struct SparseLinearParams {
+    uint32_t N;
+    uint32_t In;
+    uint32_t Out;
+    uint32_t K;
+};
+static id<MTLLibrary> g_lib = nil;
+static id<MTLComputePipelineState> g_pipeline_grad_w = nil;
+static id<MTLComputePipelineState> g_pipeline_grad_x = nil;
+static std::mutex g_mutex;
+static std::string metallib_path_for_this_module() {
+    Dl_info info;
+    if (dladdr((void*)&metallib_path_for_this_module, &info) == 0 || info.dli_fname == nullptr) return std::string();
+    fs::path so_path(info.dli_fname);
+    return (so_path.parent_path() / "sparse_linear_ops.metallib").string();
+}
+static void ensure_library_locked(id<MTLDevice> device) {
+    if (g_lib != nil) return;
+    std::string path = metallib_path_for_this_module();
+    TORCH_CHECK(!path.empty(), "sparse_linear_ops: failed to locate extension path via dladdr");
+    NSString* ns_path = [NSString stringWithUTF8String:path.c_str()];
+    NSURL* url = [NSURL fileURLWithPath:ns_path];
+    NSError* err = nil;
+    g_lib = [device newLibraryWithURL:url error:&err];
+    if (g_lib == nil) {
+        const char* msg = err ? [[err localizedDescription] UTF8String] : "unknown error";
+        TORCH_CHECK(false, "sparse_linear_ops: failed to load metallib at ", path, ": ", msg);
+    }
+}
+static id<MTLComputePipelineState> ensure_pipeline(id<MTLDevice> device, id<MTLComputePipelineState>* pipeline, const char* fn_name) {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    ensure_library_locked(device);
+    if (*pipeline != nil) return *pipeline;
+    NSString* ns_fn = [NSString stringWithUTF8String:fn_name];
+    id<MTLFunction> fn = [g_lib newFunctionWithName:ns_fn];
+    TORCH_CHECK(fn != nil, "sparse_linear_ops: function `", fn_name, "` not found in metallib");
+    NSError* err = nil;
+    *pipeline = [device newComputePipelineStateWithFunction:fn error:&err];
+    if (*pipeline == nil) {
+        const char* msg = err ? [[err localizedDescription] UTF8String] : "unknown error";
+        TORCH_CHECK(false, "sparse_linear_ops: failed to create pipeline for ", fn_name, ": ", msg);
+    }
+    return *pipeline;
+}
+static inline id<MTLBuffer> storage_as_mtlbuffer(const at::Tensor& t) {
+    void* ctx = t.storage().data_ptr().get_context();
+    TORCH_CHECK(ctx != nullptr, "sparse_linear_ops: expected MPS tensor storage with MTLBuffer context");
+    return (__bridge id<MTLBuffer>)ctx;
+}
+static inline NSUInteger storage_offset_bytes(const at::Tensor& t) {
+    return (NSUInteger)(t.storage_offset() * (int64_t)t.element_size());
+}
+static void check_mps_float_contig(const at::Tensor& t, const char* name) {
+    TORCH_CHECK(t.device().is_mps(), name, " must be on MPS");
+    TORCH_CHECK(t.dtype() == at::kFloat, name, " must be float32 for v10 kernel");
+    TORCH_CHECK(t.is_contiguous(), name, " must be contiguous");
+}
+} // namespace
+std::vector<at::Tensor> sparse_linear_grad_wb(at::Tensor x2d, at::Tensor gy2d, at::Tensor active_idx, int64_t out_features) {
+    check_mps_float_contig(x2d, "x2d");
+    check_mps_float_contig(gy2d, "gy2d");
+    TORCH_CHECK(active_idx.device().is_mps(), "active_idx must be on MPS");
+    TORCH_CHECK(active_idx.dtype() == at::kLong, "active_idx must be int64");
+    TORCH_CHECK(active_idx.is_contiguous(), "active_idx must be contiguous");
+    TORCH_CHECK(x2d.dim() == 2 && gy2d.dim() == 2 && active_idx.dim() == 1, "bad dims");
+    TORCH_CHECK(x2d.size(0) == gy2d.size(0), "x2d and gy2d N mismatch");
+    TORCH_CHECK(gy2d.size(1) == out_features, "gy2d width must equal out_features");
+    int64_t N = x2d.size(0);
+    int64_t In = x2d.size(1);
+    int64_t Out = out_features;
+    int64_t K = active_idx.numel();
+    auto grad_w = at::zeros({Out, In}, x2d.options());
+    auto grad_b = at::zeros({Out}, x2d.options());
+    if (K == 0) return {grad_w, grad_b};
+    id<MTLDevice> device = (id<MTLDevice>)at::mps::MPSDevice::getInstance()->device();
+    id<MTLComputePipelineState> pipeline = ensure_pipeline(device, &g_pipeline_grad_w, "sparse_linear_grad_w_float");
+    at::mps::MPSStream* stream = at::mps::getCurrentMPSStream();
+    TORCH_CHECK(stream != nullptr, "failed to get current MPS stream");
+    stream->endKernelCoalescing();
+    id<MTLComputeCommandEncoder> encoder = (id<MTLComputeCommandEncoder>)stream->commandEncoder();
+    TORCH_CHECK(encoder != nil, "failed to get MTLComputeCommandEncoder");
+    [encoder setComputePipelineState:pipeline];
+    auto set_tensor = [&](const at::Tensor& t, int idx) {
+        [encoder setBuffer:storage_as_mtlbuffer(t) offset:storage_offset_bytes(t) atIndex:(NSUInteger)idx];
+    };
+    set_tensor(x2d, 0);
+    set_tensor(gy2d, 1);
+    set_tensor(active_idx, 2);
+    set_tensor(grad_w, 3);
+    set_tensor(grad_b, 4);
+    SparseLinearParams prm{(uint32_t)N, (uint32_t)In, (uint32_t)Out, (uint32_t)K};
+    [encoder setBytes:&prm length:sizeof(SparseLinearParams) atIndex:5];
+    MTLSize tg = MTLSizeMake(16, 16, 1);
+    MTLSize grid = MTLSizeMake((NSUInteger)((In + 15) / 16), (NSUInteger)((K + 15) / 16), 1);
+    [encoder dispatchThreadgroups:grid threadsPerThreadgroup:tg];
+    stream->endKernelCoalescing();
+    return {grad_w, grad_b};
+}
+at::Tensor sparse_linear_grad_x(at::Tensor gy2d, at::Tensor weight, at::Tensor active_idx) {
+    check_mps_float_contig(gy2d, "gy2d");
+    check_mps_float_contig(weight, "weight");
+    TORCH_CHECK(active_idx.device().is_mps(), "active_idx must be on MPS");
+    TORCH_CHECK(active_idx.dtype() == at::kLong, "active_idx must be int64");
+    TORCH_CHECK(active_idx.is_contiguous(), "active_idx must be contiguous");
+    TORCH_CHECK(gy2d.dim() == 2 && weight.dim() == 2 && active_idx.dim() == 1, "bad dims");
+    int64_t N = gy2d.size(0);
+    int64_t Out = gy2d.size(1);
+    int64_t In = weight.size(1);
+    int64_t K = active_idx.numel();
+    TORCH_CHECK(weight.size(0) == Out, "weight out_features must match gy2d width");
+    auto grad_x = at::zeros({N, In}, gy2d.options());
+    if (K == 0) return grad_x;
+    id<MTLDevice> device = (id<MTLDevice>)at::mps::MPSDevice::getInstance()->device();
+    id<MTLComputePipelineState> pipeline = ensure_pipeline(device, &g_pipeline_grad_x, "sparse_linear_grad_x_float");
+    at::mps::MPSStream* stream = at::mps::getCurrentMPSStream();
+    TORCH_CHECK(stream != nullptr, "failed to get current MPS stream");
+    stream->endKernelCoalescing();
+    id<MTLComputeCommandEncoder> encoder = (id<MTLComputeCommandEncoder>)stream->commandEncoder();
+    TORCH_CHECK(encoder != nil, "failed to get MTLComputeCommandEncoder");
+    [encoder setComputePipelineState:pipeline];
+    auto set_tensor = [&](const at::Tensor& t, int idx) {
+        [encoder setBuffer:storage_as_mtlbuffer(t) offset:storage_offset_bytes(t) atIndex:(NSUInteger)idx];
+    };
+    set_tensor(gy2d, 0);
+    set_tensor(weight, 1);
+    set_tensor(active_idx, 2);
+    set_tensor(grad_x, 3);
+    SparseLinearParams prm{(uint32_t)N, (uint32_t)In, (uint32_t)Out, (uint32_t)K};
+    [encoder setBytes:&prm length:sizeof(SparseLinearParams) atIndex:4];
+    MTLSize tg = MTLSizeMake(16, 16, 1);
+    MTLSize grid = MTLSizeMake((NSUInteger)((In + 15) / 16), (NSUInteger)((N + 15) / 16), 1);
+    [encoder dispatchThreadgroups:grid threadsPerThreadgroup:tg];
+    stream->endKernelCoalescing();
+    return grad_x;
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("sparse_linear_grad_wb", &sparse_linear_grad_wb, "Sparse active-row Linear dW/db (Metal/MPS, fp32)");
+    m.def("sparse_linear_grad_x", &sparse_linear_grad_x, "Sparse active-row Linear dX (Metal/MPS, fp32)");
+}

experiments/sparse_linear_v11_gather_vs_metal/sparse_transformer_v11.py ADDED Viewed

	@@ -0,0 +1,406 @@

+"""
+Sparse Transformer v12: Hardware-Sympathetic Chunked Sparsity.
+This version groups rows into hardware-friendly "Chunks" (e.g., 64 rows per chunk).
+By selecting entire chunks, PyTorch can use zero-copy Strided Views. This completely
+bypasses the slow index_select/gather memory copying and feeds data directly into
+the AMX / Tensor Cores at native dense speeds.
+Run:
+    python3 sparse_transformer_v12.py --device mps --benchmark_sync
+"""
+import argparse
+import math
+import random
+import time
+from typing import Dict, List, Literal, Optional, Tuple
+import torch
+torch.set_num_threads(1)
+import torch.nn as nn
+import torch.nn.functional as F
+def sync_device(device: str) -> None:
+    if device == "cuda" and torch.cuda.is_available():
+        torch.cuda.synchronize()
+    elif device == "mps" and hasattr(torch, "mps"):
+        torch.mps.synchronize()
+Policy = Literal["predicted_magnitude", "oracle_current", "random"]
+BackwardMode = Literal["dense_baseline", "sparse_dW_full_dX", "sparse_dW_sparse_dX"]
+def set_seed(seed: int) -> None:
+    random.seed(seed)
+    torch.manual_seed(seed)
+def make_cpu_generator(seed: int) -> torch.Generator:
+    gen = torch.Generator(device="cpu")
+    gen.manual_seed(seed)
+    return gen
+# -----------------------------
+# Data
+# -----------------------------
+def make_synthetic_corpus(n_sentences: int = 12000, seed: int = 7) -> str:
+    rng = random.Random(seed)
+    words =["ada", "turing", "grace", "lovelace", "gradients", "tokens", "circuits", "features", "boldly", "strangely"]
+    return "\n".join(" ".join(rng.choices(words, k=rng.randint(4, 10))) + "." for _ in range(n_sentences))
+class CharCorpus:
+    def __init__(self, text: str, block_size: int, device: str):
+        chars = sorted(set(text))
+        self.stoi = {ch: i for i, ch in enumerate(chars)}
+        self.itos = {i: ch for ch, i in self.stoi.items()}
+        self.vocab_size = len(chars)
+        self.block_size = block_size
+        self.device = device
+        data = torch.tensor([self.stoi[ch] for ch in text], dtype=torch.long)
+        self.train_data = data[:int(0.9 * len(data))]
+        self.val_data = data[int(0.9 * len(data)):]
+    def get_batch(self, split: str, batch_size: int, generator: Optional[torch.Generator] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        data = self.train_data if split == "train" else self.val_data
+        ix = torch.randint(len(data) - self.block_size - 1, (batch_size,), generator=generator)
+        x = torch.stack([data[i : i + self.block_size] for i in ix])
+        y = torch.stack([data[i + 1 : i + self.block_size + 1] for i in ix])
+        return x.to(self.device), y.to(self.device)
+# -----------------------------
+# Chunked Sparse Autograd
+# -----------------------------
+class ChunkedMaskedLinear(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor], active_chunks: torch.Tensor, chunk_size: int, sparse_dx: bool) -> torch.Tensor:
+        ctx.save_for_backward(x, weight, active_chunks)
+        ctx.has_bias = bias is not None
+        ctx.sparse_dx = sparse_dx
+        ctx.chunk_size = chunk_size
+        return F.linear(x, weight, bias)
+    @staticmethod
+    def backward(ctx, grad_y: torch.Tensor):
+        x, weight, active_chunks = ctx.saved_tensors
+        chunk_size = ctx.chunk_size
+        x_flat = x.reshape(-1, x.shape[-1])
+        gy_flat = grad_y.reshape(-1, grad_y.shape[-1])
+        # Initialize full zero gradients
+        grad_w = torch.zeros_like(weight)
+        grad_b = torch.zeros(weight.shape[0], device=weight.device, dtype=weight.dtype) if ctx.has_bias else None
+        if ctx.sparse_dx:
+            grad_x_flat = torch.zeros_like(x_flat)
+        else:
+            grad_x_flat = gy_flat @ weight
+        # THE MAGIC: Zero-copy Strided Views
+        for c_idx in active_chunks.tolist():
+            start = c_idx * chunk_size
+            end = start + chunk_size
+            # These are views! No memory allocation, no gathering.
+            gy_slice = gy_flat[:, start:end]
+            w_slice = weight[start:end, :]
+            # Triggers dense hardware matmul (AMX / Tensor Cores) directly
+            grad_w[start:end, :] = gy_slice.t() @ x_flat
+            if ctx.has_bias:
+                grad_b[start:end] = gy_slice.sum(dim=0)
+            if ctx.sparse_dx:
+                grad_x_flat += gy_slice @ w_slice
+        return grad_x_flat.reshape(x.shape), grad_w, grad_b, None, None, None
+class SparseLinear(nn.Linear):
+    def __init__(self, in_features: int, out_features: int, bias: bool = True):
+        super().__init__(in_features, out_features, bias=bias)
+        self.sparse_enabled = False
+        self.sparse_dx = False
+        self.active_chunks: Optional[torch.Tensor] = None
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not self.sparse_enabled or self.active_chunks is None:
+            return F.linear(x, self.weight, self.bias)
+        return ChunkedMaskedLinear.apply(x, self.weight, self.bias, self.active_chunks, getattr(self, 'chunk_size', 64), self.sparse_dx)
+# -----------------------------
+# Mini GPT
+# -----------------------------
+class CausalSelfAttention(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        assert n_embd % n_head == 0
+        self.n_head = n_head
+        self.head_dim = n_embd // n_head
+        self.c_attn = SparseLinear(n_embd, 3 * n_embd)
+        self.c_proj = SparseLinear(n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+        self.register_buffer("mask", torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, T, C = x.shape
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(C, dim=2)
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float("-inf"))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.c_proj(y)
+class FeedForward(nn.Module):
+    def __init__(self, n_embd: int, dropout: float):
+        super().__init__()
+        self.c_fc = SparseLinear(n_embd, 4 * n_embd)
+        self.c_proj = SparseLinear(4 * n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.dropout(self.c_proj(F.gelu(self.c_fc(x))))
+class Block(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(n_embd)
+        self.attn = CausalSelfAttention(n_embd, n_head, block_size, dropout)
+        self.ln2 = nn.LayerNorm(n_embd)
+        self.mlp = FeedForward(n_embd, dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+class MiniGPT(nn.Module):
+    def __init__(self, vocab_size: int, block_size: int, n_layer: int, n_head: int, n_embd: int, dropout: float):
+        super().__init__()
+        self.block_size = block_size
+        self.tok_emb = nn.Embedding(vocab_size, n_embd)
+        self.pos_emb = nn.Embedding(block_size, n_embd)
+        self.blocks = nn.Sequential(*[Block(n_embd, n_head, block_size, dropout) for _ in range(n_layer)])
+        self.ln_f = nn.LayerNorm(n_embd)
+        # Standard nn.Linear for the LM head so it isn't restricted by chunk sizes
+        # and correctly calculates cross-entropy probabilities.
+        self.lm_head = nn.Linear(n_embd, vocab_size)
+    def forward(self, idx: torch.Tensor, targets: Optional[torch.Tensor] = None):
+        B, T = idx.shape
+        pos = torch.arange(T, device=idx.device)
+        x = self.tok_emb(idx) + self.pos_emb(pos)[None, :, :]
+        x = self.blocks(x)
+        x = self.ln_f(x)
+        logits = self.lm_head(x)
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss
+def get_sparse_linears(model):
+    return[m for m in model.modules() if isinstance(m, SparseLinear)]
+# -----------------------------
+# Chunk Masker
+# -----------------------------
+class ChunkMasker:
+    def __init__(self, model: nn.Module, policy: Policy, active_fraction: float, chunk_size: int, device: str):
+        self.policy = policy
+        self.active_fraction = active_fraction
+        self.chunk_size = chunk_size
+        self.device = device
+        self.linears = get_sparse_linears(model)
+        self.module_to_chunk_ids = {}
+        offset = 0
+        for m in self.linears:
+            assert m.out_features % chunk_size == 0, f"out_features {m.out_features} not divisible by chunk size {chunk_size}"
+            n_chunks = m.out_features // chunk_size
+            self.module_to_chunk_ids[m] = torch.arange(offset, offset + n_chunks, device=device)
+            offset += n_chunks
+        self.n_chunks = offset
+        self.predicted_mass = torch.zeros(self.n_chunks, device=device)
+        self.active_chunks = torch.zeros(self.n_chunks, dtype=torch.bool, device=device)
+    def choose_active(self, step: int, warmup_steps: int):
+        if step < warmup_steps:
+            self.active_chunks.fill_(True)
+            for m, ids in self.module_to_chunk_ids.items():
+                m.active_chunks = torch.arange(len(ids), device=self.device)
+            return
+        k = max(1, int(self.active_fraction * self.n_chunks))
+        self.active_chunks.fill_(False)
+        if self.policy == "random":
+            self.active_chunks[torch.randperm(self.n_chunks, device=self.device)[:k]] = True
+        elif self.policy == "predicted_magnitude":
+            # Add tiny noise for tie-breaking
+            scores = self.predicted_mass + 1e-9 * torch.rand_like(self.predicted_mass)
+            self.active_chunks[torch.topk(scores, k=k).indices] = True
+        for m, ids in self.module_to_chunk_ids.items():
+            global_active = self.active_chunks[ids]
+            local_ids = torch.arange(len(ids), device=self.device)
+            m.active_chunks = local_ids[global_active]
+    @torch.no_grad()
+    def update_predictor(self, mass_beta=0.95):
+        # Calculate true L2 norm of the gradient for each CHUNK
+        current_mass = torch.zeros_like(self.predicted_mass)
+        for m, ids in self.module_to_chunk_ids.items():
+            if m.weight.grad is None: continue
+            # Reshape[Out, In] -> [n_chunks, chunk_size, In], square, sum across chunk_size and In
+            w_sq = m.weight.grad.square().view(len(ids), self.chunk_size, -1).sum(dim=(1, 2))
+            if m.bias is not None and m.bias.grad is not None:
+                w_sq += m.bias.grad.square().view(len(ids), self.chunk_size).sum(dim=1)
+            current_mass[ids] = torch.sqrt(w_sq + 1e-30)
+        # Only update observed (active) chunks
+        observed = self.active_chunks
+        self.predicted_mass[observed] = mass_beta * self.predicted_mass[observed] + (1.0 - mass_beta) * current_mass[observed]
+# -----------------------------
+# Chunked Adam
+# -----------------------------
+class ChunkedAdam:
+    def __init__(self, model, lr=3e-4, chunk_size=64):
+        self.model = model
+        self.lr = lr
+        self.chunk_size = chunk_size
+        self.state = {}
+        # Keep track of which parameters belong to sparse modules
+        self.param_to_sparse_module = {}
+        for m in get_sparse_linears(model):
+            if m.weight is not None: self.param_to_sparse_module[m.weight] = m
+            if m.bias is not None: self.param_to_sparse_module[m.bias] = m
+    def zero_grad(self):
+        for p in self.model.parameters(): p.grad = None
+    @torch.no_grad()
+    def step(self):
+        for p in self.model.parameters():
+            if p.grad is None: continue
+            if p not in self.state:
+                self.state[p] = {"m": torch.zeros_like(p), "v": torch.zeros_like(p)}
+            exp_avg, exp_avg_sq = self.state[p]["m"], self.state[p]["v"]
+            sparse_module = self.param_to_sparse_module.get(p)
+            active_chunks = getattr(sparse_module, 'active_chunks', None) if sparse_module else None
+            if active_chunks is None:
+                # Dense update for embeddings, layernorms, LM head, or baseline
+                exp_avg.mul_(0.9).add_(p.grad, alpha=0.1)
+                exp_avg_sq.mul_(0.999).addcmul_(p.grad, p.grad, value=0.001)
+                update = exp_avg / (torch.sqrt(exp_avg_sq) + 1e-8)
+                p.sub_(update, alpha=self.lr)
+            else:
+                # Sparse update ONLY on active chunks (indices are local per module)
+                for local_c in active_chunks.tolist():
+                    start = local_c * self.chunk_size
+                    end = (local_c + 1) * self.chunk_size
+                    p_chunk = p[start:end]
+                    g_chunk = p.grad[start:end]
+                    m_chunk = exp_avg[start:end]
+                    v_chunk = exp_avg_sq[start:end]
+                    m_chunk.mul_(0.9).add_(g_chunk, alpha=0.1)
+                    v_chunk.mul_(0.999).addcmul_(g_chunk, g_chunk, value=0.001)
+                    update = m_chunk / (torch.sqrt(v_chunk) + 1e-8)
+                    p_chunk.sub_(update, alpha=self.lr)
+# -----------------------------
+# Training
+# -----------------------------
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--steps", type=int, default=50)
+    parser.add_argument("--batch_size", type=int, default=16)
+    parser.add_argument("--block_size", type=int, default=256)
+    parser.add_argument("--n_layer", type=int, default=4)
+    parser.add_argument("--n_head", type=int, default=16)
+    parser.add_argument("--n_embd", type=int, default=1024)
+    parser.add_argument("--chunk_size", type=int, default=64)
+    parser.add_argument("--active_fraction", type=float, default=0.05)
+    parser.add_argument("--device", type=str, default="mps")
+    parser.add_argument("--benchmark_sync", action="store_true")
+    args = parser.parse_args()
+    corpus = CharCorpus(make_synthetic_corpus(), args.block_size, args.device)
+    modes =[
+        ("dense_baseline", "dense_baseline"),
+        ("predicted_magnitude", "sparse_dW_full_dX"),
+        ("predicted_magnitude", "sparse_dW_sparse_dX")
+    ]
+    print(f"\nModel: {args.n_layer} layers, {args.n_embd} d_model, {args.chunk_size} chunk_size")
+    print(f"Batch: {args.batch_size}, Block: {args.block_size}. Active Fraction: {args.active_fraction}\n")
+    print(f"{'Run':>20s} | {'Time (s)':>10s} | {'Step (ms)':>10s} | {'Val Loss':>8s}")
+    print("-" * 55)
+    for policy, bwd_mode in modes:
+        set_seed(42)
+        model = MiniGPT(corpus.vocab_size, args.block_size, args.n_layer, args.n_head, args.n_embd, 0.0).to(args.device)
+        for m in get_sparse_linears(model):
+            m.chunk_size = args.chunk_size
+        masker = ChunkMasker(model, policy, args.active_fraction, args.chunk_size, args.device) if policy != "dense_baseline" else None
+        opt = ChunkedAdam(model, chunk_size=args.chunk_size)
+        if args.benchmark_sync: sync_device(args.device)
+        t0 = time.perf_counter()
+        for step in range(args.steps):
+            x, y = corpus.get_batch("train", args.batch_size, generator=make_cpu_generator(step))
+            if masker:
+                masker.choose_active(step, warmup_steps=5)
+                for m in get_sparse_linears(model):
+                    m.sparse_enabled = True
+                    m.sparse_dx = (bwd_mode == "sparse_dW_sparse_dX")
+            else:
+                for m in get_sparse_linears(model):
+                    m.sparse_enabled = False
+                    m.active_chunks = None
+            opt.zero_grad()
+            _, loss = model(x, y)
+            loss.backward()
+            if masker:
+                masker.update_predictor()
+            opt.step()
+        if args.benchmark_sync: sync_device(args.device)
+        t_elapsed = time.perf_counter() - t0
+        # Eval loss
+        model.eval()
+        with torch.no_grad():
+            x, y = corpus.get_batch("val", args.batch_size, generator=make_cpu_generator(999))
+            _, val_loss = model(x, y)
+        # Format the mode strictly for the printout width
+        bwd_str = bwd_mode if bwd_mode == "dense_baseline" else ("sparse_full_dX" if "full_dX" in bwd_mode else "sparse_sparse_dX")
+        print(f"{bwd_str:>20s} | {t_elapsed:10.2f} | {1000*t_elapsed/args.steps:10.2f} | {val_loss.item():8.4f}")
+if __name__ == "__main__":
+    main()

experiments/sparse_linear_v11_gather_vs_metal/sparse_transformer_v13.py ADDED Viewed

	@@ -0,0 +1,419 @@

+"""
+Sparse Transformer v14: The Final Architecture.
+Combines the Hardware-Sympathetic Chunked Sparse backward pass with Cosine Annealing,
+but restores the Chunked Optimizer to prevent Dense memory-bandwidth bottlenecks.
+Benchmarks are isolated to the steady-state phase (after annealing) for accurate timings.
+Run:
+    python3 sparse_transformer_v14.py --device mps --benchmark_sync --n_embd 1024
+"""
+import argparse
+import math
+import random
+import time
+from typing import Dict, List, Literal, Optional, Tuple
+import torch
+torch.set_num_threads(1)
+import torch.nn as nn
+import torch.nn.functional as F
+def sync_device(device: str) -> None:
+    if device == "cuda" and torch.cuda.is_available():
+        torch.cuda.synchronize()
+    elif device == "mps" and hasattr(torch, "mps"):
+        torch.mps.synchronize()
+Policy = Literal["predicted_magnitude", "oracle_current", "random"]
+BackwardMode = Literal["dense_baseline", "sparse_dW_full_dX", "sparse_dW_sparse_dX"]
+def set_seed(seed: int) -> None:
+    random.seed(seed)
+    torch.manual_seed(seed)
+def make_cpu_generator(seed: int) -> torch.Generator:
+    gen = torch.Generator(device="cpu")
+    gen.manual_seed(seed)
+    return gen
+# -----------------------------
+# Data
+# -----------------------------
+def make_synthetic_corpus(n_sentences: int = 12000, seed: int = 7) -> str:
+    rng = random.Random(seed)
+    words =["ada", "turing", "grace", "lovelace", "gradients", "tokens", "circuits", "features", "boldly", "strangely"]
+    return "\n".join(" ".join(rng.choices(words, k=rng.randint(4, 10))) + "." for _ in range(n_sentences))
+class CharCorpus:
+    def __init__(self, text: str, block_size: int, device: str):
+        chars = sorted(set(text))
+        self.stoi = {ch: i for i, ch in enumerate(chars)}
+        self.itos = {i: ch for ch, i in self.stoi.items()}
+        self.vocab_size = len(chars)
+        self.block_size = block_size
+        self.device = device
+        data = torch.tensor([self.stoi[ch] for ch in text], dtype=torch.long)
+        self.train_data = data[:int(0.9 * len(data))]
+        self.val_data = data[int(0.9 * len(data)):]
+    def get_batch(self, split: str, batch_size: int, generator: Optional[torch.Generator] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        data = self.train_data if split == "train" else self.val_data
+        ix = torch.randint(len(data) - self.block_size - 1, (batch_size,), generator=generator)
+        x = torch.stack([data[i : i + self.block_size] for i in ix])
+        y = torch.stack([data[i + 1 : i + self.block_size + 1] for i in ix])
+        return x.to(self.device), y.to(self.device)
+# -----------------------------
+# Chunked Sparse Autograd
+# -----------------------------
+class ChunkedMaskedLinear(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor], active_chunks: torch.Tensor, chunk_size: int, sparse_dx: bool) -> torch.Tensor:
+        ctx.save_for_backward(x, weight, active_chunks)
+        ctx.has_bias = bias is not None
+        ctx.sparse_dx = sparse_dx
+        ctx.chunk_size = chunk_size
+        return F.linear(x, weight, bias)
+    @staticmethod
+    def backward(ctx, grad_y: torch.Tensor):
+        x, weight, active_chunks = ctx.saved_tensors
+        chunk_size = ctx.chunk_size
+        x_flat = x.reshape(-1, x.shape[-1])
+        gy_flat = grad_y.reshape(-1, grad_y.shape[-1])
+        grad_w = torch.zeros_like(weight)
+        grad_b = torch.zeros(weight.shape[0], device=weight.device, dtype=weight.dtype) if ctx.has_bias else None
+        if ctx.sparse_dx:
+            grad_x_flat = torch.zeros_like(x_flat)
+        else:
+            grad_x_flat = gy_flat @ weight
+        # Zero-copy Strided Views
+        for c_idx in active_chunks.tolist():
+            start = c_idx * chunk_size
+            end = start + chunk_size
+            gy_slice = gy_flat[:, start:end]
+            w_slice = weight[start:end, :]
+            grad_w[start:end, :] = gy_slice.t() @ x_flat
+            if ctx.has_bias:
+                grad_b[start:end] = gy_slice.sum(dim=0)
+            if ctx.sparse_dx:
+                grad_x_flat += gy_slice @ w_slice
+        return grad_x_flat.reshape(x.shape), grad_w, grad_b, None, None, None
+class SparseLinear(nn.Linear):
+    def __init__(self, in_features: int, out_features: int, bias: bool = True):
+        super().__init__(in_features, out_features, bias=bias)
+        self.sparse_enabled = False
+        self.sparse_dx = False
+        self.active_chunks: Optional[torch.Tensor] = None
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not self.sparse_enabled or self.active_chunks is None:
+            return F.linear(x, self.weight, self.bias)
+        return ChunkedMaskedLinear.apply(x, self.weight, self.bias, self.active_chunks, getattr(self, 'chunk_size', 64), self.sparse_dx)
+# -----------------------------
+# Mini GPT
+# -----------------------------
+class CausalSelfAttention(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        assert n_embd % n_head == 0
+        self.n_head = n_head
+        self.head_dim = n_embd // n_head
+        self.c_attn = SparseLinear(n_embd, 3 * n_embd)
+        self.c_proj = SparseLinear(n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+        self.register_buffer("mask", torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, T, C = x.shape
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(C, dim=2)
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float("-inf"))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.c_proj(y)
+class FeedForward(nn.Module):
+    def __init__(self, n_embd: int, dropout: float):
+        super().__init__()
+        self.c_fc = SparseLinear(n_embd, 4 * n_embd)
+        self.c_proj = SparseLinear(4 * n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.dropout(self.c_proj(F.gelu(self.c_fc(x))))
+class Block(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(n_embd)
+        self.attn = CausalSelfAttention(n_embd, n_head, block_size, dropout)
+        self.ln2 = nn.LayerNorm(n_embd)
+        self.mlp = FeedForward(n_embd, dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+class MiniGPT(nn.Module):
+    def __init__(self, vocab_size: int, block_size: int, n_layer: int, n_head: int, n_embd: int, dropout: float):
+        super().__init__()
+        self.block_size = block_size
+        self.tok_emb = nn.Embedding(vocab_size, n_embd)
+        self.pos_emb = nn.Embedding(block_size, n_embd)
+        self.blocks = nn.Sequential(*[Block(n_embd, n_head, block_size, dropout) for _ in range(n_layer)])
+        self.ln_f = nn.LayerNorm(n_embd)
+        self.lm_head = nn.Linear(n_embd, vocab_size)
+    def forward(self, idx: torch.Tensor, targets: Optional[torch.Tensor] = None):
+        B, T = idx.shape
+        pos = torch.arange(T, device=idx.device)
+        x = self.tok_emb(idx) + self.pos_emb(pos)[None, :, :]
+        x = self.blocks(x)
+        x = self.ln_f(x)
+        logits = self.lm_head(x)
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss
+def get_sparse_linears(model):
+    return[m for m in model.modules() if isinstance(m, SparseLinear)]
+# -----------------------------
+# Chunk Masker with Annealing
+# -----------------------------
+class ChunkMasker:
+    def __init__(self, model: nn.Module, policy: Policy, target_fraction: float, chunk_size: int, device: str):
+        self.policy = policy
+        self.target_fraction = target_fraction
+        self.chunk_size = chunk_size
+        self.device = device
+        self.linears = get_sparse_linears(model)
+        self.module_to_chunk_ids = {}
+        offset = 0
+        for m in self.linears:
+            assert m.out_features % chunk_size == 0, f"out_features {m.out_features} not divisible by chunk size {chunk_size}"
+            n_chunks = m.out_features // chunk_size
+            self.module_to_chunk_ids[m] = torch.arange(offset, offset + n_chunks, device=device)
+            offset += n_chunks
+        self.n_chunks = offset
+        self.predicted_mass = torch.zeros(self.n_chunks, device=device)
+        self.active_chunks = torch.zeros(self.n_chunks, dtype=torch.bool, device=device)
+    def choose_active(self, step: int, warmup_steps: int, anneal_steps: int):
+        # Cosine Annealing Logic
+        if step < warmup_steps:
+            current_fraction = 1.0
+        elif step < warmup_steps + anneal_steps:
+            progress = (step - warmup_steps) / anneal_steps
+            cosine_mult = 0.5 * (1.0 + math.cos(math.pi * progress))
+            current_fraction = self.target_fraction + (1.0 - self.target_fraction) * cosine_mult
+        else:
+            current_fraction = self.target_fraction
+        if current_fraction >= 0.999:
+            self.active_chunks.fill_(True)
+            for m, ids in self.module_to_chunk_ids.items():
+                m.active_chunks = torch.arange(len(ids), device=self.device)
+            return
+        k = max(1, int(current_fraction * self.n_chunks))
+        self.active_chunks.fill_(False)
+        if self.policy == "random":
+            self.active_chunks[torch.randperm(self.n_chunks, device=self.device)[:k]] = True
+        elif self.policy == "predicted_magnitude":
+            scores = self.predicted_mass + 1e-9 * torch.rand_like(self.predicted_mass)
+            self.active_chunks[torch.topk(scores, k=k).indices] = True
+        for m, ids in self.module_to_chunk_ids.items():
+            global_active = self.active_chunks[ids]
+            local_ids = torch.arange(len(ids), device=self.device)
+            m.active_chunks = local_ids[global_active]
+    @torch.no_grad()
+    def update_predictor(self, mass_beta=0.95):
+        current_mass = torch.zeros_like(self.predicted_mass)
+        for m, ids in self.module_to_chunk_ids.items():
+            if m.weight.grad is None: continue
+            w_sq = m.weight.grad.square().view(len(ids), self.chunk_size, -1).sum(dim=(1, 2))
+            if m.bias is not None and m.bias.grad is not None:
+                w_sq += m.bias.grad.square().view(len(ids), self.chunk_size).sum(dim=1)
+            current_mass[ids] = torch.sqrt(w_sq + 1e-30)
+        observed = self.active_chunks
+        self.predicted_mass[observed] = mass_beta * self.predicted_mass[observed] + (1.0 - mass_beta) * current_mass[observed]
+# -----------------------------
+# Chunked Adam (Restored)
+# -----------------------------
+class ChunkedAdam:
+    def __init__(self, model, lr=3e-4, chunk_size=64):
+        self.model = model
+        self.lr = lr
+        self.chunk_size = chunk_size
+        self.state = {}
+        # Keep track of which parameters belong to sparse modules
+        self.param_to_sparse_module = {}
+        for m in get_sparse_linears(model):
+            if m.weight is not None: self.param_to_sparse_module[m.weight] = m
+            if m.bias is not None: self.param_to_sparse_module[m.bias] = m
+    def zero_grad(self):
+        for p in self.model.parameters(): p.grad = None
+    @torch.no_grad()
+    def step(self):
+        for p in self.model.parameters():
+            if p.grad is None: continue
+            if p not in self.state:
+                self.state[p] = {"m": torch.zeros_like(p), "v": torch.zeros_like(p)}
+            exp_avg, exp_avg_sq = self.state[p]["m"], self.state[p]["v"]
+            sparse_module = self.param_to_sparse_module.get(p)
+            active_chunks = getattr(sparse_module, 'active_chunks', None) if sparse_module else None
+            if active_chunks is None:
+                # Dense update for embeddings, layernorms, LM head, or baseline
+                exp_avg.mul_(0.9).add_(p.grad, alpha=0.1)
+                exp_avg_sq.mul_(0.999).addcmul_(p.grad, p.grad, value=0.001)
+                update = exp_avg / (torch.sqrt(exp_avg_sq) + 1e-8)
+                p.sub_(update, alpha=self.lr)
+            else:
+                # Sparse update ONLY on the active chunks
+                for local_c in active_chunks.tolist():
+                    start = local_c * self.chunk_size
+                    end = (local_c + 1) * self.chunk_size
+                    p_chunk = p[start:end]
+                    g_chunk = p.grad[start:end]
+                    m_chunk = exp_avg[start:end]
+                    v_chunk = exp_avg_sq[start:end]
+                    m_chunk.mul_(0.9).add_(g_chunk, alpha=0.1)
+                    v_chunk.mul_(0.999).addcmul_(g_chunk, g_chunk, value=0.001)
+                    update = m_chunk / (torch.sqrt(v_chunk) + 1e-8)
+                    p_chunk.sub_(update, alpha=self.lr)
+# -----------------------------
+# Training
+# -----------------------------
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--steps", type=int, default=500)
+    parser.add_argument("--batch_size", type=int, default=16)
+    parser.add_argument("--block_size", type=int, default=256)
+    parser.add_argument("--n_layer", type=int, default=4)
+    parser.add_argument("--n_head", type=int, default=16)
+    parser.add_argument("--n_embd", type=int, default=1024)
+    parser.add_argument("--chunk_size", type=int, default=64)
+    parser.add_argument("--active_fraction", type=float, default=0.05)
+    parser.add_argument("--warmup_steps", type=int, default=10)
+    parser.add_argument("--anneal_steps", type=int, default=150)
+    parser.add_argument("--device", type=str, default="mps")
+    parser.add_argument("--benchmark_sync", action="store_true")
+    args = parser.parse_args()
+    corpus = CharCorpus(make_synthetic_corpus(), args.block_size, args.device)
+    modes =[
+        ("dense_baseline", "dense_baseline"),
+        ("predicted_magnitude", "sparse_dW_full_dX"),
+        ("predicted_magnitude", "sparse_dW_sparse_dX")
+    ]
+    print(f"\nModel: {args.n_layer} layers, {args.n_embd} d_model, {args.chunk_size} chunk_size")
+    print(f"Batch: {args.batch_size}, Block: {args.block_size}. Target Active Fraction: {args.active_fraction}")
+    print(f"Annealing: {args.warmup_steps} warmup steps, {args.anneal_steps} anneal steps.\n")
+    print(f"{'Run':>20s} | {'Time (s)':>10s} | {'Step (ms)':>10s} | {'Val Loss':>8s}")
+    print("-" * 55)
+    for policy, bwd_mode in modes:
+        set_seed(42)
+        model = MiniGPT(corpus.vocab_size, args.block_size, args.n_layer, args.n_head, args.n_embd, 0.0).to(args.device)
+        for m in get_sparse_linears(model):
+            m.chunk_size = args.chunk_size
+        masker = ChunkMasker(model, policy, args.active_fraction, args.chunk_size, args.device) if policy != "dense_baseline" else None
+        # Restoring the sparse optimizer!
+        opt = ChunkedAdam(model, chunk_size=args.chunk_size)
+        if args.benchmark_sync: sync_device(args.device)
+        # We will only measure the time AFTER annealing finishes to get the true steady-state sparse speed.
+        t0 = time.perf_counter()
+        measured_steps = args.steps
+        for step in range(args.steps):
+            # Reset the timer once we hit the target sparsity
+            if step == args.warmup_steps + args.anneal_steps:
+                if args.benchmark_sync: sync_device(args.device)
+                t0 = time.perf_counter()
+                measured_steps = args.steps - step
+            x, y = corpus.get_batch("train", args.batch_size, generator=make_cpu_generator(step))
+            if masker:
+                masker.choose_active(step, warmup_steps=args.warmup_steps, anneal_steps=args.anneal_steps)
+                for m in get_sparse_linears(model):
+                    m.sparse_enabled = True
+                    m.sparse_dx = (bwd_mode == "sparse_dW_sparse_dX")
+            else:
+                for m in get_sparse_linears(model):
+                    m.sparse_enabled = False
+                    m.active_chunks = None
+            opt.zero_grad()
+            _, loss = model(x, y)
+            loss.backward()
+            if masker:
+                masker.update_predictor()
+            opt.step()
+        if args.benchmark_sync: sync_device(args.device)
+        t_elapsed = time.perf_counter() - t0
+        # Eval loss
+        model.eval()
+        with torch.no_grad():
+            x, y = corpus.get_batch("val", args.batch_size, generator=make_cpu_generator(999))
+            _, val_loss = model(x, y)
+        bwd_str = bwd_mode if bwd_mode == "dense_baseline" else ("sparse_full_dX" if "full_dX" in bwd_mode else "sparse_sparse_dX")
+        print(f"{bwd_str:>20s} | {t_elapsed:10.2f} | {1000*t_elapsed/max(1, measured_steps):10.2f} | {val_loss.item():8.4f}")
+if __name__ == "__main__":
+    main()

experiments/sparse_linear_v11_gather_vs_metal/tiny.py ADDED Viewed

	@@ -0,0 +1,441 @@

+"""
+Sparse Transformer: Real-World Benchmark on Tiny Shakespeare using GPT-2 BPE.
+This script scales the architecture to a 6-layer, 512-dim GPT and trains on
+real natural language. It applies our Hardware-Sympathetic Chunked Sparse
+backward pass, Cosine Annealing, and Chunked Adam optimizer.
+Run:
+    python3 sparse_transformer_shakespeare.py --device mps --benchmark_sync
+"""
+import argparse
+import math
+import os
+import random
+import time
+import urllib.request
+from typing import Dict, List, Literal, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    import tiktoken
+except ImportError:
+    raise ImportError("Please install tiktoken: pip install tiktoken")
+torch.set_num_threads(1)
+def sync_device(device: str) -> None:
+    if device == "cuda" and torch.cuda.is_available():
+        torch.cuda.synchronize()
+    elif device == "mps" and hasattr(torch, "mps"):
+        torch.mps.synchronize()
+Policy = Literal["predicted_magnitude", "oracle_current", "random"]
+BackwardMode = Literal["dense_baseline", "sparse_dW_full_dX", "sparse_dW_sparse_dX"]
+def set_seed(seed: int) -> None:
+    random.seed(seed)
+    torch.manual_seed(seed)
+def make_cpu_generator(seed: int) -> torch.Generator:
+    gen = torch.Generator(device="cpu")
+    gen.manual_seed(seed)
+    return gen
+# -----------------------------
+# Real-World Data Pipeline
+# -----------------------------
+class ShakespeareCorpus:
+    def __init__(self, block_size: int, device: str):
+        self.block_size = block_size
+        self.device = device
+        # 1. Download Tiny Shakespeare if not exists
+        data_path = "input.txt"
+        if not os.path.exists(data_path):
+            print("Downloading Tiny Shakespeare...")
+            url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
+            urllib.request.urlretrieve(url, data_path)
+        # 2. Tokenize using GPT-2 BPE
+        print("Tokenizing data...")
+        with open(data_path, "r", encoding="utf-8") as f:
+            text = f.read()
+        enc = tiktoken.get_encoding("gpt2")
+        tokens = enc.encode(text)
+        self.vocab_size = enc.n_vocab
+        # 3. Split 90/10 Train/Val
+        data = torch.tensor(tokens, dtype=torch.long)
+        split_idx = int(0.9 * len(data))
+        self.train_data = data[:split_idx]
+        self.val_data = data[split_idx:]
+        print(f"Dataset loaded. Vocab size: {self.vocab_size:,}. Train tokens: {len(self.train_data):,}")
+    def get_batch(self, split: str, batch_size: int, generator: Optional[torch.Generator] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        data = self.train_data if split == "train" else self.val_data
+        ix = torch.randint(len(data) - self.block_size - 1, (batch_size,), generator=generator)
+        x = torch.stack([data[i : i + self.block_size] for i in ix])
+        y = torch.stack([data[i + 1 : i + self.block_size + 1] for i in ix])
+        return x.to(self.device), y.to(self.device)
+# -----------------------------
+# Chunked Sparse Autograd
+# -----------------------------
+class ChunkedMaskedLinear(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor], active_chunks: torch.Tensor, chunk_size: int, sparse_dx: bool) -> torch.Tensor:
+        ctx.save_for_backward(x, weight, active_chunks)
+        ctx.has_bias = bias is not None
+        ctx.sparse_dx = sparse_dx
+        ctx.chunk_size = chunk_size
+        return F.linear(x, weight, bias)
+    @staticmethod
+    def backward(ctx, grad_y: torch.Tensor):
+        x, weight, active_chunks = ctx.saved_tensors
+        chunk_size = ctx.chunk_size
+        x_flat = x.reshape(-1, x.shape[-1])
+        gy_flat = grad_y.reshape(-1, grad_y.shape[-1])
+        grad_w = torch.zeros_like(weight)
+        grad_b = torch.zeros(weight.shape[0], device=weight.device, dtype=weight.dtype) if ctx.has_bias else None
+        if ctx.sparse_dx:
+            grad_x_flat = torch.zeros_like(x_flat)
+        else:
+            grad_x_flat = gy_flat @ weight
+        # Zero-copy Strided Views feeding directly into Dense Hardware Matmuls
+        for c_idx in active_chunks.tolist():
+            start = c_idx * chunk_size
+            end = start + chunk_size
+            gy_slice = gy_flat[:, start:end]
+            w_slice = weight[start:end, :]
+            grad_w[start:end, :] = gy_slice.t() @ x_flat
+            if ctx.has_bias:
+                grad_b[start:end] = gy_slice.sum(dim=0)
+            if ctx.sparse_dx:
+                grad_x_flat += gy_slice @ w_slice
+        return grad_x_flat.reshape(x.shape), grad_w, grad_b, None, None, None
+class SparseLinear(nn.Linear):
+    def __init__(self, in_features: int, out_features: int, bias: bool = True):
+        super().__init__(in_features, out_features, bias=bias)
+        self.sparse_enabled = False
+        self.sparse_dx = False
+        self.active_chunks: Optional[torch.Tensor] = None
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not self.sparse_enabled or self.active_chunks is None:
+            return F.linear(x, self.weight, self.bias)
+        return ChunkedMaskedLinear.apply(x, self.weight, self.bias, self.active_chunks, getattr(self, 'chunk_size', 64), self.sparse_dx)
+# -----------------------------
+# GPT Architecture
+# -----------------------------
+class CausalSelfAttention(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        assert n_embd % n_head == 0
+        self.n_head = n_head
+        self.head_dim = n_embd // n_head
+        self.c_attn = SparseLinear(n_embd, 3 * n_embd)
+        self.c_proj = SparseLinear(n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+        self.register_buffer("mask", torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, T, C = x.shape
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(C, dim=2)
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float("-inf"))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.c_proj(y)
+class FeedForward(nn.Module):
+    def __init__(self, n_embd: int, dropout: float):
+        super().__init__()
+        self.c_fc = SparseLinear(n_embd, 4 * n_embd)
+        self.c_proj = SparseLinear(4 * n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.dropout(self.c_proj(F.gelu(self.c_fc(x))))
+class Block(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(n_embd)
+        self.attn = CausalSelfAttention(n_embd, n_head, block_size, dropout)
+        self.ln2 = nn.LayerNorm(n_embd)
+        self.mlp = FeedForward(n_embd, dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+class GPT(nn.Module):
+    def __init__(self, vocab_size: int, block_size: int, n_layer: int, n_head: int, n_embd: int, dropout: float):
+        super().__init__()
+        self.block_size = block_size
+        self.tok_emb = nn.Embedding(vocab_size, n_embd)
+        self.pos_emb = nn.Embedding(block_size, n_embd)
+        self.blocks = nn.Sequential(*[Block(n_embd, n_head, block_size, dropout) for _ in range(n_layer)])
+        self.ln_f = nn.LayerNorm(n_embd)
+        # LM head is Dense! Needs full output dist for CrossEntropy loss
+        self.lm_head = nn.Linear(n_embd, vocab_size)
+    def forward(self, idx: torch.Tensor, targets: Optional[torch.Tensor] = None):
+        B, T = idx.shape
+        pos = torch.arange(T, device=idx.device)
+        x = self.tok_emb(idx) + self.pos_emb(pos)[None, :, :]
+        x = self.blocks(x)
+        x = self.ln_f(x)
+        logits = self.lm_head(x)
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss
+def get_sparse_linears(model):
+    return[m for m in model.modules() if isinstance(m, SparseLinear)]
+# -----------------------------
+# Chunk Masker with Annealing
+# -----------------------------
+class ChunkMasker:
+    def __init__(self, model: nn.Module, policy: Policy, target_fraction: float, chunk_size: int, device: str):
+        self.policy = policy
+        self.target_fraction = target_fraction
+        self.chunk_size = chunk_size
+        self.device = device
+        self.linears = get_sparse_linears(model)
+        self.module_to_chunk_ids = {}
+        offset = 0
+        for m in self.linears:
+            assert m.out_features % chunk_size == 0, f"out_features {m.out_features} not divisible by chunk size {chunk_size}"
+            n_chunks = m.out_features // chunk_size
+            self.module_to_chunk_ids[m] = torch.arange(offset, offset + n_chunks, device=device)
+            offset += n_chunks
+        self.n_chunks = offset
+        self.predicted_mass = torch.zeros(self.n_chunks, device=device)
+        self.active_chunks = torch.zeros(self.n_chunks, dtype=torch.bool, device=device)
+    def choose_active(self, step: int, warmup_steps: int, anneal_steps: int):
+        if step < warmup_steps:
+            current_fraction = 1.0
+        elif step < warmup_steps + anneal_steps:
+            progress = (step - warmup_steps) / anneal_steps
+            cosine_mult = 0.5 * (1.0 + math.cos(math.pi * progress))
+            current_fraction = self.target_fraction + (1.0 - self.target_fraction) * cosine_mult
+        else:
+            current_fraction = self.target_fraction
+        if current_fraction >= 0.999:
+            self.active_chunks.fill_(True)
+            for m, ids in self.module_to_chunk_ids.items():
+                m.active_chunks = torch.arange(len(ids), device=self.device)
+            return
+        k = max(1, int(current_fraction * self.n_chunks))
+        self.active_chunks.fill_(False)
+        if self.policy == "random":
+            self.active_chunks[torch.randperm(self.n_chunks, device=self.device)[:k]] = True
+        elif self.policy == "predicted_magnitude":
+            scores = self.predicted_mass + 1e-9 * torch.rand_like(self.predicted_mass)
+            self.active_chunks[torch.topk(scores, k=k).indices] = True
+        for m, ids in self.module_to_chunk_ids.items():
+            global_active = self.active_chunks[ids]
+            local_ids = torch.arange(len(ids), device=self.device)
+            m.active_chunks = local_ids[global_active]
+    @torch.no_grad()
+    def update_predictor(self, mass_beta=0.95):
+        current_mass = torch.zeros_like(self.predicted_mass)
+        for m, ids in self.module_to_chunk_ids.items():
+            if m.weight.grad is None: continue
+            w_sq = m.weight.grad.square().view(len(ids), self.chunk_size, -1).sum(dim=(1, 2))
+            if m.bias is not None and m.bias.grad is not None:
+                w_sq += m.bias.grad.square().view(len(ids), self.chunk_size).sum(dim=1)
+            current_mass[ids] = torch.sqrt(w_sq + 1e-30)
+        observed = self.active_chunks
+        self.predicted_mass[observed] = mass_beta * self.predicted_mass[observed] + (1.0 - mass_beta) * current_mass[observed]
+# -----------------------------
+# Chunked Adam
+# -----------------------------
+class ChunkedAdam:
+    def __init__(self, model, lr=5e-4, chunk_size=64):
+        self.model = model
+        self.lr = lr
+        self.chunk_size = chunk_size
+        self.state = {}
+        self.param_to_sparse_module = {}
+        for m in get_sparse_linears(model):
+            if m.weight is not None: self.param_to_sparse_module[m.weight] = m
+            if m.bias is not None: self.param_to_sparse_module[m.bias] = m
+    def zero_grad(self):
+        for p in self.model.parameters(): p.grad = None
+    @torch.no_grad()
+    def step(self):
+        for p in self.model.parameters():
+            if p.grad is None: continue
+            if p not in self.state:
+                self.state[p] = {"m": torch.zeros_like(p), "v": torch.zeros_like(p)}
+            exp_avg, exp_avg_sq = self.state[p]["m"], self.state[p]["v"]
+            sparse_module = self.param_to_sparse_module.get(p)
+            active_chunks = getattr(sparse_module, 'active_chunks', None) if sparse_module else None
+            if active_chunks is None:
+                # Dense update
+                exp_avg.mul_(0.9).add_(p.grad, alpha=0.1)
+                exp_avg_sq.mul_(0.999).addcmul_(p.grad, p.grad, value=0.001)
+                update = exp_avg / (torch.sqrt(exp_avg_sq) + 1e-8)
+                p.sub_(update, alpha=self.lr)
+            else:
+                # Sparse update
+                for local_c in active_chunks.tolist():
+                    start = local_c * self.chunk_size
+                    end = (local_c + 1) * self.chunk_size
+                    p_chunk = p[start:end]
+                    g_chunk = p.grad[start:end]
+                    m_chunk = exp_avg[start:end]
+                    v_chunk = exp_avg_sq[start:end]
+                    m_chunk.mul_(0.9).add_(g_chunk, alpha=0.1)
+                    v_chunk.mul_(0.999).addcmul_(g_chunk, g_chunk, value=0.001)
+                    update = m_chunk / (torch.sqrt(v_chunk) + 1e-8)
+                    p_chunk.sub_(update, alpha=self.lr)
+# -----------------------------
+# Training
+# -----------------------------
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--steps", type=int, default=1000)
+    parser.add_argument("--batch_size", type=int, default=8)
+    parser.add_argument("--block_size", type=int, default=256)
+    parser.add_argument("--n_layer", type=int, default=6)
+    parser.add_argument("--n_head", type=int, default=8)
+    parser.add_argument("--n_embd", type=int, default=512)
+    parser.add_argument("--chunk_size", type=int, default=64)
+    parser.add_argument("--active_fraction", type=float, default=0.10)
+    parser.add_argument("--warmup_steps", type=int, default=50)
+    parser.add_argument("--anneal_steps", type=int, default=200)
+    parser.add_argument("--device", type=str, default="mps")
+    parser.add_argument("--benchmark_sync", action="store_true")
+    args = parser.parse_args()
+    corpus = ShakespeareCorpus(args.block_size, args.device)
+    modes =[
+        ("dense_baseline", "dense_baseline"),
+        ("predicted_magnitude", "sparse_dW_full_dX"),
+        ("predicted_magnitude", "sparse_dW_sparse_dX")
+    ]
+    print(f"\nModel: {args.n_layer} layers, {args.n_embd} d_model, {args.chunk_size} chunk_size")
+    print(f"Batch: {args.batch_size}, Block: {args.block_size}. Target Active: {args.active_fraction*100}%")
+    print(f"Annealing: {args.warmup_steps} warmup steps, {args.anneal_steps} anneal steps.\n")
+    print(f"{'Run':>20s} | {'Time (s)':>10s} | {'Step (ms)':>10s} | {'Val Loss':>8s}")
+    print("-" * 55)
+    for policy, bwd_mode in modes:
+        set_seed(42)
+        model = GPT(corpus.vocab_size, args.block_size, args.n_layer, args.n_head, args.n_embd, 0.1).to(args.device)
+        for m in get_sparse_linears(model):
+            m.chunk_size = args.chunk_size
+        masker = ChunkMasker(model, policy, args.active_fraction, args.chunk_size, args.device) if policy != "dense_baseline" else None
+        opt = ChunkedAdam(model, lr=5e-4, chunk_size=args.chunk_size)
+        if args.benchmark_sync: sync_device(args.device)
+        t0 = time.perf_counter()
+        measured_steps = args.steps
+        for step in range(args.steps):
+            if step == args.warmup_steps + args.anneal_steps:
+                if args.benchmark_sync: sync_device(args.device)
+                t0 = time.perf_counter()
+                measured_steps = args.steps - step
+            x, y = corpus.get_batch("train", args.batch_size, generator=make_cpu_generator(step))
+            if masker:
+                masker.choose_active(step, warmup_steps=args.warmup_steps, anneal_steps=args.anneal_steps)
+                for m in get_sparse_linears(model):
+                    m.sparse_enabled = True
+                    m.sparse_dx = (bwd_mode == "sparse_dW_sparse_dX")
+            else:
+                for m in get_sparse_linears(model):
+                    m.sparse_enabled = False
+                    m.active_chunks = None
+            opt.zero_grad()
+            _, loss = model(x, y)
+            loss.backward()
+            if masker:
+                masker.update_predictor()
+            opt.step()
+            # Optional: Print progress every 100 steps
+            if step % 200 == 0:
+                print(f"  [Progress] {bwd_mode} step {step}/{args.steps} | Loss: {loss.item():.4f}", end="\r")
+        if args.benchmark_sync: sync_device(args.device)
+        t_elapsed = time.perf_counter() - t0
+        # Eval loss
+        model.eval()
+        with torch.no_grad():
+            # Eval loss
+        model.eval()
+        with torch.no_grad():
+            val_x, val_y = corpus.get_batch("val", args.batch_size, generator=make_cpu_generator(999))
+            _, val_loss = model(val_x, val_y)
+        # Clear the progress line
+        print(" " * 60, end="\r")
+        bwd_str = bwd_mode if bwd_mode == "dense_baseline" else ("sparse_full_dX" if "full_dX" in bwd_mode else "sparse_sparse_dX")
+        print(f"{bwd_str:>20s} | {t_elapsed:10.2f} | {1000*t_elapsed/max(1, measured_steps):10.2f} | {val_loss.item():8.4f}")
+if __name__ == "__main__":
+    main()

experiments/sparse_transformer_v15_inactive_prediction.py ADDED Viewed

	@@ -0,0 +1,729 @@

+"""
+Sparse Transformer v15: Inactive-Update Prediction Diagnostics.
+Tests two simple ideas:
+1. Correlated-neighbor prediction:
+   Use active chunks as sensors. For each inactive chunk, find historically
+   correlated active chunks and predict its update magnitude from them.
+2. Graph / boundary interpolation:
+   Treat chunks as nodes in a learned similarity graph. Active chunks are
+   boundary values. Inactive chunk magnitudes are filled in by diffusion.
+This is intentionally a diagnostic script, not a speed benchmark.
+It computes dense gradients every step so we can measure whether inactive
+updates are predictable.
+Run:
+    python3 sparse_transformer_v15_inactive_prediction.py --device mps --benchmark_sync
+Good first runs:
+    python3 sparse_transformer_v15_inactive_prediction.py --device mps --steps 300 --n_embd 512
+    python3 sparse_transformer_v15_inactive_prediction.py --device mps --steps 300 --n_embd 1024
+"""
+import argparse
+import math
+import random
+import time
+from typing import Dict, List, Literal, Optional, Tuple
+import torch
+torch.set_num_threads(1)
+import torch.nn as nn
+import torch.nn.functional as F
+Policy = Literal["predicted_magnitude", "random"]
+def sync_device(device: str) -> None:
+    if device == "cuda" and torch.cuda.is_available():
+        torch.cuda.synchronize()
+    elif device == "mps" and hasattr(torch, "mps"):
+        torch.mps.synchronize()
+def set_seed(seed: int) -> None:
+    random.seed(seed)
+    torch.manual_seed(seed)
+def make_cpu_generator(seed: int) -> torch.Generator:
+    gen = torch.Generator(device="cpu")
+    gen.manual_seed(seed)
+    return gen
+# -----------------------------
+# Data
+# -----------------------------
+def make_synthetic_corpus(n_sentences: int = 12000, seed: int = 7) -> str:
+    rng = random.Random(seed)
+    words = [
+        "ada", "turing", "grace", "lovelace", "gradients",
+        "tokens", "circuits", "features", "boldly", "strangely",
+        "matrix", "attention", "kernel", "entropy", "signal",
+    ]
+    return "\n".join(
+        " ".join(rng.choices(words, k=rng.randint(4, 10))) + "."
+        for _ in range(n_sentences)
+    )
+class CharCorpus:
+    def __init__(self, text: str, block_size: int, device: str):
+        chars = sorted(set(text))
+        self.stoi = {ch: i for i, ch in enumerate(chars)}
+        self.itos = {i: ch for ch, i in self.stoi.items()}
+        self.vocab_size = len(chars)
+        self.block_size = block_size
+        self.device = device
+        data = torch.tensor([self.stoi[ch] for ch in text], dtype=torch.long)
+        self.train_data = data[: int(0.9 * len(data))]
+        self.val_data = data[int(0.9 * len(data)) :]
+    def get_batch(
+        self,
+        split: str,
+        batch_size: int,
+        generator: Optional[torch.Generator] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        data = self.train_data if split == "train" else self.val_data
+        ix = torch.randint(len(data) - self.block_size - 1, (batch_size,), generator=generator)
+        x = torch.stack([data[i : i + self.block_size] for i in ix])
+        y = torch.stack([data[i + 1 : i + self.block_size + 1] for i in ix])
+        return x.to(self.device), y.to(self.device)
+# -----------------------------
+# Model
+# -----------------------------
+class SparseLinear(nn.Linear):
+    """Name retained for compatibility with earlier experiments.
+    In this diagnostic script, backward is dense. We only use chunk masks
+    analytically after gradients are computed.
+    """
+class CausalSelfAttention(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        assert n_embd % n_head == 0
+        self.n_head = n_head
+        self.head_dim = n_embd // n_head
+        self.c_attn = SparseLinear(n_embd, 3 * n_embd)
+        self.c_proj = SparseLinear(n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+        self.register_buffer(
+            "mask",
+            torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, T, C = x.shape
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(C, dim=2)
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float("-inf"))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.c_proj(y)
+class FeedForward(nn.Module):
+    def __init__(self, n_embd: int, dropout: float):
+        super().__init__()
+        self.c_fc = SparseLinear(n_embd, 4 * n_embd)
+        self.c_proj = SparseLinear(4 * n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.dropout(self.c_proj(F.gelu(self.c_fc(x))))
+class Block(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(n_embd)
+        self.attn = CausalSelfAttention(n_embd, n_head, block_size, dropout)
+        self.ln2 = nn.LayerNorm(n_embd)
+        self.mlp = FeedForward(n_embd, dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+class MiniGPT(nn.Module):
+    def __init__(
+        self,
+        vocab_size: int,
+        block_size: int,
+        n_layer: int,
+        n_head: int,
+        n_embd: int,
+        dropout: float,
+    ):
+        super().__init__()
+        self.block_size = block_size
+        self.tok_emb = nn.Embedding(vocab_size, n_embd)
+        self.pos_emb = nn.Embedding(block_size, n_embd)
+        self.blocks = nn.Sequential(
+            *[Block(n_embd, n_head, block_size, dropout) for _ in range(n_layer)]
+        )
+        self.ln_f = nn.LayerNorm(n_embd)
+        self.lm_head = nn.Linear(n_embd, vocab_size)
+    def forward(self, idx: torch.Tensor, targets: Optional[torch.Tensor] = None):
+        B, T = idx.shape
+        pos = torch.arange(T, device=idx.device)
+        x = self.tok_emb(idx) + self.pos_emb(pos)[None, :, :]
+        x = self.blocks(x)
+        x = self.ln_f(x)
+        logits = self.lm_head(x)
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss
+def get_sparse_linears(model: nn.Module) -> List[SparseLinear]:
+    return [m for m in model.modules() if isinstance(m, SparseLinear)]
+# -----------------------------
+# Chunk geometry and diagnostics
+# -----------------------------
+class ChunkMap:
+    def __init__(self, model: nn.Module, chunk_size: int, device: str):
+        self.model = model
+        self.chunk_size = chunk_size
+        self.device = device
+        self.linears = get_sparse_linears(model)
+        self.module_to_chunk_ids: Dict[nn.Module, torch.Tensor] = {}
+        self.chunk_to_module_local: List[Tuple[nn.Module, int]] = []
+        offset = 0
+        for m in self.linears:
+            assert m.out_features % chunk_size == 0, (
+                f"out_features {m.out_features} not divisible by chunk_size {chunk_size}"
+            )
+            n_chunks = m.out_features // chunk_size
+            ids = torch.arange(offset, offset + n_chunks, device=device)
+            self.module_to_chunk_ids[m] = ids
+            for local_c in range(n_chunks):
+                self.chunk_to_module_local.append((m, local_c))
+            offset += n_chunks
+        self.n_chunks = offset
+        self.predicted_mass = torch.zeros(self.n_chunks, device=device)
+        self.direction_ema: List[Optional[torch.Tensor]] = [None for _ in range(self.n_chunks)]
+        # Histories for correlation and graph similarities.
+        self.mass_history: List[torch.Tensor] = []
+    def choose_active(
+        self,
+        step: int,
+        warmup_steps: int,
+        active_fraction: float,
+        policy: Policy,
+    ) -> torch.Tensor:
+        if step < warmup_steps:
+            return torch.ones(self.n_chunks, dtype=torch.bool, device=self.device)
+        k = max(1, int(active_fraction * self.n_chunks))
+        mask = torch.zeros(self.n_chunks, dtype=torch.bool, device=self.device)
+        if policy == "random":
+            idx = torch.randperm(self.n_chunks, device=self.device)[:k]
+        else:
+            scores = self.predicted_mass + 1e-9 * torch.rand_like(self.predicted_mass)
+            idx = torch.topk(scores, k=k).indices
+        mask[idx] = True
+        return mask
+    @torch.no_grad()
+    def chunk_gradient_vectors(self) -> List[torch.Tensor]:
+        vecs: List[torch.Tensor] = []
+        for m, local_c in self.chunk_to_module_local:
+            start = local_c * self.chunk_size
+            end = (local_c + 1) * self.chunk_size
+            parts = []
+            if m.weight.grad is None:
+                parts.append(torch.zeros_like(m.weight[start:end]).flatten())
+            else:
+                parts.append(m.weight.grad[start:end].detach().flatten())
+            if m.bias is not None:
+                if m.bias.grad is None:
+                    parts.append(torch.zeros_like(m.bias[start:end]).flatten())
+                else:
+                    parts.append(m.bias.grad[start:end].detach().flatten())
+            vecs.append(torch.cat(parts))
+        return vecs
+    @torch.no_grad()
+    def chunk_masses_from_vecs(self, vecs: List[torch.Tensor]) -> torch.Tensor:
+        return torch.stack([v.norm() for v in vecs]).to(self.device)
+    @torch.no_grad()
+    def update_predictor(
+        self,
+        active_mask: torch.Tensor,
+        vecs: List[torch.Tensor],
+        mass_beta: float = 0.95,
+        dir_beta: float = 0.95,
+        store_history: bool = True,
+    ) -> torch.Tensor:
+        masses = self.chunk_masses_from_vecs(vecs)
+        observed = active_mask
+        # First observation should initialize directly, not get shrunk by beta.
+        never_seen = observed & (self.predicted_mass == 0)
+        already_seen = observed & ~never_seen
+        self.predicted_mass[never_seen] = masses[never_seen]
+        self.predicted_mass[already_seen] = (
+            mass_beta * self.predicted_mass[already_seen]
+            + (1.0 - mass_beta) * masses[already_seen]
+        )
+        for i, is_active in enumerate(observed.tolist()):
+            if not is_active:
+                continue
+            v = vecs[i]
+            n = v.norm()
+            if n <= 1e-12:
+                continue
+            unit = v / n
+            if self.direction_ema[i] is None:
+                self.direction_ema[i] = unit.detach().clone()
+            else:
+                self.direction_ema[i] = (
+                    dir_beta * self.direction_ema[i] + (1.0 - dir_beta) * unit
+                )
+                self.direction_ema[i] = self.direction_ema[i] / (self.direction_ema[i].norm() + 1e-12)
+        if store_history:
+            self.mass_history.append(masses.detach().clone())
+            max_hist = 128
+            if len(self.mass_history) > max_hist:
+                self.mass_history = self.mass_history[-max_hist:]
+        return masses
+    def layer_aware_masks(self) -> List[torch.Tensor]:
+        masks = []
+        for m, ids in self.module_to_chunk_ids.items():
+            mask = torch.zeros(self.n_chunks, dtype=torch.bool, device=self.device)
+            mask[ids] = True
+            masks.append(mask)
+        return masks
+def dense_cosine_from_vecs(a: List[torch.Tensor], b: List[torch.Tensor]) -> float:
+    va = torch.cat([x.flatten() for x in a])
+    vb = torch.cat([x.flatten() for x in b])
+    return float(F.cosine_similarity(va, vb, dim=0).item())
+def mse_reduction_vs_zero(true_vecs: List[torch.Tensor], pred_vecs: List[torch.Tensor], mask: torch.Tensor) -> float:
+    idxs = torch.nonzero(mask, as_tuple=False).flatten().tolist()
+    if not idxs:
+        return float("nan")
+    true = torch.cat([true_vecs[i].flatten() for i in idxs])
+    pred = torch.cat([pred_vecs[i].flatten() for i in idxs])
+    zero_mse = torch.mean(true.square())
+    pred_mse = torch.mean((true - pred).square())
+    return float((1.0 - pred_mse / (zero_mse + 1e-12)).item())
+def active_only_prediction(true_vecs: List[torch.Tensor], active_mask: torch.Tensor) -> List[torch.Tensor]:
+    out = []
+    for i, v in enumerate(true_vecs):
+        out.append(v.clone() if bool(active_mask[i]) else torch.zeros_like(v))
+    return out
+def ema_direction_prediction(
+    cmap: ChunkMap,
+    true_vecs: List[torch.Tensor],
+    active_mask: torch.Tensor,
+    inactive_magnitudes: torch.Tensor,
+) -> List[torch.Tensor]:
+    out = []
+    for i, v in enumerate(true_vecs):
+        if bool(active_mask[i]):
+            out.append(v.clone())
+        else:
+            direction = cmap.direction_ema[i]
+            if direction is None:
+                out.append(torch.zeros_like(v))
+            else:
+                out.append(direction.to(v.device, v.dtype) * inactive_magnitudes[i])
+    return out
+def build_mass_similarity(cmap: ChunkMap, min_history: int = 8) -> Optional[torch.Tensor]:
+    if len(cmap.mass_history) < min_history:
+        return None
+    H = torch.stack(cmap.mass_history, dim=0)  # [history, chunks]
+    H = H - H.mean(dim=0, keepdim=True)
+    H = H / (H.std(dim=0, keepdim=True) + 1e-6)
+    S = (H.T @ H) / max(1, H.shape[0] - 1)
+    S = torch.clamp(S, min=0.0)
+    # Remove self similarity.
+    S.fill_diagonal_(0.0)
+    # Layer-aware block diagonal: avoid mixing unrelated layers by default.
+    layer_masks = cmap.layer_aware_masks()
+    layer_allowed = torch.zeros_like(S, dtype=torch.bool)
+    for mask in layer_masks:
+        layer_allowed |= mask[:, None] & mask[None, :]
+    S = torch.where(layer_allowed, S, torch.zeros_like(S))
+    return S
+def knn_magnitude_prediction(
+    cmap: ChunkMap,
+    active_mask: torch.Tensor,
+    true_masses: torch.Tensor,
+    k_neighbors: int = 3,
+) -> torch.Tensor:
+    """Predict inactive magnitudes as weighted average of correlated active magnitudes."""
+    S = build_mass_similarity(cmap)
+    if S is None:
+        pred = cmap.predicted_mass.clone()
+        pred[active_mask] = true_masses[active_mask]
+        return pred
+    pred = torch.zeros_like(true_masses)
+    pred[active_mask] = true_masses[active_mask]
+    active_idx = torch.nonzero(active_mask, as_tuple=False).flatten()
+    inactive_idx = torch.nonzero(~active_mask, as_tuple=False).flatten()
+    if active_idx.numel() == 0:
+        return pred
+    for i in inactive_idx.tolist():
+        weights = S[i, active_idx]
+        if weights.sum() <= 1e-12:
+            pred[i] = cmap.predicted_mass[i]
+            continue
+        kk = min(k_neighbors, weights.numel())
+        top = torch.topk(weights, k=kk)
+        w = top.values
+        aidx = active_idx[top.indices]
+        pred[i] = (w * true_masses[aidx]).sum() / (w.sum() + 1e-12)
+    return pred
+def graph_diffusion_magnitude_prediction(
+    cmap: ChunkMap,
+    active_mask: torch.Tensor,
+    true_masses: torch.Tensor,
+    diffusion_steps: int = 8,
+    alpha: float = 0.7,
+) -> torch.Tensor:
+    """Boundary-value style magnitude interpolation over a learned similarity graph.
+    Active nodes are clamped to observed true magnitudes. Inactive nodes diffuse
+    toward graph-neighbor values.
+    """
+    S = build_mass_similarity(cmap)
+    if S is None:
+        pred = cmap.predicted_mass.clone()
+        pred[active_mask] = true_masses[active_mask]
+        return pred
+    W = S / (S.sum(dim=1, keepdim=True) + 1e-12)
+    pred = cmap.predicted_mass.clone()
+    pred[active_mask] = true_masses[active_mask]
+    for _ in range(diffusion_steps):
+        proposal = W @ pred
+        pred = alpha * proposal + (1.0 - alpha) * pred
+        pred[active_mask] = true_masses[active_mask]
+    return torch.clamp(pred, min=0.0)
+# -----------------------------
+# Optimizer
+# -----------------------------
+class SimpleAdam:
+    """Small Adam-like optimizer for diagnostics.
+    This is intentionally simple and consistent across runs. It is not trying
+    to be production AdamW.
+    """
+    def __init__(self, model: nn.Module, lr: float = 3e-4):
+        self.model = model
+        self.lr = lr
+        self.state: Dict[torch.nn.Parameter, Dict[str, torch.Tensor]] = {}
+    def zero_grad(self):
+        for p in self.model.parameters():
+            p.grad = None
+    @torch.no_grad()
+    def step(self):
+        for p in self.model.parameters():
+            if p.grad is None:
+                continue
+            if p not in self.state:
+                self.state[p] = {
+                    "m": torch.zeros_like(p),
+                    "v": torch.zeros_like(p),
+                }
+            m = self.state[p]["m"]
+            v = self.state[p]["v"]
+            m.mul_(0.9).add_(p.grad, alpha=0.1)
+            v.mul_(0.999).addcmul_(p.grad, p.grad, value=0.001)
+            p.sub_(m / (torch.sqrt(v) + 1e-8), alpha=self.lr)
+# -----------------------------
+# Apply chunk-gradient predictions
+# -----------------------------
+@torch.no_grad()
+def install_chunk_prediction_as_grads(
+    cmap: ChunkMap,
+    pred_vecs: List[torch.Tensor],
+):
+    """Overwrite SparseLinear weight/bias grads from predicted chunk vectors.
+    Non-SparseLinear parameters keep their dense gradients.
+    """
+    for m, ids in cmap.module_to_chunk_ids.items():
+        if m.weight.grad is None:
+            continue
+        m.weight.grad.zero_()
+        if m.bias is not None and m.bias.grad is not None:
+            m.bias.grad.zero_()
+        for local_c, global_id in enumerate(ids.tolist()):
+            start = local_c * cmap.chunk_size
+            end = (local_c + 1) * cmap.chunk_size
+            v = pred_vecs[global_id]
+            w_numel = cmap.chunk_size * m.weight.shape[1]
+            w_flat = v[:w_numel]
+            m.weight.grad[start:end] = w_flat.view(cmap.chunk_size, m.weight.shape[1])
+            if m.bias is not None and m.bias.grad is not None:
+                b_flat = v[w_numel:]
+                if b_flat.numel() > 0:
+                    m.bias.grad[start:end] = b_flat.view(cmap.chunk_size)
+# -----------------------------
+# Training / diagnostics
+# -----------------------------
+def evaluate(model: nn.Module, corpus: CharCorpus, batch_size: int, seed: int) -> float:
+    model.eval()
+    with torch.no_grad():
+        x, y = corpus.get_batch("val", batch_size, generator=make_cpu_generator(seed))
+        _, loss = model(x, y)
+    model.train()
+    return float(loss.item())
+def run_experiment(
+    mode: str,
+    device: str,
+    steps: int,
+    batch_size: int,
+    block_size: int,
+    n_layer: int,
+    n_head: int,
+    n_embd: int,
+    chunk_size: int,
+    active_fraction: float,
+    warmup_steps: int,
+    policy: Policy,
+    benchmark_sync: bool,
+) -> Dict[str, float]:
+    set_seed(42)
+    corpus = CharCorpus(make_synthetic_corpus(), block_size, device)
+    model = MiniGPT(corpus.vocab_size, block_size, n_layer, n_head, n_embd, 0.0).to(device)
+    opt = SimpleAdam(model, lr=3e-4)
+    cmap = ChunkMap(model, chunk_size=chunk_size, device=device)
+    metric_rows = []
+    if benchmark_sync:
+        sync_device(device)
+    t0 = time.perf_counter()
+    for step in range(steps):
+        x, y = corpus.get_batch("train", batch_size, generator=make_cpu_generator(step))
+        opt.zero_grad()
+        _, loss = model(x, y)
+        loss.backward()
+        true_vecs = cmap.chunk_gradient_vectors()
+        true_masses = cmap.chunk_masses_from_vecs(true_vecs)
+        active_mask = cmap.choose_active(
+            step=step,
+            warmup_steps=warmup_steps,
+            active_fraction=active_fraction,
+            policy=policy,
+        )
+        if step < warmup_steps or mode == "dense":
+            pred_vecs = [v.clone() for v in true_vecs]
+        else:
+            active_only_vecs = active_only_prediction(true_vecs, active_mask)
+            if mode == "active_only":
+                pred_vecs = active_only_vecs
+            elif mode == "knn_magnitude":
+                pred_masses = knn_magnitude_prediction(cmap, active_mask, true_masses)
+                pred_vecs = ema_direction_prediction(cmap, true_vecs, active_mask, pred_masses)
+            elif mode == "graph_diffusion":
+                pred_masses = graph_diffusion_magnitude_prediction(cmap, active_mask, true_masses)
+                pred_vecs = ema_direction_prediction(cmap, true_vecs, active_mask, pred_masses)
+            elif mode == "ema_inactive":
+                pred_masses = cmap.predicted_mass.clone()
+                pred_masses[active_mask] = true_masses[active_mask]
+                pred_vecs = ema_direction_prediction(cmap, true_vecs, active_mask, pred_masses)
+            else:
+                raise ValueError(f"Unknown mode: {mode}")
+            install_chunk_prediction_as_grads(cmap, pred_vecs)
+            if step % 25 == 0:
+                inactive_mask = ~active_mask
+                row = {
+                    "cosine_full": dense_cosine_from_vecs(true_vecs, pred_vecs),
+                    "inactive_mse_reduction": mse_reduction_vs_zero(true_vecs, pred_vecs, inactive_mask),
+                    "active_frac": float(active_mask.float().mean().item()),
+                    "val": evaluate(model, corpus, batch_size, seed=999 + step),
+                }
+                metric_rows.append(row)
+        # Update predictor after measuring and installing predicted grads.
+        # Use true active chunk observations only, mimicking sparse observation.
+        cmap.update_predictor(active_mask, true_vecs, store_history=True)
+        opt.step()
+    if benchmark_sync:
+        sync_device(device)
+    elapsed = time.perf_counter() - t0
+    val_loss = evaluate(model, corpus, batch_size, seed=12345)
+    if metric_rows:
+        avg_cos = sum(r["cosine_full"] for r in metric_rows) / len(metric_rows)
+        avg_mse_red = sum(r["inactive_mse_reduction"] for r in metric_rows) / len(metric_rows)
+    else:
+        avg_cos = float("nan")
+        avg_mse_red = float("nan")
+    return {
+        "val": val_loss,
+        "ms": 1000.0 * elapsed / steps,
+        "cos": avg_cos,
+        "mse_red": avg_mse_red,
+    }
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--steps", type=int, default=300)
+    parser.add_argument("--batch_size", type=int, default=8)
+    parser.add_argument("--block_size", type=int, default=128)
+    parser.add_argument("--n_layer", type=int, default=4)
+    parser.add_argument("--n_head", type=int, default=8)
+    parser.add_argument("--n_embd", type=int, default=512)
+    parser.add_argument("--chunk_size", type=int, default=64)
+    parser.add_argument("--active_fraction", type=float, default=0.10)
+    parser.add_argument("--warmup_steps", type=int, default=25)
+    parser.add_argument("--policy", type=str, default="predicted_magnitude", choices=["predicted_magnitude", "random"])
+    parser.add_argument("--device", type=str, default="mps")
+    parser.add_argument("--benchmark_sync", action="store_true")
+    args = parser.parse_args()
+    modes = [
+        "dense",
+        "active_only",
+        "ema_inactive",
+        "knn_magnitude",
+        "graph_diffusion",
+    ]
+    print(f"\nInactive-update prediction diagnostic")
+    print(f"device={args.device} steps={args.steps} d={args.n_embd} chunks={args.chunk_size}")
+    print(f"active_fraction={args.active_fraction} warmup={args.warmup_steps} policy={args.policy}\n")
+    print(f"{'mode':>18s} | {'val':>8s} | {'ms/step':>8s} | {'grad_cos':>8s} | {'inactive_mse+':>13s}")
+    print("-" * 70)
+    for mode in modes:
+        result = run_experiment(
+            mode=mode,
+            device=args.device,
+            steps=args.steps,
+            batch_size=args.batch_size,
+            block_size=args.block_size,
+            n_layer=args.n_layer,
+            n_head=args.n_head,
+            n_embd=args.n_embd,
+            chunk_size=args.chunk_size,
+            active_fraction=args.active_fraction,
+            warmup_steps=args.warmup_steps,
+            policy=args.policy,
+            benchmark_sync=args.benchmark_sync,
+        )
+        print(
+            f"{mode:>18s} | "
+            f"{result['val']:8.4f} | "
+            f"{result['ms']:8.2f} | "
+            f"{result['cos']:8.3f} | "
+            f"{result['mse_red']:13.3f}"
+        )
+if __name__ == "__main__":
+    main()

experiments/sparse_transformer_v16_sensor_scheduler.py ADDED Viewed

	@@ -0,0 +1,677 @@

+"""
+Sparse Transformer v16: Sensor-Based Mask Scheduling.
+v15 showed that directly hallucinating inactive gradient vectors was harmful.
+v16 tests the safer next idea:
+    Use active chunks as sensors to choose which chunks receive real gradients next.
+No inactive gradient is invented. In sparse modes, inactive chunks get zero gradient.
+The only question is whether active chunk observations improve future mask selection.
+Schedulers:
+    dense
+        Dense baseline.
+    ema_topk
+        Select top chunks by each chunk's own EMA gradient mass.
+    knn_scheduler
+        Use active chunks as sensors. Predict next-step inactive chunk mass from
+        historically correlated active chunks. Select next mask from that score.
+    graph_scheduler
+        Boundary-value style magnitude diffusion over a chunk similarity graph.
+        Active chunks are clamped to observed magnitudes. Inactive magnitudes are
+        interpolated and used to choose the next mask.
+    random
+        Random sparse-support control.
+This is still a diagnostic/simulation script: it computes dense gradients so we can
+measure oracle Jaccard/cosine, then installs only the selected active chunk gradients
+for sparse training.
+Run:
+    python3 sparse_transformer_v16_sensor_scheduler.py --device mps --benchmark_sync
+Useful:
+    python3 sparse_transformer_v16_sensor_scheduler.py --device mps --steps 500 --n_embd 512
+    python3 sparse_transformer_v16_sensor_scheduler.py --device mps --steps 500 --n_embd 1024
+"""
+from __future__ import annotations
+import argparse
+import math
+import random
+import time
+from typing import Dict, List, Literal, Optional, Tuple
+import torch
+torch.set_num_threads(1)
+import torch.nn as nn
+import torch.nn.functional as F
+Scheduler = Literal["dense", "ema_topk", "knn_scheduler", "graph_scheduler", "random"]
+def sync_device(device: str) -> None:
+    if device == "cuda" and torch.cuda.is_available():
+        torch.cuda.synchronize()
+    elif device == "mps" and hasattr(torch, "mps"):
+        torch.mps.synchronize()
+def set_seed(seed: int) -> None:
+    random.seed(seed)
+    torch.manual_seed(seed)
+def make_cpu_generator(seed: int) -> torch.Generator:
+    gen = torch.Generator(device="cpu")
+    gen.manual_seed(seed)
+    return gen
+# -----------------------------
+# Data
+# -----------------------------
+def make_synthetic_corpus(n_sentences: int = 12000, seed: int = 7) -> str:
+    rng = random.Random(seed)
+    words = [
+        "ada", "turing", "grace", "lovelace", "gradients",
+        "tokens", "circuits", "features", "boldly", "strangely",
+        "matrix", "attention", "kernel", "entropy", "signal",
+    ]
+    return "\n".join(
+        " ".join(rng.choices(words, k=rng.randint(4, 10))) + "."
+        for _ in range(n_sentences)
+    )
+class CharCorpus:
+    def __init__(self, text: str, block_size: int, device: str):
+        chars = sorted(set(text))
+        self.stoi = {ch: i for i, ch in enumerate(chars)}
+        self.itos = {i: ch for ch, i in self.stoi.items()}
+        self.vocab_size = len(chars)
+        self.block_size = block_size
+        self.device = device
+        data = torch.tensor([self.stoi[ch] for ch in text], dtype=torch.long)
+        self.train_data = data[: int(0.9 * len(data))]
+        self.val_data = data[int(0.9 * len(data)) :]
+    def get_batch(
+        self,
+        split: str,
+        batch_size: int,
+        generator: Optional[torch.Generator] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        data = self.train_data if split == "train" else self.val_data
+        ix = torch.randint(len(data) - self.block_size - 1, (batch_size,), generator=generator)
+        x = torch.stack([data[i : i + self.block_size] for i in ix])
+        y = torch.stack([data[i + 1 : i + self.block_size + 1] for i in ix])
+        return x.to(self.device), y.to(self.device)
+# -----------------------------
+# Model
+# -----------------------------
+class SparseLinear(nn.Linear):
+    pass
+class CausalSelfAttention(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        assert n_embd % n_head == 0
+        self.n_head = n_head
+        self.head_dim = n_embd // n_head
+        self.c_attn = SparseLinear(n_embd, 3 * n_embd)
+        self.c_proj = SparseLinear(n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+        self.register_buffer(
+            "mask",
+            torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, T, C = x.shape
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(C, dim=2)
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float("-inf"))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.c_proj(y)
+class FeedForward(nn.Module):
+    def __init__(self, n_embd: int, dropout: float):
+        super().__init__()
+        self.c_fc = SparseLinear(n_embd, 4 * n_embd)
+        self.c_proj = SparseLinear(4 * n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.dropout(self.c_proj(F.gelu(self.c_fc(x))))
+class Block(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(n_embd)
+        self.attn = CausalSelfAttention(n_embd, n_head, block_size, dropout)
+        self.ln2 = nn.LayerNorm(n_embd)
+        self.mlp = FeedForward(n_embd, dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+class MiniGPT(nn.Module):
+    def __init__(
+        self,
+        vocab_size: int,
+        block_size: int,
+        n_layer: int,
+        n_head: int,
+        n_embd: int,
+        dropout: float,
+    ):
+        super().__init__()
+        self.block_size = block_size
+        self.tok_emb = nn.Embedding(vocab_size, n_embd)
+        self.pos_emb = nn.Embedding(block_size, n_embd)
+        self.blocks = nn.Sequential(
+            *[Block(n_embd, n_head, block_size, dropout) for _ in range(n_layer)]
+        )
+        self.ln_f = nn.LayerNorm(n_embd)
+        self.lm_head = nn.Linear(n_embd, vocab_size)
+    def forward(self, idx: torch.Tensor, targets: Optional[torch.Tensor] = None):
+        B, T = idx.shape
+        pos = torch.arange(T, device=idx.device)
+        x = self.tok_emb(idx) + self.pos_emb(pos)[None, :, :]
+        x = self.blocks(x)
+        x = self.ln_f(x)
+        logits = self.lm_head(x)
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss
+def get_sparse_linears(model: nn.Module) -> List[SparseLinear]:
+    return [m for m in model.modules() if isinstance(m, SparseLinear)]
+# -----------------------------
+# Chunk map and scheduler
+# -----------------------------
+class ChunkScheduler:
+    def __init__(
+        self,
+        model: nn.Module,
+        chunk_size: int,
+        active_fraction: float,
+        device: str,
+        scheduler: Scheduler,
+        mass_beta: float = 0.95,
+    ):
+        self.model = model
+        self.chunk_size = chunk_size
+        self.active_fraction = active_fraction
+        self.device = device
+        self.scheduler = scheduler
+        self.mass_beta = mass_beta
+        self.linears = get_sparse_linears(model)
+        self.module_to_chunk_ids: Dict[nn.Module, torch.Tensor] = {}
+        self.chunk_to_module_local: List[Tuple[nn.Module, int]] = []
+        offset = 0
+        for m in self.linears:
+            assert m.out_features % chunk_size == 0, (
+                f"out_features {m.out_features} not divisible by chunk_size {chunk_size}"
+            )
+            n_chunks = m.out_features // chunk_size
+            ids = torch.arange(offset, offset + n_chunks, device=device)
+            self.module_to_chunk_ids[m] = ids
+            for local_c in range(n_chunks):
+                self.chunk_to_module_local.append((m, local_c))
+            offset += n_chunks
+        self.n_chunks = offset
+        self.predicted_mass = torch.zeros(self.n_chunks, device=device)
+        self.mass_history: List[torch.Tensor] = []
+        self.current_mask = torch.ones(self.n_chunks, dtype=torch.bool, device=device)
+        self.next_scores = torch.zeros(self.n_chunks, device=device)
+        self.prev_mask: Optional[torch.Tensor] = None
+        self.similarity: Optional[torch.Tensor] = None
+    def k_active(self) -> int:
+        return max(1, int(self.active_fraction * self.n_chunks))
+    def choose_mask(self, step: int, warmup_steps: int) -> torch.Tensor:
+        if self.scheduler == "dense" or step < warmup_steps:
+            self.current_mask = torch.ones(self.n_chunks, dtype=torch.bool, device=self.device)
+            return self.current_mask
+        k = self.k_active()
+        mask = torch.zeros(self.n_chunks, dtype=torch.bool, device=self.device)
+        if self.scheduler == "random":
+            idx = torch.randperm(self.n_chunks, device=self.device)[:k]
+        elif self.scheduler == "ema_topk":
+            scores = self.predicted_mass + 1e-9 * torch.rand_like(self.predicted_mass)
+            idx = torch.topk(scores, k=k).indices
+        elif self.scheduler in ("knn_scheduler", "graph_scheduler"):
+            # next_scores are computed from the previous step's active sensors.
+            # If unavailable, fall back to EMA.
+            base = self.next_scores
+            if torch.count_nonzero(base).item() == 0:
+                base = self.predicted_mass
+            scores = base + 1e-9 * torch.rand_like(base)
+            idx = torch.topk(scores, k=k).indices
+        else:
+            raise ValueError(f"Unknown scheduler: {self.scheduler}")
+        mask[idx] = True
+        self.current_mask = mask
+        return mask
+    @torch.no_grad()
+    def chunk_gradient_vectors(self) -> List[torch.Tensor]:
+        vecs: List[torch.Tensor] = []
+        for m, local_c in self.chunk_to_module_local:
+            start = local_c * self.chunk_size
+            end = (local_c + 1) * self.chunk_size
+            parts = []
+            if m.weight.grad is None:
+                parts.append(torch.zeros_like(m.weight[start:end]).flatten())
+            else:
+                parts.append(m.weight.grad[start:end].detach().flatten())
+            if m.bias is not None:
+                if m.bias.grad is None:
+                    parts.append(torch.zeros_like(m.bias[start:end]).flatten())
+                else:
+                    parts.append(m.bias.grad[start:end].detach().flatten())
+            vecs.append(torch.cat(parts))
+        return vecs
+    @torch.no_grad()
+    def chunk_masses_from_vecs(self, vecs: List[torch.Tensor]) -> torch.Tensor:
+        return torch.stack([v.norm() for v in vecs]).to(self.device)
+    @torch.no_grad()
+    def update_from_observed(
+        self,
+        active_mask: torch.Tensor,
+        true_masses: torch.Tensor,
+        step: int,
+        warmup_steps: int,
+    ) -> None:
+        observed = active_mask
+        never_seen = observed & (self.predicted_mass == 0)
+        already_seen = observed & ~never_seen
+        self.predicted_mass[never_seen] = true_masses[never_seen]
+        self.predicted_mass[already_seen] = (
+            self.mass_beta * self.predicted_mass[already_seen]
+            + (1.0 - self.mass_beta) * true_masses[already_seen]
+        )
+        # During warmup we store dense mass histories to learn the similarity graph.
+        if step < warmup_steps:
+            self.mass_history.append(true_masses.detach().clone())
+            max_hist = 128
+            if len(self.mass_history) > max_hist:
+                self.mass_history = self.mass_history[-max_hist:]
+            if len(self.mass_history) >= 8:
+                self.similarity = self.build_similarity()
+        # Compute next_scores from current active observations.
+        if self.scheduler == "knn_scheduler":
+            self.next_scores = self.knn_scores(active_mask, true_masses)
+        elif self.scheduler == "graph_scheduler":
+            self.next_scores = self.diffusion_scores(active_mask, true_masses)
+        else:
+            self.next_scores = self.predicted_mass.clone()
+    def layer_allowed_mask(self) -> torch.Tensor:
+        allowed = torch.zeros((self.n_chunks, self.n_chunks), dtype=torch.bool, device=self.device)
+        for _, ids in self.module_to_chunk_ids.items():
+            allowed |= ids[:, None].eq(ids[None, :])  # placeholder overwritten below
+        allowed.zero_()
+        for _, ids in self.module_to_chunk_ids.items():
+            allowed[ids[:, None], ids[None, :]] = True
+        return allowed
+    def build_similarity(self) -> torch.Tensor:
+        H = torch.stack(self.mass_history, dim=0)  # [history, chunks]
+        H = H - H.mean(dim=0, keepdim=True)
+        H = H / (H.std(dim=0, keepdim=True) + 1e-6)
+        S = (H.T @ H) / max(1, H.shape[0] - 1)
+        S = torch.clamp(S, min=0.0)
+        S.fill_diagonal_(0.0)
+        # Keep only within-layer similarities. Cross-layer correlation is too easy
+        # to overfit in this tiny diagnostic.
+        allowed = torch.zeros_like(S, dtype=torch.bool)
+        for _, ids in self.module_to_chunk_ids.items():
+            allowed[ids[:, None], ids[None, :]] = True
+        S = torch.where(allowed, S, torch.zeros_like(S))
+        return S
+    def knn_scores(self, active_mask: torch.Tensor, true_masses: torch.Tensor, k_neighbors: int = 3) -> torch.Tensor:
+        if self.similarity is None:
+            return self.predicted_mass.clone()
+        S = self.similarity
+        scores = self.predicted_mass.clone()
+        scores[active_mask] = true_masses[active_mask]
+        active_idx = torch.nonzero(active_mask, as_tuple=False).flatten()
+        inactive_idx = torch.nonzero(~active_mask, as_tuple=False).flatten()
+        if active_idx.numel() == 0:
+            return scores
+        for i in inactive_idx.tolist():
+            weights = S[i, active_idx]
+            if weights.sum() <= 1e-12:
+                continue
+            kk = min(k_neighbors, weights.numel())
+            top = torch.topk(weights, k=kk)
+            w = top.values
+            aidx = active_idx[top.indices]
+            scores[i] = (w * true_masses[aidx]).sum() / (w.sum() + 1e-12)
+        return scores
+    def diffusion_scores(
+        self,
+        active_mask: torch.Tensor,
+        true_masses: torch.Tensor,
+        diffusion_steps: int = 8,
+        alpha: float = 0.7,
+    ) -> torch.Tensor:
+        if self.similarity is None:
+            return self.predicted_mass.clone()
+        S = self.similarity
+        W = S / (S.sum(dim=1, keepdim=True) + 1e-12)
+        scores = self.predicted_mass.clone()
+        scores[active_mask] = true_masses[active_mask]
+        for _ in range(diffusion_steps):
+            proposal = W @ scores
+            scores = alpha * proposal + (1.0 - alpha) * scores
+            scores[active_mask] = true_masses[active_mask]
+        return torch.clamp(scores, min=0.0)
+    def oracle_topk_mask(self, true_masses: torch.Tensor) -> torch.Tensor:
+        k = self.k_active()
+        mask = torch.zeros(self.n_chunks, dtype=torch.bool, device=self.device)
+        mask[torch.topk(true_masses, k=k).indices] = True
+        return mask
+# -----------------------------
+# Gradient installation and metrics
+# -----------------------------
+@torch.no_grad()
+def install_active_only_grads(sched: ChunkScheduler, active_mask: torch.Tensor) -> None:
+    if sched.scheduler == "dense":
+        return
+    for m, ids in sched.module_to_chunk_ids.items():
+        local_active = active_mask[ids]
+        if m.weight.grad is not None:
+            for local_c, is_active in enumerate(local_active.tolist()):
+                if not is_active:
+                    start = local_c * sched.chunk_size
+                    end = (local_c + 1) * sched.chunk_size
+                    m.weight.grad[start:end].zero_()
+        if m.bias is not None and m.bias.grad is not None:
+            for local_c, is_active in enumerate(local_active.tolist()):
+                if not is_active:
+                    start = local_c * sched.chunk_size
+                    end = (local_c + 1) * sched.chunk_size
+                    m.bias.grad[start:end].zero_()
+def dense_cosine_active_only(vecs: List[torch.Tensor], active_mask: torch.Tensor) -> float:
+    true = torch.cat([v.flatten() for v in vecs])
+    approx_parts = []
+    for i, v in enumerate(vecs):
+        approx_parts.append(v.flatten() if bool(active_mask[i]) else torch.zeros_like(v).flatten())
+    approx = torch.cat(approx_parts)
+    return float(F.cosine_similarity(true, approx, dim=0).item())
+def jaccard(a: torch.Tensor, b: torch.Tensor) -> float:
+    inter = (a & b).sum().float()
+    union = (a | b).sum().float()
+    return float((inter / torch.clamp(union, min=1.0)).item())
+class SimpleAdam:
+    def __init__(self, model: nn.Module, lr: float = 3e-4):
+        self.model = model
+        self.lr = lr
+        self.state: Dict[torch.nn.Parameter, Dict[str, torch.Tensor]] = {}
+    def zero_grad(self):
+        for p in self.model.parameters():
+            p.grad = None
+    @torch.no_grad()
+    def step(self):
+        for p in self.model.parameters():
+            if p.grad is None:
+                continue
+            if p not in self.state:
+                self.state[p] = {"m": torch.zeros_like(p), "v": torch.zeros_like(p)}
+            m = self.state[p]["m"]
+            v = self.state[p]["v"]
+            m.mul_(0.9).add_(p.grad, alpha=0.1)
+            v.mul_(0.999).addcmul_(p.grad, p.grad, value=0.001)
+            p.sub_(m / (torch.sqrt(v) + 1e-8), alpha=self.lr)
+def evaluate(model: nn.Module, corpus: CharCorpus, batch_size: int, seed: int) -> float:
+    model.eval()
+    with torch.no_grad():
+        x, y = corpus.get_batch("val", batch_size, generator=make_cpu_generator(seed))
+        _, loss = model(x, y)
+    model.train()
+    return float(loss.item())
+def run_experiment(
+    scheduler_name: Scheduler,
+    device: str,
+    steps: int,
+    batch_size: int,
+    block_size: int,
+    n_layer: int,
+    n_head: int,
+    n_embd: int,
+    chunk_size: int,
+    active_fraction: float,
+    warmup_steps: int,
+    benchmark_sync: bool,
+) -> Dict[str, float]:
+    set_seed(42)
+    corpus = CharCorpus(make_synthetic_corpus(), block_size, device)
+    model = MiniGPT(corpus.vocab_size, block_size, n_layer, n_head, n_embd, 0.0).to(device)
+    opt = SimpleAdam(model, lr=3e-4)
+    sched = ChunkScheduler(
+        model=model,
+        chunk_size=chunk_size,
+        active_fraction=active_fraction,
+        device=device,
+        scheduler=scheduler_name,
+    )
+    metric_rows = []
+    if benchmark_sync:
+        sync_device(device)
+    t0 = time.perf_counter()
+    for step in range(steps):
+        x, y = corpus.get_batch("train", batch_size, generator=make_cpu_generator(step))
+        active_mask = sched.choose_mask(step=step, warmup_steps=warmup_steps)
+        opt.zero_grad()
+        _, loss = model(x, y)
+        loss.backward()
+        vecs = sched.chunk_gradient_vectors()
+        masses = sched.chunk_masses_from_vecs(vecs)
+        if step >= warmup_steps and scheduler_name != "dense":
+            oracle = sched.oracle_topk_mask(masses)
+            row = {
+                "cos": dense_cosine_active_only(vecs, active_mask),
+                "jacc": jaccard(active_mask, oracle),
+                "stable": jaccard(active_mask, sched.prev_mask) if sched.prev_mask is not None else 0.0,
+                "val": evaluate(model, corpus, batch_size, seed=10_000 + step) if step % 50 == 0 else float("nan"),
+            }
+            metric_rows.append(row)
+        install_active_only_grads(sched, active_mask)
+        # Important: update scheduler from the active observations only.
+        # Dense gradients exist for diagnostics, but unselected chunks should not
+        # teach the sparse scheduler after warmup.
+        observed_for_scheduler = active_mask if step >= warmup_steps else torch.ones_like(active_mask)
+        sched.update_from_observed(
+            active_mask=observed_for_scheduler,
+            true_masses=masses,
+            step=step,
+            warmup_steps=warmup_steps,
+        )
+        sched.prev_mask = active_mask.clone()
+        opt.step()
+    if benchmark_sync:
+        sync_device(device)
+    elapsed = time.perf_counter() - t0
+    val_loss = evaluate(model, corpus, batch_size, seed=12345)
+    if metric_rows:
+        avg_cos = sum(r["cos"] for r in metric_rows) / len(metric_rows)
+        avg_jacc = sum(r["jacc"] for r in metric_rows) / len(metric_rows)
+        avg_stable = sum(r["stable"] for r in metric_rows) / len(metric_rows)
+    else:
+        avg_cos = float("nan")
+        avg_jacc = float("nan")
+        avg_stable = float("nan")
+    return {
+        "val": val_loss,
+        "ms": 1000.0 * elapsed / steps,
+        "cos": avg_cos,
+        "jacc": avg_jacc,
+        "stable": avg_stable,
+    }
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--steps", type=int, default=500)
+    parser.add_argument("--batch_size", type=int, default=8)
+    parser.add_argument("--block_size", type=int, default=128)
+    parser.add_argument("--n_layer", type=int, default=4)
+    parser.add_argument("--n_head", type=int, default=8)
+    parser.add_argument("--n_embd", type=int, default=512)
+    parser.add_argument("--chunk_size", type=int, default=64)
+    parser.add_argument("--active_fraction", type=float, default=0.10)
+    parser.add_argument("--warmup_steps", type=int, default=25)
+    parser.add_argument("--device", type=str, default="mps")
+    parser.add_argument("--benchmark_sync", action="store_true")
+    args = parser.parse_args()
+    schedulers: List[Scheduler] = [
+        "dense",
+        "ema_topk",
+        "knn_scheduler",
+        "graph_scheduler",
+        "random",
+    ]
+    print("\nSensor-based mask scheduling diagnostic")
+    print(f"device={args.device} steps={args.steps} d={args.n_embd} chunks={args.chunk_size}")
+    print(f"active_fraction={args.active_fraction} warmup={args.warmup_steps}\n")
+    print(f"{'scheduler':>18s} | {'val':>8s} | {'ms/step':>8s} | {'grad_cos':>8s} | {'jacc':>8s} | {'stable':>8s}")
+    print("-" * 78)
+    for sched_name in schedulers:
+        result = run_experiment(
+            scheduler_name=sched_name,
+            device=args.device,
+            steps=args.steps,
+            batch_size=args.batch_size,
+            block_size=args.block_size,
+            n_layer=args.n_layer,
+            n_head=args.n_head,
+            n_embd=args.n_embd,
+            chunk_size=args.chunk_size,
+            active_fraction=args.active_fraction,
+            warmup_steps=args.warmup_steps,
+            benchmark_sync=args.benchmark_sync,
+        )
+        print(
+            f"{sched_name:>18s} | "
+            f"{result['val']:8.4f} | "
+            f"{result['ms']:8.2f} | "
+            f"{result['cos']:8.3f} | "
+            f"{result['jacc']:8.3f} | "
+            f"{result['stable']:8.3f}"
+        )
+if __name__ == "__main__":
+    main()

experiments/sparse_transformer_v17_radar_scheduler.py ADDED Viewed

	@@ -0,0 +1,725 @@

+"""
+Sparse Transformer v17: Radar Scheduler Diagnostic.
+v16 showed:
+    - Directly predicting inactive gradient vectors is harmful.
+    - Using active chunks as sensors to schedule the next active mask works.
+    - KNN/graph sensors improve oracle overlap and gradient cosine, but churn masks.
+    - EMA is stable, but can be blind.
+v17 tests the fusion:
+    radar_score = alpha * normalized_ema_mass
+                + (1 - alpha) * normalized_sensor_score
+where sensor_score is either:
+    - KNN over a learned chunk-mass correlation graph
+    - graph diffusion / boundary interpolation over that graph
+This is still a diagnostic script. It computes dense gradients so we can measure:
+    - oracle Jaccard
+    - active-only full-gradient cosine
+    - mask stability
+    - validation loss after sparse active-only updates
+No inactive gradients are invented. In sparse modes, inactive chunks get zeroed.
+Run:
+    python3 sparse_transformer_v17_radar_scheduler.py --device mps --benchmark_sync
+Useful:
+    python3 sparse_transformer_v17_radar_scheduler.py --device mps --steps 500 --n_embd 512
+    python3 sparse_transformer_v17_radar_scheduler.py --device mps --steps 500 --n_embd 1024
+    python3 sparse_transformer_v17_radar_scheduler.py --device mps --steps 500 --n_embd 1024 --alphas 0.25 0.5 0.75 0.9
+"""
+from __future__ import annotations
+import argparse
+import math
+import random
+import time
+from typing import Dict, List, Literal, Optional, Tuple
+import torch
+torch.set_num_threads(1)
+import torch.nn as nn
+import torch.nn.functional as F
+Scheduler = Literal[
+    "dense",
+    "ema_topk",
+    "knn_scheduler",
+    "graph_scheduler",
+    "radar_knn",
+    "radar_graph",
+    "random",
+]
+def sync_device(device: str) -> None:
+    if device == "cuda" and torch.cuda.is_available():
+        torch.cuda.synchronize()
+    elif device == "mps" and hasattr(torch, "mps"):
+        torch.mps.synchronize()
+def set_seed(seed: int) -> None:
+    random.seed(seed)
+    torch.manual_seed(seed)
+def make_cpu_generator(seed: int) -> torch.Generator:
+    gen = torch.Generator(device="cpu")
+    gen.manual_seed(seed)
+    return gen
+def normalize_scores(x: torch.Tensor, eps: float = 1e-12) -> torch.Tensor:
+    """Robust [0, 1] normalization.
+    We avoid z-score because heavy tails are the signal, not necessarily noise.
+    """
+    x = torch.nan_to_num(x, nan=0.0, posinf=0.0, neginf=0.0)
+    lo = x.min()
+    hi = x.max()
+    if (hi - lo) <= eps:
+        return torch.zeros_like(x)
+    return (x - lo) / (hi - lo + eps)
+# -----------------------------
+# Data
+# -----------------------------
+def make_synthetic_corpus(n_sentences: int = 12000, seed: int = 7) -> str:
+    rng = random.Random(seed)
+    words = [
+        "ada", "turing", "grace", "lovelace", "gradients",
+        "tokens", "circuits", "features", "boldly", "strangely",
+        "matrix", "attention", "kernel", "entropy", "signal",
+    ]
+    return "\n".join(
+        " ".join(rng.choices(words, k=rng.randint(4, 10))) + "."
+        for _ in range(n_sentences)
+    )
+class CharCorpus:
+    def __init__(self, text: str, block_size: int, device: str):
+        chars = sorted(set(text))
+        self.stoi = {ch: i for i, ch in enumerate(chars)}
+        self.itos = {i: ch for ch, i in self.stoi.items()}
+        self.vocab_size = len(chars)
+        self.block_size = block_size
+        self.device = device
+        data = torch.tensor([self.stoi[ch] for ch in text], dtype=torch.long)
+        self.train_data = data[: int(0.9 * len(data))]
+        self.val_data = data[int(0.9 * len(data)) :]
+    def get_batch(
+        self,
+        split: str,
+        batch_size: int,
+        generator: Optional[torch.Generator] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        data = self.train_data if split == "train" else self.val_data
+        ix = torch.randint(len(data) - self.block_size - 1, (batch_size,), generator=generator)
+        x = torch.stack([data[i : i + self.block_size] for i in ix])
+        y = torch.stack([data[i + 1 : i + self.block_size + 1] for i in ix])
+        return x.to(self.device), y.to(self.device)
+# -----------------------------
+# Model
+# -----------------------------
+class SparseLinear(nn.Linear):
+    pass
+class CausalSelfAttention(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        assert n_embd % n_head == 0
+        self.n_head = n_head
+        self.head_dim = n_embd // n_head
+        self.c_attn = SparseLinear(n_embd, 3 * n_embd)
+        self.c_proj = SparseLinear(n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+        self.register_buffer(
+            "mask",
+            torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, T, C = x.shape
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(C, dim=2)
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float("-inf"))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.c_proj(y)
+class FeedForward(nn.Module):
+    def __init__(self, n_embd: int, dropout: float):
+        super().__init__()
+        self.c_fc = SparseLinear(n_embd, 4 * n_embd)
+        self.c_proj = SparseLinear(4 * n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.dropout(self.c_proj(F.gelu(self.c_fc(x))))
+class Block(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(n_embd)
+        self.attn = CausalSelfAttention(n_embd, n_head, block_size, dropout)
+        self.ln2 = nn.LayerNorm(n_embd)
+        self.mlp = FeedForward(n_embd, dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+class MiniGPT(nn.Module):
+    def __init__(
+        self,
+        vocab_size: int,
+        block_size: int,
+        n_layer: int,
+        n_head: int,
+        n_embd: int,
+        dropout: float,
+    ):
+        super().__init__()
+        self.block_size = block_size
+        self.tok_emb = nn.Embedding(vocab_size, n_embd)
+        self.pos_emb = nn.Embedding(block_size, n_embd)
+        self.blocks = nn.Sequential(
+            *[Block(n_embd, n_head, block_size, dropout) for _ in range(n_layer)]
+        )
+        self.ln_f = nn.LayerNorm(n_embd)
+        self.lm_head = nn.Linear(n_embd, vocab_size)
+    def forward(self, idx: torch.Tensor, targets: Optional[torch.Tensor] = None):
+        B, T = idx.shape
+        pos = torch.arange(T, device=idx.device)
+        x = self.tok_emb(idx) + self.pos_emb(pos)[None, :, :]
+        x = self.blocks(x)
+        x = self.ln_f(x)
+        logits = self.lm_head(x)
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss
+def get_sparse_linears(model: nn.Module) -> List[SparseLinear]:
+    return [m for m in model.modules() if isinstance(m, SparseLinear)]
+# -----------------------------
+# Radar scheduler
+# -----------------------------
+class RadarScheduler:
+    def __init__(
+        self,
+        model: nn.Module,
+        chunk_size: int,
+        active_fraction: float,
+        device: str,
+        scheduler: Scheduler,
+        alpha: float,
+        mass_beta: float = 0.95,
+        similarity_history: int = 128,
+        min_similarity_history: int = 8,
+    ):
+        self.model = model
+        self.chunk_size = chunk_size
+        self.active_fraction = active_fraction
+        self.device = device
+        self.scheduler = scheduler
+        self.alpha = float(alpha)
+        self.mass_beta = mass_beta
+        self.similarity_history = similarity_history
+        self.min_similarity_history = min_similarity_history
+        self.linears = get_sparse_linears(model)
+        self.module_to_chunk_ids: Dict[nn.Module, torch.Tensor] = {}
+        self.chunk_to_module_local: List[Tuple[nn.Module, int]] = []
+        offset = 0
+        for m in self.linears:
+            assert m.out_features % chunk_size == 0, (
+                f"out_features {m.out_features} not divisible by chunk_size {chunk_size}"
+            )
+            n_chunks = m.out_features // chunk_size
+            ids = torch.arange(offset, offset + n_chunks, device=device)
+            self.module_to_chunk_ids[m] = ids
+            for local_c in range(n_chunks):
+                self.chunk_to_module_local.append((m, local_c))
+            offset += n_chunks
+        self.n_chunks = offset
+        self.predicted_mass = torch.zeros(self.n_chunks, device=device)
+        self.mass_history: List[torch.Tensor] = []
+        self.current_mask = torch.ones(self.n_chunks, dtype=torch.bool, device=device)
+        self.next_sensor_scores = torch.zeros(self.n_chunks, device=device)
+        self.next_scores = torch.zeros(self.n_chunks, device=device)
+        self.prev_mask: Optional[torch.Tensor] = None
+        self.similarity: Optional[torch.Tensor] = None
+    def k_active(self) -> int:
+        return max(1, int(self.active_fraction * self.n_chunks))
+    def choose_mask(self, step: int, warmup_steps: int) -> torch.Tensor:
+        if self.scheduler == "dense" or step < warmup_steps:
+            self.current_mask = torch.ones(self.n_chunks, dtype=torch.bool, device=self.device)
+            return self.current_mask
+        k = self.k_active()
+        mask = torch.zeros(self.n_chunks, dtype=torch.bool, device=self.device)
+        if self.scheduler == "random":
+            idx = torch.randperm(self.n_chunks, device=self.device)[:k]
+        else:
+            scores = self.score_for_selection()
+            scores = scores + 1e-9 * torch.rand_like(scores)
+            idx = torch.topk(scores, k=k).indices
+        mask[idx] = True
+        self.current_mask = mask
+        return mask
+    def score_for_selection(self) -> torch.Tensor:
+        if self.scheduler == "random":
+            return torch.zeros_like(self.predicted_mass)
+        if self.scheduler == "ema_topk":
+            return self.predicted_mass
+        if self.scheduler in ("knn_scheduler", "graph_scheduler"):
+            if torch.count_nonzero(self.next_sensor_scores).item() == 0:
+                return self.predicted_mass
+            return self.next_sensor_scores
+        if self.scheduler in ("radar_knn", "radar_graph"):
+            sensor = self.next_sensor_scores
+            if torch.count_nonzero(sensor).item() == 0:
+                sensor = self.predicted_mass
+            ema_n = normalize_scores(self.predicted_mass)
+            sensor_n = normalize_scores(sensor)
+            return self.alpha * ema_n + (1.0 - self.alpha) * sensor_n
+        if self.scheduler == "dense":
+            return torch.ones_like(self.predicted_mass)
+        raise ValueError(f"Unknown scheduler: {self.scheduler}")
+    @torch.no_grad()
+    def chunk_gradient_vectors(self) -> List[torch.Tensor]:
+        vecs: List[torch.Tensor] = []
+        for m, local_c in self.chunk_to_module_local:
+            start = local_c * self.chunk_size
+            end = (local_c + 1) * self.chunk_size
+            parts = []
+            if m.weight.grad is None:
+                parts.append(torch.zeros_like(m.weight[start:end]).flatten())
+            else:
+                parts.append(m.weight.grad[start:end].detach().flatten())
+            if m.bias is not None:
+                if m.bias.grad is None:
+                    parts.append(torch.zeros_like(m.bias[start:end]).flatten())
+                else:
+                    parts.append(m.bias.grad[start:end].detach().flatten())
+            vecs.append(torch.cat(parts))
+        return vecs
+    @torch.no_grad()
+    def chunk_masses_from_vecs(self, vecs: List[torch.Tensor]) -> torch.Tensor:
+        return torch.stack([v.norm() for v in vecs]).to(self.device)
+    @torch.no_grad()
+    def update_from_observed(
+        self,
+        observed_mask: torch.Tensor,
+        true_masses: torch.Tensor,
+        step: int,
+        warmup_steps: int,
+    ) -> None:
+        observed = observed_mask
+        never_seen = observed & (self.predicted_mass == 0)
+        already_seen = observed & ~never_seen
+        self.predicted_mass[never_seen] = true_masses[never_seen]
+        self.predicted_mass[already_seen] = (
+            self.mass_beta * self.predicted_mass[already_seen]
+            + (1.0 - self.mass_beta) * true_masses[already_seen]
+        )
+        # Dense warmup teaches the similarity graph.
+        if step < warmup_steps:
+            self.mass_history.append(true_masses.detach().clone())
+            if len(self.mass_history) > self.similarity_history:
+                self.mass_history = self.mass_history[-self.similarity_history :]
+            if len(self.mass_history) >= self.min_similarity_history:
+                self.similarity = self.build_similarity()
+        if self.scheduler in ("knn_scheduler", "radar_knn"):
+            self.next_sensor_scores = self.knn_scores(observed, true_masses)
+        elif self.scheduler in ("graph_scheduler", "radar_graph"):
+            self.next_sensor_scores = self.diffusion_scores(observed, true_masses)
+        else:
+            self.next_sensor_scores = self.predicted_mass.clone()
+        self.next_scores = self.score_for_selection()
+    def build_similarity(self) -> torch.Tensor:
+        H = torch.stack(self.mass_history, dim=0)  # [history, chunks]
+        H = H - H.mean(dim=0, keepdim=True)
+        H = H / (H.std(dim=0, keepdim=True) + 1e-6)
+        S = (H.T @ H) / max(1, H.shape[0] - 1)
+        S = torch.clamp(S, min=0.0)
+        S.fill_diagonal_(0.0)
+        # Within-layer only. This keeps the graph interpretable and avoids
+        # overfitting tiny cross-layer coincidences.
+        allowed = torch.zeros_like(S, dtype=torch.bool)
+        for _, ids in self.module_to_chunk_ids.items():
+            allowed[ids[:, None], ids[None, :]] = True
+        S = torch.where(allowed, S, torch.zeros_like(S))
+        return S
+    def knn_scores(
+        self,
+        active_mask: torch.Tensor,
+        true_masses: torch.Tensor,
+        k_neighbors: int = 3,
+    ) -> torch.Tensor:
+        if self.similarity is None:
+            return self.predicted_mass.clone()
+        S = self.similarity
+        scores = self.predicted_mass.clone()
+        scores[active_mask] = true_masses[active_mask]
+        active_idx = torch.nonzero(active_mask, as_tuple=False).flatten()
+        inactive_idx = torch.nonzero(~active_mask, as_tuple=False).flatten()
+        if active_idx.numel() == 0:
+            return scores
+        for i in inactive_idx.tolist():
+            weights = S[i, active_idx]
+            if weights.sum() <= 1e-12:
+                continue
+            kk = min(k_neighbors, weights.numel())
+            top = torch.topk(weights, k=kk)
+            w = top.values
+            aidx = active_idx[top.indices]
+            scores[i] = (w * true_masses[aidx]).sum() / (w.sum() + 1e-12)
+        return scores
+    def diffusion_scores(
+        self,
+        active_mask: torch.Tensor,
+        true_masses: torch.Tensor,
+        diffusion_steps: int = 8,
+        alpha: float = 0.7,
+    ) -> torch.Tensor:
+        if self.similarity is None:
+            return self.predicted_mass.clone()
+        S = self.similarity
+        W = S / (S.sum(dim=1, keepdim=True) + 1e-12)
+        scores = self.predicted_mass.clone()
+        scores[active_mask] = true_masses[active_mask]
+        for _ in range(diffusion_steps):
+            proposal = W @ scores
+            scores = alpha * proposal + (1.0 - alpha) * scores
+            scores[active_mask] = true_masses[active_mask]
+        return torch.clamp(scores, min=0.0)
+    def oracle_topk_mask(self, true_masses: torch.Tensor) -> torch.Tensor:
+        k = self.k_active()
+        mask = torch.zeros(self.n_chunks, dtype=torch.bool, device=self.device)
+        mask[torch.topk(true_masses, k=k).indices] = True
+        return mask
+# -----------------------------
+# Gradient installation and metrics
+# -----------------------------
+@torch.no_grad()
+def install_active_only_grads(sched: RadarScheduler, active_mask: torch.Tensor) -> None:
+    if sched.scheduler == "dense":
+        return
+    for m, ids in sched.module_to_chunk_ids.items():
+        local_active = active_mask[ids]
+        if m.weight.grad is not None:
+            for local_c, is_active in enumerate(local_active.tolist()):
+                if not is_active:
+                    start = local_c * sched.chunk_size
+                    end = (local_c + 1) * sched.chunk_size
+                    m.weight.grad[start:end].zero_()
+        if m.bias is not None and m.bias.grad is not None:
+            for local_c, is_active in enumerate(local_active.tolist()):
+                if not is_active:
+                    start = local_c * sched.chunk_size
+                    end = (local_c + 1) * sched.chunk_size
+                    m.bias.grad[start:end].zero_()
+def dense_cosine_active_only(vecs: List[torch.Tensor], active_mask: torch.Tensor) -> float:
+    true = torch.cat([v.flatten() for v in vecs])
+    approx_parts = []
+    for i, v in enumerate(vecs):
+        approx_parts.append(v.flatten() if bool(active_mask[i]) else torch.zeros_like(v).flatten())
+    approx = torch.cat(approx_parts)
+    return float(F.cosine_similarity(true, approx, dim=0).item())
+def jaccard(a: torch.Tensor, b: torch.Tensor) -> float:
+    inter = (a & b).sum().float()
+    union = (a | b).sum().float()
+    return float((inter / torch.clamp(union, min=1.0)).item())
+class SimpleAdam:
+    def __init__(self, model: nn.Module, lr: float = 3e-4):
+        self.model = model
+        self.lr = lr
+        self.state: Dict[torch.nn.Parameter, Dict[str, torch.Tensor]] = {}
+    def zero_grad(self):
+        for p in self.model.parameters():
+            p.grad = None
+    @torch.no_grad()
+    def step(self):
+        for p in self.model.parameters():
+            if p.grad is None:
+                continue
+            if p not in self.state:
+                self.state[p] = {"m": torch.zeros_like(p), "v": torch.zeros_like(p)}
+            m = self.state[p]["m"]
+            v = self.state[p]["v"]
+            m.mul_(0.9).add_(p.grad, alpha=0.1)
+            v.mul_(0.999).addcmul_(p.grad, p.grad, value=0.001)
+            p.sub_(m / (torch.sqrt(v) + 1e-8), alpha=self.lr)
+def evaluate(model: nn.Module, corpus: CharCorpus, batch_size: int, seed: int) -> float:
+    model.eval()
+    with torch.no_grad():
+        x, y = corpus.get_batch("val", batch_size, generator=make_cpu_generator(seed))
+        _, loss = model(x, y)
+    model.train()
+    return float(loss.item())
+def run_experiment(
+    scheduler_name: Scheduler,
+    alpha: float,
+    device: str,
+    steps: int,
+    batch_size: int,
+    block_size: int,
+    n_layer: int,
+    n_head: int,
+    n_embd: int,
+    chunk_size: int,
+    active_fraction: float,
+    warmup_steps: int,
+    benchmark_sync: bool,
+) -> Dict[str, float]:
+    set_seed(42)
+    corpus = CharCorpus(make_synthetic_corpus(), block_size, device)
+    model = MiniGPT(corpus.vocab_size, block_size, n_layer, n_head, n_embd, 0.0).to(device)
+    opt = SimpleAdam(model, lr=3e-4)
+    sched = RadarScheduler(
+        model=model,
+        chunk_size=chunk_size,
+        active_fraction=active_fraction,
+        device=device,
+        scheduler=scheduler_name,
+        alpha=alpha,
+    )
+    metric_rows = []
+    if benchmark_sync:
+        sync_device(device)
+    t0 = time.perf_counter()
+    for step in range(steps):
+        x, y = corpus.get_batch("train", batch_size, generator=make_cpu_generator(step))
+        active_mask = sched.choose_mask(step=step, warmup_steps=warmup_steps)
+        opt.zero_grad()
+        _, loss = model(x, y)
+        loss.backward()
+        vecs = sched.chunk_gradient_vectors()
+        masses = sched.chunk_masses_from_vecs(vecs)
+        if step >= warmup_steps and scheduler_name != "dense":
+            oracle = sched.oracle_topk_mask(masses)
+            metric_rows.append(
+                {
+                    "cos": dense_cosine_active_only(vecs, active_mask),
+                    "jacc": jaccard(active_mask, oracle),
+                    "stable": jaccard(active_mask, sched.prev_mask) if sched.prev_mask is not None else 0.0,
+                }
+            )
+        install_active_only_grads(sched, active_mask)
+        # The scheduler only learns from active chunks after warmup.
+        # During warmup it observes everything to build the similarity graph.
+        observed_for_scheduler = active_mask if step >= warmup_steps else torch.ones_like(active_mask)
+        sched.update_from_observed(
+            observed_mask=observed_for_scheduler,
+            true_masses=masses,
+            step=step,
+            warmup_steps=warmup_steps,
+        )
+        sched.prev_mask = active_mask.clone()
+        opt.step()
+    if benchmark_sync:
+        sync_device(device)
+    elapsed = time.perf_counter() - t0
+    val_loss = evaluate(model, corpus, batch_size, seed=12345)
+    if metric_rows:
+        avg_cos = sum(r["cos"] for r in metric_rows) / len(metric_rows)
+        avg_jacc = sum(r["jacc"] for r in metric_rows) / len(metric_rows)
+        avg_stable = sum(r["stable"] for r in metric_rows) / len(metric_rows)
+    else:
+        avg_cos = float("nan")
+        avg_jacc = float("nan")
+        avg_stable = float("nan")
+    return {
+        "val": val_loss,
+        "ms": 1000.0 * elapsed / steps,
+        "cos": avg_cos,
+        "jacc": avg_jacc,
+        "stable": avg_stable,
+    }
+def build_runs(alphas: List[float]) -> List[Tuple[Scheduler, float, str]]:
+    runs: List[Tuple[Scheduler, float, str]] = [
+        ("dense", 1.0, "dense"),
+        ("ema_topk", 1.0, "ema_topk"),
+        ("knn_scheduler", 0.0, "knn"),
+        ("graph_scheduler", 0.0, "graph"),
+    ]
+    for a in alphas:
+        runs.append(("radar_knn", a, f"radar_knn_a{a:g}"))
+    for a in alphas:
+        runs.append(("radar_graph", a, f"radar_graph_a{a:g}"))
+    runs.append(("random", 0.0, "random"))
+    return runs
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--steps", type=int, default=500)
+    parser.add_argument("--batch_size", type=int, default=8)
+    parser.add_argument("--block_size", type=int, default=128)
+    parser.add_argument("--n_layer", type=int, default=4)
+    parser.add_argument("--n_head", type=int, default=8)
+    parser.add_argument("--n_embd", type=int, default=512)
+    parser.add_argument("--chunk_size", type=int, default=64)
+    parser.add_argument("--active_fraction", type=float, default=0.10)
+    parser.add_argument("--warmup_steps", type=int, default=25)
+    parser.add_argument("--alphas", type=float, nargs="+", default=[0.25, 0.5, 0.75, 0.9])
+    parser.add_argument("--device", type=str, default="mps")
+    parser.add_argument("--benchmark_sync", action="store_true")
+    args = parser.parse_args()
+    runs = build_runs(args.alphas)
+    print("\nRadar scheduler diagnostic")
+    print(f"device={args.device} steps={args.steps} d={args.n_embd} chunks={args.chunk_size}")
+    print(f"active_fraction={args.active_fraction} warmup={args.warmup_steps}")
+    print(f"alphas={args.alphas}\n")
+    print(
+        f"{'run':>18s} | {'val':>8s} | {'ms/step':>8s} | "
+        f"{'grad_cos':>8s} | {'jacc':>8s} | {'stable':>8s}"
+    )
+    print("-" * 78)
+    for scheduler_name, alpha, label in runs:
+        result = run_experiment(
+            scheduler_name=scheduler_name,
+            alpha=alpha,
+            device=args.device,
+            steps=args.steps,
+            batch_size=args.batch_size,
+            block_size=args.block_size,
+            n_layer=args.n_layer,
+            n_head=args.n_head,
+            n_embd=args.n_embd,
+            chunk_size=args.chunk_size,
+            active_fraction=args.active_fraction,
+            warmup_steps=args.warmup_steps,
+            benchmark_sync=args.benchmark_sync,
+        )
+        print(
+            f"{label:>18s} | "
+            f"{result['val']:8.4f} | "
+            f"{result['ms']:8.2f} | "
+            f"{result['cos']:8.3f} | "
+            f"{result['jacc']:8.3f} | "
+            f"{result['stable']:8.3f}"
+        )
+if __name__ == "__main__":
+    main()

experiments/sparse_transformer_v6.py ADDED Viewed

	@@ -0,0 +1,596 @@

+"""
+Sparse Transformer v6: stable predicted-magnitude masks, no dense refresh by default.
+This prototype is designed to test the next hypothesis after the spiral/MLP runs:
+    The important gradient support is heavy-tailed and temporally stable enough
+    that we can select active parameter blocks from history, freeze the rest, and
+    still train a harder sequence model.
+Key fixes versus v5
+-------------------
+1. Harder model: a small causal Transformer language model.
+2. No periodic dense refresh by default: --warmup_steps 0.
+3. The selector only learns from blocks it actually observes/updates.
+4. Inactive Linear rows are truly frozen by MaskedAdam. This matters because
+   ordinary Adam can still move parameters with zero gradients through momentum.
+5. A true current-step oracle is included as an audit upper bound.
+6. Random masks are included as a control.
+Important limitation
+--------------------
+This still calls loss.backward(), so PyTorch computes dense gradients. Those full
+current gradients are used for audit metrics and for the oracle run only. The
+practical predicted_magnitude selector is not allowed to update its statistics
+from inactive full gradients.
+Actual speedup would require structured partial backward/custom kernels.
+Run
+---
+    python3 sparse_transformer_v6.py --quick
+    python3 sparse_transformer_v6.py --steps 1000 --active_fractions 0.10 0.05 0.02
+    python3 sparse_transformer_v6.py --text_path input.txt --steps 2000
+"""
+from __future__ import annotations
+import argparse
+import math
+import random
+from typing import Dict, List, Literal, Optional, Tuple
+import torch
+torch.set_num_threads(1)
+import torch.nn as nn
+import torch.nn.functional as F
+Policy = Literal["predicted_magnitude", "oracle_current", "random"]
+def set_seed(seed: int) -> None:
+    random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+def device() -> str:
+    return "cuda" if torch.cuda.is_available() else "cpu"
+# -----------------------------
+# Data
+# -----------------------------
+def make_synthetic_corpus(n_sentences: int = 12000, seed: int = 7) -> str:
+    rng = random.Random(seed)
+    names = ["ada", "turing", "grace", "lovelace", "noether", "shannon", "hopper", "gauss"]
+    verbs = ["builds", "tests", "traces", "compresses", "predicts", "routes", "writes", "measures"]
+    objects = ["signals", "gradients", "tokens", "circuits", "features", "masks", "errors", "states"]
+    adverbs = ["quietly", "boldly", "slowly", "quickly", "cleanly", "strangely", "carefully"]
+    clauses = [
+        "when the loss falls",
+        "after the mask shifts",
+        "before the model answers",
+        "while the signal drifts",
+        "if the pattern repeats",
+        "because the tail is noisy",
+    ]
+    symbols = ["alpha", "beta", "gamma", "delta", "omega", "sigma"]
+    lines: List[str] = []
+    for _ in range(n_sentences):
+        t = rng.randrange(6)
+        if t == 0:
+            line = f"{rng.choice(names)} {rng.choice(verbs)} {rng.choice(objects)} {rng.choice(adverbs)}."
+        elif t == 1:
+            line = f"{rng.choice(clauses)}, {rng.choice(names)} {rng.choice(verbs)} {rng.choice(objects)}."
+        elif t == 2:
+            a, b = rng.sample(symbols, 2)
+            line = f"rule {a}: {rng.choice(objects)} -> {rng.choice(objects)}; rule {b}: {rng.choice(objects)} -> {rng.choice(objects)}."
+        elif t == 3:
+            line = f"the {rng.choice(objects)} {rng.choice(verbs)} the {rng.choice(objects)} {rng.choice(adverbs)}."
+        elif t == 4:
+            seq = " ".join(rng.choice(symbols) for _ in range(rng.randint(2, 7)))
+            line = f"sequence {seq} ends when {rng.choice(names)} {rng.choice(verbs)}."
+        else:
+            line = f"if {rng.choice(objects)} rise then {rng.choice(names)} {rng.choice(verbs)} {rng.choice(objects)} else wait."
+        lines.append(line)
+    return "\n".join(lines) + "\n"
+class CharCorpus:
+    def __init__(self, text: str, block_size: int, device: str):
+        chars = sorted(set(text))
+        self.stoi = {ch: i for i, ch in enumerate(chars)}
+        self.itos = {i: ch for ch, i in self.stoi.items()}
+        self.vocab_size = len(chars)
+        self.block_size = block_size
+        self.device = device
+        data = torch.tensor([self.stoi[ch] for ch in text], dtype=torch.long)
+        split = int(0.9 * len(data))
+        self.train_data = data[:split]
+        self.val_data = data[split:]
+    def get_batch(self, split: str, batch_size: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        data = self.train_data if split == "train" else self.val_data
+        max_start = len(data) - self.block_size - 1
+        if max_start <= 0:
+            raise ValueError("Corpus too small for block_size")
+        ix = torch.randint(max_start, (batch_size,))
+        x = torch.stack([data[i : i + self.block_size] for i in ix])
+        y = torch.stack([data[i + 1 : i + self.block_size + 1] for i in ix])
+        return x.to(self.device), y.to(self.device)
+def load_text(args: argparse.Namespace) -> str:
+    if args.text_path:
+        with open(args.text_path, "r", encoding="utf-8") as f:
+            return f.read()
+    return make_synthetic_corpus(args.synthetic_sentences, args.seed)
+# -----------------------------
+# Mini GPT
+# -----------------------------
+class CausalSelfAttention(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        assert n_embd % n_head == 0
+        self.n_head = n_head
+        self.head_dim = n_embd // n_head
+        self.c_attn = nn.Linear(n_embd, 3 * n_embd)
+        self.c_proj = nn.Linear(n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+        self.register_buffer("mask", torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, T, C = x.shape
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(C, dim=2)
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float("-inf"))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.c_proj(y)
+class FeedForward(nn.Module):
+    def __init__(self, n_embd: int, dropout: float):
+        super().__init__()
+        self.c_fc = nn.Linear(n_embd, 4 * n_embd)
+        self.c_proj = nn.Linear(4 * n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.dropout(self.c_proj(F.gelu(self.c_fc(x))))
+class Block(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(n_embd)
+        self.attn = CausalSelfAttention(n_embd, n_head, block_size, dropout)
+        self.ln2 = nn.LayerNorm(n_embd)
+        self.mlp = FeedForward(n_embd, dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+class MiniGPT(nn.Module):
+    def __init__(self, vocab_size: int, block_size: int, n_layer: int, n_head: int, n_embd: int, dropout: float):
+        super().__init__()
+        self.block_size = block_size
+        self.tok_emb = nn.Embedding(vocab_size, n_embd)
+        self.pos_emb = nn.Embedding(block_size, n_embd)
+        self.drop = nn.Dropout(dropout)
+        self.blocks = nn.Sequential(*[Block(n_embd, n_head, block_size, dropout) for _ in range(n_layer)])
+        self.ln_f = nn.LayerNorm(n_embd)
+        self.lm_head = nn.Linear(n_embd, vocab_size)
+    def forward(self, idx: torch.Tensor, targets: Optional[torch.Tensor] = None):
+        B, T = idx.shape
+        pos = torch.arange(T, device=idx.device)
+        x = self.tok_emb(idx) + self.pos_emb(pos)[None, :, :]
+        x = self.drop(x)
+        x = self.blocks(x)
+        x = self.ln_f(x)
+        logits = self.lm_head(x)
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss
+def named_linear_modules(model: nn.Module) -> List[Tuple[str, nn.Linear]]:
+    return [(name, m) for name, m in model.named_modules() if isinstance(m, nn.Linear)]
+# -----------------------------
+# Mask selector
+# -----------------------------
+class RowMasker:
+    def __init__(
+        self,
+        model: nn.Module,
+        policy: Policy,
+        active_fraction: float,
+        explore_fraction: float,
+        mass_beta: float,
+        unobserved_decay: float,
+        warmup_steps: int,
+        device: str,
+    ):
+        self.model = model
+        self.policy = policy
+        self.active_fraction = active_fraction
+        self.explore_fraction = explore_fraction
+        self.mass_beta = mass_beta
+        self.unobserved_decay = unobserved_decay
+        self.warmup_steps = warmup_steps
+        self.device = device
+        self.linear_modules = [m for _, m in named_linear_modules(model)]
+        self.module_to_ids: Dict[nn.Linear, torch.Tensor] = {}
+        ids = []
+        offset = 0
+        for m in self.linear_modules:
+            n = m.weight.shape[0]
+            block_ids = torch.arange(offset, offset + n, device=device)
+            self.module_to_ids[m] = block_ids
+            ids.append(block_ids)
+            offset += n
+        self.n_blocks = offset
+        self.predicted_mass = torch.ones(self.n_blocks, device=device)
+        self.prev_active = torch.zeros(self.n_blocks, dtype=torch.bool, device=device)
+        self.active = torch.zeros(self.n_blocks, dtype=torch.bool, device=device)
+        self.row_masks: Dict[nn.Linear, torch.Tensor] = {m: torch.zeros(m.weight.shape[0], dtype=torch.bool, device=device) for m in self.linear_modules}
+    def _topk_mask(self, values: torch.Tensor, fraction: float) -> torch.Tensor:
+        k = max(1, int(fraction * values.numel()))
+        mask = torch.zeros_like(values, dtype=torch.bool)
+        mask[torch.topk(values, k=k).indices] = True
+        return mask
+    @staticmethod
+    def _jaccard(a: torch.Tensor, b: torch.Tensor) -> float:
+        inter = (a & b).sum().float()
+        union = (a | b).sum().float()
+        return float((inter / torch.clamp(union, min=1.0)).item())
+    def _set_active(self, active: torch.Tensor) -> None:
+        self.active = active
+        self.row_masks = {}
+        for m, ids in self.module_to_ids.items():
+            self.row_masks[m] = active[ids]
+    def choose_pre_backward(self, step: int) -> None:
+        if step < self.warmup_steps:
+            self._set_active(torch.ones(self.n_blocks, dtype=torch.bool, device=self.device))
+            return
+        if self.policy == "oracle_current":
+            # Cannot select until after current gradients are known.
+            self._set_active(torch.zeros(self.n_blocks, dtype=torch.bool, device=self.device))
+            return
+        k_total = max(1, int(self.active_fraction * self.n_blocks))
+        if self.policy == "random":
+            active = torch.zeros(self.n_blocks, dtype=torch.bool, device=self.device)
+            active[torch.randperm(self.n_blocks, device=self.device)[:k_total]] = True
+            self._set_active(active)
+            return
+        if self.policy != "predicted_magnitude":
+            raise ValueError(f"Unknown policy: {self.policy}")
+        k_explore = min(k_total, max(0, int(self.explore_fraction * k_total)))
+        k_exploit = k_total - k_explore
+        active = torch.zeros(self.n_blocks, dtype=torch.bool, device=self.device)
+        scores = self.predicted_mass + 1e-8 * torch.rand_like(self.predicted_mass)
+        if k_exploit > 0:
+            active[torch.topk(scores, k=k_exploit).indices] = True
+        if k_explore > 0:
+            remaining = torch.nonzero(~active, as_tuple=False).flatten()
+            active[remaining[torch.randperm(remaining.numel(), device=self.device)[:k_explore]]] = True
+        self._set_active(active)
+    @torch.no_grad()
+    def current_gradient_mass(self) -> torch.Tensor:
+        mass = torch.zeros(self.n_blocks, device=self.device)
+        for m, ids in self.module_to_ids.items():
+            if m.weight.grad is None:
+                continue
+            row_sq = m.weight.grad.square().sum(dim=1)
+            if m.bias is not None and m.bias.grad is not None:
+                row_sq = row_sq + m.bias.grad.square()
+            mass[ids] = torch.sqrt(row_sq + 1e-30)
+        return mass
+    @torch.no_grad()
+    def audit_and_update(self, step: int) -> Dict[str, float]:
+        mass = self.current_gradient_mass()
+        if step < self.warmup_steps:
+            active = torch.ones(self.n_blocks, dtype=torch.bool, device=self.device)
+            self._set_active(active)
+        elif self.policy == "oracle_current":
+            active = self._topk_mask(mass, self.active_fraction)
+            self._set_active(active)
+        else:
+            active = self.active
+        true_sq = mass.square().sum()
+        approx_sq = mass[active].square().sum()
+        cosine = float((torch.sqrt(approx_sq + 1e-30) / torch.sqrt(true_sq + 1e-30)).item())
+        # With zero inactive blocks and active blocks using true gradient, cosine == norm ratio.
+        norm_ratio = cosine
+        oracle_mask = self._topk_mask(mass, self.active_fraction)
+        jacc = self._jaccard(active, oracle_mask)
+        stability = self._jaccard(active, self.prev_active)
+        self.prev_active = active.clone()
+        k20 = max(1, int(0.2 * self.n_blocks))
+        sorted_mass = torch.sort(mass, descending=True).values
+        top20_mass = float((sorted_mass[:k20].sum() / (sorted_mass.sum() + 1e-12)).item())
+        # Strict rule: do not update stats from inactive full gradients.
+        self.predicted_mass.mul_(self.unobserved_decay)
+        observed = active
+        self.predicted_mass[observed] = (
+            self.mass_beta * self.predicted_mass[observed]
+            + (1.0 - self.mass_beta) * mass[observed]
+        )
+        return {
+            "cosine": cosine,
+            "norm_ratio": norm_ratio,
+            "top20_mass": top20_mass,
+            "jacc_oracle": jacc,
+            "stability": stability,
+            "active_fraction_real": float(active.float().mean().item()),
+        }
+    def row_mask_for(self, module: nn.Linear) -> Optional[torch.Tensor]:
+        return self.row_masks.get(module)
+# -----------------------------
+# Masked Adam
+# -----------------------------
+class MaskedAdam:
+    def __init__(self, model: nn.Module, masker: Optional[RowMasker], lr: float, betas=(0.9, 0.95), eps=1e-8, weight_decay=0.0):
+        self.model = model
+        self.masker = masker
+        self.lr = lr
+        self.beta1, self.beta2 = betas
+        self.eps = eps
+        self.weight_decay = weight_decay
+        self.state: Dict[nn.Parameter, Dict[str, torch.Tensor]] = {}
+        self.linear_param: Dict[nn.Parameter, Tuple[nn.Linear, str]] = {}
+        for _, m in named_linear_modules(model):
+            self.linear_param[m.weight] = (m, "weight")
+            if m.bias is not None:
+                self.linear_param[m.bias] = (m, "bias")
+    def zero_grad(self) -> None:
+        for p in self.model.parameters():
+            p.grad = None
+    @torch.no_grad()
+    def step(self) -> None:
+        for p in self.model.parameters():
+            if p.grad is None:
+                continue
+            if p not in self.state:
+                self.state[p] = {"m": torch.zeros_like(p), "v": torch.zeros_like(p)}
+            m = self.state[p]["m"]
+            v = self.state[p]["v"]
+            g = p.grad
+            if self.weight_decay:
+                g = g.add(p, alpha=self.weight_decay)
+            row_mask = None
+            if self.masker is not None and p in self.linear_param:
+                module, kind = self.linear_param[p]
+                base = self.masker.row_mask_for(module)
+                if base is not None:
+                    row_mask = base.view(-1, *([1] * (p.ndim - 1))) if kind == "weight" else base
+            if row_mask is None:
+                m.mul_(self.beta1).add_(g, alpha=1.0 - self.beta1)
+                v.mul_(self.beta2).addcmul_(g, g, value=1.0 - self.beta2)
+                p.add_(m / (torch.sqrt(v) + self.eps), alpha=-self.lr)
+            else:
+                mask = row_mask.expand_as(p)
+                if not bool(mask.any().item()):
+                    continue
+                new_m = self.beta1 * m + (1.0 - self.beta1) * g
+                new_v = self.beta2 * v + (1.0 - self.beta2) * g * g
+                m[mask] = new_m[mask]
+                v[mask] = new_v[mask]
+                update = m / (torch.sqrt(v) + self.eps)
+                p[mask] = p[mask] - self.lr * update[mask]
+# -----------------------------
+# Training
+# -----------------------------
+@torch.no_grad()
+def estimate_loss(model: nn.Module, corpus: CharCorpus, batch_size: int, eval_iters: int) -> Dict[str, float]:
+    model.eval()
+    out = {}
+    for split in ["train", "val"]:
+        losses = []
+        for _ in range(eval_iters):
+            x, y = corpus.get_batch(split, batch_size)
+            _, loss = model(x, y)
+            losses.append(float(loss.item()))
+        out[split] = sum(losses) / len(losses)
+    model.train()
+    return out
+def train_run(corpus: CharCorpus, args: argparse.Namespace, policy: Optional[Policy], active_fraction: float, seed_offset: int) -> Dict[str, float | str]:
+    set_seed(args.seed + seed_offset)
+    dev = corpus.device
+    model = MiniGPT(corpus.vocab_size, args.block_size, args.n_layer, args.n_head, args.n_embd, args.dropout).to(dev)
+    masker = None
+    if policy is not None:
+        masker = RowMasker(
+            model=model,
+            policy=policy,
+            active_fraction=active_fraction,
+            explore_fraction=args.explore_fraction,
+            mass_beta=args.mass_beta,
+            unobserved_decay=args.unobserved_decay,
+            warmup_steps=args.warmup_steps,
+            device=dev,
+        )
+    opt = MaskedAdam(model, masker, lr=args.lr, weight_decay=args.weight_decay)
+    sums = {"cosine": 0.0, "norm_ratio": 0.0, "top20_mass": 0.0, "jacc_oracle": 0.0, "stability": 0.0, "active_fraction_real": 0.0}
+    count = 0
+    for step in range(args.steps):
+        x, y = corpus.get_batch("train", args.batch_size)
+        if masker is not None:
+            masker.choose_pre_backward(step)
+        _, loss = model(x, y)
+        opt.zero_grad()
+        loss.backward()
+        if masker is not None:
+            metrics = masker.audit_and_update(step)
+            if step >= args.warmup_steps:
+                for k in sums:
+                    sums[k] += metrics[k]
+                count += 1
+        opt.step()
+        if args.verbose and (step % args.eval_interval == 0 or step == args.steps - 1):
+            losses = estimate_loss(model, corpus, args.batch_size, args.eval_iters)
+            name = "dense" if policy is None else policy
+            print(f"{name:20s} step={step:5d} train={losses['train']:.4f} val={losses['val']:.4f}")
+    losses = estimate_loss(model, corpus, args.batch_size, args.eval_iters)
+    row: Dict[str, float | str] = {
+        "run": "dense_baseline" if policy is None else policy,
+        "target_active": 1.0 if policy is None else active_fraction,
+        "train_loss": losses["train"],
+        "val_loss": losses["val"],
+    }
+    if masker is None or count == 0:
+        row.update({"cosine": float("nan"), "norm_ratio": float("nan"), "top20_mass": float("nan"), "jacc_oracle": float("nan"), "stability": float("nan"), "active_fraction_real": 1.0})
+    else:
+        for k, v in sums.items():
+            row[k] = v / count
+    return row
+def print_summary(rows: List[Dict[str, float | str]]) -> None:
+    print("\nSummary")
+    header = f"{'run':>22s} {'target':>7s} {'actual':>7s} {'val':>8s} {'train':>8s} {'cos':>7s} {'top20':>7s} {'jacc':>7s} {'stable':>7s}"
+    print(header)
+    print("-" * len(header))
+    for r in rows:
+        print(
+            f"{str(r['run']):>22s} "
+            f"{float(r['target_active']):7.3f} "
+            f"{float(r['active_fraction_real']):7.3f} "
+            f"{float(r['val_loss']):8.4f} "
+            f"{float(r['train_loss']):8.4f} "
+            f"{float(r['cosine']):7.3f} "
+            f"{float(r['top20_mass']):7.3f} "
+            f"{float(r['jacc_oracle']):7.3f} "
+            f"{float(r['stability']):7.3f}"
+        )
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser()
+    p.add_argument("--text_path", type=str, default=None)
+    p.add_argument("--synthetic_sentences", type=int, default=12000)
+    p.add_argument("--steps", type=int, default=1000)
+    p.add_argument("--quick", action="store_true")
+    p.add_argument("--batch_size", type=int, default=32)
+    p.add_argument("--block_size", type=int, default=64)
+    p.add_argument("--n_layer", type=int, default=2)
+    p.add_argument("--n_head", type=int, default=4)
+    p.add_argument("--n_embd", type=int, default=64)
+    p.add_argument("--dropout", type=float, default=0.0)
+    p.add_argument("--lr", type=float, default=3e-4)
+    p.add_argument("--weight_decay", type=float, default=0.0)
+    p.add_argument("--active_fractions", type=float, nargs="+", default=[0.10, 0.05, 0.02])
+    p.add_argument("--explore_fraction", type=float, default=0.10)
+    p.add_argument("--mass_beta", type=float, default=0.95)
+    p.add_argument("--unobserved_decay", type=float, default=0.999)
+    p.add_argument("--warmup_steps", type=int, default=0)
+    p.add_argument("--eval_interval", type=int, default=200)
+    p.add_argument("--eval_iters", type=int, default=20)
+    p.add_argument("--seed", type=int, default=7)
+    p.add_argument("--verbose", action="store_true")
+    return p.parse_args()
+def main() -> None:
+    args = parse_args()
+    if args.quick:
+        args.steps = 60
+        args.eval_iters = 3
+        args.batch_size = 16
+        args.block_size = 32
+        args.n_layer = 1
+        args.n_embd = 32
+        args.n_head = 4
+        args.synthetic_sentences = 2000
+        args.active_fractions = [0.10, 0.02]
+    set_seed(args.seed)
+    dev = device()
+    print(f"device={dev}")
+    corpus = CharCorpus(load_text(args), args.block_size, dev)
+    print(f"vocab_size={corpus.vocab_size} train_tokens={len(corpus.train_data)} val_tokens={len(corpus.val_data)}")
+    print(f"warmup_steps={args.warmup_steps} explore_fraction={args.explore_fraction}")
+    rows: List[Dict[str, float | str]] = []
+    print("\nRunning dense baseline")
+    rows.append(train_run(corpus, args, policy=None, active_fraction=1.0, seed_offset=0))
+    seed_offset = 100
+    for af in args.active_fractions:
+        for policy in ["oracle_current", "predicted_magnitude", "random"]:
+            print(f"\nRunning policy={policy}, active_fraction={af:.3f}")
+            rows.append(train_run(corpus, args, policy=policy, active_fraction=af, seed_offset=seed_offset))
+            seed_offset += 1
+    print_summary(rows)
+    print("\nNotes")
+    print("  oracle_current uses the current full gradient to choose rows; it is an upper bound, not a practical selector.")
+    print("  predicted_magnitude chooses from EMA mass only, plus a small random exploration budget.")
+    print("  EMA mass is updated only for active/observed rows, not all rows.")
+    print("  inactive Linear rows are frozen by MaskedAdam, including Adam state; zero grad alone is not enough.")
+    print("  dense gradients are still computed for audit, so this is not a wall-clock speed benchmark yet.")
+if __name__ == "__main__":
+    main()

experiments/sparse_transformer_v7.py ADDED Viewed

	@@ -0,0 +1,780 @@

+"""
+Sparse Transformer v7: discovery stress tests for stable gradient-support masks.
+This version follows the v6 result:
+    oracle_current works far better than random, so useful sparse support exists;
+    predicted_magnitude without warmup does not reliably discover that support.
+v7 focuses on discovery mechanisms:
+1. predicted_magnitude
+   Exploit rows with the largest EMA-observed gradient mass.
+2. ucb_magnitude
+   A bandit-style selector: EMA mass + an uncertainty bonus for under-observed rows.
+   This is meant to discover useful rows without dense refresh.
+First observation initializes EMA scale immediately.
+3. stale_current
+   A renamed diagnostic control: use the previous full-gradient mass. It is not
+   practical because it relies on dense audit gradients, but it tells us whether
+   one-step lag is too noisy.
+4. oracle_current
+   True current top-k by dense gradient mass. Upper bound only.
+5. random
+   Control.
+Important limitation
+--------------------
+This still calls loss.backward(), so PyTorch computes dense gradients. Dense
+current gradients are used for audit metrics and for oracle/stale controls.
+The practical selectors only update their EMA statistics from active rows.
+Actual speedup would require structured partial backward/custom kernels.
+Example runs
+------------
+Smoke test:
+    python3 sparse_transformer_v7.py --quick
+No-warmup discovery test:
+    python3 sparse_transformer_v7.py --steps 1000 \
+      --active_fractions 0.10 0.05 0.02 \
+      --policies predicted_magnitude ucb_magnitude oracle_current random \
+      --warmup_steps_list 0 5 50 --explore_fractions 0.10 0.30
+Warm-start separation test:
+    python3 sparse_transformer_v7.py --steps 1000 \
+      --active_fractions 0.10 0.05 0.02 \
+      --policies predicted_magnitude ucb_magnitude oracle_current random \
+      --warmup_steps_list 0 5 50 200 --explore_fractions 0.10
+"""
+from __future__ import annotations
+import argparse
+import math
+import random
+from typing import Dict, List, Literal, Optional, Tuple
+import torch
+torch.set_num_threads(1)
+import torch.nn as nn
+import torch.nn.functional as F
+Policy = Literal["predicted_magnitude", "ucb_magnitude", "oracle_current", "stale_current", "random"]
+def set_seed(seed: int) -> None:
+    random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+def device() -> str:
+    return "cuda" if torch.cuda.is_available() else "cpu"
+# -----------------------------
+# Data
+# -----------------------------
+def make_synthetic_corpus(n_sentences: int = 12000, seed: int = 7) -> str:
+    rng = random.Random(seed)
+    names = ["ada", "turing", "grace", "lovelace", "noether", "shannon", "hopper", "gauss"]
+    verbs = ["builds", "tests", "traces", "compresses", "predicts", "routes", "writes", "measures"]
+    objects = ["signals", "gradients", "tokens", "circuits", "features", "masks", "errors", "states"]
+    adverbs = ["quietly", "boldly", "slowly", "quickly", "cleanly", "strangely", "carefully"]
+    clauses = [
+        "when the loss falls",
+        "after the mask shifts",
+        "before the model answers",
+        "while the signal drifts",
+        "if the pattern repeats",
+        "because the tail is noisy",
+    ]
+    symbols = ["alpha", "beta", "gamma", "delta", "omega", "sigma"]
+    lines: List[str] = []
+    for _ in range(n_sentences):
+        t = rng.randrange(6)
+        if t == 0:
+            line = f"{rng.choice(names)} {rng.choice(verbs)} {rng.choice(objects)} {rng.choice(adverbs)}."
+        elif t == 1:
+            line = f"{rng.choice(clauses)}, {rng.choice(names)} {rng.choice(verbs)} {rng.choice(objects)}."
+        elif t == 2:
+            a, b = rng.sample(symbols, 2)
+            line = f"rule {a}: {rng.choice(objects)} -> {rng.choice(objects)}; rule {b}: {rng.choice(objects)} -> {rng.choice(objects)}."
+        elif t == 3:
+            line = f"the {rng.choice(objects)} {rng.choice(verbs)} the {rng.choice(objects)} {rng.choice(adverbs)}."
+        elif t == 4:
+            seq = " ".join(rng.choice(symbols) for _ in range(rng.randint(2, 7)))
+            line = f"sequence {seq} ends when {rng.choice(names)} {rng.choice(verbs)}."
+        else:
+            line = f"if {rng.choice(objects)} rise then {rng.choice(names)} {rng.choice(verbs)} {rng.choice(objects)} else wait."
+        lines.append(line)
+    return "\n".join(lines) + "\n"
+class CharCorpus:
+    def __init__(self, text: str, block_size: int, device: str):
+        chars = sorted(set(text))
+        self.stoi = {ch: i for i, ch in enumerate(chars)}
+        self.itos = {i: ch for ch, i in self.stoi.items()}
+        self.vocab_size = len(chars)
+        self.block_size = block_size
+        self.device = device
+        data = torch.tensor([self.stoi[ch] for ch in text], dtype=torch.long)
+        split = int(0.9 * len(data))
+        self.train_data = data[:split]
+        self.val_data = data[split:]
+    def get_batch(self, split: str, batch_size: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        data = self.train_data if split == "train" else self.val_data
+        max_start = len(data) - self.block_size - 1
+        if max_start <= 0:
+            raise ValueError("Corpus too small for block_size")
+        ix = torch.randint(max_start, (batch_size,))
+        x = torch.stack([data[i : i + self.block_size] for i in ix])
+        y = torch.stack([data[i + 1 : i + self.block_size + 1] for i in ix])
+        return x.to(self.device), y.to(self.device)
+def load_text(args: argparse.Namespace) -> str:
+    if args.text_path:
+        with open(args.text_path, "r", encoding="utf-8") as f:
+            return f.read()
+    return make_synthetic_corpus(args.synthetic_sentences, args.seed)
+# -----------------------------
+# Mini GPT
+# -----------------------------
+class CausalSelfAttention(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        assert n_embd % n_head == 0
+        self.n_head = n_head
+        self.head_dim = n_embd // n_head
+        self.c_attn = nn.Linear(n_embd, 3 * n_embd)
+        self.c_proj = nn.Linear(n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+        self.register_buffer("mask", torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, T, C = x.shape
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(C, dim=2)
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float("-inf"))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.c_proj(y)
+class FeedForward(nn.Module):
+    def __init__(self, n_embd: int, dropout: float):
+        super().__init__()
+        self.c_fc = nn.Linear(n_embd, 4 * n_embd)
+        self.c_proj = nn.Linear(4 * n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.dropout(self.c_proj(F.gelu(self.c_fc(x))))
+class Block(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(n_embd)
+        self.attn = CausalSelfAttention(n_embd, n_head, block_size, dropout)
+        self.ln2 = nn.LayerNorm(n_embd)
+        self.mlp = FeedForward(n_embd, dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+class MiniGPT(nn.Module):
+    def __init__(self, vocab_size: int, block_size: int, n_layer: int, n_head: int, n_embd: int, dropout: float):
+        super().__init__()
+        self.block_size = block_size
+        self.tok_emb = nn.Embedding(vocab_size, n_embd)
+        self.pos_emb = nn.Embedding(block_size, n_embd)
+        self.drop = nn.Dropout(dropout)
+        self.blocks = nn.Sequential(*[Block(n_embd, n_head, block_size, dropout) for _ in range(n_layer)])
+        self.ln_f = nn.LayerNorm(n_embd)
+        self.lm_head = nn.Linear(n_embd, vocab_size)
+    def forward(self, idx: torch.Tensor, targets: Optional[torch.Tensor] = None):
+        B, T = idx.shape
+        pos = torch.arange(T, device=idx.device)
+        x = self.tok_emb(idx) + self.pos_emb(pos)[None, :, :]
+        x = self.drop(x)
+        x = self.blocks(x)
+        x = self.ln_f(x)
+        logits = self.lm_head(x)
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss
+def named_linear_modules(model: nn.Module) -> List[Tuple[str, nn.Linear]]:
+    return [(name, m) for name, m in model.named_modules() if isinstance(m, nn.Linear)]
+def parameter_fractions(model: nn.Module) -> Tuple[int, int, float]:
+    total = sum(p.numel() for p in model.parameters())
+    linear = 0
+    for _, m in named_linear_modules(model):
+        linear += m.weight.numel()
+        if m.bias is not None:
+            linear += m.bias.numel()
+    return total, linear, linear / max(1, total)
+# -----------------------------
+# Mask selector
+# -----------------------------
+class RowMasker:
+    def __init__(
+        self,
+        model: nn.Module,
+        policy: Policy,
+        active_fraction: float,
+        explore_fraction: float,
+        mass_beta: float,
+        unobserved_decay: float,
+        warmup_steps: int,
+        ucb_alpha: float,
+        mass_init: float,
+        device: str,
+    ):
+        self.model = model
+        self.policy = policy
+        self.active_fraction = active_fraction
+        self.explore_fraction = explore_fraction
+        self.mass_beta = mass_beta
+        self.unobserved_decay = unobserved_decay
+        self.warmup_steps = warmup_steps
+        self.ucb_alpha = ucb_alpha
+        self.mass_init = mass_init
+        self.device = device
+        self.step_index = 0
+        self.linear_modules = [m for _, m in named_linear_modules(model)]
+        self.module_to_ids: Dict[nn.Linear, torch.Tensor] = {}
+        ids = []
+        offset = 0
+        for m in self.linear_modules:
+            n = m.weight.shape[0]
+            block_ids = torch.arange(offset, offset + n, device=device)
+            self.module_to_ids[m] = block_ids
+            ids.append(block_ids)
+            offset += n
+        self.n_blocks = offset
+        self.predicted_mass = torch.full((self.n_blocks,), mass_init, device=device)
+        self.last_full_mass = torch.full((self.n_blocks,), mass_init, device=device)
+        self.observed_count = torch.zeros(self.n_blocks, device=device)
+        self.global_mass_ema = torch.tensor(max(mass_init, 1e-6), device=device)
+        self.prev_active = torch.zeros(self.n_blocks, dtype=torch.bool, device=device)
+        self.active = torch.zeros(self.n_blocks, dtype=torch.bool, device=device)
+        self.row_masks: Dict[nn.Linear, torch.Tensor] = {
+            m: torch.zeros(m.weight.shape[0], dtype=torch.bool, device=device) for m in self.linear_modules
+        }
+    def _topk_mask(self, values: torch.Tensor, fraction: float) -> torch.Tensor:
+        k = max(1, int(fraction * values.numel()))
+        mask = torch.zeros_like(values, dtype=torch.bool)
+        # Tie-breaking noise matters when many rows have identical initial scores.
+        noisy = values + 1e-9 * torch.rand_like(values)
+        mask[torch.topk(noisy, k=k).indices] = True
+        return mask
+    @staticmethod
+    def _jaccard(a: torch.Tensor, b: torch.Tensor) -> float:
+        inter = (a & b).sum().float()
+        union = (a | b).sum().float()
+        return float((inter / torch.clamp(union, min=1.0)).item())
+    def _set_active(self, active: torch.Tensor) -> None:
+        self.active = active
+        self.row_masks = {}
+        for m, ids in self.module_to_ids.items():
+            self.row_masks[m] = active[ids]
+    def _sample_exploit_explore(self, scores: torch.Tensor) -> torch.Tensor:
+        n = self.n_blocks
+        k_total = max(1, int(self.active_fraction * n))
+        k_explore = min(k_total, max(0, int(self.explore_fraction * k_total)))
+        k_exploit = k_total - k_explore
+        active = torch.zeros(n, dtype=torch.bool, device=self.device)
+        if k_exploit > 0:
+            active[torch.topk(scores + 1e-9 * torch.rand_like(scores), k=k_exploit).indices] = True
+        if k_explore > 0:
+            remaining = torch.nonzero(~active, as_tuple=False).flatten()
+            pick = remaining[torch.randperm(remaining.numel(), device=self.device)[:k_explore]]
+            active[pick] = True
+        return active
+    def choose_pre_backward(self, step: int) -> None:
+        self.step_index = step
+        if step < self.warmup_steps:
+            self._set_active(torch.ones(self.n_blocks, dtype=torch.bool, device=self.device))
+            return
+        if self.policy == "oracle_current":
+            # Cannot select until after current gradients are known.
+            self._set_active(torch.zeros(self.n_blocks, dtype=torch.bool, device=self.device))
+            return
+        if self.policy == "random":
+            self._set_active(self._sample_exploit_explore(torch.rand(self.n_blocks, device=self.device)))
+            return
+        if self.policy == "stale_current":
+            self._set_active(self._topk_mask(self.last_full_mass, self.active_fraction))
+            return
+        if self.policy == "predicted_magnitude":
+            self._set_active(self._sample_exploit_explore(self.predicted_mass))
+            return
+        if self.policy == "ucb_magnitude":
+            t = max(1, step - self.warmup_steps + 1)
+            log_term = torch.log(torch.tensor(float(t + 2), device=self.device))
+            bonus_scale = torch.clamp(self.global_mass_ema, min=1e-8)
+            bonus = self.ucb_alpha * bonus_scale * torch.sqrt(log_term / (self.observed_count + 1.0))
+            scores = self.predicted_mass + bonus
+            self._set_active(self._sample_exploit_explore(scores))
+            return
+        raise ValueError(f"Unknown policy: {self.policy}")
+    @torch.no_grad()
+    def current_gradient_mass(self) -> torch.Tensor:
+        mass = torch.zeros(self.n_blocks, device=self.device)
+        for m, ids in self.module_to_ids.items():
+            if m.weight.grad is None:
+                continue
+            row_sq = m.weight.grad.square().sum(dim=1)
+            if m.bias is not None and m.bias.grad is not None:
+                row_sq = row_sq + m.bias.grad.square()
+            mass[ids] = torch.sqrt(row_sq + 1e-30)
+        return mass
+    @torch.no_grad()
+    def audit_and_update(self, step: int) -> Dict[str, float]:
+        mass = self.current_gradient_mass()
+        if step < self.warmup_steps:
+            active = torch.ones(self.n_blocks, dtype=torch.bool, device=self.device)
+            self._set_active(active)
+        elif self.policy == "oracle_current":
+            active = self._topk_mask(mass, self.active_fraction)
+            self._set_active(active)
+        else:
+            active = self.active
+        true_sq = mass.square().sum()
+        approx_sq = mass[active].square().sum()
+        cosine = float((torch.sqrt(approx_sq + 1e-30) / torch.sqrt(true_sq + 1e-30)).item())
+        norm_ratio = cosine
+        oracle_mask = self._topk_mask(mass, self.active_fraction)
+        jacc = self._jaccard(active, oracle_mask)
+        stability = self._jaccard(active, self.prev_active)
+        self.prev_active = active.clone()
+        k20 = max(1, int(0.2 * self.n_blocks))
+        sorted_mass = torch.sort(mass, descending=True).values
+        top20_mass = float((sorted_mass[:k20].sum() / (sorted_mass.sum() + 1e-12)).item())
+        new_active = active & (self.observed_count == 0)
+        # Strict rule for practical policies: update stats only from active rows.
+        # oracle_current and stale_current also update only active rows for consistency;
+        # stale_current separately records last_full_mass as a diagnostic signal.
+        self.predicted_mass.mul_(self.unobserved_decay)
+        observed = active
+        if bool(observed.any().item()):
+            obs_mass = mass[observed]
+            first_seen = self.observed_count[observed] == 0
+            ema_mass = self.mass_beta * self.predicted_mass[observed] + (1.0 - self.mass_beta) * obs_mass
+            # First observation should establish the real scale immediately.
+            # Otherwise a beta=0.95 EMA needs many observations to climb from zero.
+            self.predicted_mass[observed] = torch.where(first_seen, obs_mass, ema_mass)
+            self.observed_count[observed] += 1.0
+            self.global_mass_ema = self.mass_beta * self.global_mass_ema + (1.0 - self.mass_beta) * obs_mass.mean()
+        # Dense audit signal. Only stale_current is allowed to use this for next-step selection.
+        self.last_full_mass = mass.detach().clone()
+        coverage = float((self.observed_count > 0).float().mean().item())
+        avg_obs_count = float(self.observed_count.mean().item())
+        new_active_fraction = float((new_active.float().mean()).item())
+        return {
+            "cosine": cosine,
+            "norm_ratio": norm_ratio,
+            "top20_mass": top20_mass,
+            "jacc_oracle": jacc,
+            "stability": stability,
+            "active_fraction_real": float(active.float().mean().item()),
+            "coverage": coverage,
+            "avg_obs_count": avg_obs_count,
+            "new_active_fraction": new_active_fraction,
+        }
+    def row_mask_for(self, module: nn.Linear) -> Optional[torch.Tensor]:
+        return self.row_masks.get(module)
+# -----------------------------
+# Masked Adam
+# -----------------------------
+class MaskedAdam:
+    def __init__(
+        self,
+        model: nn.Module,
+        masker: Optional[RowMasker],
+        lr: float,
+        betas=(0.9, 0.95),
+        eps=1e-8,
+        weight_decay=0.0,
+        freeze_non_linear_when_sparse: bool = False,
+    ):
+        self.model = model
+        self.masker = masker
+        self.lr = lr
+        self.beta1, self.beta2 = betas
+        self.eps = eps
+        self.weight_decay = weight_decay
+        self.freeze_non_linear_when_sparse = freeze_non_linear_when_sparse
+        self.state: Dict[nn.Parameter, Dict[str, torch.Tensor]] = {}
+        self.linear_param: Dict[nn.Parameter, Tuple[nn.Linear, str]] = {}
+        for _, m in named_linear_modules(model):
+            self.linear_param[m.weight] = (m, "weight")
+            if m.bias is not None:
+                self.linear_param[m.bias] = (m, "bias")
+    def zero_grad(self) -> None:
+        for p in self.model.parameters():
+            p.grad = None
+    @torch.no_grad()
+    def step(self) -> None:
+        for p in self.model.parameters():
+            if p.grad is None:
+                continue
+            if self.masker is not None and self.freeze_non_linear_when_sparse and p not in self.linear_param:
+                # Optional stricter mode: freeze embeddings/layernorm/etc. in sparse runs.
+                continue
+            if p not in self.state:
+                self.state[p] = {"m": torch.zeros_like(p), "v": torch.zeros_like(p)}
+            m = self.state[p]["m"]
+            v = self.state[p]["v"]
+            g = p.grad
+            if self.weight_decay:
+                g = g.add(p, alpha=self.weight_decay)
+            row_mask = None
+            if self.masker is not None and p in self.linear_param:
+                module, kind = self.linear_param[p]
+                base = self.masker.row_mask_for(module)
+                if base is not None:
+                    row_mask = base.view(-1, *([1] * (p.ndim - 1))) if kind == "weight" else base
+            if row_mask is None:
+                m.mul_(self.beta1).add_(g, alpha=1.0 - self.beta1)
+                v.mul_(self.beta2).addcmul_(g, g, value=1.0 - self.beta2)
+                p.add_(m / (torch.sqrt(v) + self.eps), alpha=-self.lr)
+            else:
+                mask = row_mask.expand_as(p)
+                if not bool(mask.any().item()):
+                    continue
+                new_m = self.beta1 * m + (1.0 - self.beta1) * g
+                new_v = self.beta2 * v + (1.0 - self.beta2) * g * g
+                m[mask] = new_m[mask]
+                v[mask] = new_v[mask]
+                update = m / (torch.sqrt(v) + self.eps)
+                p[mask] = p[mask] - self.lr * update[mask]
+# -----------------------------
+# Training
+# -----------------------------
+@torch.no_grad()
+def estimate_loss(model: nn.Module, corpus: CharCorpus, batch_size: int, eval_iters: int) -> Dict[str, float]:
+    model.eval()
+    out = {}
+    for split in ["train", "val"]:
+        losses = []
+        for _ in range(eval_iters):
+            x, y = corpus.get_batch(split, batch_size)
+            _, loss = model(x, y)
+            losses.append(float(loss.item()))
+        out[split] = sum(losses) / len(losses)
+    model.train()
+    return out
+def train_run(
+    corpus: CharCorpus,
+    args: argparse.Namespace,
+    policy: Optional[Policy],
+    active_fraction: float,
+    warmup_steps: int,
+    explore_fraction: float,
+    seed_offset: int,
+) -> Dict[str, float | str]:
+    set_seed(args.seed + seed_offset)
+    dev = corpus.device
+    model = MiniGPT(corpus.vocab_size, args.block_size, args.n_layer, args.n_head, args.n_embd, args.dropout).to(dev)
+    masker = None
+    if policy is not None:
+        masker = RowMasker(
+            model=model,
+            policy=policy,
+            active_fraction=active_fraction,
+            explore_fraction=explore_fraction,
+            mass_beta=args.mass_beta,
+            unobserved_decay=args.unobserved_decay,
+            warmup_steps=warmup_steps,
+            ucb_alpha=args.ucb_alpha,
+            mass_init=args.mass_init,
+            device=dev,
+        )
+    opt = MaskedAdam(
+        model,
+        masker,
+        lr=args.lr,
+        weight_decay=args.weight_decay,
+        freeze_non_linear_when_sparse=args.freeze_non_linear_when_sparse,
+    )
+    sums = {
+        "cosine": 0.0,
+        "norm_ratio": 0.0,
+        "top20_mass": 0.0,
+        "jacc_oracle": 0.0,
+        "stability": 0.0,
+        "active_fraction_real": 0.0,
+        "coverage": 0.0,
+        "avg_obs_count": 0.0,
+        "new_active_fraction": 0.0,
+    }
+    count = 0
+    for step in range(args.steps):
+        x, y = corpus.get_batch("train", args.batch_size)
+        if masker is not None:
+            masker.choose_pre_backward(step)
+        _, loss = model(x, y)
+        opt.zero_grad()
+        loss.backward()
+        if masker is not None:
+            metrics = masker.audit_and_update(step)
+            if step >= warmup_steps:
+                for k in sums:
+                    sums[k] += metrics[k]
+                count += 1
+        opt.step()
+        if args.verbose and (step % args.eval_interval == 0 or step == args.steps - 1):
+            losses = estimate_loss(model, corpus, args.batch_size, args.eval_iters)
+            name = "dense" if policy is None else policy
+            print(
+                f"{name:20s} step={step:5d} warm={warmup_steps:4d} explore={explore_fraction:.2f} "
+                f"train={losses['train']:.4f} val={losses['val']:.4f}"
+            )
+    losses = estimate_loss(model, corpus, args.batch_size, args.eval_iters)
+    row: Dict[str, float | str] = {
+        "run": "dense_baseline" if policy is None else policy,
+        "target_active": 1.0 if policy is None else active_fraction,
+        "warmup": warmup_steps,
+        "explore": explore_fraction if policy is not None else 0.0,
+        "train_loss": losses["train"],
+        "val_loss": losses["val"],
+    }
+    if masker is None or count == 0:
+        row.update({
+            "cosine": float("nan"),
+            "norm_ratio": float("nan"),
+            "top20_mass": float("nan"),
+            "jacc_oracle": float("nan"),
+            "stability": float("nan"),
+            "active_fraction_real": 1.0,
+            "coverage": float("nan"),
+            "avg_obs_count": float("nan"),
+            "new_active_fraction": float("nan"),
+        })
+    else:
+        for k, v in sums.items():
+            row[k] = v / count
+    return row
+def print_summary(rows: List[Dict[str, float | str]]) -> None:
+    print("\nSummary")
+    header = (
+        f"{'run':>22s} {'target':>7s} {'actual':>7s} {'warm':>5s} {'expl':>5s} "
+        f"{'val':>8s} {'train':>8s} {'cos':>7s} {'top20':>7s} {'jacc':>7s} "
+        f"{'stable':>7s} {'cover':>7s} {'new':>7s}"
+    )
+    print(header)
+    print("-" * len(header))
+    for r in rows:
+        print(
+            f"{str(r['run']):>22s} "
+            f"{float(r['target_active']):7.3f} "
+            f"{float(r['active_fraction_real']):7.3f} "
+            f"{int(float(r['warmup'])):5d} "
+            f"{float(r['explore']):5.2f} "
+            f"{float(r['val_loss']):8.4f} "
+            f"{float(r['train_loss']):8.4f} "
+            f"{float(r['cosine']):7.3f} "
+            f"{float(r['top20_mass']):7.3f} "
+            f"{float(r['jacc_oracle']):7.3f} "
+            f"{float(r['stability']):7.3f} "
+            f"{float(r['coverage']):7.3f} "
+            f"{float(r['new_active_fraction']):7.3f}"
+        )
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser()
+    p.add_argument("--text_path", type=str, default=None)
+    p.add_argument("--synthetic_sentences", type=int, default=12000)
+    p.add_argument("--steps", type=int, default=1000)
+    p.add_argument("--quick", action="store_true")
+    p.add_argument("--batch_size", type=int, default=32)
+    p.add_argument("--block_size", type=int, default=64)
+    p.add_argument("--n_layer", type=int, default=2)
+    p.add_argument("--n_head", type=int, default=4)
+    p.add_argument("--n_embd", type=int, default=64)
+    p.add_argument("--dropout", type=float, default=0.0)
+    p.add_argument("--lr", type=float, default=3e-4)
+    p.add_argument("--weight_decay", type=float, default=0.0)
+    p.add_argument("--active_fractions", type=float, nargs="+", default=[0.10, 0.05, 0.02])
+    p.add_argument("--policies", type=str, nargs="+", default=["oracle_current", "predicted_magnitude", "ucb_magnitude", "random"])
+    p.add_argument("--explore_fractions", type=float, nargs="+", default=[0.10])
+    p.add_argument("--warmup_steps_list", type=int, nargs="+", default=[5])
+    p.add_argument("--mass_beta", type=float, default=0.95)
+    p.add_argument("--unobserved_decay", type=float, default=1.0)
+    p.add_argument("--mass_init", type=float, default=0.0)
+    p.add_argument("--ucb_alpha", type=float, default=1.0)
+    p.add_argument("--freeze_non_linear_when_sparse", action="store_true")
+    p.add_argument("--eval_interval", type=int, default=200)
+    p.add_argument("--eval_iters", type=int, default=20)
+    p.add_argument("--seed", type=int, default=7)
+    p.add_argument("--verbose", action="store_true")
+    return p.parse_args()
+def main() -> None:
+    args = parse_args()
+    if args.quick:
+        args.steps = 60
+        args.eval_iters = 3
+        args.batch_size = 16
+        args.block_size = 32
+        args.n_layer = 1
+        args.n_embd = 32
+        args.n_head = 4
+        args.synthetic_sentences = 2000
+        args.active_fractions = [0.10, 0.02]
+        args.policies = ["oracle_current", "predicted_magnitude", "ucb_magnitude", "random"]
+        args.explore_fractions = [0.10]
+        args.warmup_steps_list = [0]
+    # Validate policy strings early.
+    valid = {"predicted_magnitude", "ucb_magnitude", "oracle_current", "stale_current", "random"}
+    for pol in args.policies:
+        if pol not in valid:
+            raise ValueError(f"Unknown policy {pol!r}. Valid policies: {sorted(valid)}")
+    set_seed(args.seed)
+    dev = device()
+    print(f"device={dev}")
+    corpus = CharCorpus(load_text(args), args.block_size, dev)
+    print(f"vocab_size={corpus.vocab_size} train_tokens={len(corpus.train_data)} val_tokens={len(corpus.val_data)}")
+    print(f"policies={args.policies}")
+    print(f"active_fractions={args.active_fractions}")
+    print(f"warmup_steps_list={args.warmup_steps_list} explore_fractions={args.explore_fractions}")
+    print(f"mass_init={args.mass_init} mass_beta={args.mass_beta} ucb_alpha={args.ucb_alpha}")
+    # Report how much of the model is governed by row masks.
+    tmp_model = MiniGPT(corpus.vocab_size, args.block_size, args.n_layer, args.n_head, args.n_embd, args.dropout).to(dev)
+    total_params, linear_params, linear_frac = parameter_fractions(tmp_model)
+    del tmp_model
+    print(f"params total={total_params} linear={linear_params} linear_fraction={linear_frac:.3f}")
+    if args.freeze_non_linear_when_sparse:
+        print("freeze_non_linear_when_sparse=True: embeddings/layernorm/etc. are frozen in sparse runs")
+    else:
+        print("freeze_non_linear_when_sparse=False: non-Linear params are still updated densely")
+    rows: List[Dict[str, float | str]] = []
+    print("\nRunning dense baseline")
+    rows.append(train_run(corpus, args, policy=None, active_fraction=1.0, warmup_steps=0, explore_fraction=0.0, seed_offset=0))
+    seed_offset = 100
+    for af in args.active_fractions:
+        for pol in args.policies:
+            # oracle_current and stale_current do not use explore_fraction; random does not either.
+            explore_values = args.explore_fractions if pol in {"predicted_magnitude", "ucb_magnitude"} else [0.0]
+            # Warmup matters for every sparse policy, so keep it in the loop.
+            for warmup in args.warmup_steps_list:
+                for explore in explore_values:
+                    print(f"\nRunning policy={pol}, active_fraction={af:.3f}, warmup={warmup}, explore={explore:.2f}")
+                    rows.append(
+                        train_run(
+                            corpus,
+                            args,
+                            policy=pol,  # type: ignore[arg-type]
+                            active_fraction=af,
+                            warmup_steps=warmup,
+                            explore_fraction=explore,
+                            seed_offset=seed_offset,
+                        )
+                    )
+                    seed_offset += 1
+    print_summary(rows)
+    print("\nNotes")
+    print("  oracle_current uses current dense gradients to choose rows; it is the true upper bound.")
+    print("  stale_current uses previous-step dense gradient mass; it is a renamed stale/noisy control.")
+    print("  predicted_magnitude uses only EMA mass from active/observed rows.")
+    print("  ucb_magnitude adds an uncertainty bonus for under-observed rows to improve discovery.")
+    print("  coverage is the fraction of Linear rows that have ever been observed/active.")
+    print("  new is the average fraction of rows newly observed per non-warmup step.")
+    print("  dense gradients are still computed for audit; this is not a wall-clock benchmark yet.")
+if __name__ == "__main__":
+    main()

experiments/sparse_transformer_v8.py ADDED Viewed

	@@ -0,0 +1,943 @@

+"""
+Sparse Transformer v8: from masked-optimizer simulation to real sparse Linear backward.
+v7 showed that Transformer Linear-row gradient support is heavy-tailed and stable,
+and that a practical EMA selector can nearly match an oracle selector after a tiny
+warmup. But v7 still computed dense gradients and only masked the optimizer step.
+v8 tests the next question:
+    Can the sparse row mask be moved into the Linear backward pass itself?
+Backward modes
+--------------
+1. masked_optimizer
+   v7-style control. Compute dense backward, but MaskedAdam only updates active
+   Linear rows. This should match the previous simulation behavior.
+2. sparse_dW_full_dX
+   Custom autograd Linear computes grad_weight / grad_bias only for active output
+   rows, while still propagating full grad_input backward. This is the conservative
+   real-backward mode. It targets the dW part of Linear backward only.
+3. sparse_dW_sparse_dX
+   Custom autograd Linear computes grad_weight only for active rows and also
+   propagates grad_input only through active output rows. This is the aggressive
+   mode. It may save more backward compute in a real kernel, but it can damage
+   upstream learning.
+Important caveat
+----------------
+This script still performs a dense audit backward pass each training step to:
+  - compute oracle metrics,
+  - support oracle_current and stale_current controls,
+  - update practical EMA statistics only for active/observed rows.
+The actual training update in sparse_dW_* modes comes from the custom sparse
+backward pass, not from the dense audit gradients. This is a correctness and
+semantics experiment, not a wall-clock benchmark.
+Example
+-------
+Smoke test:
+    python3 sparse_transformer_v8.py --quick
+Main comparison:
+    python3 sparse_transformer_v8.py \
+      --steps 2000 \
+      --active_fractions 0.05 0.02 \
+      --warmup_steps_list 5 \
+      --explore_fractions 0.00 \
+      --policies oracle_current predicted_magnitude random \
+      --backward_modes masked_optimizer sparse_dW_full_dX sparse_dW_sparse_dX
+"""
+from __future__ import annotations
+import argparse
+import math
+import random
+from typing import Dict, List, Literal, Optional, Tuple
+import torch
+torch.set_num_threads(1)
+import torch.nn as nn
+import torch.nn.functional as F
+Policy = Literal["predicted_magnitude", "ucb_magnitude", "oracle_current", "stale_current", "random"]
+BackwardMode = Literal["masked_optimizer", "sparse_dW_full_dX", "sparse_dW_sparse_dX"]
+# -----------------------------
+# Reproducibility and device
+# -----------------------------
+def set_seed(seed: int) -> None:
+    random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+def device() -> str:
+    return "cuda" if torch.cuda.is_available() else "cpu"
+def make_cpu_generator(seed: int) -> torch.Generator:
+    gen = torch.Generator(device="cpu")
+    gen.manual_seed(seed)
+    return gen
+# -----------------------------
+# Data
+# -----------------------------
+def make_synthetic_corpus(n_sentences: int = 12000, seed: int = 7) -> str:
+    rng = random.Random(seed)
+    names = ["ada", "turing", "grace", "lovelace", "noether", "shannon", "hopper", "gauss"]
+    verbs = ["builds", "tests", "traces", "compresses", "predicts", "routes", "writes", "measures"]
+    objects = ["signals", "gradients", "tokens", "circuits", "features", "masks", "errors", "states"]
+    adverbs = ["quietly", "boldly", "slowly", "quickly", "cleanly", "strangely", "carefully"]
+    clauses = [
+        "when the loss falls",
+        "after the mask shifts",
+        "before the model answers",
+        "while the signal drifts",
+        "if the pattern repeats",
+        "because the tail is noisy",
+    ]
+    symbols = ["alpha", "beta", "gamma", "delta", "omega", "sigma"]
+    lines: List[str] = []
+    for _ in range(n_sentences):
+        t = rng.randrange(6)
+        if t == 0:
+            line = f"{rng.choice(names)} {rng.choice(verbs)} {rng.choice(objects)} {rng.choice(adverbs)}."
+        elif t == 1:
+            line = f"{rng.choice(clauses)}, {rng.choice(names)} {rng.choice(verbs)} {rng.choice(objects)}."
+        elif t == 2:
+            a, b = rng.sample(symbols, 2)
+            line = f"rule {a}: {rng.choice(objects)} -> {rng.choice(objects)}; rule {b}: {rng.choice(objects)} -> {rng.choice(objects)}."
+        elif t == 3:
+            line = f"the {rng.choice(objects)} {rng.choice(verbs)} the {rng.choice(objects)} {rng.choice(adverbs)}."
+        elif t == 4:
+            seq = " ".join(rng.choice(symbols) for _ in range(rng.randint(2, 7)))
+            line = f"sequence {seq} ends when {rng.choice(names)} {rng.choice(verbs)}."
+        else:
+            line = f"if {rng.choice(objects)} rise then {rng.choice(names)} {rng.choice(verbs)} {rng.choice(objects)} else wait."
+        lines.append(line)
+    return "\n".join(lines) + "\n"
+class CharCorpus:
+    def __init__(self, text: str, block_size: int, device: str):
+        chars = sorted(set(text))
+        self.stoi = {ch: i for i, ch in enumerate(chars)}
+        self.itos = {i: ch for ch, i in self.stoi.items()}
+        self.vocab_size = len(chars)
+        self.block_size = block_size
+        self.device = device
+        data = torch.tensor([self.stoi[ch] for ch in text], dtype=torch.long)
+        split = int(0.9 * len(data))
+        self.train_data = data[:split]
+        self.val_data = data[split:]
+    def get_batch(
+        self,
+        split: str,
+        batch_size: int,
+        generator: Optional[torch.Generator] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        data = self.train_data if split == "train" else self.val_data
+        max_start = len(data) - self.block_size - 1
+        if max_start <= 0:
+            raise ValueError("Corpus too small for block_size")
+        ix = torch.randint(max_start, (batch_size,), generator=generator)
+        x = torch.stack([data[i : i + self.block_size] for i in ix])
+        y = torch.stack([data[i + 1 : i + self.block_size + 1] for i in ix])
+        return x.to(self.device), y.to(self.device)
+def load_text(args: argparse.Namespace) -> str:
+    if args.text_path:
+        with open(args.text_path, "r", encoding="utf-8") as f:
+            return f.read()
+    return make_synthetic_corpus(args.synthetic_sentences, args.seed)
+# -----------------------------
+# Sparse Linear autograd
+# -----------------------------
+class MaskedLinearFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(  # type: ignore[override]
+        ctx,
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        bias: Optional[torch.Tensor],
+        active_rows: torch.Tensor,
+        sparse_dx: bool,
+    ) -> torch.Tensor:
+        ctx.save_for_backward(x, weight, active_rows)
+        ctx.has_bias = bias is not None
+        ctx.sparse_dx = bool(sparse_dx)
+        return F.linear(x, weight, bias)
+    @staticmethod
+    def backward(ctx, grad_y: torch.Tensor):  # type: ignore[override]
+        x, weight, active_rows = ctx.saved_tensors
+        sparse_dx = bool(ctx.sparse_dx)
+        has_bias = bool(ctx.has_bias)
+        x_shape = x.shape
+        x_flat = x.reshape(-1, x.shape[-1])
+        gy_flat = grad_y.reshape(-1, grad_y.shape[-1])
+        active_idx = torch.nonzero(active_rows, as_tuple=False).flatten()
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros(weight.shape[0], device=weight.device, dtype=weight.dtype) if has_bias else None
+        if active_idx.numel() > 0:
+            gy_active = gy_flat[:, active_idx]
+            grad_weight[active_idx] = gy_active.transpose(0, 1) @ x_flat
+            if grad_bias is not None:
+                grad_bias[active_idx] = gy_active.sum(dim=0)
+            if sparse_dx:
+                grad_x_flat = gy_active @ weight[active_idx]
+            else:
+                grad_x_flat = gy_flat @ weight
+        else:
+            # This can happen when a global top-k mask selects no rows from a
+            # particular layer. Conservative full_dX still propagates through that
+            # layer; aggressive sparse_dX cuts it off for that layer.
+            if sparse_dx:
+                grad_x_flat = torch.zeros_like(x_flat)
+            else:
+                grad_x_flat = gy_flat @ weight
+        grad_x = grad_x_flat.reshape(x_shape)
+        return grad_x, grad_weight, grad_bias, None, None
+class SparseLinear(nn.Linear):
+    """nn.Linear with an optional row-sparse backward pass."""
+    def __init__(self, in_features: int, out_features: int, bias: bool = True):
+        super().__init__(in_features, out_features, bias=bias)
+        self.sparse_enabled = False
+        self.sparse_dx = False
+        self.active_rows: Optional[torch.Tensor] = None
+    def set_sparse_backward(self, enabled: bool, active_rows: Optional[torch.Tensor], sparse_dx: bool) -> None:
+        self.sparse_enabled = bool(enabled)
+        self.sparse_dx = bool(sparse_dx)
+        self.active_rows = active_rows
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not self.sparse_enabled or self.active_rows is None:
+            return F.linear(x, self.weight, self.bias)
+        return MaskedLinearFunction.apply(x, self.weight, self.bias, self.active_rows, self.sparse_dx)
+# -----------------------------
+# Mini GPT
+# -----------------------------
+class CausalSelfAttention(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        assert n_embd % n_head == 0
+        self.n_head = n_head
+        self.head_dim = n_embd // n_head
+        self.c_attn = SparseLinear(n_embd, 3 * n_embd)
+        self.c_proj = SparseLinear(n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+        self.register_buffer("mask", torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, T, C = x.shape
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(C, dim=2)
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float("-inf"))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.c_proj(y)
+class FeedForward(nn.Module):
+    def __init__(self, n_embd: int, dropout: float):
+        super().__init__()
+        self.c_fc = SparseLinear(n_embd, 4 * n_embd)
+        self.c_proj = SparseLinear(4 * n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.dropout(self.c_proj(F.gelu(self.c_fc(x))))
+class Block(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(n_embd)
+        self.attn = CausalSelfAttention(n_embd, n_head, block_size, dropout)
+        self.ln2 = nn.LayerNorm(n_embd)
+        self.mlp = FeedForward(n_embd, dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+class MiniGPT(nn.Module):
+    def __init__(self, vocab_size: int, block_size: int, n_layer: int, n_head: int, n_embd: int, dropout: float):
+        super().__init__()
+        self.block_size = block_size
+        self.tok_emb = nn.Embedding(vocab_size, n_embd)
+        self.pos_emb = nn.Embedding(block_size, n_embd)
+        self.drop = nn.Dropout(dropout)
+        self.blocks = nn.Sequential(*[Block(n_embd, n_head, block_size, dropout) for _ in range(n_layer)])
+        self.ln_f = nn.LayerNorm(n_embd)
+        self.lm_head = SparseLinear(n_embd, vocab_size)
+    def forward(self, idx: torch.Tensor, targets: Optional[torch.Tensor] = None):
+        B, T = idx.shape
+        pos = torch.arange(T, device=idx.device)
+        x = self.tok_emb(idx) + self.pos_emb(pos)[None, :, :]
+        x = self.drop(x)
+        x = self.blocks(x)
+        x = self.ln_f(x)
+        logits = self.lm_head(x)
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss
+def named_sparse_linear_modules(model: nn.Module) -> List[Tuple[str, SparseLinear]]:
+    return [(name, m) for name, m in model.named_modules() if isinstance(m, SparseLinear)]
+def parameter_fractions(model: nn.Module) -> Tuple[int, int, float]:
+    total = sum(p.numel() for p in model.parameters())
+    linear = 0
+    for _, m in named_sparse_linear_modules(model):
+        linear += m.weight.numel()
+        if m.bias is not None:
+            linear += m.bias.numel()
+    return total, linear, linear / max(1, total)
+def configure_sparse_linears(
+    model: nn.Module,
+    masker: Optional["RowMasker"],
+    enabled: bool,
+    backward_mode: Optional[str],
+) -> None:
+    sparse_dx = backward_mode == "sparse_dW_sparse_dX"
+    for _, m in named_sparse_linear_modules(model):
+        active = masker.row_mask_for(m) if masker is not None else None
+        m.set_sparse_backward(enabled=enabled, active_rows=active, sparse_dx=sparse_dx)
+# -----------------------------
+# Mask selector
+# -----------------------------
+class RowMasker:
+    def __init__(
+        self,
+        model: nn.Module,
+        policy: Policy,
+        active_fraction: float,
+        explore_fraction: float,
+        mass_beta: float,
+        unobserved_decay: float,
+        warmup_steps: int,
+        ucb_alpha: float,
+        mass_init: float,
+        device: str,
+    ):
+        self.model = model
+        self.policy = policy
+        self.active_fraction = active_fraction
+        self.explore_fraction = explore_fraction
+        self.mass_beta = mass_beta
+        self.unobserved_decay = unobserved_decay
+        self.warmup_steps = warmup_steps
+        self.ucb_alpha = ucb_alpha
+        self.mass_init = mass_init
+        self.device = device
+        self.step_index = 0
+        self.linear_modules = [m for _, m in named_sparse_linear_modules(model)]
+        self.module_to_ids: Dict[SparseLinear, torch.Tensor] = {}
+        ids = []
+        offset = 0
+        for m in self.linear_modules:
+            n = m.weight.shape[0]
+            block_ids = torch.arange(offset, offset + n, device=device)
+            self.module_to_ids[m] = block_ids
+            ids.append(block_ids)
+            offset += n
+        self.n_blocks = offset
+        self.predicted_mass = torch.full((self.n_blocks,), mass_init, device=device)
+        self.last_full_mass = torch.full((self.n_blocks,), mass_init, device=device)
+        self.observed_count = torch.zeros(self.n_blocks, device=device)
+        self.global_mass_ema = torch.tensor(max(mass_init, 1e-6), device=device)
+        self.prev_active = torch.zeros(self.n_blocks, dtype=torch.bool, device=device)
+        self.active = torch.zeros(self.n_blocks, dtype=torch.bool, device=device)
+        self.row_masks: Dict[SparseLinear, torch.Tensor] = {
+            m: torch.zeros(m.weight.shape[0], dtype=torch.bool, device=device) for m in self.linear_modules
+        }
+    def _topk_mask(self, values: torch.Tensor, fraction: float) -> torch.Tensor:
+        k = max(1, int(fraction * values.numel()))
+        mask = torch.zeros_like(values, dtype=torch.bool)
+        noisy = values + 1e-9 * torch.rand_like(values)
+        mask[torch.topk(noisy, k=k).indices] = True
+        return mask
+    @staticmethod
+    def _jaccard(a: torch.Tensor, b: torch.Tensor) -> float:
+        inter = (a & b).sum().float()
+        union = (a | b).sum().float()
+        return float((inter / torch.clamp(union, min=1.0)).item())
+    def _set_active(self, active: torch.Tensor) -> None:
+        self.active = active
+        self.row_masks = {}
+        for m, ids in self.module_to_ids.items():
+            self.row_masks[m] = active[ids]
+    def _sample_exploit_explore(self, scores: torch.Tensor) -> torch.Tensor:
+        n = self.n_blocks
+        k_total = max(1, int(self.active_fraction * n))
+        k_explore = min(k_total, max(0, int(self.explore_fraction * k_total)))
+        k_exploit = k_total - k_explore
+        active = torch.zeros(n, dtype=torch.bool, device=self.device)
+        if k_exploit > 0:
+            active[torch.topk(scores + 1e-9 * torch.rand_like(scores), k=k_exploit).indices] = True
+        if k_explore > 0:
+            remaining = torch.nonzero(~active, as_tuple=False).flatten()
+            pick = remaining[torch.randperm(remaining.numel(), device=self.device)[:k_explore]]
+            active[pick] = True
+        return active
+    def choose_pre_backward(self, step: int) -> None:
+        self.step_index = step
+        if step < self.warmup_steps:
+            self._set_active(torch.ones(self.n_blocks, dtype=torch.bool, device=self.device))
+            return
+        if self.policy == "oracle_current":
+            # Oracle cannot choose until the dense audit gradient is known.
+            self._set_active(torch.zeros(self.n_blocks, dtype=torch.bool, device=self.device))
+            return
+        if self.policy == "random":
+            self._set_active(self._sample_exploit_explore(torch.rand(self.n_blocks, device=self.device)))
+            return
+        if self.policy == "stale_current":
+            self._set_active(self._topk_mask(self.last_full_mass, self.active_fraction))
+            return
+        if self.policy == "predicted_magnitude":
+            self._set_active(self._sample_exploit_explore(self.predicted_mass))
+            return
+        if self.policy == "ucb_magnitude":
+            t = max(1, step - self.warmup_steps + 1)
+            log_term = torch.log(torch.tensor(float(t + 2), device=self.device))
+            bonus_scale = torch.clamp(self.global_mass_ema, min=1e-8)
+            bonus = self.ucb_alpha * bonus_scale * torch.sqrt(log_term / (self.observed_count + 1.0))
+            self._set_active(self._sample_exploit_explore(self.predicted_mass + bonus))
+            return
+        raise ValueError(f"Unknown policy: {self.policy}")
+    @torch.no_grad()
+    def current_gradient_mass_from_grads(self) -> torch.Tensor:
+        mass = torch.zeros(self.n_blocks, device=self.device)
+        for m, ids in self.module_to_ids.items():
+            if m.weight.grad is None:
+                continue
+            row_sq = m.weight.grad.square().sum(dim=1)
+            if m.bias is not None and m.bias.grad is not None:
+                row_sq = row_sq + m.bias.grad.square()
+            mass[ids] = torch.sqrt(row_sq + 1e-30)
+        return mass
+    @torch.no_grad()
+    def audit_and_update_from_mass(self, step: int, mass: torch.Tensor) -> Dict[str, float]:
+        if step < self.warmup_steps:
+            active = torch.ones(self.n_blocks, dtype=torch.bool, device=self.device)
+            self._set_active(active)
+        elif self.policy == "oracle_current":
+            active = self._topk_mask(mass, self.active_fraction)
+            self._set_active(active)
+        else:
+            active = self.active
+        true_sq = mass.square().sum()
+        approx_sq = mass[active].square().sum()
+        cosine = float((torch.sqrt(approx_sq + 1e-30) / torch.sqrt(true_sq + 1e-30)).item())
+        oracle_mask = self._topk_mask(mass, self.active_fraction)
+        jacc = self._jaccard(active, oracle_mask)
+        stability = self._jaccard(active, self.prev_active)
+        self.prev_active = active.clone()
+        k20 = max(1, int(0.2 * self.n_blocks))
+        sorted_mass = torch.sort(mass, descending=True).values
+        top20_mass = float((sorted_mass[:k20].sum() / (sorted_mass.sum() + 1e-12)).item())
+        new_active = active & (self.observed_count == 0)
+        # Practical rule: update predicted statistics only for active/observed rows.
+        self.predicted_mass.mul_(self.unobserved_decay)
+        observed = active
+        if bool(observed.any().item()):
+            obs_mass = mass[observed]
+            first_seen = self.observed_count[observed] == 0
+            ema_mass = self.mass_beta * self.predicted_mass[observed] + (1.0 - self.mass_beta) * obs_mass
+            self.predicted_mass[observed] = torch.where(first_seen, obs_mass, ema_mass)
+            self.observed_count[observed] += 1.0
+            self.global_mass_ema = self.mass_beta * self.global_mass_ema + (1.0 - self.mass_beta) * obs_mass.mean()
+        # Dense audit signal; only stale_current is allowed to use this for selection.
+        self.last_full_mass = mass.detach().clone()
+        return {
+            "cosine": cosine,
+            "norm_ratio": cosine,
+            "top20_mass": top20_mass,
+            "jacc_oracle": jacc,
+            "stability": stability,
+            "active_fraction_real": float(active.float().mean().item()),
+            "coverage": float((self.observed_count > 0).float().mean().item()),
+            "avg_obs_count": float(self.observed_count.mean().item()),
+            "new_active_fraction": float(new_active.float().mean().item()),
+        }
+    def row_mask_for(self, module: SparseLinear) -> Optional[torch.Tensor]:
+        return self.row_masks.get(module)
+# -----------------------------
+# Masked Adam
+# -----------------------------
+class MaskedAdam:
+    def __init__(
+        self,
+        model: nn.Module,
+        masker: Optional[RowMasker],
+        lr: float,
+        betas=(0.9, 0.95),
+        eps=1e-8,
+        weight_decay=0.0,
+        freeze_non_linear_when_sparse: bool = False,
+    ):
+        self.model = model
+        self.masker = masker
+        self.lr = lr
+        self.beta1, self.beta2 = betas
+        self.eps = eps
+        self.weight_decay = weight_decay
+        self.freeze_non_linear_when_sparse = freeze_non_linear_when_sparse
+        self.state: Dict[nn.Parameter, Dict[str, torch.Tensor]] = {}
+        self.linear_param: Dict[nn.Parameter, Tuple[SparseLinear, str]] = {}
+        for _, m in named_sparse_linear_modules(model):
+            self.linear_param[m.weight] = (m, "weight")
+            if m.bias is not None:
+                self.linear_param[m.bias] = (m, "bias")
+    def zero_grad(self) -> None:
+        for p in self.model.parameters():
+            p.grad = None
+    @torch.no_grad()
+    def step(self) -> None:
+        for p in self.model.parameters():
+            if p.grad is None:
+                continue
+            if self.masker is not None and self.freeze_non_linear_when_sparse and p not in self.linear_param:
+                continue
+            if p not in self.state:
+                self.state[p] = {"m": torch.zeros_like(p), "v": torch.zeros_like(p)}
+            m = self.state[p]["m"]
+            v = self.state[p]["v"]
+            g = p.grad
+            if self.weight_decay:
+                g = g.add(p, alpha=self.weight_decay)
+            row_mask = None
+            if self.masker is not None and p in self.linear_param:
+                module, kind = self.linear_param[p]
+                base = self.masker.row_mask_for(module)
+                if base is not None:
+                    row_mask = base.view(-1, *([1] * (p.ndim - 1))) if kind == "weight" else base
+            if row_mask is None:
+                m.mul_(self.beta1).add_(g, alpha=1.0 - self.beta1)
+                v.mul_(self.beta2).addcmul_(g, g, value=1.0 - self.beta2)
+                p.add_(m / (torch.sqrt(v) + self.eps), alpha=-self.lr)
+            else:
+                mask = row_mask.expand_as(p)
+                if not bool(mask.any().item()):
+                    continue
+                new_m = self.beta1 * m + (1.0 - self.beta1) * g
+                new_v = self.beta2 * v + (1.0 - self.beta2) * g * g
+                m[mask] = new_m[mask]
+                v[mask] = new_v[mask]
+                update = m / (torch.sqrt(v) + self.eps)
+                p[mask] = p[mask] - self.lr * update[mask]
+# -----------------------------
+# Training utilities
+# -----------------------------
+@torch.no_grad()
+def estimate_loss(model: nn.Module, corpus: CharCorpus, batch_size: int, eval_iters: int, seed: int) -> Dict[str, float]:
+    model.eval()
+    configure_sparse_linears(model, masker=None, enabled=False, backward_mode=None)
+    out = {}
+    for split in ["train", "val"]:
+        losses = []
+        gen = make_cpu_generator(seed + (0 if split == "train" else 100000))
+        for _ in range(eval_iters):
+            x, y = corpus.get_batch(split, batch_size, generator=gen)
+            _, loss = model(x, y)
+            losses.append(float(loss.item()))
+        out[split] = sum(losses) / len(losses)
+    model.train()
+    return out
+def dense_audit_pass(model: nn.Module, corpus_batch: Tuple[torch.Tensor, torch.Tensor], opt: MaskedAdam, masker: RowMasker) -> torch.Tensor:
+    x, y = corpus_batch
+    configure_sparse_linears(model, masker=None, enabled=False, backward_mode=None)
+    opt.zero_grad()
+    _, audit_loss = model(x, y)
+    audit_loss.backward()
+    mass = masker.current_gradient_mass_from_grads()
+    opt.zero_grad()
+    return mass
+def sparse_training_backward(
+    model: nn.Module,
+    corpus_batch: Tuple[torch.Tensor, torch.Tensor],
+    opt: MaskedAdam,
+    masker: Optional[RowMasker],
+    backward_mode: Optional[BackwardMode],
+) -> float:
+    x, y = corpus_batch
+    opt.zero_grad()
+    if masker is None or backward_mode is None or backward_mode == "masked_optimizer":
+        configure_sparse_linears(model, masker=None, enabled=False, backward_mode=None)
+    else:
+        configure_sparse_linears(model, masker=masker, enabled=True, backward_mode=backward_mode)
+    _, loss = model(x, y)
+    loss.backward()
+    configure_sparse_linears(model, masker=None, enabled=False, backward_mode=None)
+    return float(loss.item())
+def train_run(
+    corpus: CharCorpus,
+    args: argparse.Namespace,
+    policy: Optional[Policy],
+    backward_mode: Optional[BackwardMode],
+    active_fraction: float,
+    warmup_steps: int,
+    explore_fraction: float,
+    seed_offset: int,
+) -> Dict[str, float | str]:
+    # Same model initialization and same minibatch sequence for every run by default.
+    set_seed(args.seed + (seed_offset if args.unpaired_seeds else 0))
+    data_gen = make_cpu_generator(args.seed + 12345)
+    dev = corpus.device
+    model = MiniGPT(corpus.vocab_size, args.block_size, args.n_layer, args.n_head, args.n_embd, args.dropout).to(dev)
+    masker = None
+    if policy is not None:
+        masker = RowMasker(
+            model=model,
+            policy=policy,
+            active_fraction=active_fraction,
+            explore_fraction=explore_fraction,
+            mass_beta=args.mass_beta,
+            unobserved_decay=args.unobserved_decay,
+            warmup_steps=warmup_steps,
+            ucb_alpha=args.ucb_alpha,
+            mass_init=args.mass_init,
+            device=dev,
+        )
+    opt = MaskedAdam(
+        model,
+        masker,
+        lr=args.lr,
+        weight_decay=args.weight_decay,
+        freeze_non_linear_when_sparse=args.freeze_non_linear_when_sparse,
+    )
+    sums = {
+        "cosine": 0.0,
+        "norm_ratio": 0.0,
+        "top20_mass": 0.0,
+        "jacc_oracle": 0.0,
+        "stability": 0.0,
+        "active_fraction_real": 0.0,
+        "coverage": 0.0,
+        "avg_obs_count": 0.0,
+        "new_active_fraction": 0.0,
+    }
+    count = 0
+    for step in range(args.steps):
+        batch = corpus.get_batch("train", args.batch_size, generator=data_gen)
+        if masker is None:
+            loss_value = sparse_training_backward(model, batch, opt, masker=None, backward_mode=None)
+            opt.step()
+        else:
+            masker.choose_pre_backward(step)
+            full_mass = dense_audit_pass(model, batch, opt, masker)
+            metrics = masker.audit_and_update_from_mass(step, full_mass)
+            if step >= warmup_steps:
+                for k in sums:
+                    sums[k] += metrics[k]
+                count += 1
+            loss_value = sparse_training_backward(model, batch, opt, masker=masker, backward_mode=backward_mode)
+            opt.step()
+        if args.verbose and (step % args.eval_interval == 0 or step == args.steps - 1):
+            losses = estimate_loss(model, corpus, args.batch_size, args.eval_iters, seed=args.seed + 555)
+            name = "dense" if policy is None else f"{policy}/{backward_mode}"
+            print(
+                f"{name:38s} step={step:5d} warm={warmup_steps:4d} explore={explore_fraction:.2f} "
+                f"loss={loss_value:.4f} train={losses['train']:.4f} val={losses['val']:.4f}"
+            )
+    losses = estimate_loss(model, corpus, args.batch_size, args.eval_iters, seed=args.seed + 999)
+    row: Dict[str, float | str] = {
+        "run": "dense_baseline" if policy is None else policy,
+        "mode": "dense" if backward_mode is None else backward_mode,
+        "target_active": 1.0 if policy is None else active_fraction,
+        "warmup": warmup_steps,
+        "explore": explore_fraction if policy is not None else 0.0,
+        "train_loss": losses["train"],
+        "val_loss": losses["val"],
+    }
+    if masker is None or count == 0:
+        row.update({
+            "cosine": float("nan"),
+            "norm_ratio": float("nan"),
+            "top20_mass": float("nan"),
+            "jacc_oracle": float("nan"),
+            "stability": float("nan"),
+            "active_fraction_real": 1.0,
+            "coverage": float("nan"),
+            "avg_obs_count": float("nan"),
+            "new_active_fraction": float("nan"),
+        })
+    else:
+        for k, v in sums.items():
+            row[k] = v / count
+    return row
+def print_summary(rows: List[Dict[str, float | str]]) -> None:
+    print("\nSummary")
+    header = (
+        f"{'run':>22s} {'mode':>19s} {'target':>7s} {'actual':>7s} {'warm':>5s} {'expl':>5s} "
+        f"{'val':>8s} {'train':>8s} {'cos':>7s} {'top20':>7s} {'jacc':>7s} "
+        f"{'stable':>7s} {'cover':>7s} {'new':>7s}"
+    )
+    print(header)
+    print("-" * len(header))
+    for r in rows:
+        print(
+            f"{str(r['run']):>22s} "
+            f"{str(r['mode']):>19s} "
+            f"{float(r['target_active']):7.3f} "
+            f"{float(r['active_fraction_real']):7.3f} "
+            f"{int(float(r['warmup'])):5d} "
+            f"{float(r['explore']):5.2f} "
+            f"{float(r['val_loss']):8.4f} "
+            f"{float(r['train_loss']):8.4f} "
+            f"{float(r['cosine']):7.3f} "
+            f"{float(r['top20_mass']):7.3f} "
+            f"{float(r['jacc_oracle']):7.3f} "
+            f"{float(r['stability']):7.3f} "
+            f"{float(r['coverage']):7.3f} "
+            f"{float(r['new_active_fraction']):7.3f}"
+        )
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser()
+    p.add_argument("--text_path", type=str, default=None)
+    p.add_argument("--synthetic_sentences", type=int, default=12000)
+    p.add_argument("--steps", type=int, default=1000)
+    p.add_argument("--quick", action="store_true")
+    p.add_argument("--batch_size", type=int, default=32)
+    p.add_argument("--block_size", type=int, default=64)
+    p.add_argument("--n_layer", type=int, default=2)
+    p.add_argument("--n_head", type=int, default=4)
+    p.add_argument("--n_embd", type=int, default=64)
+    p.add_argument("--dropout", type=float, default=0.0)
+    p.add_argument("--lr", type=float, default=3e-4)
+    p.add_argument("--weight_decay", type=float, default=0.0)
+    p.add_argument("--active_fractions", type=float, nargs="+", default=[0.05, 0.02])
+    p.add_argument("--policies", type=str, nargs="+", default=["oracle_current", "predicted_magnitude", "random"])
+    p.add_argument(
+        "--backward_modes",
+        type=str,
+        nargs="+",
+        default=["masked_optimizer", "sparse_dW_full_dX", "sparse_dW_sparse_dX"],
+    )
+    p.add_argument("--explore_fractions", type=float, nargs="+", default=[0.0])
+    p.add_argument("--warmup_steps_list", type=int, nargs="+", default=[5])
+    p.add_argument("--mass_beta", type=float, default=0.95)
+    p.add_argument("--unobserved_decay", type=float, default=1.0)
+    p.add_argument("--mass_init", type=float, default=0.0)
+    p.add_argument("--ucb_alpha", type=float, default=1.0)
+    p.add_argument("--freeze_non_linear_when_sparse", action="store_true")
+    p.add_argument("--eval_interval", type=int, default=200)
+    p.add_argument("--eval_iters", type=int, default=20)
+    p.add_argument("--seed", type=int, default=7)
+    p.add_argument("--unpaired_seeds", action="store_true", help="Use different init seeds per run instead of paired seeds.")
+    p.add_argument("--verbose", action="store_true")
+    return p.parse_args()
+def main() -> None:
+    args = parse_args()
+    if args.quick:
+        args.steps = 40
+        args.eval_iters = 2
+        args.batch_size = 8
+        args.block_size = 32
+        args.n_layer = 1
+        args.n_embd = 32
+        args.n_head = 4
+        args.synthetic_sentences = 1200
+        args.active_fractions = [0.05]
+        args.policies = ["predicted_magnitude", "random"]
+        args.backward_modes = ["masked_optimizer", "sparse_dW_full_dX", "sparse_dW_sparse_dX"]
+        args.explore_fractions = [0.0]
+        args.warmup_steps_list = [5]
+    valid_policies = {"predicted_magnitude", "ucb_magnitude", "oracle_current", "stale_current", "random"}
+    valid_modes = {"masked_optimizer", "sparse_dW_full_dX", "sparse_dW_sparse_dX"}
+    for pol in args.policies:
+        if pol not in valid_policies:
+            raise ValueError(f"Unknown policy {pol!r}. Valid policies: {sorted(valid_policies)}")
+    for mode in args.backward_modes:
+        if mode not in valid_modes:
+            raise ValueError(f"Unknown backward mode {mode!r}. Valid modes: {sorted(valid_modes)}")
+    set_seed(args.seed)
+    dev = device()
+    print(f"device={dev}")
+    corpus = CharCorpus(load_text(args), args.block_size, dev)
+    print(f"vocab_size={corpus.vocab_size} train_tokens={len(corpus.train_data)} val_tokens={len(corpus.val_data)}")
+    print(f"policies={args.policies}")
+    print(f"backward_modes={args.backward_modes}")
+    print(f"active_fractions={args.active_fractions}")
+    print(f"warmup_steps_list={args.warmup_steps_list} explore_fractions={args.explore_fractions}")
+    print(f"mass_init={args.mass_init} mass_beta={args.mass_beta} ucb_alpha={args.ucb_alpha}")
+    print(f"paired_seeds={not args.unpaired_seeds}")
+    tmp_model = MiniGPT(corpus.vocab_size, args.block_size, args.n_layer, args.n_head, args.n_embd, args.dropout).to(dev)
+    total_params, linear_params, linear_frac = parameter_fractions(tmp_model)
+    del tmp_model
+    print(f"params total={total_params} linear={linear_params} linear_fraction={linear_frac:.3f}")
+    if args.freeze_non_linear_when_sparse:
+        print("freeze_non_linear_when_sparse=True: embeddings/layernorm/etc. are frozen in sparse runs")
+    else:
+        print("freeze_non_linear_when_sparse=False: non-Linear params are still updated densely")
+    if args.dropout != 0.0:
+        print("warning: dropout is nonzero; dense audit and sparse training passes may see different dropout masks")
+    rows: List[Dict[str, float | str]] = []
+    print("\nRunning dense baseline")
+    rows.append(
+        train_run(
+            corpus,
+            args,
+            policy=None,
+            backward_mode=None,
+            active_fraction=1.0,
+            warmup_steps=0,
+            explore_fraction=0.0,
+            seed_offset=0,
+        )
+    )
+    seed_offset = 100
+    for mode in args.backward_modes:
+        for af in args.active_fractions:
+            for pol in args.policies:
+                explore_values = args.explore_fractions if pol in {"predicted_magnitude", "ucb_magnitude"} else [0.0]
+                for warmup in args.warmup_steps_list:
+                    for explore in explore_values:
+                        print(
+                            f"\nRunning mode={mode}, policy={pol}, "
+                            f"active_fraction={af:.3f}, warmup={warmup}, explore={explore:.2f}"
+                        )
+                        rows.append(
+                            train_run(
+                                corpus,
+                                args,
+                                policy=pol,  # type: ignore[arg-type]
+                                backward_mode=mode,  # type: ignore[arg-type]
+                                active_fraction=af,
+                                warmup_steps=warmup,
+                                explore_fraction=explore,
+                                seed_offset=seed_offset,
+                            )
+                        )
+                        seed_offset += 1
+    print_summary(rows)
+    print("\nNotes")
+    print("  masked_optimizer is the v7-style dense-backward simulation control.")
+    print("  sparse_dW_full_dX uses custom Linear backward: sparse weight/bias grads, full input gradient.")
+    print("  sparse_dW_sparse_dX uses custom Linear backward: sparse weight/bias grads and sparse input gradient.")
+    print("  oracle_current uses dense audit gradients to choose rows; it is an upper bound.")
+    print("  predicted_magnitude uses EMA mass from active/observed rows only.")
+    print("  random is the sparse-support control.")
+    print("  dense audit gradients are still computed every step for metrics/control; this is not a speed benchmark.")
+    print("  The key comparison is masked_optimizer vs sparse_dW_full_dX. If they match, the v7 effect survives real dW sparsification.")
+if __name__ == "__main__":
+    main()

experiments/sparse_transformer_v9.py ADDED Viewed

	@@ -0,0 +1,1042 @@

+"""
+Sparse Transformer v9: no-audit sparse training after dense warmup.
+v8 proved that the row-sparse mask can be moved into a custom Linear backward.
+v9 removes the remaining dense-audit crutch.
+Default behavior
+----------------
+1. Run a short dense warmup, usually 5 steps.
+2. Initialize the EMA row-importance predictor from those dense warmup gradients.
+3. After warmup, choose active rows from the predictor.
+4. Train using sparse backward.
+5. Update EMA statistics only from rows that were actually active/observed.
+6. Do not compute dense gradients unless --audit_every > 0.
+Audit behavior
+--------------
+--audit_every 0
+    No dense audit after warmup. Cosine/Jaccard/top20 are unavailable and show as nan.
+--audit_every N
+    Every N steps, run an extra dense backward pass on the same batch only to
+    measure cosine/top20/Jaccard. The audit is NOT used to update the selector,
+    except for oracle_current, which is explicitly an upper-bound control.
+This is still not a wall-clock benchmark on vanilla PyTorch/MPS/CPU. The custom
+backward uses indexing and ordinary PyTorch matmuls. The goal is to verify that
+the method survives without dense information after warmup.
+Examples
+--------
+No-audit practical run:
+    python3 sparse_transformer_v9.py \
+      --device mps \
+      --steps 2000 \
+      --active_fractions 0.05 0.02 \
+      --warmup_steps_list 5 \
+      --policies predicted_magnitude random \
+      --backward_modes sparse_dW_full_dX sparse_dW_sparse_dX \
+      --audit_every 0
+Occasional audit for measurement only:
+    python3 sparse_transformer_v9.py \
+      --steps 2000 \
+      --active_fractions 0.05 0.02 \
+      --warmup_steps_list 5 \
+      --policies predicted_magnitude random \
+      --backward_modes sparse_dW_full_dX sparse_dW_sparse_dX \
+      --audit_every 100
+"""
+from __future__ import annotations
+import argparse
+import math
+import random
+from typing import Dict, List, Literal, Optional, Tuple
+import torch
+torch.set_num_threads(1)
+import torch.nn as nn
+import torch.nn.functional as F
+Policy = Literal["predicted_magnitude", "ucb_magnitude", "oracle_current", "stale_current", "random"]
+BackwardMode = Literal["masked_optimizer", "sparse_dW_full_dX", "sparse_dW_sparse_dX"]
+# -----------------------------
+# Reproducibility and device
+# -----------------------------
+def set_seed(seed: int) -> None:
+    random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+def default_device() -> str:
+    if torch.cuda.is_available():
+        return "cuda"
+    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        return "mps"
+    return "cpu"
+def make_cpu_generator(seed: int) -> torch.Generator:
+    gen = torch.Generator(device="cpu")
+    gen.manual_seed(seed)
+    return gen
+# -----------------------------
+# Data
+# -----------------------------
+def make_synthetic_corpus(n_sentences: int = 12000, seed: int = 7) -> str:
+    rng = random.Random(seed)
+    names = ["ada", "turing", "grace", "lovelace", "noether", "shannon", "hopper", "gauss"]
+    verbs = ["builds", "tests", "traces", "compresses", "predicts", "routes", "writes", "measures"]
+    objects = ["signals", "gradients", "tokens", "circuits", "features", "masks", "errors", "states"]
+    adverbs = ["quietly", "boldly", "slowly", "quickly", "cleanly", "strangely", "carefully"]
+    clauses = [
+        "when the loss falls",
+        "after the mask shifts",
+        "before the model answers",
+        "while the signal drifts",
+        "if the pattern repeats",
+        "because the tail is noisy",
+    ]
+    symbols = ["alpha", "beta", "gamma", "delta", "omega", "sigma"]
+    lines: List[str] = []
+    for _ in range(n_sentences):
+        t = rng.randrange(6)
+        if t == 0:
+            line = f"{rng.choice(names)} {rng.choice(verbs)} {rng.choice(objects)} {rng.choice(adverbs)}."
+        elif t == 1:
+            line = f"{rng.choice(clauses)}, {rng.choice(names)} {rng.choice(verbs)} {rng.choice(objects)}."
+        elif t == 2:
+            a, b = rng.sample(symbols, 2)
+            line = f"rule {a}: {rng.choice(objects)} -> {rng.choice(objects)}; rule {b}: {rng.choice(objects)} -> {rng.choice(objects)}."
+        elif t == 3:
+            line = f"the {rng.choice(objects)} {rng.choice(verbs)} the {rng.choice(objects)} {rng.choice(adverbs)}."
+        elif t == 4:
+            seq = " ".join(rng.choice(symbols) for _ in range(rng.randint(2, 7)))
+            line = f"sequence {seq} ends when {rng.choice(names)} {rng.choice(verbs)}."
+        else:
+            line = f"if {rng.choice(objects)} rise then {rng.choice(names)} {rng.choice(verbs)} {rng.choice(objects)} else wait."
+        lines.append(line)
+    return "\n".join(lines) + "\n"
+class CharCorpus:
+    def __init__(self, text: str, block_size: int, device: str):
+        chars = sorted(set(text))
+        self.stoi = {ch: i for i, ch in enumerate(chars)}
+        self.itos = {i: ch for ch, i in self.stoi.items()}
+        self.vocab_size = len(chars)
+        self.block_size = block_size
+        self.device = device
+        data = torch.tensor([self.stoi[ch] for ch in text], dtype=torch.long)
+        split = int(0.9 * len(data))
+        self.train_data = data[:split]
+        self.val_data = data[split:]
+    def get_batch(
+        self,
+        split: str,
+        batch_size: int,
+        generator: Optional[torch.Generator] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        data = self.train_data if split == "train" else self.val_data
+        max_start = len(data) - self.block_size - 1
+        if max_start <= 0:
+            raise ValueError("Corpus too small for block_size")
+        ix = torch.randint(max_start, (batch_size,), generator=generator)
+        x = torch.stack([data[i : i + self.block_size] for i in ix])
+        y = torch.stack([data[i + 1 : i + self.block_size + 1] for i in ix])
+        return x.to(self.device), y.to(self.device)
+def load_text(args: argparse.Namespace) -> str:
+    if args.text_path:
+        with open(args.text_path, "r", encoding="utf-8") as f:
+            return f.read()
+    return make_synthetic_corpus(args.synthetic_sentences, args.seed)
+# -----------------------------
+# Sparse Linear autograd
+# -----------------------------
+class MaskedLinearFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(  # type: ignore[override]
+        ctx,
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        bias: Optional[torch.Tensor],
+        active_rows: torch.Tensor,
+        sparse_dx: bool,
+    ) -> torch.Tensor:
+        ctx.save_for_backward(x, weight, active_rows)
+        ctx.has_bias = bias is not None
+        ctx.sparse_dx = bool(sparse_dx)
+        return F.linear(x, weight, bias)
+    @staticmethod
+    def backward(ctx, grad_y: torch.Tensor):  # type: ignore[override]
+        x, weight, active_rows = ctx.saved_tensors
+        sparse_dx = bool(ctx.sparse_dx)
+        has_bias = bool(ctx.has_bias)
+        x_shape = x.shape
+        x_flat = x.reshape(-1, x.shape[-1])
+        gy_flat = grad_y.reshape(-1, grad_y.shape[-1])
+        active_idx = torch.nonzero(active_rows, as_tuple=False).flatten()
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros(weight.shape[0], device=weight.device, dtype=weight.dtype) if has_bias else None
+        if active_idx.numel() > 0:
+            gy_active = gy_flat[:, active_idx]
+            grad_weight[active_idx] = gy_active.transpose(0, 1) @ x_flat
+            if grad_bias is not None:
+                grad_bias[active_idx] = gy_active.sum(dim=0)
+            if sparse_dx:
+                grad_x_flat = gy_active @ weight[active_idx]
+            else:
+                grad_x_flat = gy_flat @ weight
+        else:
+            # This can happen when a global top-k mask selects no rows from a
+            # particular layer. Conservative full_dX still propagates through that
+            # layer; aggressive sparse_dX cuts it off for that layer.
+            if sparse_dx:
+                grad_x_flat = torch.zeros_like(x_flat)
+            else:
+                grad_x_flat = gy_flat @ weight
+        grad_x = grad_x_flat.reshape(x_shape)
+        return grad_x, grad_weight, grad_bias, None, None
+class SparseLinear(nn.Linear):
+    """nn.Linear with an optional row-sparse backward pass."""
+    def __init__(self, in_features: int, out_features: int, bias: bool = True):
+        super().__init__(in_features, out_features, bias=bias)
+        self.sparse_enabled = False
+        self.sparse_dx = False
+        self.active_rows: Optional[torch.Tensor] = None
+    def set_sparse_backward(self, enabled: bool, active_rows: Optional[torch.Tensor], sparse_dx: bool) -> None:
+        self.sparse_enabled = bool(enabled)
+        self.sparse_dx = bool(sparse_dx)
+        self.active_rows = active_rows
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not self.sparse_enabled or self.active_rows is None:
+            return F.linear(x, self.weight, self.bias)
+        return MaskedLinearFunction.apply(x, self.weight, self.bias, self.active_rows, self.sparse_dx)
+# -----------------------------
+# Mini GPT
+# -----------------------------
+class CausalSelfAttention(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        assert n_embd % n_head == 0
+        self.n_head = n_head
+        self.head_dim = n_embd // n_head
+        self.c_attn = SparseLinear(n_embd, 3 * n_embd)
+        self.c_proj = SparseLinear(n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+        self.register_buffer("mask", torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, T, C = x.shape
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(C, dim=2)
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float("-inf"))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.c_proj(y)
+class FeedForward(nn.Module):
+    def __init__(self, n_embd: int, dropout: float):
+        super().__init__()
+        self.c_fc = SparseLinear(n_embd, 4 * n_embd)
+        self.c_proj = SparseLinear(4 * n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.dropout(self.c_proj(F.gelu(self.c_fc(x))))
+class Block(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(n_embd)
+        self.attn = CausalSelfAttention(n_embd, n_head, block_size, dropout)
+        self.ln2 = nn.LayerNorm(n_embd)
+        self.mlp = FeedForward(n_embd, dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+class MiniGPT(nn.Module):
+    def __init__(self, vocab_size: int, block_size: int, n_layer: int, n_head: int, n_embd: int, dropout: float):
+        super().__init__()
+        self.block_size = block_size
+        self.tok_emb = nn.Embedding(vocab_size, n_embd)
+        self.pos_emb = nn.Embedding(block_size, n_embd)
+        self.drop = nn.Dropout(dropout)
+        self.blocks = nn.Sequential(*[Block(n_embd, n_head, block_size, dropout) for _ in range(n_layer)])
+        self.ln_f = nn.LayerNorm(n_embd)
+        self.lm_head = SparseLinear(n_embd, vocab_size)
+    def forward(self, idx: torch.Tensor, targets: Optional[torch.Tensor] = None):
+        B, T = idx.shape
+        pos = torch.arange(T, device=idx.device)
+        x = self.tok_emb(idx) + self.pos_emb(pos)[None, :, :]
+        x = self.drop(x)
+        x = self.blocks(x)
+        x = self.ln_f(x)
+        logits = self.lm_head(x)
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss
+def named_sparse_linear_modules(model: nn.Module) -> List[Tuple[str, SparseLinear]]:
+    return [(name, m) for name, m in model.named_modules() if isinstance(m, SparseLinear)]
+def parameter_fractions(model: nn.Module) -> Tuple[int, int, float]:
+    total = sum(p.numel() for p in model.parameters())
+    linear = 0
+    for _, m in named_sparse_linear_modules(model):
+        linear += m.weight.numel()
+        if m.bias is not None:
+            linear += m.bias.numel()
+    return total, linear, linear / max(1, total)
+def configure_sparse_linears(
+    model: nn.Module,
+    masker: Optional["RowMasker"],
+    enabled: bool,
+    backward_mode: Optional[str],
+) -> None:
+    sparse_dx = backward_mode == "sparse_dW_sparse_dX"
+    for _, m in named_sparse_linear_modules(model):
+        active = masker.row_mask_for(m) if masker is not None else None
+        m.set_sparse_backward(enabled=enabled, active_rows=active, sparse_dx=sparse_dx)
+# -----------------------------
+# Mask selector
+# -----------------------------
+class RowMasker:
+    def __init__(
+        self,
+        model: nn.Module,
+        policy: Policy,
+        active_fraction: float,
+        explore_fraction: float,
+        mass_beta: float,
+        unobserved_decay: float,
+        warmup_steps: int,
+        ucb_alpha: float,
+        mass_init: float,
+        device: str,
+    ):
+        self.model = model
+        self.policy = policy
+        self.active_fraction = active_fraction
+        self.explore_fraction = explore_fraction
+        self.mass_beta = mass_beta
+        self.unobserved_decay = unobserved_decay
+        self.warmup_steps = warmup_steps
+        self.ucb_alpha = ucb_alpha
+        self.mass_init = mass_init
+        self.device = device
+        self.step_index = 0
+        self.linear_modules = [m for _, m in named_sparse_linear_modules(model)]
+        self.module_to_ids: Dict[SparseLinear, torch.Tensor] = {}
+        ids = []
+        offset = 0
+        for m in self.linear_modules:
+            n = m.weight.shape[0]
+            block_ids = torch.arange(offset, offset + n, device=device)
+            self.module_to_ids[m] = block_ids
+            ids.append(block_ids)
+            offset += n
+        self.n_blocks = offset
+        self.predicted_mass = torch.full((self.n_blocks,), mass_init, device=device)
+        self.last_full_mass = torch.full((self.n_blocks,), mass_init, device=device)
+        self.observed_count = torch.zeros(self.n_blocks, device=device)
+        self.global_mass_ema = torch.tensor(max(mass_init, 1e-6), device=device)
+        self.prev_active = torch.zeros(self.n_blocks, dtype=torch.bool, device=device)
+        self.active = torch.zeros(self.n_blocks, dtype=torch.bool, device=device)
+        self.row_masks: Dict[SparseLinear, torch.Tensor] = {
+            m: torch.zeros(m.weight.shape[0], dtype=torch.bool, device=device) for m in self.linear_modules
+        }
+    def _topk_mask(self, values: torch.Tensor, fraction: float) -> torch.Tensor:
+        k = max(1, int(fraction * values.numel()))
+        mask = torch.zeros_like(values, dtype=torch.bool)
+        noisy = values + 1e-9 * torch.rand_like(values)
+        mask[torch.topk(noisy, k=k).indices] = True
+        return mask
+    @staticmethod
+    def _jaccard(a: torch.Tensor, b: torch.Tensor) -> float:
+        inter = (a & b).sum().float()
+        union = (a | b).sum().float()
+        return float((inter / torch.clamp(union, min=1.0)).item())
+    def _set_active(self, active: torch.Tensor) -> None:
+        self.active = active
+        self.row_masks = {}
+        for m, ids in self.module_to_ids.items():
+            self.row_masks[m] = active[ids]
+    def _sample_exploit_explore(self, scores: torch.Tensor) -> torch.Tensor:
+        n = self.n_blocks
+        k_total = max(1, int(self.active_fraction * n))
+        k_explore = min(k_total, max(0, int(self.explore_fraction * k_total)))
+        k_exploit = k_total - k_explore
+        active = torch.zeros(n, dtype=torch.bool, device=self.device)
+        if k_exploit > 0:
+            active[torch.topk(scores + 1e-9 * torch.rand_like(scores), k=k_exploit).indices] = True
+        if k_explore > 0:
+            remaining = torch.nonzero(~active, as_tuple=False).flatten()
+            pick = remaining[torch.randperm(remaining.numel(), device=self.device)[:k_explore]]
+            active[pick] = True
+        return active
+    def choose_pre_backward(self, step: int) -> None:
+        self.step_index = step
+        if step < self.warmup_steps:
+            self._set_active(torch.ones(self.n_blocks, dtype=torch.bool, device=self.device))
+            return
+        if self.policy == "oracle_current":
+            # Oracle cannot choose until the dense audit gradient is known.
+            self._set_active(torch.zeros(self.n_blocks, dtype=torch.bool, device=self.device))
+            return
+        if self.policy == "random":
+            self._set_active(self._sample_exploit_explore(torch.rand(self.n_blocks, device=self.device)))
+            return
+        if self.policy == "stale_current":
+            self._set_active(self._topk_mask(self.last_full_mass, self.active_fraction))
+            return
+        if self.policy == "predicted_magnitude":
+            self._set_active(self._sample_exploit_explore(self.predicted_mass))
+            return
+        if self.policy == "ucb_magnitude":
+            t = max(1, step - self.warmup_steps + 1)
+            log_term = torch.log(torch.tensor(float(t + 2), device=self.device))
+            bonus_scale = torch.clamp(self.global_mass_ema, min=1e-8)
+            bonus = self.ucb_alpha * bonus_scale * torch.sqrt(log_term / (self.observed_count + 1.0))
+            self._set_active(self._sample_exploit_explore(self.predicted_mass + bonus))
+            return
+        raise ValueError(f"Unknown policy: {self.policy}")
+    @torch.no_grad()
+    def current_gradient_mass_from_grads(self) -> torch.Tensor:
+        mass = torch.zeros(self.n_blocks, device=self.device)
+        for m, ids in self.module_to_ids.items():
+            if m.weight.grad is None:
+                continue
+            row_sq = m.weight.grad.square().sum(dim=1)
+            if m.bias is not None and m.bias.grad is not None:
+                row_sq = row_sq + m.bias.grad.square()
+            mass[ids] = torch.sqrt(row_sq + 1e-30)
+        return mass
+    @torch.no_grad()
+    @torch.no_grad()
+    def update_predictor_from_observed_mass(self, mass: torch.Tensor, observed: Optional[torch.Tensor] = None) -> Dict[str, float]:
+        """Update EMA statistics only for observed rows.
+        After warmup, sparse backward only gives trustworthy gradients for active
+        rows, so only those rows are allowed to update predicted_mass.
+        """
+        if observed is None:
+            observed = self.active
+        new_active = observed & (self.observed_count == 0)
+        self.predicted_mass.mul_(self.unobserved_decay)
+        if bool(observed.any().item()):
+            obs_mass = mass[observed]
+            first_seen = self.observed_count[observed] == 0
+            ema_mass = self.mass_beta * self.predicted_mass[observed] + (1.0 - self.mass_beta) * obs_mass
+            self.predicted_mass[observed] = torch.where(first_seen, obs_mass, ema_mass)
+            self.observed_count[observed] += 1.0
+            self.global_mass_ema = self.mass_beta * self.global_mass_ema + (1.0 - self.mass_beta) * obs_mass.mean()
+        stability = self._jaccard(self.active, self.prev_active)
+        self.prev_active = self.active.clone()
+        return {
+            "stability": stability,
+            "active_fraction_real": float(self.active.float().mean().item()),
+            "coverage": float((self.observed_count > 0).float().mean().item()),
+            "avg_obs_count": float(self.observed_count.mean().item()),
+            "new_active_fraction": float(new_active.float().mean().item()),
+        }
+    @torch.no_grad()
+    def audit_metrics_from_mass(self, mass: torch.Tensor) -> Dict[str, float]:
+        """Compute dense-audit metrics without updating the practical selector."""
+        active = self.active
+        true_sq = mass.square().sum()
+        approx_sq = mass[active].square().sum()
+        cosine = float((torch.sqrt(approx_sq + 1e-30) / torch.sqrt(true_sq + 1e-30)).item())
+        oracle_mask = self._topk_mask(mass, self.active_fraction)
+        jacc = self._jaccard(active, oracle_mask)
+        k20 = max(1, int(0.2 * self.n_blocks))
+        sorted_mass = torch.sort(mass, descending=True).values
+        top20_mass = float((sorted_mass[:k20].sum() / (sorted_mass.sum() + 1e-12)).item())
+        return {
+            "cosine": cosine,
+            "norm_ratio": cosine,
+            "top20_mass": top20_mass,
+            "jacc_oracle": jacc,
+        }
+    def audit_and_update_from_mass(self, step: int, mass: torch.Tensor) -> Dict[str, float]:
+        if step < self.warmup_steps:
+            active = torch.ones(self.n_blocks, dtype=torch.bool, device=self.device)
+            self._set_active(active)
+        elif self.policy == "oracle_current":
+            active = self._topk_mask(mass, self.active_fraction)
+            self._set_active(active)
+        else:
+            active = self.active
+        true_sq = mass.square().sum()
+        approx_sq = mass[active].square().sum()
+        cosine = float((torch.sqrt(approx_sq + 1e-30) / torch.sqrt(true_sq + 1e-30)).item())
+        oracle_mask = self._topk_mask(mass, self.active_fraction)
+        jacc = self._jaccard(active, oracle_mask)
+        stability = self._jaccard(active, self.prev_active)
+        self.prev_active = active.clone()
+        k20 = max(1, int(0.2 * self.n_blocks))
+        sorted_mass = torch.sort(mass, descending=True).values
+        top20_mass = float((sorted_mass[:k20].sum() / (sorted_mass.sum() + 1e-12)).item())
+        new_active = active & (self.observed_count == 0)
+        # Practical rule: update predicted statistics only for active/observed rows.
+        self.predicted_mass.mul_(self.unobserved_decay)
+        observed = active
+        if bool(observed.any().item()):
+            obs_mass = mass[observed]
+            first_seen = self.observed_count[observed] == 0
+            ema_mass = self.mass_beta * self.predicted_mass[observed] + (1.0 - self.mass_beta) * obs_mass
+            self.predicted_mass[observed] = torch.where(first_seen, obs_mass, ema_mass)
+            self.observed_count[observed] += 1.0
+            self.global_mass_ema = self.mass_beta * self.global_mass_ema + (1.0 - self.mass_beta) * obs_mass.mean()
+        # Dense audit signal; only stale_current is allowed to use this for selection.
+        self.last_full_mass = mass.detach().clone()
+        return {
+            "cosine": cosine,
+            "norm_ratio": cosine,
+            "top20_mass": top20_mass,
+            "jacc_oracle": jacc,
+            "stability": stability,
+            "active_fraction_real": float(active.float().mean().item()),
+            "coverage": float((self.observed_count > 0).float().mean().item()),
+            "avg_obs_count": float(self.observed_count.mean().item()),
+            "new_active_fraction": float(new_active.float().mean().item()),
+        }
+    def row_mask_for(self, module: SparseLinear) -> Optional[torch.Tensor]:
+        return self.row_masks.get(module)
+# -----------------------------
+# Masked Adam
+# -----------------------------
+class MaskedAdam:
+    def __init__(
+        self,
+        model: nn.Module,
+        masker: Optional[RowMasker],
+        lr: float,
+        betas=(0.9, 0.95),
+        eps=1e-8,
+        weight_decay=0.0,
+        freeze_non_linear_when_sparse: bool = False,
+    ):
+        self.model = model
+        self.masker = masker
+        self.lr = lr
+        self.beta1, self.beta2 = betas
+        self.eps = eps
+        self.weight_decay = weight_decay
+        self.freeze_non_linear_when_sparse = freeze_non_linear_when_sparse
+        self.state: Dict[nn.Parameter, Dict[str, torch.Tensor]] = {}
+        self.linear_param: Dict[nn.Parameter, Tuple[SparseLinear, str]] = {}
+        for _, m in named_sparse_linear_modules(model):
+            self.linear_param[m.weight] = (m, "weight")
+            if m.bias is not None:
+                self.linear_param[m.bias] = (m, "bias")
+    def zero_grad(self) -> None:
+        for p in self.model.parameters():
+            p.grad = None
+    @torch.no_grad()
+    def step(self) -> None:
+        for p in self.model.parameters():
+            if p.grad is None:
+                continue
+            if self.masker is not None and self.freeze_non_linear_when_sparse and p not in self.linear_param:
+                continue
+            if p not in self.state:
+                self.state[p] = {"m": torch.zeros_like(p), "v": torch.zeros_like(p)}
+            m = self.state[p]["m"]
+            v = self.state[p]["v"]
+            g = p.grad
+            if self.weight_decay:
+                g = g.add(p, alpha=self.weight_decay)
+            row_mask = None
+            if self.masker is not None and p in self.linear_param:
+                module, kind = self.linear_param[p]
+                base = self.masker.row_mask_for(module)
+                if base is not None:
+                    row_mask = base.view(-1, *([1] * (p.ndim - 1))) if kind == "weight" else base
+            if row_mask is None:
+                m.mul_(self.beta1).add_(g, alpha=1.0 - self.beta1)
+                v.mul_(self.beta2).addcmul_(g, g, value=1.0 - self.beta2)
+                p.add_(m / (torch.sqrt(v) + self.eps), alpha=-self.lr)
+            else:
+                # MPS can mis-handle expanded boolean masks for row-wise assignment
+                # (e.g. reporting nonsense out-of-bounds indices). Use explicit
+                # row indices and index_copy_ instead. This also avoids materializing
+                # a full expanded mask for weight matrices.
+                active_rows = row_mask.reshape(-1).nonzero(as_tuple=False).flatten()
+                if active_rows.numel() == 0:
+                    continue
+                m_rows = m.index_select(0, active_rows)
+                v_rows = v.index_select(0, active_rows)
+                g_rows = g.index_select(0, active_rows)
+                new_m_rows = self.beta1 * m_rows + (1.0 - self.beta1) * g_rows
+                new_v_rows = self.beta2 * v_rows + (1.0 - self.beta2) * g_rows * g_rows
+                update_rows = new_m_rows / (torch.sqrt(new_v_rows) + self.eps)
+                p_rows = p.index_select(0, active_rows) - self.lr * update_rows
+                m.index_copy_(0, active_rows, new_m_rows)
+                v.index_copy_(0, active_rows, new_v_rows)
+                p.index_copy_(0, active_rows, p_rows)
+# -----------------------------
+# Training utilities
+# -----------------------------
+@torch.no_grad()
+def estimate_loss(model: nn.Module, corpus: CharCorpus, batch_size: int, eval_iters: int, seed: int) -> Dict[str, float]:
+    model.eval()
+    configure_sparse_linears(model, masker=None, enabled=False, backward_mode=None)
+    out = {}
+    for split in ["train", "val"]:
+        losses = []
+        gen = make_cpu_generator(seed + (0 if split == "train" else 100000))
+        for _ in range(eval_iters):
+            x, y = corpus.get_batch(split, batch_size, generator=gen)
+            _, loss = model(x, y)
+            losses.append(float(loss.item()))
+        out[split] = sum(losses) / len(losses)
+    model.train()
+    return out
+def dense_audit_pass(model: nn.Module, corpus_batch: Tuple[torch.Tensor, torch.Tensor], opt: MaskedAdam, masker: RowMasker) -> torch.Tensor:
+    x, y = corpus_batch
+    configure_sparse_linears(model, masker=None, enabled=False, backward_mode=None)
+    opt.zero_grad()
+    _, audit_loss = model(x, y)
+    audit_loss.backward()
+    mass = masker.current_gradient_mass_from_grads()
+    opt.zero_grad()
+    return mass
+def sparse_training_backward(
+    model: nn.Module,
+    corpus_batch: Tuple[torch.Tensor, torch.Tensor],
+    opt: MaskedAdam,
+    masker: Optional[RowMasker],
+    backward_mode: Optional[BackwardMode],
+) -> float:
+    x, y = corpus_batch
+    opt.zero_grad()
+    if masker is None or backward_mode is None or backward_mode == "masked_optimizer":
+        configure_sparse_linears(model, masker=None, enabled=False, backward_mode=None)
+    else:
+        configure_sparse_linears(model, masker=masker, enabled=True, backward_mode=backward_mode)
+    _, loss = model(x, y)
+    loss.backward()
+    configure_sparse_linears(model, masker=None, enabled=False, backward_mode=None)
+    return float(loss.item())
+def train_run(
+    corpus: CharCorpus,
+    args: argparse.Namespace,
+    policy: Optional[Policy],
+    backward_mode: Optional[BackwardMode],
+    active_fraction: float,
+    warmup_steps: int,
+    explore_fraction: float,
+    seed_offset: int,
+) -> Dict[str, float | str]:
+    # Same model initialization and same minibatch sequence for every run by default.
+    set_seed(args.seed + (seed_offset if args.unpaired_seeds else 0))
+    data_gen = make_cpu_generator(args.seed + 12345)
+    dev = corpus.device
+    model = MiniGPT(corpus.vocab_size, args.block_size, args.n_layer, args.n_head, args.n_embd, args.dropout).to(dev)
+    masker = None
+    if policy is not None:
+        masker = RowMasker(
+            model=model,
+            policy=policy,
+            active_fraction=active_fraction,
+            explore_fraction=explore_fraction,
+            mass_beta=args.mass_beta,
+            unobserved_decay=args.unobserved_decay,
+            warmup_steps=warmup_steps,
+            ucb_alpha=args.ucb_alpha,
+            mass_init=args.mass_init,
+            device=dev,
+        )
+    opt = MaskedAdam(
+        model,
+        masker,
+        lr=args.lr,
+        weight_decay=args.weight_decay,
+        freeze_non_linear_when_sparse=args.freeze_non_linear_when_sparse,
+    )
+    sums = {
+        "cosine": 0.0,
+        "norm_ratio": 0.0,
+        "top20_mass": 0.0,
+        "jacc_oracle": 0.0,
+        "stability": 0.0,
+        "active_fraction_real": 0.0,
+        "coverage": 0.0,
+        "avg_obs_count": 0.0,
+        "new_active_fraction": 0.0,
+    }
+    counts = {k: 0 for k in sums}
+    def add_metrics(metrics: Dict[str, float]) -> None:
+        for k, v in metrics.items():
+            if k in sums:
+                sums[k] += float(v)
+                counts[k] += 1
+    for step in range(args.steps):
+        batch = corpus.get_batch("train", args.batch_size, generator=data_gen)
+        if masker is None:
+            loss_value = sparse_training_backward(model, batch, opt, masker=None, backward_mode=None)
+            opt.step()
+        else:
+            if step < warmup_steps:
+                # Dense bootstrap. Every row is active and every row updates the predictor.
+                masker._set_active(torch.ones(masker.n_blocks, dtype=torch.bool, device=dev))
+                loss_value = sparse_training_backward(model, batch, opt, masker=masker, backward_mode="masked_optimizer")
+                full_mass = masker.current_gradient_mass_from_grads()
+                masker.last_full_mass = full_mass.detach().clone()
+                add_metrics(masker.audit_metrics_from_mass(full_mass))
+                add_metrics(masker.update_predictor_from_observed_mass(full_mass, observed=masker.active))
+                opt.step()
+            else:
+                masker.choose_pre_backward(step)
+                if policy == "oracle_current":
+                    # Explicit upper bound. Oracle necessarily computes dense gradients to choose rows.
+                    full_mass = dense_audit_pass(model, batch, opt, masker)
+                    masker._set_active(masker._topk_mask(full_mass, active_fraction))
+                    masker.last_full_mass = full_mass.detach().clone()
+                    add_metrics(masker.audit_metrics_from_mass(full_mass))
+                elif args.audit_every > 0 and ((step - warmup_steps) % args.audit_every == 0):
+                    # Measurement only. Do not update predicted_magnitude/ucb/random with this dense mass.
+                    full_mass = dense_audit_pass(model, batch, opt, masker)
+                    add_metrics(masker.audit_metrics_from_mass(full_mass))
+                    if policy == "stale_current":
+                        masker.last_full_mass = full_mass.detach().clone()
+                loss_value = sparse_training_backward(model, batch, opt, masker=masker, backward_mode=backward_mode)
+                # Practical selector update: only active rows were observed by the training backward pass.
+                observed_mass = masker.current_gradient_mass_from_grads()
+                add_metrics(masker.update_predictor_from_observed_mass(observed_mass, observed=masker.active))
+                opt.step()
+        if args.verbose and (step % args.eval_interval == 0 or step == args.steps - 1):
+            losses = estimate_loss(model, corpus, args.batch_size, args.eval_iters, seed=args.seed + 555)
+            name = "dense" if policy is None else f"{policy}/{backward_mode}"
+            print(
+                f"{name:38s} step={step:5d} warm={warmup_steps:4d} explore={explore_fraction:.2f} "
+                f"loss={loss_value:.4f} train={losses['train']:.4f} val={losses['val']:.4f}"
+            )
+    losses = estimate_loss(model, corpus, args.batch_size, args.eval_iters, seed=args.seed + 999)
+    row: Dict[str, float | str] = {
+        "run": "dense_baseline" if policy is None else policy,
+        "mode": "dense" if backward_mode is None else backward_mode,
+        "target_active": 1.0 if policy is None else active_fraction,
+        "warmup": warmup_steps,
+        "explore": explore_fraction if policy is not None else 0.0,
+        "train_loss": losses["train"],
+        "val_loss": losses["val"],
+    }
+    if masker is None:
+        row.update({
+            "cosine": float("nan"),
+            "norm_ratio": float("nan"),
+            "top20_mass": float("nan"),
+            "jacc_oracle": float("nan"),
+            "stability": float("nan"),
+            "active_fraction_real": 1.0,
+            "coverage": float("nan"),
+            "avg_obs_count": float("nan"),
+            "new_active_fraction": float("nan"),
+        })
+    else:
+        for k in sums:
+            row[k] = (sums[k] / counts[k]) if counts[k] > 0 else float("nan")
+    return row
+def print_summary(rows: List[Dict[str, float | str]]) -> None:
+    print("\nSummary")
+    header = (
+        f"{'run':>22s} {'mode':>19s} {'target':>7s} {'actual':>7s} {'warm':>5s} {'expl':>5s} "
+        f"{'val':>8s} {'train':>8s} {'cos':>7s} {'top20':>7s} {'jacc':>7s} "
+        f"{'stable':>7s} {'cover':>7s} {'new':>7s}"
+    )
+    print(header)
+    print("-" * len(header))
+    for r in rows:
+        print(
+            f"{str(r['run']):>22s} "
+            f"{str(r['mode']):>19s} "
+            f"{float(r['target_active']):7.3f} "
+            f"{float(r['active_fraction_real']):7.3f} "
+            f"{int(float(r['warmup'])):5d} "
+            f"{float(r['explore']):5.2f} "
+            f"{float(r['val_loss']):8.4f} "
+            f"{float(r['train_loss']):8.4f} "
+            f"{float(r['cosine']):7.3f} "
+            f"{float(r['top20_mass']):7.3f} "
+            f"{float(r['jacc_oracle']):7.3f} "
+            f"{float(r['stability']):7.3f} "
+            f"{float(r['coverage']):7.3f} "
+            f"{float(r['new_active_fraction']):7.3f}"
+        )
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser()
+    p.add_argument("--text_path", type=str, default=None)
+    p.add_argument("--synthetic_sentences", type=int, default=12000)
+    p.add_argument("--steps", type=int, default=1000)
+    p.add_argument("--quick", action="store_true")
+    p.add_argument("--batch_size", type=int, default=32)
+    p.add_argument("--block_size", type=int, default=64)
+    p.add_argument("--n_layer", type=int, default=2)
+    p.add_argument("--n_head", type=int, default=4)
+    p.add_argument("--n_embd", type=int, default=64)
+    p.add_argument("--dropout", type=float, default=0.0)
+    p.add_argument("--lr", type=float, default=3e-4)
+    p.add_argument("--weight_decay", type=float, default=0.0)
+    p.add_argument("--active_fractions", type=float, nargs="+", default=[0.05, 0.02])
+    p.add_argument("--policies", type=str, nargs="+", default=["oracle_current", "predicted_magnitude", "random"])
+    p.add_argument(
+        "--backward_modes",
+        type=str,
+        nargs="+",
+        default=["masked_optimizer", "sparse_dW_full_dX", "sparse_dW_sparse_dX"],
+    )
+    p.add_argument("--explore_fractions", type=float, nargs="+", default=[0.0])
+    p.add_argument("--warmup_steps_list", type=int, nargs="+", default=[5])
+    p.add_argument("--mass_beta", type=float, default=0.95)
+    p.add_argument("--unobserved_decay", type=float, default=1.0)
+    p.add_argument("--mass_init", type=float, default=0.0)
+    p.add_argument("--ucb_alpha", type=float, default=1.0)
+    p.add_argument("--freeze_non_linear_when_sparse", action="store_true")
+    p.add_argument("--eval_interval", type=int, default=200)
+    p.add_argument("--eval_iters", type=int, default=20)
+    p.add_argument("--seed", type=int, default=7)
+    p.add_argument("--device", type=str, default="auto", choices=["auto", "cpu", "cuda", "mps"])
+    p.add_argument("--audit_every", type=int, default=0, help="Dense audit interval after warmup. 0 disables audits except oracle_current.")
+    p.add_argument("--unpaired_seeds", action="store_true", help="Use different init seeds per run instead of paired seeds.")
+    p.add_argument("--verbose", action="store_true")
+    return p.parse_args()
+def main() -> None:
+    args = parse_args()
+    if args.quick:
+        args.steps = 40
+        args.eval_iters = 2
+        args.batch_size = 8
+        args.block_size = 32
+        args.n_layer = 1
+        args.n_embd = 32
+        args.n_head = 4
+        args.synthetic_sentences = 1200
+        args.active_fractions = [0.05]
+        args.policies = ["predicted_magnitude", "random"]
+        args.backward_modes = ["masked_optimizer", "sparse_dW_full_dX", "sparse_dW_sparse_dX"]
+        args.explore_fractions = [0.0]
+        args.warmup_steps_list = [5]
+        args.audit_every = 10
+    valid_policies = {"predicted_magnitude", "ucb_magnitude", "oracle_current", "stale_current", "random"}
+    valid_modes = {"masked_optimizer", "sparse_dW_full_dX", "sparse_dW_sparse_dX"}
+    for pol in args.policies:
+        if pol not in valid_policies:
+            raise ValueError(f"Unknown policy {pol!r}. Valid policies: {sorted(valid_policies)}")
+    for mode in args.backward_modes:
+        if mode not in valid_modes:
+            raise ValueError(f"Unknown backward mode {mode!r}. Valid modes: {sorted(valid_modes)}")
+    set_seed(args.seed)
+    dev = args.device if args.device != "auto" else default_device()
+    print(f"device={dev}")
+    corpus = CharCorpus(load_text(args), args.block_size, dev)
+    print(f"vocab_size={corpus.vocab_size} train_tokens={len(corpus.train_data)} val_tokens={len(corpus.val_data)}")
+    print(f"policies={args.policies}")
+    print(f"backward_modes={args.backward_modes}")
+    print(f"active_fractions={args.active_fractions}")
+    print(f"warmup_steps_list={args.warmup_steps_list} explore_fractions={args.explore_fractions}")
+    print(f"mass_init={args.mass_init} mass_beta={args.mass_beta} ucb_alpha={args.ucb_alpha}")
+    print(f"paired_seeds={not args.unpaired_seeds}")
+    print(f"audit_every={args.audit_every} (0 means no dense audit after warmup, except oracle_current)")
+    tmp_model = MiniGPT(corpus.vocab_size, args.block_size, args.n_layer, args.n_head, args.n_embd, args.dropout).to(dev)
+    total_params, linear_params, linear_frac = parameter_fractions(tmp_model)
+    del tmp_model
+    print(f"params total={total_params} linear={linear_params} linear_fraction={linear_frac:.3f}")
+    if args.freeze_non_linear_when_sparse:
+        print("freeze_non_linear_when_sparse=True: embeddings/layernorm/etc. are frozen in sparse runs")
+    else:
+        print("freeze_non_linear_when_sparse=False: non-Linear params are still updated densely")
+    if args.dropout != 0.0:
+        print("warning: dropout is nonzero; dense audit and sparse training passes may see different dropout masks")
+    rows: List[Dict[str, float | str]] = []
+    print("\nRunning dense baseline")
+    rows.append(
+        train_run(
+            corpus,
+            args,
+            policy=None,
+            backward_mode=None,
+            active_fraction=1.0,
+            warmup_steps=0,
+            explore_fraction=0.0,
+            seed_offset=0,
+        )
+    )
+    seed_offset = 100
+    for mode in args.backward_modes:
+        for af in args.active_fractions:
+            for pol in args.policies:
+                explore_values = args.explore_fractions if pol in {"predicted_magnitude", "ucb_magnitude"} else [0.0]
+                for warmup in args.warmup_steps_list:
+                    for explore in explore_values:
+                        print(
+                            f"\nRunning mode={mode}, policy={pol}, "
+                            f"active_fraction={af:.3f}, warmup={warmup}, explore={explore:.2f}"
+                        )
+                        rows.append(
+                            train_run(
+                                corpus,
+                                args,
+                                policy=pol,  # type: ignore[arg-type]
+                                backward_mode=mode,  # type: ignore[arg-type]
+                                active_fraction=af,
+                                warmup_steps=warmup,
+                                explore_fraction=explore,
+                                seed_offset=seed_offset,
+                            )
+                        )
+                        seed_offset += 1
+    print_summary(rows)
+    print("\nNotes")
+    print("  masked_optimizer is the v7-style dense-backward simulation control.")
+    print("  sparse_dW_full_dX uses custom Linear backward: sparse weight/bias grads, full input gradient.")
+    print("  sparse_dW_sparse_dX uses custom Linear backward: sparse weight/bias grads and sparse input gradient.")
+    print("  oracle_current uses dense audit gradients to choose rows; it is an upper bound.")
+    print("  predicted_magnitude uses EMA mass from active/observed rows only.")
+    print("  random is the sparse-support control.")
+    print("  v9 does not compute dense audit gradients after warmup unless --audit_every > 0, except oracle_current.")
+    print("  predicted_magnitude updates EMA statistics only from active rows observed by the training backward pass.")
+    print("  cosine/top20/jacc are nan when --audit_every 0 because no dense reference gradient is computed.")
+    print("  This is still not a wall-clock benchmark: PyTorch indexing may not accelerate on CPU/MPS without a custom Metal kernel.")
+if __name__ == "__main__":
+    main()

experiments/surprise_topk_gradient_prototype-v2.py ADDED Viewed

	@@ -0,0 +1,418 @@

+"""
+Surprise Top-K Gradient Prototype, v2
+Goal
+----
+Test the hypothesis:
+    gradient_t ≈ predicted_gradient_t + sparse_surprising_residual_t
+This version fixes two problems from v1:
+  1. The baseline now actually learns the toy task, using Adam instead of raw SGD.
+  2. The surprise method is tested as an approximate gradient passed into Adam,
+     rather than as a hand-written SGD update with unstable error-feedback buffers.
+Important caveat
+----------------
+This still computes the full gradient every step. That is intentional. This is a
+hypothesis test: does a sparse "surprising residual" preserve the useful update
+signal? If yes, a later version can try to skip real backward computation.
+Run
+---
+    python3 surprise_topk_gradient_prototype.py
+"""
+from __future__ import annotations
+import math
+import random
+from dataclasses import dataclass
+from typing import Dict, List, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+SEED = 7
+random.seed(SEED)
+torch.manual_seed(SEED)
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# -----------------------------
+# Toy data: 2-class spiral
+# -----------------------------
+def make_spiral(n_per_class: int = 1024, noise: float = 0.12) -> Tuple[torch.Tensor, torch.Tensor]:
+    xs = []
+    ys = []
+    for class_id in range(2):
+        r = torch.linspace(0.0, 1.0, n_per_class)
+        theta = class_id * math.pi + r * 4.0 * math.pi
+        theta = theta + torch.randn(n_per_class) * noise
+        x = torch.stack([r * torch.sin(theta), r * torch.cos(theta)], dim=1)
+        y = torch.full((n_per_class,), class_id, dtype=torch.long)
+        xs.append(x)
+        ys.append(y)
+    X = torch.cat(xs, dim=0)
+    Y = torch.cat(ys, dim=0)
+    # Mild scale expansion helps the MLP separate the spiral.
+    X = 3.0 * X
+    perm = torch.randperm(X.shape[0])
+    return X[perm].to(DEVICE), Y[perm].to(DEVICE)
+# -----------------------------
+# Model
+# -----------------------------
+class TinyMLP(nn.Module):
+    def __init__(self, width: int = 128):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(2, width),
+            nn.ReLU(),
+            nn.Linear(width, width),
+            nn.ReLU(),
+            nn.Linear(width, width),
+            nn.ReLU(),
+            nn.Linear(width, 2),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.net(x)
+def linear_layers(model: nn.Module) -> List[nn.Linear]:
+    return [m for m in model.modules() if isinstance(m, nn.Linear)]
+# -----------------------------
+# Surprise Top-K machinery
+# -----------------------------
+@dataclass(frozen=True)
+class BlockRef:
+    layer_index: int
+    row_index: int
+class SurpriseTopKGradientBuilder:
+    """
+    Builds approximate gradients after a full backward pass.
+    Block = one output row of a Linear weight matrix, plus its bias element.
+    On active blocks:
+        use true gradient.
+    On inactive blocks:
+        use predicted gradient from an exponential moving average.
+    Score:
+        surprise = ||true_gradient - predicted_gradient|| / (||true_gradient|| + eps)
+    The highest-surprise blocks become active on non-refresh steps.
+    """
+    def __init__(
+        self,
+        model: nn.Module,
+        beta: float = 0.95,
+        active_fraction: float = 0.2,
+        refresh_interval: int = 10,
+        warmup_steps: int = 100,
+        eps: float = 1e-12,
+    ):
+        self.model = model
+        self.layers = linear_layers(model)
+        self.beta = beta
+        self.active_fraction = active_fraction
+        self.refresh_interval = refresh_interval
+        self.warmup_steps = warmup_steps
+        self.eps = eps
+        self.blocks: List[BlockRef] = []
+        for li, layer in enumerate(self.layers):
+            for row in range(layer.weight.shape[0]):
+                self.blocks.append(BlockRef(li, row))
+        self.pred_w: Dict[int, torch.Tensor] = {}
+        self.pred_b: Dict[int, torch.Tensor] = {}
+        for li, layer in enumerate(self.layers):
+            self.pred_w[li] = torch.zeros_like(layer.weight.data)
+            if layer.bias is not None:
+                self.pred_b[li] = torch.zeros_like(layer.bias.data)
+        self.scores = torch.ones(len(self.blocks), device=DEVICE)
+    def _choose_active_blocks(self, step: int) -> torch.Tensor:
+        n = len(self.blocks)
+        if step < self.warmup_steps:
+            return torch.ones(n, dtype=torch.bool, device=DEVICE)
+        if step % self.refresh_interval == 0:
+            return torch.ones(n, dtype=torch.bool, device=DEVICE)
+        k = max(1, int(self.active_fraction * n))
+        active = torch.zeros(n, dtype=torch.bool, device=DEVICE)
+        idx = torch.topk(self.scores, k=k).indices
+        active[idx] = True
+        return active
+    @torch.no_grad()
+    def build_and_install_approx_grads(self, step: int) -> Dict[str, float]:
+        active = self._choose_active_blocks(step)
+        true_parts = []
+        approx_parts = []
+        pred_parts = []
+        # We build full approximate grad tensors, then overwrite .grad so Adam sees
+        # the approximate gradient.
+        approx_w: Dict[int, torch.Tensor] = {}
+        approx_b: Dict[int, torch.Tensor] = {}
+        for li, layer in enumerate(self.layers):
+            approx_w[li] = torch.zeros_like(layer.weight.grad)
+            if layer.bias is not None:
+                approx_b[li] = torch.zeros_like(layer.bias.grad)
+        for block_id, block in enumerate(self.blocks):
+            li = block.layer_index
+            row = block.row_index
+            layer = self.layers[li]
+            is_active = bool(active[block_id].item())
+            g_w = layer.weight.grad[row].detach().clone()
+            p_w = self.pred_w[li][row].detach().clone()
+            if layer.bias is not None:
+                g_b = layer.bias.grad[row].detach().clone()
+                p_b = self.pred_b[li][row].detach().clone()
+            else:
+                g_b = None
+                p_b = None
+            # Score is computed against the predictor BEFORE updating predictor.
+            true_vec_items = [g_w.flatten()]
+            pred_vec_items = [p_w.flatten()]
+            if g_b is not None:
+                true_vec_items.append(g_b.view(1))
+                pred_vec_items.append(p_b.view(1))
+            true_vec_block = torch.cat(true_vec_items)
+            pred_vec_block = torch.cat(pred_vec_items)
+            residual = true_vec_block - pred_vec_block
+            self.scores[block_id] = torch.norm(residual) / (torch.norm(true_vec_block) + self.eps)
+            if is_active:
+                a_w = g_w
+                a_b = g_b
+            else:
+                a_w = p_w
+                a_b = p_b
+            approx_w[li][row] = a_w
+            if layer.bias is not None:
+                approx_b[li][row] = a_b
+            # Update EMA predictor from the true gradient. We allow this in the
+            # prototype because full gradients are being computed for measurement.
+            # A speedup version would only update this from refresh/active blocks.
+            self.pred_w[li][row].mul_(self.beta).add_(g_w, alpha=1.0 - self.beta)
+            if layer.bias is not None:
+                self.pred_b[li][row].mul_(self.beta).add_(g_b, alpha=1.0 - self.beta)
+            approx_vec_items = [a_w.flatten()]
+            if a_b is not None:
+                approx_vec_items.append(a_b.view(1))
+            true_parts.append(true_vec_block)
+            pred_parts.append(pred_vec_block)
+            approx_parts.append(torch.cat(approx_vec_items))
+        # Install approximate gradients for the optimizer.
+        for li, layer in enumerate(self.layers):
+            layer.weight.grad.copy_(approx_w[li])
+            if layer.bias is not None:
+                layer.bias.grad.copy_(approx_b[li])
+        true_vec = torch.cat(true_parts)
+        pred_vec = torch.cat(pred_parts)
+        approx_vec = torch.cat(approx_parts)
+        cosine = F.cosine_similarity(true_vec, approx_vec, dim=0).item()
+        pred_explained = 1.0 - (
+            torch.norm(true_vec - pred_vec).pow(2) / (torch.norm(true_vec).pow(2) + self.eps)
+        ).item()
+        k20 = max(1, int(0.2 * len(self.blocks)))
+        sorted_scores = torch.sort(self.scores.detach(), descending=True).values
+        top20_mass = (sorted_scores[:k20].sum() / (sorted_scores.sum() + self.eps)).item()
+        return {
+            "active_fraction": float(active.float().mean().item()),
+            "cosine_true_vs_approx": cosine,
+            "pred_explained_fraction": pred_explained,
+            "top20_surprise_mass": top20_mass,
+        }
+# -----------------------------
+# Metrics and training
+# -----------------------------
+def accuracy(model: nn.Module, X: torch.Tensor, y: torch.Tensor) -> float:
+    model.eval()
+    with torch.no_grad():
+        pred = model(X).argmax(dim=1)
+        return (pred == y).float().mean().item()
+def train_baseline(
+    X: torch.Tensor,
+    y: torch.Tensor,
+    steps: int = 2000,
+    batch_size: int = 256,
+    lr: float = 1e-3,
+) -> Tuple[nn.Module, List[Dict[str, float]]]:
+    model = TinyMLP().to(DEVICE)
+    opt = torch.optim.Adam(model.parameters(), lr=lr)
+    history: List[Dict[str, float]] = []
+    for step in range(steps):
+        model.train()
+        idx = torch.randint(0, X.shape[0], (batch_size,), device=DEVICE)
+        xb, yb = X[idx], y[idx]
+        loss = F.cross_entropy(model(xb), yb)
+        opt.zero_grad(set_to_none=True)
+        loss.backward()
+        opt.step()
+        if step % 100 == 0 or step == steps - 1:
+            history.append({
+                "step": step,
+                "loss": float(loss.item()),
+                "accuracy": accuracy(model, X, y),
+            })
+    return model, history
+def train_surprise_topk(
+    X: torch.Tensor,
+    y: torch.Tensor,
+    steps: int = 2000,
+    batch_size: int = 256,
+    lr: float = 1e-3,
+    active_fraction: float = 0.2,
+    refresh_interval: int = 10,
+    warmup_steps: int = 100,
+    beta: float = 0.95,
+) -> Tuple[nn.Module, List[Dict[str, float]]]:
+    model = TinyMLP().to(DEVICE)
+    opt = torch.optim.Adam(model.parameters(), lr=lr)
+    builder = SurpriseTopKGradientBuilder(
+        model,
+        beta=beta,
+        active_fraction=active_fraction,
+        refresh_interval=refresh_interval,
+        warmup_steps=warmup_steps,
+    )
+    history: List[Dict[str, float]] = []
+    for step in range(steps):
+        model.train()
+        idx = torch.randint(0, X.shape[0], (batch_size,), device=DEVICE)
+        xb, yb = X[idx], y[idx]
+        loss = F.cross_entropy(model(xb), yb)
+        opt.zero_grad(set_to_none=True)
+        loss.backward()
+        diagnostics = builder.build_and_install_approx_grads(step)
+        opt.step()
+        if step % 100 == 0 or step == steps - 1:
+            history.append({
+                "step": step,
+                "loss": float(loss.item()),
+                "accuracy": accuracy(model, X, y),
+                **diagnostics,
+            })
+    return model, history
+def print_last(label: str, history: List[Dict[str, float]]) -> None:
+    print(f"\n{label}")
+    for k, v in history[-1].items():
+        if isinstance(v, float):
+            print(f"  {k:28s}: {v:.4f}")
+        else:
+            print(f"  {k:28s}: {v}")
+def print_checkpoints(label: str, history: List[Dict[str, float]]) -> None:
+    print(f"\n{label} checkpoints:")
+    stride = max(1, len(history) // 8)
+    for row in history[::stride]:
+        extra = ""
+        if "cosine_true_vs_approx" in row:
+            extra = (
+                f" active={row['active_fraction']:.2f}"
+                f" cos={row['cosine_true_vs_approx']:.3f}"
+                f" pred_expl={row['pred_explained_fraction']:.3f}"
+                f" top20_mass={row['top20_surprise_mass']:.3f}"
+            )
+        print(
+            f"step={row['step']:4d} "
+            f"loss={row['loss']:.4f} "
+            f"acc={row['accuracy']:.3f}"
+            f"{extra}"
+        )
+def main() -> None:
+    X, y = make_spiral()
+    baseline_model, baseline_hist = train_baseline(X, y)
+    surprise_model, surprise_hist = train_surprise_topk(
+        X,
+        y,
+        active_fraction=0.2,
+        refresh_interval=10,
+        warmup_steps=100,
+        beta=0.95,
+    )
+    print_last("Baseline full Adam", baseline_hist)
+    print_last("Surprise Top-K simulated Adam", surprise_hist)
+    print_checkpoints("Baseline", baseline_hist)
+    print_checkpoints("Surprise Top-K", surprise_hist)
+    print("\nHow to read this:")
+    print("  cos near 1.0       => approximate update points like the true gradient")
+    print("  pred_expl > 0      => predictor beats zero as a gradient guess")
+    print("  top20_mass high    => surprise is heavy-tailed / concentrated")
+    print("  accuracy close     => approximation did not wreck training")
+if __name__ == "__main__":
+    main()

experiments/surprise_topk_gradient_prototype-v3.py ADDED Viewed

	@@ -0,0 +1,487 @@

+"""
+Surprise Top-K Gradient Prototype, v3
+Goal
+----
+Test the hypothesis:
+    gradient_t ≈ predicted_gradient_t + sparse_surprising_residual_t
+What changed from v2
+--------------------
+1. Checkpoints no longer accidentally land mostly on refresh steps.
+2. Surprise mass is measured using absolute residual norm, not a normalized ratio
+   that explodes when true gradients are tiny.
+3. We track both:
+      - score_for_selection: normalized surprise, used to decide active blocks
+      - residual_mass: absolute residual norm, used to test heavy-tailed structure
+4. We print separate summaries for refresh steps and sparse steps.
+5. We compare surprise-top-k against magnitude-top-k and random-top-k policies.
+Important caveat
+----------------
+This still computes the full gradient every step. That is intentional. This is a
+hypothesis test: does a sparse "surprising residual" preserve the useful update
+signal? If yes, a later version can try to skip real backward computation.
+Run
+---
+    python3 surprise_topk_gradient_prototype.py
+"""
+from __future__ import annotations
+import math
+import random
+from dataclasses import dataclass
+from typing import Dict, List, Literal, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+SEED = 7
+random.seed(SEED)
+torch.manual_seed(SEED)
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+Policy = Literal["surprise", "magnitude", "random"]
+# -----------------------------
+# Toy data: 2-class spiral
+# -----------------------------
+def make_spiral(n_per_class: int = 1024, noise: float = 0.12) -> Tuple[torch.Tensor, torch.Tensor]:
+    xs = []
+    ys = []
+    for class_id in range(2):
+        r = torch.linspace(0.0, 1.0, n_per_class)
+        theta = class_id * math.pi + r * 4.0 * math.pi
+        theta = theta + torch.randn(n_per_class) * noise
+        x = torch.stack([r * torch.sin(theta), r * torch.cos(theta)], dim=1)
+        y = torch.full((n_per_class,), class_id, dtype=torch.long)
+        xs.append(x)
+        ys.append(y)
+    X = torch.cat(xs, dim=0)
+    Y = torch.cat(ys, dim=0)
+    X = 3.0 * X
+    perm = torch.randperm(X.shape[0])
+    return X[perm].to(DEVICE), Y[perm].to(DEVICE)
+# -----------------------------
+# Model
+# -----------------------------
+class TinyMLP(nn.Module):
+    def __init__(self, width: int = 128):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(2, width),
+            nn.ReLU(),
+            nn.Linear(width, width),
+            nn.ReLU(),
+            nn.Linear(width, width),
+            nn.ReLU(),
+            nn.Linear(width, 2),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.net(x)
+def linear_layers(model: nn.Module) -> List[nn.Linear]:
+    return [m for m in model.modules() if isinstance(m, nn.Linear)]
+# -----------------------------
+# Surprise Top-K machinery
+# -----------------------------
+@dataclass(frozen=True)
+class BlockRef:
+    layer_index: int
+    row_index: int
+class SparseGradientBuilder:
+    """
+    Builds approximate gradients after a full backward pass.
+    Block = one output row of a Linear weight matrix, plus its bias element.
+    Approx gradient:
+        active blocks   -> true gradient
+        inactive blocks -> EMA-predicted gradient
+    Selection policies:
+        surprise  -> choose blocks where true gradient differs most from prediction
+        magnitude -> choose blocks with largest true gradient norm
+        random    -> choose random blocks
+    """
+    def __init__(
+        self,
+        model: nn.Module,
+        policy: Policy = "surprise",
+        beta: float = 0.95,
+        active_fraction: float = 0.2,
+        refresh_interval: int = 10,
+        warmup_steps: int = 100,
+        eps: float = 1e-12,
+    ):
+        self.model = model
+        self.layers = linear_layers(model)
+        self.policy = policy
+        self.beta = beta
+        self.active_fraction = active_fraction
+        self.refresh_interval = refresh_interval
+        self.warmup_steps = warmup_steps
+        self.eps = eps
+        self.blocks: List[BlockRef] = []
+        for li, layer in enumerate(self.layers):
+            for row in range(layer.weight.shape[0]):
+                self.blocks.append(BlockRef(li, row))
+        self.pred_w: Dict[int, torch.Tensor] = {}
+        self.pred_b: Dict[int, torch.Tensor] = {}
+        for li, layer in enumerate(self.layers):
+            self.pred_w[li] = torch.zeros_like(layer.weight.data)
+            if layer.bias is not None:
+                self.pred_b[li] = torch.zeros_like(layer.bias.data)
+        self.selection_score = torch.ones(len(self.blocks), device=DEVICE)
+        self.residual_mass = torch.ones(len(self.blocks), device=DEVICE)
+        self.gradient_mass = torch.ones(len(self.blocks), device=DEVICE)
+    def _is_refresh_step(self, step: int) -> bool:
+        return step < self.warmup_steps or step % self.refresh_interval == 0
+    def _choose_active_blocks(self, step: int) -> torch.Tensor:
+        n = len(self.blocks)
+        if self._is_refresh_step(step):
+            return torch.ones(n, dtype=torch.bool, device=DEVICE)
+        k = max(1, int(self.active_fraction * n))
+        active = torch.zeros(n, dtype=torch.bool, device=DEVICE)
+        if self.policy == "surprise":
+            idx = torch.topk(self.selection_score, k=k).indices
+        elif self.policy == "magnitude":
+            idx = torch.topk(self.gradient_mass, k=k).indices
+        elif self.policy == "random":
+            idx = torch.randperm(n, device=DEVICE)[:k]
+        else:
+            raise ValueError(f"Unknown policy: {self.policy}")
+        active[idx] = True
+        return active
+    @torch.no_grad()
+    def build_and_install_approx_grads(self, step: int) -> Dict[str, float]:
+        active = self._choose_active_blocks(step)
+        is_refresh = self._is_refresh_step(step)
+        true_parts = []
+        approx_parts = []
+        pred_parts = []
+        approx_w: Dict[int, torch.Tensor] = {}
+        approx_b: Dict[int, torch.Tensor] = {}
+        for li, layer in enumerate(self.layers):
+            approx_w[li] = torch.zeros_like(layer.weight.grad)
+            if layer.bias is not None:
+                approx_b[li] = torch.zeros_like(layer.bias.grad)
+        for block_id, block in enumerate(self.blocks):
+            li = block.layer_index
+            row = block.row_index
+            layer = self.layers[li]
+            is_active = bool(active[block_id].item())
+            g_w = layer.weight.grad[row].detach().clone()
+            p_w = self.pred_w[li][row].detach().clone()
+            if layer.bias is not None:
+                g_b = layer.bias.grad[row].detach().clone()
+                p_b = self.pred_b[li][row].detach().clone()
+            else:
+                g_b = None
+                p_b = None
+            true_vec_items = [g_w.flatten()]
+            pred_vec_items = [p_w.flatten()]
+            if g_b is not None:
+                true_vec_items.append(g_b.view(1))
+                pred_vec_items.append(p_b.view(1))
+            true_vec_block = torch.cat(true_vec_items)
+            pred_vec_block = torch.cat(pred_vec_items)
+            residual = true_vec_block - pred_vec_block
+            grad_norm = torch.norm(true_vec_block)
+            residual_norm = torch.norm(residual)
+            self.gradient_mass[block_id] = grad_norm
+            self.residual_mass[block_id] = residual_norm
+            # Selection score deliberately normalized, but with a floor so tiny
+            # gradients do not create absurd ratios.
+            denom = torch.maximum(grad_norm, torch.tensor(1e-6, device=DEVICE))
+            self.selection_score[block_id] = residual_norm / denom
+            if is_active:
+                a_w = g_w
+                a_b = g_b
+            else:
+                a_w = p_w
+                a_b = p_b
+            approx_w[li][row] = a_w
+            if layer.bias is not None:
+                approx_b[li][row] = a_b
+            # Prototype choice: update EMA from true gradient on every step,
+            # because we computed full gradients for measurement. A speedup version
+            # would only update this on refresh/active blocks.
+            self.pred_w[li][row].mul_(self.beta).add_(g_w, alpha=1.0 - self.beta)
+            if layer.bias is not None:
+                self.pred_b[li][row].mul_(self.beta).add_(g_b, alpha=1.0 - self.beta)
+            approx_vec_items = [a_w.flatten()]
+            if a_b is not None:
+                approx_vec_items.append(a_b.view(1))
+            true_parts.append(true_vec_block)
+            pred_parts.append(pred_vec_block)
+            approx_parts.append(torch.cat(approx_vec_items))
+        for li, layer in enumerate(self.layers):
+            layer.weight.grad.copy_(approx_w[li])
+            if layer.bias is not None:
+                layer.bias.grad.copy_(approx_b[li])
+        true_vec = torch.cat(true_parts)
+        pred_vec = torch.cat(pred_parts)
+        approx_vec = torch.cat(approx_parts)
+        true_norm = torch.norm(true_vec)
+        pred_error_norm = torch.norm(true_vec - pred_vec)
+        cosine = F.cosine_similarity(true_vec, approx_vec, dim=0).item()
+        pred_explained = 1.0 - (
+            pred_error_norm.pow(2) / (true_norm.pow(2) + self.eps)
+        ).item()
+        k20 = max(1, int(0.2 * len(self.blocks)))
+        sorted_residual = torch.sort(self.residual_mass.detach(), descending=True).values
+        top20_residual_mass = (sorted_residual[:k20].sum() / (sorted_residual.sum() + self.eps)).item()
+        sorted_gradient = torch.sort(self.gradient_mass.detach(), descending=True).values
+        top20_gradient_mass = (sorted_gradient[:k20].sum() / (sorted_gradient.sum() + self.eps)).item()
+        return {
+            "is_refresh": float(is_refresh),
+            "active_fraction": float(active.float().mean().item()),
+            "cosine_true_vs_approx": cosine,
+            "pred_explained_fraction": pred_explained,
+            "top20_residual_mass": top20_residual_mass,
+            "top20_gradient_mass": top20_gradient_mass,
+            "true_grad_norm": float(true_norm.item()),
+            "pred_error_norm": float(pred_error_norm.item()),
+        }
+# -----------------------------
+# Metrics and training
+# -----------------------------
+def accuracy(model: nn.Module, X: torch.Tensor, y: torch.Tensor) -> float:
+    model.eval()
+    with torch.no_grad():
+        pred = model(X).argmax(dim=1)
+        return (pred == y).float().mean().item()
+def train_baseline(
+    X: torch.Tensor,
+    y: torch.Tensor,
+    steps: int = 2000,
+    batch_size: int = 256,
+    lr: float = 1e-3,
+) -> Tuple[nn.Module, List[Dict[str, float]]]:
+    model = TinyMLP().to(DEVICE)
+    opt = torch.optim.Adam(model.parameters(), lr=lr)
+    history: List[Dict[str, float]] = []
+    for step in range(steps):
+        model.train()
+        idx = torch.randint(0, X.shape[0], (batch_size,), device=DEVICE)
+        xb, yb = X[idx], y[idx]
+        loss = F.cross_entropy(model(xb), yb)
+        opt.zero_grad(set_to_none=True)
+        loss.backward()
+        opt.step()
+        if step % 97 == 0 or step == steps - 1:
+            history.append({
+                "step": step,
+                "loss": float(loss.item()),
+                "accuracy": accuracy(model, X, y),
+            })
+    return model, history
+def train_sparse_policy(
+    X: torch.Tensor,
+    y: torch.Tensor,
+    policy: Policy,
+    steps: int = 2000,
+    batch_size: int = 256,
+    lr: float = 1e-3,
+    active_fraction: float = 0.2,
+    refresh_interval: int = 10,
+    warmup_steps: int = 100,
+    beta: float = 0.95,
+) -> Tuple[nn.Module, List[Dict[str, float]]]:
+    model = TinyMLP().to(DEVICE)
+    opt = torch.optim.Adam(model.parameters(), lr=lr)
+    builder = SparseGradientBuilder(
+        model,
+        policy=policy,
+        beta=beta,
+        active_fraction=active_fraction,
+        refresh_interval=refresh_interval,
+        warmup_steps=warmup_steps,
+    )
+    history: List[Dict[str, float]] = []
+    for step in range(steps):
+        model.train()
+        idx = torch.randint(0, X.shape[0], (batch_size,), device=DEVICE)
+        xb, yb = X[idx], y[idx]
+        loss = F.cross_entropy(model(xb), yb)
+        opt.zero_grad(set_to_none=True)
+        loss.backward()
+        diagnostics = builder.build_and_install_approx_grads(step)
+        opt.step()
+        if step % 97 == 0 or step == steps - 1:
+            history.append({
+                "step": step,
+                "loss": float(loss.item()),
+                "accuracy": accuracy(model, X, y),
+                **diagnostics,
+            })
+    return model, history
+def avg_sparse_metric(history: List[Dict[str, float]], key: str) -> float:
+    vals = [row[key] for row in history if row.get("is_refresh", 0.0) == 0.0]
+    if not vals:
+        return float("nan")
+    return sum(vals) / len(vals)
+def print_last(label: str, history: List[Dict[str, float]]) -> None:
+    print(f"\n{label}")
+    for k, v in history[-1].items():
+        if isinstance(v, float):
+            print(f"  {k:28s}: {v:.4f}")
+        else:
+            print(f"  {k:28s}: {v}")
+def print_sparse_summary(label: str, history: List[Dict[str, float]]) -> None:
+    last = history[-1]
+    print(f"\n{label} sparse-step averages from logged checkpoints")
+    for key in [
+        "cosine_true_vs_approx",
+        "pred_explained_fraction",
+        "top20_residual_mass",
+        "top20_gradient_mass",
+    ]:
+        print(f"  avg {key:24s}: {avg_sparse_metric(history, key):.4f}")
+    print(f"  final accuracy              : {last['accuracy']:.4f}")
+    print(f"  final loss                  : {last['loss']:.4f}")
+def print_checkpoints(label: str, history: List[Dict[str, float]]) -> None:
+    print(f"\n{label} checkpoints:")
+    stride = max(1, len(history) // 8)
+    for row in history[::stride]:
+        extra = ""
+        if "cosine_true_vs_approx" in row:
+            extra = (
+                f" refresh={int(row['is_refresh'])}"
+                f" active={row['active_fraction']:.2f}"
+                f" cos={row['cosine_true_vs_approx']:.3f}"
+                f" pred_expl={row['pred_explained_fraction']:.3f}"
+                f" top20_resid={row['top20_residual_mass']:.3f}"
+                f" top20_grad={row['top20_gradient_mass']:.3f}"
+            )
+        print(
+            f"step={row['step']:4d} "
+            f"loss={row['loss']:.4f} "
+            f"acc={row['accuracy']:.3f}"
+            f"{extra}"
+        )
+def main() -> None:
+    X, y = make_spiral()
+    baseline_model, baseline_hist = train_baseline(X, y)
+    results = {}
+    for policy in ["surprise", "magnitude", "random"]:
+        _, hist = train_sparse_policy(
+            X,
+            y,
+            policy=policy,
+            active_fraction=0.2,
+            refresh_interval=10,
+            warmup_steps=100,
+            beta=0.95,
+        )
+        results[policy] = hist
+    print_last("Baseline full Adam", baseline_hist)
+    for policy, hist in results.items():
+        print_last(f"{policy.title()} Top-K simulated Adam", hist)
+    print_checkpoints("Baseline", baseline_hist)
+    for policy, hist in results.items():
+        print_checkpoints(f"{policy.title()} Top-K", hist)
+        print_sparse_summary(f"{policy.title()} Top-K", hist)
+    print("\nHow to read this:")
+    print("  refresh=0             => a real sparse/approximate logged step")
+    print("  cos near 1.0          => approximate update points like the true gradient")
+    print("  pred_expl > 0         => predictor beats zero as a gradient guess")
+    print("  top20_resid high      => prediction error is heavy-tailed/concentrated")
+    print("  top20_grad high       => raw gradient mass is heavy-tailed/concentrated")
+    print("  surprise > baselines  => surprise is doing more than ordinary top-k/random")
+if __name__ == "__main__":
+    main()

experiments/surprise_topk_gradient_prototype-v4.py ADDED Viewed

	@@ -0,0 +1,571 @@

+"""
+Surprise / Predicted-Magnitude Top-K Gradient Prototype, v4
+Goal
+----
+Test the practical version of the hypothesis:
+    The gradient/update signal is heavy-tailed, and the high-mass blocks are
+    predictable enough that we can choose where to spend backward/update compute.
+What v4 adds
+------------
+A new policy:
+    predicted_magnitude
+This selects active blocks using only a historical EMA of block gradient norms,
+not the current gradient. This is much closer to something that could eventually
+save backward computation, because the active set is known before using the
+current full gradient.
+Policies compared
+-----------------
+1. surprise
+   Select blocks whose current gradient is least predictable from EMA gradient.
+   This is mostly a diagnostic/oracle because it uses the current gradient.
+2. magnitude
+   Select blocks with largest current gradient norm.
+   This is an oracle upper-bound for simple top-k block sparsification.
+3. predicted_magnitude
+   Select blocks with largest EMA-predicted gradient norm.
+   This is the important practical test.
+4. random
+   Control baseline.
+Important caveat
+----------------
+This still computes the full gradient every step. That is intentional. We are
+measuring whether the active-set prediction would have worked. Actual speedup
+would require skipping or restricting backward computation for inactive blocks.
+Run
+---
+    python3 surprise_topk_gradient_prototype.py
+"""
+from __future__ import annotations
+import math
+import random
+from dataclasses import dataclass
+from typing import Dict, List, Literal, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+SEED = 7
+random.seed(SEED)
+torch.manual_seed(SEED)
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+Policy = Literal["surprise", "magnitude", "predicted_magnitude", "random"]
+# -----------------------------
+# Toy data: 2-class spiral
+# -----------------------------
+def make_spiral(n_per_class: int = 1024, noise: float = 0.12) -> Tuple[torch.Tensor, torch.Tensor]:
+    xs = []
+    ys = []
+    for class_id in range(2):
+        r = torch.linspace(0.0, 1.0, n_per_class)
+        theta = class_id * math.pi + r * 4.0 * math.pi
+        theta = theta + torch.randn(n_per_class) * noise
+        x = torch.stack([r * torch.sin(theta), r * torch.cos(theta)], dim=1)
+        y = torch.full((n_per_class,), class_id, dtype=torch.long)
+        xs.append(x)
+        ys.append(y)
+    X = torch.cat(xs, dim=0)
+    Y = torch.cat(ys, dim=0)
+    X = 3.0 * X
+    perm = torch.randperm(X.shape[0])
+    return X[perm].to(DEVICE), Y[perm].to(DEVICE)
+# -----------------------------
+# Model
+# -----------------------------
+class TinyMLP(nn.Module):
+    def __init__(self, width: int = 128):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(2, width),
+            nn.ReLU(),
+            nn.Linear(width, width),
+            nn.ReLU(),
+            nn.Linear(width, width),
+            nn.ReLU(),
+            nn.Linear(width, 2),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.net(x)
+def linear_layers(model: nn.Module) -> List[nn.Linear]:
+    return [m for m in model.modules() if isinstance(m, nn.Linear)]
+# -----------------------------
+# Sparse gradient machinery
+# -----------------------------
+@dataclass(frozen=True)
+class BlockRef:
+    layer_index: int
+    row_index: int
+class SparseGradientBuilder:
+    """
+    Builds approximate gradients after a full backward pass.
+    Block = one output row of a Linear weight matrix, plus its bias element.
+    Approx gradient:
+        active blocks   -> true gradient
+        inactive blocks -> EMA-predicted gradient
+    Selection policies:
+        surprise            -> largest current residual: ||g - pred_g||
+        magnitude           -> largest current gradient norm: ||g||
+        predicted_magnitude -> largest historical EMA gradient norm
+        random              -> random active blocks
+    """
+    def __init__(
+        self,
+        model: nn.Module,
+        policy: Policy = "predicted_magnitude",
+        grad_beta: float = 0.95,
+        mass_beta: float = 0.95,
+        active_fraction: float = 0.2,
+        refresh_interval: int = 10,
+        warmup_steps: int = 100,
+        eps: float = 1e-12,
+    ):
+        self.model = model
+        self.layers = linear_layers(model)
+        self.policy = policy
+        self.grad_beta = grad_beta
+        self.mass_beta = mass_beta
+        self.active_fraction = active_fraction
+        self.refresh_interval = refresh_interval
+        self.warmup_steps = warmup_steps
+        self.eps = eps
+        self.blocks: List[BlockRef] = []
+        for li, layer in enumerate(self.layers):
+            for row in range(layer.weight.shape[0]):
+                self.blocks.append(BlockRef(li, row))
+        self.pred_w: Dict[int, torch.Tensor] = {}
+        self.pred_b: Dict[int, torch.Tensor] = {}
+        for li, layer in enumerate(self.layers):
+            self.pred_w[li] = torch.zeros_like(layer.weight.data)
+            if layer.bias is not None:
+                self.pred_b[li] = torch.zeros_like(layer.bias.data)
+        n = len(self.blocks)
+        self.current_gradient_mass = torch.ones(n, device=DEVICE)
+        self.current_residual_mass = torch.ones(n, device=DEVICE)
+        self.predicted_gradient_mass = torch.ones(n, device=DEVICE)
+        self.selection_score = torch.ones(n, device=DEVICE)
+        # Active-set stability diagnostic.
+        self.prev_active = torch.zeros(n, dtype=torch.bool, device=DEVICE)
+    def _is_refresh_step(self, step: int) -> bool:
+        return step < self.warmup_steps or step % self.refresh_interval == 0
+    def _choose_active_blocks(self, step: int) -> torch.Tensor:
+        n = len(self.blocks)
+        if self._is_refresh_step(step):
+            return torch.ones(n, dtype=torch.bool, device=DEVICE)
+        k = max(1, int(self.active_fraction * n))
+        active = torch.zeros(n, dtype=torch.bool, device=DEVICE)
+        if self.policy == "surprise":
+            # Uses last step's residual score when choosing before the current
+            # gradient is processed. After full gradient is computed below, this
+            # becomes an oracle-ish diagnostic of residual concentration.
+            idx = torch.topk(self.current_residual_mass, k=k).indices
+        elif self.policy == "magnitude":
+            # Uses last observed current_gradient_mass from previous step, not the
+            # current one, at selection time. Still a strong baseline.
+            idx = torch.topk(self.current_gradient_mass, k=k).indices
+        elif self.policy == "predicted_magnitude":
+            # This is the important practical policy: choose from historical EMA.
+            idx = torch.topk(self.predicted_gradient_mass, k=k).indices
+        elif self.policy == "random":
+            idx = torch.randperm(n, device=DEVICE)[:k]
+        else:
+            raise ValueError(f"Unknown policy: {self.policy}")
+        active[idx] = True
+        return active
+    @staticmethod
+    def _topk_mask(values: torch.Tensor, fraction: float) -> torch.Tensor:
+        n = values.numel()
+        k = max(1, int(fraction * n))
+        mask = torch.zeros(n, dtype=torch.bool, device=values.device)
+        mask[torch.topk(values, k=k).indices] = True
+        return mask
+    @staticmethod
+    def _jaccard(a: torch.Tensor, b: torch.Tensor) -> float:
+        inter = (a & b).sum().float()
+        union = (a | b).sum().float()
+        return float((inter / torch.clamp(union, min=1.0)).item())
+    @torch.no_grad()
+    def build_and_install_approx_grads(self, step: int) -> Dict[str, float]:
+        active = self._choose_active_blocks(step)
+        is_refresh = self._is_refresh_step(step)
+        true_parts = []
+        approx_parts = []
+        pred_parts = []
+        approx_w: Dict[int, torch.Tensor] = {}
+        approx_b: Dict[int, torch.Tensor] = {}
+        for li, layer in enumerate(self.layers):
+            approx_w[li] = torch.zeros_like(layer.weight.grad)
+            if layer.bias is not None:
+                approx_b[li] = torch.zeros_like(layer.bias.grad)
+        # Temporary arrays for current-step measurements.
+        new_gradient_mass = torch.zeros_like(self.current_gradient_mass)
+        new_residual_mass = torch.zeros_like(self.current_residual_mass)
+        for block_id, block in enumerate(self.blocks):
+            li = block.layer_index
+            row = block.row_index
+            layer = self.layers[li]
+            is_active = bool(active[block_id].item())
+            g_w = layer.weight.grad[row].detach().clone()
+            p_w = self.pred_w[li][row].detach().clone()
+            if layer.bias is not None:
+                g_b = layer.bias.grad[row].detach().clone()
+                p_b = self.pred_b[li][row].detach().clone()
+            else:
+                g_b = None
+                p_b = None
+            true_vec_items = [g_w.flatten()]
+            pred_vec_items = [p_w.flatten()]
+            if g_b is not None:
+                true_vec_items.append(g_b.view(1))
+                pred_vec_items.append(p_b.view(1))
+            true_vec_block = torch.cat(true_vec_items)
+            pred_vec_block = torch.cat(pred_vec_items)
+            residual_vec = true_vec_block - pred_vec_block
+            grad_norm = torch.norm(true_vec_block)
+            residual_norm = torch.norm(residual_vec)
+            new_gradient_mass[block_id] = grad_norm
+            new_residual_mass[block_id] = residual_norm
+            if is_active:
+                a_w = g_w
+                a_b = g_b
+            else:
+                a_w = p_w
+                a_b = p_b
+            approx_w[li][row] = a_w
+            if layer.bias is not None:
+                approx_b[li][row] = a_b
+            # Prototype choice: update predictors from true gradient because the
+            # full gradient is available for measurement. A speedup version would
+            # update fully only on refresh steps and partially on active blocks.
+            self.pred_w[li][row].mul_(self.grad_beta).add_(g_w, alpha=1.0 - self.grad_beta)
+            if layer.bias is not None:
+                self.pred_b[li][row].mul_(self.grad_beta).add_(g_b, alpha=1.0 - self.grad_beta)
+            approx_vec_items = [a_w.flatten()]
+            if a_b is not None:
+                approx_vec_items.append(a_b.view(1))
+            true_parts.append(true_vec_block)
+            pred_parts.append(pred_vec_block)
+            approx_parts.append(torch.cat(approx_vec_items))
+        # Install approximate gradients for Adam.
+        for li, layer in enumerate(self.layers):
+            layer.weight.grad.copy_(approx_w[li])
+            if layer.bias is not None:
+                layer.bias.grad.copy_(approx_b[li])
+        true_vec = torch.cat(true_parts)
+        pred_vec = torch.cat(pred_parts)
+        approx_vec = torch.cat(approx_parts)
+        true_norm = torch.norm(true_vec)
+        pred_error_norm = torch.norm(true_vec - pred_vec)
+        cosine = F.cosine_similarity(true_vec, approx_vec, dim=0).item()
+        pred_explained = 1.0 - (
+            pred_error_norm.pow(2) / (true_norm.pow(2) + self.eps)
+        ).item()
+        # Oracle masks for diagnostics after seeing the true current gradient.
+        oracle_magnitude_mask = self._topk_mask(new_gradient_mass, self.active_fraction)
+        oracle_residual_mask = self._topk_mask(new_residual_mass, self.active_fraction)
+        predicted_magnitude_mask = self._topk_mask(self.predicted_gradient_mass, self.active_fraction)
+        active_vs_oracle_mag = self._jaccard(active, oracle_magnitude_mask)
+        active_vs_oracle_resid = self._jaccard(active, oracle_residual_mask)
+        predmag_vs_oracle_mag = self._jaccard(predicted_magnitude_mask, oracle_magnitude_mask)
+        active_stability = self._jaccard(active, self.prev_active)
+        self.prev_active = active.clone()
+        k20 = max(1, int(0.2 * len(self.blocks)))
+        sorted_residual = torch.sort(new_residual_mass.detach(), descending=True).values
+        top20_residual_mass = (sorted_residual[:k20].sum() / (sorted_residual.sum() + self.eps)).item()
+        sorted_gradient = torch.sort(new_gradient_mass.detach(), descending=True).values
+        top20_gradient_mass = (sorted_gradient[:k20].sum() / (sorted_gradient.sum() + self.eps)).item()
+        # Update mass trackers AFTER diagnostics, so predicted_magnitude really
+        # uses only history at selection time.
+        self.current_gradient_mass = new_gradient_mass
+        self.current_residual_mass = new_residual_mass
+        self.predicted_gradient_mass.mul_(self.mass_beta).add_(new_gradient_mass, alpha=1.0 - self.mass_beta)
+        return {
+            "is_refresh": float(is_refresh),
+            "active_fraction": float(active.float().mean().item()),
+            "cosine_true_vs_approx": cosine,
+            "pred_explained_fraction": pred_explained,
+            "top20_residual_mass": top20_residual_mass,
+            "top20_gradient_mass": top20_gradient_mass,
+            "active_vs_oracle_mag": active_vs_oracle_mag,
+            "active_vs_oracle_resid": active_vs_oracle_resid,
+            "predmag_vs_oracle_mag": predmag_vs_oracle_mag,
+            "active_stability": active_stability,
+            "true_grad_norm": float(true_norm.item()),
+            "pred_error_norm": float(pred_error_norm.item()),
+        }
+# -----------------------------
+# Metrics and training
+# -----------------------------
+def accuracy(model: nn.Module, X: torch.Tensor, y: torch.Tensor) -> float:
+    model.eval()
+    with torch.no_grad():
+        pred = model(X).argmax(dim=1)
+        return (pred == y).float().mean().item()
+def train_baseline(
+    X: torch.Tensor,
+    y: torch.Tensor,
+    steps: int = 2000,
+    batch_size: int = 256,
+    lr: float = 1e-3,
+) -> Tuple[nn.Module, List[Dict[str, float]]]:
+    model = TinyMLP().to(DEVICE)
+    opt = torch.optim.Adam(model.parameters(), lr=lr)
+    history: List[Dict[str, float]] = []
+    for step in range(steps):
+        model.train()
+        idx = torch.randint(0, X.shape[0], (batch_size,), device=DEVICE)
+        xb, yb = X[idx], y[idx]
+        loss = F.cross_entropy(model(xb), yb)
+        opt.zero_grad(set_to_none=True)
+        loss.backward()
+        opt.step()
+        if step % 97 == 0 or step == steps - 1:
+            history.append({
+                "step": step,
+                "loss": float(loss.item()),
+                "accuracy": accuracy(model, X, y),
+            })
+    return model, history
+def train_sparse_policy(
+    X: torch.Tensor,
+    y: torch.Tensor,
+    policy: Policy,
+    steps: int = 2000,
+    batch_size: int = 256,
+    lr: float = 1e-3,
+    active_fraction: float = 0.2,
+    refresh_interval: int = 10,
+    warmup_steps: int = 100,
+    grad_beta: float = 0.95,
+    mass_beta: float = 0.95,
+) -> Tuple[nn.Module, List[Dict[str, float]]]:
+    model = TinyMLP().to(DEVICE)
+    opt = torch.optim.Adam(model.parameters(), lr=lr)
+    builder = SparseGradientBuilder(
+        model,
+        policy=policy,
+        grad_beta=grad_beta,
+        mass_beta=mass_beta,
+        active_fraction=active_fraction,
+        refresh_interval=refresh_interval,
+        warmup_steps=warmup_steps,
+    )
+    history: List[Dict[str, float]] = []
+    for step in range(steps):
+        model.train()
+        idx = torch.randint(0, X.shape[0], (batch_size,), device=DEVICE)
+        xb, yb = X[idx], y[idx]
+        loss = F.cross_entropy(model(xb), yb)
+        opt.zero_grad(set_to_none=True)
+        loss.backward()
+        diagnostics = builder.build_and_install_approx_grads(step)
+        opt.step()
+        if step % 97 == 0 or step == steps - 1:
+            history.append({
+                "step": step,
+                "loss": float(loss.item()),
+                "accuracy": accuracy(model, X, y),
+                **diagnostics,
+            })
+    return model, history
+def sparse_rows(history: List[Dict[str, float]]) -> List[Dict[str, float]]:
+    return [row for row in history if row.get("is_refresh", 0.0) == 0.0]
+def avg_sparse_metric(history: List[Dict[str, float]], key: str) -> float:
+    rows = sparse_rows(history)
+    vals = [row[key] for row in rows]
+    if not vals:
+        return float("nan")
+    return sum(vals) / len(vals)
+def print_last(label: str, history: List[Dict[str, float]]) -> None:
+    print(f"\n{label}")
+    for k, v in history[-1].items():
+        if isinstance(v, float):
+            print(f"  {k:28s}: {v:.4f}")
+        else:
+            print(f"  {k:28s}: {v}")
+def print_sparse_summary(label: str, history: List[Dict[str, float]]) -> None:
+    last = history[-1]
+    print(f"\n{label} sparse-step averages from logged checkpoints")
+    for key in [
+        "cosine_true_vs_approx",
+        "pred_explained_fraction",
+        "top20_residual_mass",
+        "top20_gradient_mass",
+        "active_vs_oracle_mag",
+        "active_vs_oracle_resid",
+        "predmag_vs_oracle_mag",
+        "active_stability",
+    ]:
+        print(f"  avg {key:24s}: {avg_sparse_metric(history, key):.4f}")
+    print(f"  final accuracy              : {last['accuracy']:.4f}")
+    print(f"  final loss                  : {last['loss']:.4f}")
+def print_checkpoints(label: str, history: List[Dict[str, float]]) -> None:
+    print(f"\n{label} checkpoints:")
+    stride = max(1, len(history) // 8)
+    for row in history[::stride]:
+        extra = ""
+        if "cosine_true_vs_approx" in row:
+            extra = (
+                f" refresh={int(row['is_refresh'])}"
+                f" active={row['active_fraction']:.2f}"
+                f" cos={row['cosine_true_vs_approx']:.3f}"
+                f" pred_expl={row['pred_explained_fraction']:.3f}"
+                f" top20_grad={row['top20_gradient_mass']:.3f}"
+                f" jacc_mag={row['active_vs_oracle_mag']:.3f}"
+                f" stable={row['active_stability']:.3f}"
+            )
+        print(
+            f"step={row['step']:4d} "
+            f"loss={row['loss']:.4f} "
+            f"acc={row['accuracy']:.3f}"
+            f"{extra}"
+        )
+def main() -> None:
+    X, y = make_spiral()
+    baseline_model, baseline_hist = train_baseline(X, y)
+    results = {}
+    for policy in ["surprise", "magnitude", "predicted_magnitude", "random"]:
+        _, hist = train_sparse_policy(
+            X,
+            y,
+            policy=policy,
+            active_fraction=0.2,
+            refresh_interval=10,
+            warmup_steps=100,
+            grad_beta=0.95,
+            mass_beta=0.95,
+        )
+        results[policy] = hist
+    print_last("Baseline full Adam", baseline_hist)
+    for policy, hist in results.items():
+        print_last(f"{policy.title().replace('_', ' ')} Top-K simulated Adam", hist)
+    print_checkpoints("Baseline", baseline_hist)
+    for policy, hist in results.items():
+        print_checkpoints(f"{policy.title().replace('_', ' ')} Top-K", hist)
+        print_sparse_summary(f"{policy.title().replace('_', ' ')} Top-K", hist)
+    print("\nHow to read this:")
+    print("  predicted_magnitude    => the practical policy to watch")
+    print("  magnitude              => oracle-ish upper bound using recent/current mass")
+    print("  cos near 1.0           => approximate update points like the true gradient")
+    print("  top20_grad high        => raw gradient mass is heavy-tailed/concentrated")
+    print("  jacc_mag high          => selected blocks match current oracle top-k blocks")
+    print("  stable high            => active set is stable over time")
+    print("  pred_mag close to mag  => historical mass is enough to select useful blocks")
+if __name__ == "__main__":
+    main()

experiments/surprise_topk_gradient_prototype-v5.py ADDED Viewed

	@@ -0,0 +1,563 @@

+"""
+Predicted-Magnitude Sparse Gradient Prototype, v5
+Goal
+----
+Test the practical version of the hypothesis more harshly:
+    The gradient/update signal is heavy-tailed, and the high-mass blocks are
+    predictable enough that we can update only those blocks most of the time.
+v5 adds
+-------
+1. inactive_mode="ema"
+      Inactive blocks receive the EMA-predicted gradient.
+2. inactive_mode="zero"
+      Inactive blocks receive zero gradient. This is the stricter and more
+      compute-relevant test.
+3. active_fraction sweep
+      Tests 20%, 10%, 5%, and 2% active blocks.
+4. focused policy comparison
+      - predicted_magnitude: practical policy, chooses active blocks from history
+      - magnitude: oracle-ish policy, chooses from recently observed mass
+      - random: control
+Important caveat
+----------------
+This still computes the full gradient every step. That is intentional. We are
+measuring whether the selected active set would have preserved useful learning.
+Actual speedup would require restricting/skipping backward computation for
+inactive blocks with custom structured backward logic.
+Run
+---
+    python3 surprise_topk_gradient_prototype.py
+"""
+from __future__ import annotations
+import math
+import random
+from dataclasses import dataclass
+from typing import Dict, List, Literal, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+SEED = 7
+random.seed(SEED)
+torch.manual_seed(SEED)
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+Policy = Literal["magnitude", "predicted_magnitude", "random"]
+InactiveMode = Literal["ema", "zero"]
+# -----------------------------
+# Toy data: 2-class spiral
+# -----------------------------
+def make_spiral(n_per_class: int = 1024, noise: float = 0.12) -> Tuple[torch.Tensor, torch.Tensor]:
+    xs = []
+    ys = []
+    for class_id in range(2):
+        r = torch.linspace(0.0, 1.0, n_per_class)
+        theta = class_id * math.pi + r * 4.0 * math.pi
+        theta = theta + torch.randn(n_per_class) * noise
+        x = torch.stack([r * torch.sin(theta), r * torch.cos(theta)], dim=1)
+        y = torch.full((n_per_class,), class_id, dtype=torch.long)
+        xs.append(x)
+        ys.append(y)
+    X = torch.cat(xs, dim=0)
+    Y = torch.cat(ys, dim=0)
+    X = 3.0 * X
+    perm = torch.randperm(X.shape[0])
+    return X[perm].to(DEVICE), Y[perm].to(DEVICE)
+# -----------------------------
+# Model
+# -----------------------------
+class TinyMLP(nn.Module):
+    def __init__(self, width: int = 128):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(2, width),
+            nn.ReLU(),
+            nn.Linear(width, width),
+            nn.ReLU(),
+            nn.Linear(width, width),
+            nn.ReLU(),
+            nn.Linear(width, 2),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.net(x)
+def linear_layers(model: nn.Module) -> List[nn.Linear]:
+    return [m for m in model.modules() if isinstance(m, nn.Linear)]
+# -----------------------------
+# Sparse gradient machinery
+# -----------------------------
+@dataclass(frozen=True)
+class BlockRef:
+    layer_index: int
+    row_index: int
+class SparseGradientBuilder:
+    """
+    Builds approximate gradients after a full backward pass.
+    Block = one output row of a Linear weight matrix, plus its bias element.
+    Approx gradient:
+        active blocks -> true gradient
+        inactive blocks -> either EMA-predicted gradient or zero gradient
+    Selection policies:
+        magnitude             -> largest recently observed gradient norm
+        predicted_magnitude   -> largest historical EMA gradient norm
+        random                -> random active blocks
+    """
+    def __init__(
+        self,
+        model: nn.Module,
+        policy: Policy = "predicted_magnitude",
+        inactive_mode: InactiveMode = "zero",
+        grad_beta: float = 0.95,
+        mass_beta: float = 0.95,
+        active_fraction: float = 0.2,
+        refresh_interval: int = 10,
+        warmup_steps: int = 100,
+        eps: float = 1e-12,
+    ):
+        self.model = model
+        self.layers = linear_layers(model)
+        self.policy = policy
+        self.inactive_mode = inactive_mode
+        self.grad_beta = grad_beta
+        self.mass_beta = mass_beta
+        self.active_fraction = active_fraction
+        self.refresh_interval = refresh_interval
+        self.warmup_steps = warmup_steps
+        self.eps = eps
+        self.blocks: List[BlockRef] = []
+        for li, layer in enumerate(self.layers):
+            for row in range(layer.weight.shape[0]):
+                self.blocks.append(BlockRef(li, row))
+        self.pred_w: Dict[int, torch.Tensor] = {}
+        self.pred_b: Dict[int, torch.Tensor] = {}
+        for li, layer in enumerate(self.layers):
+            self.pred_w[li] = torch.zeros_like(layer.weight.data)
+            if layer.bias is not None:
+                self.pred_b[li] = torch.zeros_like(layer.bias.data)
+        n = len(self.blocks)
+        self.current_gradient_mass = torch.ones(n, device=DEVICE)
+        self.predicted_gradient_mass = torch.ones(n, device=DEVICE)
+        self.prev_active = torch.zeros(n, dtype=torch.bool, device=DEVICE)
+    def _is_refresh_step(self, step: int) -> bool:
+        return step < self.warmup_steps or step % self.refresh_interval == 0
+    def _choose_active_blocks(self, step: int) -> torch.Tensor:
+        n = len(self.blocks)
+        if self._is_refresh_step(step):
+            return torch.ones(n, dtype=torch.bool, device=DEVICE)
+        k = max(1, int(self.active_fraction * n))
+        active = torch.zeros(n, dtype=torch.bool, device=DEVICE)
+        if self.policy == "magnitude":
+            # Uses the previous step's observed gradient mass, not the current one.
+            idx = torch.topk(self.current_gradient_mass, k=k).indices
+        elif self.policy == "predicted_magnitude":
+            # Practical policy: uses historical EMA only.
+            idx = torch.topk(self.predicted_gradient_mass, k=k).indices
+        elif self.policy == "random":
+            idx = torch.randperm(n, device=DEVICE)[:k]
+        else:
+            raise ValueError(f"Unknown policy: {self.policy}")
+        active[idx] = True
+        return active
+    @staticmethod
+    def _topk_mask(values: torch.Tensor, fraction: float) -> torch.Tensor:
+        n = values.numel()
+        k = max(1, int(fraction * n))
+        mask = torch.zeros(n, dtype=torch.bool, device=values.device)
+        mask[torch.topk(values, k=k).indices] = True
+        return mask
+    @staticmethod
+    def _jaccard(a: torch.Tensor, b: torch.Tensor) -> float:
+        inter = (a & b).sum().float()
+        union = (a | b).sum().float()
+        return float((inter / torch.clamp(union, min=1.0)).item())
+    @torch.no_grad()
+    def build_and_install_approx_grads(self, step: int) -> Dict[str, float]:
+        active = self._choose_active_blocks(step)
+        is_refresh = self._is_refresh_step(step)
+        true_parts = []
+        approx_parts = []
+        pred_parts = []
+        approx_w: Dict[int, torch.Tensor] = {}
+        approx_b: Dict[int, torch.Tensor] = {}
+        for li, layer in enumerate(self.layers):
+            approx_w[li] = torch.zeros_like(layer.weight.grad)
+            if layer.bias is not None:
+                approx_b[li] = torch.zeros_like(layer.bias.grad)
+        new_gradient_mass = torch.zeros_like(self.current_gradient_mass)
+        for block_id, block in enumerate(self.blocks):
+            li = block.layer_index
+            row = block.row_index
+            layer = self.layers[li]
+            is_active = bool(active[block_id].item())
+            g_w = layer.weight.grad[row].detach().clone()
+            p_w = self.pred_w[li][row].detach().clone()
+            if layer.bias is not None:
+                g_b = layer.bias.grad[row].detach().clone()
+                p_b = self.pred_b[li][row].detach().clone()
+            else:
+                g_b = None
+                p_b = None
+            true_vec_items = [g_w.flatten()]
+            pred_vec_items = [p_w.flatten()]
+            if g_b is not None:
+                true_vec_items.append(g_b.view(1))
+                pred_vec_items.append(p_b.view(1))
+            true_vec_block = torch.cat(true_vec_items)
+            pred_vec_block = torch.cat(pred_vec_items)
+            grad_norm = torch.norm(true_vec_block)
+            new_gradient_mass[block_id] = grad_norm
+            if is_active:
+                a_w = g_w
+                a_b = g_b
+            else:
+                if self.inactive_mode == "ema":
+                    a_w = p_w
+                    a_b = p_b
+                elif self.inactive_mode == "zero":
+                    a_w = torch.zeros_like(g_w)
+                    a_b = torch.zeros_like(g_b) if g_b is not None else None
+                else:
+                    raise ValueError(f"Unknown inactive_mode: {self.inactive_mode}")
+            approx_w[li][row] = a_w
+            if layer.bias is not None:
+                approx_b[li][row] = a_b
+            # Prototype choice: update predictors from true gradient because the
+            # full gradient is available for measurement. A speedup version would
+            # update predictors fully only on refresh steps and partially on active blocks.
+            self.pred_w[li][row].mul_(self.grad_beta).add_(g_w, alpha=1.0 - self.grad_beta)
+            if layer.bias is not None:
+                self.pred_b[li][row].mul_(self.grad_beta).add_(g_b, alpha=1.0 - self.grad_beta)
+            approx_vec_items = [a_w.flatten()]
+            if a_b is not None:
+                approx_vec_items.append(a_b.view(1))
+            true_parts.append(true_vec_block)
+            pred_parts.append(pred_vec_block)
+            approx_parts.append(torch.cat(approx_vec_items))
+        # Install approximate gradients for Adam.
+        for li, layer in enumerate(self.layers):
+            layer.weight.grad.copy_(approx_w[li])
+            if layer.bias is not None:
+                layer.bias.grad.copy_(approx_b[li])
+        true_vec = torch.cat(true_parts)
+        pred_vec = torch.cat(pred_parts)
+        approx_vec = torch.cat(approx_parts)
+        true_norm = torch.norm(true_vec)
+        pred_error_norm = torch.norm(true_vec - pred_vec)
+        cosine = F.cosine_similarity(true_vec, approx_vec, dim=0).item()
+        approx_norm_ratio = float((torch.norm(approx_vec) / (true_norm + self.eps)).item())
+        pred_explained = 1.0 - (
+            pred_error_norm.pow(2) / (true_norm.pow(2) + self.eps)
+        ).item()
+        oracle_magnitude_mask = self._topk_mask(new_gradient_mass, self.active_fraction)
+        predicted_magnitude_mask = self._topk_mask(self.predicted_gradient_mass, self.active_fraction)
+        active_vs_oracle_mag = self._jaccard(active, oracle_magnitude_mask)
+        predmag_vs_oracle_mag = self._jaccard(predicted_magnitude_mask, oracle_magnitude_mask)
+        active_stability = self._jaccard(active, self.prev_active)
+        self.prev_active = active.clone()
+        k20 = max(1, int(0.2 * len(self.blocks)))
+        sorted_gradient = torch.sort(new_gradient_mass.detach(), descending=True).values
+        top20_gradient_mass = (sorted_gradient[:k20].sum() / (sorted_gradient.sum() + self.eps)).item()
+        # Update mass trackers AFTER diagnostics, so predicted_magnitude really
+        # uses only history at selection time.
+        self.current_gradient_mass = new_gradient_mass
+        self.predicted_gradient_mass.mul_(self.mass_beta).add_(new_gradient_mass, alpha=1.0 - self.mass_beta)
+        return {
+            "is_refresh": float(is_refresh),
+            "active_fraction": float(active.float().mean().item()),
+            "cosine_true_vs_approx": cosine,
+            "approx_norm_ratio": approx_norm_ratio,
+            "pred_explained_fraction": pred_explained,
+            "top20_gradient_mass": top20_gradient_mass,
+            "active_vs_oracle_mag": active_vs_oracle_mag,
+            "predmag_vs_oracle_mag": predmag_vs_oracle_mag,
+            "active_stability": active_stability,
+            "true_grad_norm": float(true_norm.item()),
+            "pred_error_norm": float(pred_error_norm.item()),
+        }
+# -----------------------------
+# Metrics and training
+# -----------------------------
+def accuracy(model: nn.Module, X: torch.Tensor, y: torch.Tensor) -> float:
+    model.eval()
+    with torch.no_grad():
+        pred = model(X).argmax(dim=1)
+        return (pred == y).float().mean().item()
+def train_baseline(
+    X: torch.Tensor,
+    y: torch.Tensor,
+    steps: int = 2000,
+    batch_size: int = 256,
+    lr: float = 1e-3,
+) -> Tuple[nn.Module, List[Dict[str, float]]]:
+    model = TinyMLP().to(DEVICE)
+    opt = torch.optim.Adam(model.parameters(), lr=lr)
+    history: List[Dict[str, float]] = []
+    for step in range(steps):
+        model.train()
+        idx = torch.randint(0, X.shape[0], (batch_size,), device=DEVICE)
+        xb, yb = X[idx], y[idx]
+        loss = F.cross_entropy(model(xb), yb)
+        opt.zero_grad(set_to_none=True)
+        loss.backward()
+        opt.step()
+        if step % 97 == 0 or step == steps - 1:
+            history.append({
+                "step": step,
+                "loss": float(loss.item()),
+                "accuracy": accuracy(model, X, y),
+            })
+    return model, history
+def train_sparse_policy(
+    X: torch.Tensor,
+    y: torch.Tensor,
+    policy: Policy,
+    inactive_mode: InactiveMode,
+    active_fraction: float,
+    steps: int = 2000,
+    batch_size: int = 256,
+    lr: float = 1e-3,
+    refresh_interval: int = 10,
+    warmup_steps: int = 100,
+    grad_beta: float = 0.95,
+    mass_beta: float = 0.95,
+) -> Tuple[nn.Module, List[Dict[str, float]]]:
+    model = TinyMLP().to(DEVICE)
+    opt = torch.optim.Adam(model.parameters(), lr=lr)
+    builder = SparseGradientBuilder(
+        model,
+        policy=policy,
+        inactive_mode=inactive_mode,
+        grad_beta=grad_beta,
+        mass_beta=mass_beta,
+        active_fraction=active_fraction,
+        refresh_interval=refresh_interval,
+        warmup_steps=warmup_steps,
+    )
+    history: List[Dict[str, float]] = []
+    for step in range(steps):
+        model.train()
+        idx = torch.randint(0, X.shape[0], (batch_size,), device=DEVICE)
+        xb, yb = X[idx], y[idx]
+        loss = F.cross_entropy(model(xb), yb)
+        opt.zero_grad(set_to_none=True)
+        loss.backward()
+        diagnostics = builder.build_and_install_approx_grads(step)
+        opt.step()
+        if step % 97 == 0 or step == steps - 1:
+            history.append({
+                "step": step,
+                "loss": float(loss.item()),
+                "accuracy": accuracy(model, X, y),
+                **diagnostics,
+            })
+    return model, history
+def sparse_rows(history: List[Dict[str, float]]) -> List[Dict[str, float]]:
+    return [row for row in history if row.get("is_refresh", 0.0) == 0.0]
+def avg_sparse_metric(history: List[Dict[str, float]], key: str) -> float:
+    rows = sparse_rows(history)
+    vals = [row[key] for row in rows]
+    if not vals:
+        return float("nan")
+    return sum(vals) / len(vals)
+def summarize_sparse_run(
+    policy: Policy,
+    inactive_mode: InactiveMode,
+    active_fraction: float,
+    history: List[Dict[str, float]],
+) -> Dict[str, float | str]:
+    last = history[-1]
+    return {
+        "policy": policy,
+        "inactive_mode": inactive_mode,
+        "active_fraction": active_fraction,
+        "final_accuracy": last["accuracy"],
+        "final_loss": last["loss"],
+        "avg_cosine": avg_sparse_metric(history, "cosine_true_vs_approx"),
+        "avg_norm_ratio": avg_sparse_metric(history, "approx_norm_ratio"),
+        "avg_top20_grad_mass": avg_sparse_metric(history, "top20_gradient_mass"),
+        "avg_jacc_mag": avg_sparse_metric(history, "active_vs_oracle_mag"),
+        "avg_predmag_jacc": avg_sparse_metric(history, "predmag_vs_oracle_mag"),
+        "avg_stability": avg_sparse_metric(history, "active_stability"),
+    }
+def print_baseline(history: List[Dict[str, float]]) -> None:
+    print("\nBaseline full Adam")
+    for k, v in history[-1].items():
+        if isinstance(v, float):
+            print(f"  {k:24s}: {v:.4f}")
+        else:
+            print(f"  {k:24s}: {v}")
+def print_summary_table(rows: List[Dict[str, float | str]]) -> None:
+    print("\nSparse run summary")
+    header = (
+        f"{'policy':>20s} {'mode':>6s} {'active':>7s} "
+        f"{'acc':>7s} {'loss':>9s} {'cos':>7s} {'norm':>7s} "
+        f"{'top20':>7s} {'jacc':>7s} {'stable':>7s}"
+    )
+    print(header)
+    print("-" * len(header))
+    for row in rows:
+        print(
+            f"{str(row['policy']):>20s} "
+            f"{str(row['inactive_mode']):>6s} "
+            f"{float(row['active_fraction']):7.2f} "
+            f"{float(row['final_accuracy']):7.4f} "
+            f"{float(row['final_loss']):9.4f} "
+            f"{float(row['avg_cosine']):7.3f} "
+            f"{float(row['avg_norm_ratio']):7.3f} "
+            f"{float(row['avg_top20_grad_mass']):7.3f} "
+            f"{float(row['avg_jacc_mag']):7.3f} "
+            f"{float(row['avg_stability']):7.3f}"
+        )
+def main() -> None:
+    X, y = make_spiral()
+    baseline_model, baseline_hist = train_baseline(X, y)
+    print_baseline(baseline_hist)
+    active_fractions = [0.20, 0.10, 0.05, 0.02]
+    # Keep this matrix focused; it runs 20 sparse experiments.
+    experiment_plan: List[Tuple[Policy, InactiveMode, float]] = []
+    for active_fraction in active_fractions:
+        experiment_plan.append(("predicted_magnitude", "ema", active_fraction))
+        experiment_plan.append(("predicted_magnitude", "zero", active_fraction))
+        experiment_plan.append(("magnitude", "zero", active_fraction))
+        experiment_plan.append(("random", "zero", active_fraction))
+    summary_rows: List[Dict[str, float | str]] = []
+    for policy, inactive_mode, active_fraction in experiment_plan:
+        print(
+            f"\nRunning policy={policy}, inactive_mode={inactive_mode}, "
+            f"active_fraction={active_fraction:.2f}"
+        )
+        _, hist = train_sparse_policy(
+            X,
+            y,
+            policy=policy,
+            inactive_mode=inactive_mode,
+            active_fraction=active_fraction,
+            refresh_interval=10,
+            warmup_steps=100,
+            grad_beta=0.95,
+            mass_beta=0.95,
+        )
+        summary_rows.append(summarize_sparse_run(policy, inactive_mode, active_fraction, hist))
+    print_summary_table(summary_rows)
+    print("\nHow to read this:")
+    print("  predicted_magnitude + zero is the main practical test.")
+    print("  magnitude + zero is an oracle-ish upper bound using recent observed mass.")
+    print("  random + zero is the control.")
+    print("  acc close to baseline means sparse updates preserved learning.")
+    print("  cos near 1.0 means sparse update direction matches full gradient direction.")
+    print("  norm much below 1.0 means the sparse update is much smaller than full gradient.")
+    print("  top20 near 0.7+ means gradient mass is concentrated/heavy-tailed.")
+    print("  jacc above random means active-set prediction finds the true important blocks.")
+if __name__ == "__main__":
+    main()

experiments/surprise_topk_gradient_prototype.py ADDED Viewed

	@@ -0,0 +1,426 @@

+"""
+Surprise Top-K Gradient Prototype
+This is a deliberately small, readable PyTorch experiment for testing the idea:
+    gradient_t ≈ predicted_gradient_t + sparse_surprising_residual_t
+The code compares:
+  1. Baseline full SGD
+  2. SurpriseTopK training, where only high-surprise parameter blocks use the true
+     gradient on cheap steps; low-surprise blocks use a predicted/stale gradient.
+Important caveat:
+  This prototype still computes full gradients on every step so we can evaluate the
+  approximation honestly. It simulates reduced backward/update entropy; it does not
+  yet provide real wall-clock acceleration. Real acceleration would require structured
+  partial backward passes, custom kernels, or graph-level masking.
+"""
+from __future__ import annotations
+import math
+import random
+from dataclasses import dataclass
+from typing import Dict, List, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# -----------------------------
+# Reproducibility
+# -----------------------------
+SEED = 7
+random.seed(SEED)
+torch.manual_seed(SEED)
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# -----------------------------
+# Toy data: 2-class spiral
+# -----------------------------
+def make_spiral(n_per_class: int = 512, noise: float = 0.2) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Create a small nonlinear classification problem without external datasets."""
+    xs = []
+    ys = []
+    for class_id in range(2):
+        r = torch.linspace(0.0, 1.0, n_per_class)
+        theta = class_id * math.pi + r * 4.0 * math.pi
+        theta = theta + torch.randn(n_per_class) * noise
+        x = torch.stack([r * torch.sin(theta), r * torch.cos(theta)], dim=1)
+        y = torch.full((n_per_class,), class_id, dtype=torch.long)
+        xs.append(x)
+        ys.append(y)
+    X = torch.cat(xs, dim=0)
+    Y = torch.cat(ys, dim=0)
+    perm = torch.randperm(X.shape[0])
+    return X[perm].to(DEVICE), Y[perm].to(DEVICE)
+# -----------------------------
+# Model
+# -----------------------------
+class TinyMLP(nn.Module):
+    def __init__(self, width: int = 64):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(2, width),
+            nn.Tanh(),
+            nn.Linear(width, width),
+            nn.Tanh(),
+            nn.Linear(width, 2),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.net(x)
+def linear_layers(model: nn.Module) -> List[nn.Linear]:
+    return [m for m in model.modules() if isinstance(m, nn.Linear)]
+# -----------------------------
+# Block bookkeeping
+# -----------------------------
+@dataclass
+class BlockRef:
+    """A block is one output row of a Linear layer, plus its bias element if present."""
+    layer_index: int
+    row_index: int
+class SurpriseTopKUpdater:
+    """
+    Applies a predicted-gradient + top-surprise true-gradient update.
+    Unit of sparsity:
+      One output row of each Linear layer.
+    Why rows?
+      Row blocks correspond roughly to neurons/features, and structured blocks are
+      much closer to real hardware speedup than individual unstructured weights.
+    """
+    def __init__(
+        self,
+        model: nn.Module,
+        lr: float = 0.05,
+        beta: float = 0.9,
+        active_fraction: float = 0.2,
+        refresh_interval: int = 10,
+        use_error_feedback: bool = True,
+        eps: float = 1e-12,
+    ):
+        self.model = model
+        self.layers = linear_layers(model)
+        self.lr = lr
+        self.beta = beta
+        self.active_fraction = active_fraction
+        self.refresh_interval = refresh_interval
+        self.use_error_feedback = use_error_feedback
+        self.eps = eps
+        self.blocks: List[BlockRef] = []
+        for li, layer in enumerate(self.layers):
+            for row in range(layer.weight.shape[0]):
+                self.blocks.append(BlockRef(li, row))
+        # Predicted gradients, same shape as parameters.
+        self.pred_w: Dict[int, torch.Tensor] = {}
+        self.pred_b: Dict[int, torch.Tensor] = {}
+        # Error-feedback buffers accumulate information we did not apply.
+        self.err_w: Dict[int, torch.Tensor] = {}
+        self.err_b: Dict[int, torch.Tensor] = {}
+        # Surprise scores per block. Higher means more worth computing/updating exactly.
+        self.scores = torch.ones(len(self.blocks), device=DEVICE)
+        for li, layer in enumerate(self.layers):
+            self.pred_w[li] = torch.zeros_like(layer.weight.data)
+            self.err_w[li] = torch.zeros_like(layer.weight.data)
+            if layer.bias is not None:
+                self.pred_b[li] = torch.zeros_like(layer.bias.data)
+                self.err_b[li] = torch.zeros_like(layer.bias.data)
+    def _block_grad_vector(self, li: int, row: int) -> torch.Tensor:
+        layer = self.layers[li]
+        parts = [layer.weight.grad[row].flatten()]
+        if layer.bias is not None:
+            parts.append(layer.bias.grad[row].view(1))
+        return torch.cat(parts)
+    def _block_pred_vector(self, li: int, row: int) -> torch.Tensor:
+        layer = self.layers[li]
+        parts = [self.pred_w[li][row].flatten()]
+        if layer.bias is not None:
+            parts.append(self.pred_b[li][row].view(1))
+        return torch.cat(parts)
+    def _choose_active_blocks(self, step: int) -> torch.Tensor:
+        """Return boolean mask over blocks."""
+        n_blocks = len(self.blocks)
+        # Full refresh: observe/update everything.
+        if step % self.refresh_interval == 0:
+            return torch.ones(n_blocks, dtype=torch.bool, device=DEVICE)
+        k = max(1, int(self.active_fraction * n_blocks))
+        active = torch.zeros(n_blocks, dtype=torch.bool, device=DEVICE)
+        top_idx = torch.topk(self.scores, k=k).indices
+        active[top_idx] = True
+        return active
+    @torch.no_grad()
+    def step(self, step: int) -> Dict[str, float]:
+        """
+        Apply one optimizer step after loss.backward().
+        Returns diagnostics comparing the approximate update to the true gradient.
+        """
+        active = self._choose_active_blocks(step)
+        true_flat = []
+        applied_flat = []
+        pred_flat = []
+        active_count = int(active.sum().item())
+        total_count = len(self.blocks)
+        for block_id, block in enumerate(self.blocks):
+            li = block.layer_index
+            row = block.row_index
+            layer = self.layers[li]
+            is_active = bool(active[block_id].item())
+            g_w = layer.weight.grad[row].clone()
+            g_b = layer.bias.grad[row].clone() if layer.bias is not None else None
+            # Error feedback makes skipped/prediction error come back later instead
+            # of disappearing forever.
+            if self.use_error_feedback:
+                g_w_eff = g_w + self.err_w[li][row]
+                g_b_eff = g_b + self.err_b[li][row] if g_b is not None else None
+            else:
+                g_w_eff = g_w
+                g_b_eff = g_b
+            p_w = self.pred_w[li][row].clone()
+            p_b = self.pred_b[li][row].clone() if layer.bias is not None else None
+            if is_active:
+                # Use the exact observed gradient for high-surprise blocks.
+                applied_w = g_w_eff
+                applied_b = g_b_eff
+                # Update predictor only where we pretend we actually observed the gradient.
+                self.pred_w[li][row].mul_(self.beta).add_(g_w, alpha=1.0 - self.beta)
+                if layer.bias is not None:
+                    self.pred_b[li][row].mul_(self.beta).add_(g_b, alpha=1.0 - self.beta)
+                # Update surprise score: how wrong was the current predictor?
+                pred_vec = self._block_pred_vector(li, row)
+                grad_vec = self._block_grad_vector(li, row)
+                residual_norm = torch.norm(grad_vec - pred_vec)
+                grad_norm = torch.norm(grad_vec)
+                self.scores[block_id] = residual_norm / (grad_norm + self.eps)
+            else:
+                # Use the predicted/stale gradient for low-surprise blocks.
+                applied_w = p_w
+                applied_b = p_b
+            # Update error-feedback buffers.
+            if self.use_error_feedback:
+                self.err_w[li][row] = g_w_eff - applied_w
+                if layer.bias is not None:
+                    self.err_b[li][row] = g_b_eff - applied_b
+            # Apply update.
+            layer.weight.data[row].add_(applied_w, alpha=-self.lr)
+            if layer.bias is not None:
+                layer.bias.data[row].add_(applied_b, alpha=-self.lr)
+            # Diagnostics.
+            true_parts = [g_w.flatten()]
+            app_parts = [applied_w.flatten()]
+            pred_parts = [p_w.flatten()]
+            if layer.bias is not None:
+                true_parts.append(g_b.view(1))
+                app_parts.append(applied_b.view(1))
+                pred_parts.append(p_b.view(1))
+            true_flat.append(torch.cat(true_parts))
+            applied_flat.append(torch.cat(app_parts))
+            pred_flat.append(torch.cat(pred_parts))
+        true_vec = torch.cat(true_flat)
+        applied_vec = torch.cat(applied_flat)
+        pred_vec = torch.cat(pred_flat)
+        cosine = F.cosine_similarity(true_vec, applied_vec, dim=0).item()
+        pred_explained = 1.0 - (
+            torch.norm(true_vec - pred_vec).pow(2) / (torch.norm(true_vec).pow(2) + self.eps)
+        ).item()
+        # Heavy-tail diagnostic: how much surprise mass lives in the top 20% blocks?
+        k20 = max(1, int(0.2 * total_count))
+        sorted_scores = torch.sort(self.scores.detach(), descending=True).values
+        top20_mass = (sorted_scores[:k20].sum() / (sorted_scores.sum() + self.eps)).item()
+        # Clear gradients manually.
+        for p in self.model.parameters():
+            p.grad = None
+        return {
+            "active_fraction": active_count / total_count,
+            "cosine_true_vs_applied": cosine,
+            "pred_explained_fraction": pred_explained,
+            "top20_surprise_mass": top20_mass,
+        }
+# -----------------------------
+# Training loops
+# -----------------------------
+def accuracy(model: nn.Module, X: torch.Tensor, y: torch.Tensor) -> float:
+    model.eval()
+    with torch.no_grad():
+        pred = model(X).argmax(dim=1)
+        return (pred == y).float().mean().item()
+def train_baseline(
+    X: torch.Tensor,
+    y: torch.Tensor,
+    steps: int = 600,
+    batch_size: int = 128,
+    lr: float = 0.05,
+) -> Tuple[nn.Module, List[Dict[str, float]]]:
+    model = TinyMLP().to(DEVICE)
+    opt = torch.optim.SGD(model.parameters(), lr=lr)
+    history = []
+    for step in range(steps):
+        idx = torch.randint(0, X.shape[0], (batch_size,), device=DEVICE)
+        xb, yb = X[idx], y[idx]
+        logits = model(xb)
+        loss = F.cross_entropy(logits, yb)
+        opt.zero_grad(set_to_none=True)
+        loss.backward()
+        opt.step()
+        if step % 25 == 0 or step == steps - 1:
+            history.append({
+                "step": step,
+                "loss": float(loss.item()),
+                "accuracy": accuracy(model, X, y),
+            })
+    return model, history
+def train_surprise_topk(
+    X: torch.Tensor,
+    y: torch.Tensor,
+    steps: int = 600,
+    batch_size: int = 128,
+    lr: float = 0.05,
+    active_fraction: float = 0.2,
+    refresh_interval: int = 10,
+    beta: float = 0.9,
+) -> Tuple[nn.Module, List[Dict[str, float]]]:
+    model = TinyMLP().to(DEVICE)
+    updater = SurpriseTopKUpdater(
+        model,
+        lr=lr,
+        beta=beta,
+        active_fraction=active_fraction,
+        refresh_interval=refresh_interval,
+        use_error_feedback=True,
+    )
+    history = []
+    for step in range(steps):
+        idx = torch.randint(0, X.shape[0], (batch_size,), device=DEVICE)
+        xb, yb = X[idx], y[idx]
+        logits = model(xb)
+        loss = F.cross_entropy(logits, yb)
+        loss.backward()
+        diagnostics = updater.step(step)
+        if step % 25 == 0 or step == steps - 1:
+            row = {
+                "step": step,
+                "loss": float(loss.item()),
+                "accuracy": accuracy(model, X, y),
+                **diagnostics,
+            }
+            history.append(row)
+    return model, history
+# -----------------------------
+# Main experiment
+# -----------------------------
+def print_last(label: str, history: List[Dict[str, float]]) -> None:
+    last = history[-1]
+    print(f"\n{label}")
+    for k, v in last.items():
+        if isinstance(v, float):
+            print(f"  {k:28s}: {v:.4f}")
+        else:
+            print(f"  {k:28s}: {v}")
+def main() -> None:
+    X, y = make_spiral(n_per_class=768, noise=0.18)
+    baseline_model, baseline_hist = train_baseline(X, y)
+    surprise_model, surprise_hist = train_surprise_topk(
+        X,
+        y,
+        active_fraction=0.2,
+        refresh_interval=10,
+        beta=0.9,
+    )
+    print_last("Baseline full SGD", baseline_hist)
+    print_last("Surprise Top-K simulated training", surprise_hist)
+    print("\nA few Surprise Top-K checkpoints:")
+    for row in surprise_hist[:: max(1, len(surprise_hist) // 8)]:
+        print(
+            f"step={row['step']:4d} "
+            f"loss={row['loss']:.4f} "
+            f"acc={row['accuracy']:.3f} "
+            f"active={row['active_fraction']:.2f} "
+            f"cos={row['cosine_true_vs_applied']:.3f} "
+            f"pred_expl={row['pred_explained_fraction']:.3f} "
+            f"top20_mass={row['top20_surprise_mass']:.3f}"
+        )
+if __name__ == "__main__":
+    main()

triton_sparse.py → experiments/triton_sparse.py RENAMED Viewed

File without changes

triton_v2.py → experiments/triton_v2.py RENAMED Viewed

File without changes

experiments/uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

paper/main.tex ADDED Viewed

	@@ -0,0 +1,307 @@

+\documentclass[11pt]{article}
+\usepackage[margin=1in]{geometry}
+\usepackage{amsmath,amssymb}
+\usepackage{graphicx}
+\usepackage{microtype}
+\usepackage{booktabs}
+\usepackage{tabularx}
+\usepackage{hyperref}
+\hypersetup{
+  colorlinks=true,
+  linkcolor=blue,
+  citecolor=blue,
+  urlcolor=blue,
+}
+\title{%
+  \textbf{Zero-Copy Sparse Backpropagation}: \\[0.3em]
+  \large Temporal Gradient Tracking for
+Faster, Regularized LLM Training
+}
+\author{
+  Daniel Owen van Dommelen\\
+  \textit{Independent Research - WORKING DRAFT}\\
+  \texttt{theapemachine@gmail.com}
+}
+\date{\today}
+\begin{document}
+\maketitle
+\begin{abstract}
+We describe \emph{Predictive Chunked Sparsity}: fixed top-$k$ row-chunks for
+sparse $dW$, selected by an EMA of past chunk-gradient norms, with contiguous
+slices for PyTorch-style GEMMs. On \textbf{Apple MPS} (full 6-layer runs:
+$B{=}8$, $T{=}256$, chunk 64, $10\%$ active, 2000 steps), sparse training is
+\textbf{slower} at $d_{\text{model}}{=}512$ ($\sim$1.22$\times$ higher ms/step than
+dense for both $G_X$ modes) but \textbf{faster} at $d{=}2048$ ($\sim$1.18$\times$
+and $\sim$1.221$\times$ speedup for full-$G_X$ and sparse-$G_X$ respectively,
+with validation loss reported in Table~\ref{tab:mps-e2e}).
+On \textbf{NVIDIA T4}, an isolated single-FFN timing harness (100 iters, fp32,
+same $B,T$, chunk 64, $10\%$ active) shows full-$G_X$ totals from
+1.02$\times$ at $d{=}256$ to 1.35$\times$ at $d{=}2048$
+(Table~\ref{tab:t4-ffn-micro}). A fused \textbf{Triton} backward passes numeric
+checks (Table~\ref{tab:triton-correctness}); isolated backward on T4 improves
+over dense for $d\ge 512$ but can trail PyTorch at $d{=}256$
+(Table~\ref{tab:triton-backward}). Short \textbf{T4 end-to-end} training (100
+steps) shows modest PyLoop gains at $d{=}512$/1024 and Triton autotune/noise
+hurting at small scale (Table~\ref{tab:t4-e2e}). EMA--oracle chunk overlap on one
+seed is in Table~\ref{tab:ema-overlap}; multi-seed long runs were pending at
+draft time.
+\end{abstract}
+\section{Introduction}
+Training transformers is dominated by dense matmuls. Some work reports
+heavy-tailed gradient coordinates; whether that yields wall-clock savings depends
+on implementation and hardware. Dynamic sparsity often hits irregular memory
+access and, for variable masks, possible host--device coordination for shapes.
+We use \emph{fixed-cardinality} chunk masks, an EMA scorer, cosine annealing,
+and strided views (and optionally Triton) so active tiles map to dense GEMMs.
+Contributions are \textbf{(1)} the algorithmic recipe, \textbf{(2)}
+reproducible tables for MPS full training, T4 microbenchmarks, Triton
+correctness and speed, short T4 E2E, chunk-size timing, and \textbf{(3)} honest
+limits: speedups are width-, backend-, and workload-dependent.
+\section{Methodology: Predictive Chunked Sparsity}
+Linear $W\in\mathbb{R}^{O\times I}$ is split into $N$ row chunks of size $C$.
+Binary mask $A\in\{0,1\}^N$ picks active chunks; inactive $dW$ is zeroed into
+the optimizer. EMA on observed chunk norms $M_c^{(t)}=\beta
+M_c^{(t-1)}+(1-\beta)\|G_{W_c}\|_2$ (active); $M_c^{(t)}=\gamma M_c^{(t-1)}$
+(inactive). Top-$k$ chunks from $M^{(t-1)}$ fix $A$ at step $t$. Cosine schedule
+$S(t)$ warms up fully dense then anneals toward $S_{\text{target}}$. With AdamW,
+$g{=}0$ on inactive weights yields decaying moments (``phantom momentum'')---
+standard Adam side effect, not a separate contribution.
+\section{Systems}
+Fixed $k$ avoids mask-derived index tensor sizes. Chunk rows are contiguous
+slices (\texttt{gy\_flat[:, s:e]}). PyTorch normally implements that as a view;
+exact behavior is version-dependent. A Python loop over active chunks issues
+multiple kernel launches; Triton fusion targets that overhead (see
+Table~\ref{tab:triton-backward}).
+\section{Experiments and Results}
+All numbers below are from recorded runs; GPU, hyperparameters, and seed are
+stated per table. We do not claim universal ranking of backends.
+\subsection{Full training on Apple MPS (author runs)}
+Six layers, $B{=}8$, $T{=}256$, chunk\_size${=}64$, $10\%$ active chunks, 2000
+optimization steps. Times are total wall for 2000 steps; ms/step derived.
+\begin{table}[t]
+  \centering
+  \caption{MPS full training (single-seed author configuration per run).}
+  \label{tab:mps-e2e}
+  \begin{tabular}{l l r r r}
+    \toprule
+    $d_{\text{model}}$ & Run & Time (s) & ms/step & Val.\ loss \\
+    \midrule
+    512 & \texttt{dense\_baseline} & 74.77 & 99.70 & 5.3142 \\
+    512 & \texttt{sparse\_full\_dX} & 91.04 & 121.38 & 5.4141 \\
+    512 & \texttt{sparse\_sparse\_dX} & 93.33 & 124.44 & 5.5467 \\
+    \midrule
+    2048 & \texttt{dense\_baseline} & 1035.84 & 591.91 & 6.0264 \\
+    2048 & \texttt{sparse\_full\_dX} & 875.51 & 500.29 & 5.9807 \\
+    2048 & \texttt{sparse\_sparse\_dX} & 847.22 & 484.13 & 6.0231 \\
+    \bottomrule
+  \end{tabular}
+\end{table}
+At $d{=}512$, sparse ms/step is $\sim$1.22$\times$ (\texttt{sparse\_full\_dX})
+and $\sim$1.25$\times$ (\texttt{sparse\_sparse\_dX}) vs.\ dense---\emph{slower}.
+At $d{=}2048$, sparse is $\sim$1.18$\times$ and $\sim$1.22$\times$
+\emph{faster}. Validation loss at $d{=}2048$ is best for
+\texttt{sparse\_full\_dX} in this table; at $d{=}512$ dense is best.
+\subsection{Isolated FFN layer microbenchmark (T4)}
+One FFN block, $M{=}2048$, $B{=}8$, $T{=}256$, chunk\_size${=}64$, $10\%$ active,
+fp32, 100 iterations. Components: forward, $dX$, $dW$ dense vs.\ sparse;
+\emph{full\_$G_X$} total = sum with dense $dX$.
+\begin{table}[t]
+  \centering
+  \caption{T4: per--FFN-layer times (ms). Spd. $=$ Tot.\ den.\,/{}Tot.\ sp.f.;
+    sparse total uses dense $dX$ (full\_dX).}
+  \label{tab:t4-ffn-micro}
+  \resizebox{\linewidth}{!}{%
+  \footnotesize
+  \begin{tabular}{r r r r r r r r r r}
+    \toprule
+    $d_{\text{model}}$ & FFN dim & Params & Fwd & $dX$ & $dW_{\mathrm{d}}$ &
+      $dW_{\mathrm{s}}$ & Tot.\ den. & Tot.\ sp.f. & Spd. \\
+    \midrule
+    256 & 1024 & 0.3M & 0.27 & 0.21 & 0.27 & 0.26 & 0.75 & 0.74 & 1.02$\times$ \\
+    384 & 1536 & 0.6M & 0.52 & 0.69 & 0.61 & 0.18 & 1.82 & 1.39 & 1.31$\times$ \\
+    512 & 2048 & 1.0M & 1.00 & 1.01 & 0.97 & 0.26 & 2.99 & 2.28 & 1.31$\times$ \\
+    768 & 3072 & 2.4M & 2.16 & 2.25 & 2.05 & 0.40 & 6.46 & 4.81 & 1.34$\times$ \\
+    1024 & 4096 & 4.2M & 3.69 & 3.90 & 3.35 & 0.59 & 10.95 & 8.18 & 1.34$\times$ \\
+    1536 & 6144 & 9.4M & 10.33 & 9.03 & 8.14 & 1.30 & 27.50 & 20.66 & 1.33$\times$ \\
+    2048 & 8192 & 16.8M & 14.76 & 15.57 & 13.19 & 1.93 & 43.51 & 32.26 & 1.35$\times$ \\
+    \bottomrule
+  \end{tabular}%
+  }
+\end{table}
+If $dW_{\mathrm{dense}}$ were removed from the dense total, a simple
+illustrative ratio (using the measured forward+$dX$ share) implies a ceiling
+around $\sim$1.42--1.48$\times$ for this harness; crossover for net speedup vs.\
+dense full-$G_X$ is near $d_{\text{model}}\approx 384$ in this table.
+\subsection{Triton numeric checks (T4)}
+Max absolute errors vs.\ reference (fp32 tolerances in experiment script); all
+marked passing in the run log.
+\begin{table}[t]
+  \centering
+  \caption{Triton backward vs.\ reference: max abs error.}
+  \label{tab:triton-correctness}
+  \begin{tabular}{r r r r r r r}
+    \toprule
+    $d_{\mathrm{in}}$ & $d_{\mathrm{out}}$ & ch. & $\max|dW|$ & $\max|db|$ & $\max|dX|$ & OK \\
+    \midrule
+    512 & 2048 & 64 & 0.000320 & 0.000023 & 0.000042 & $\checkmark$ \\
+    1024 & 4096 & 64 & 0.000443 & 0.000021 & 0.000092 & $\checkmark$ \\
+    256 & 1024 & 32 & 0.000275 & 0.000038 & 0.000019 & $\checkmark$ \\
+    \bottomrule
+  \end{tabular}
+\end{table}
+\subsection{Isolated backward: Dense vs.\ PyLoop vs.\ Triton (T4)}
+$M{=}2048$, chunk\_size${=}64$, $10\%$ active, full\_$G_X$ mode, 50 iterations
+post-warmup. Times are full backward ms for the timed region (as recorded).
+\begin{table}[t]
+  \centering
+  \caption{T4 isolated backward (ms). Triton/Dense $=$ dense/time\_triton.}
+  \label{tab:triton-backward}
+  \resizebox{\linewidth}{!}{%
+  \footnotesize
+  \begin{tabular}{r r r r r r r r r}
+    \toprule
+    $d_{\text{model}}$ & FFN & Active ch. & Dense & PyLoop & Triton &
+      T/Dense & T/PyLoop \\
+    \midrule
+    256 & 1024 & 1 & 0.39 & 0.40 & 0.46 & 0.85$\times$ & 0.88$\times$ \\
+    512 & 2048 & 3 & 1.96 & 1.30 & 1.16 & 1.69$\times$ & 1.12$\times$ \\
+    768 & 3072 & 4 & 4.29 & 2.52 & 2.51 & 1.70$\times$ & 1.00$\times$ \\
+    1024 & 4096 & 6 & 7.29 & 4.37 & 4.30 & 1.70$\times$ & 1.02$\times$ \\
+    1536 & 6144 & 9 & 17.32 & 10.04 & 9.78 & 1.77$\times$ & 1.03$\times$ \\
+    2048 & 8192 & 12 & 29.14 & 17.20 & 16.89 & 1.73$\times$ & 1.02$\times$ \\
+    \bottomrule
+  \end{tabular}%
+  }
+\end{table}
+\noindent\textbf{Triton with both $dW$ and $dX$ sparse} (same harness family;
+user-reported row):
+\begin{table}[h]
+  \centering
+  \begin{tabular}{r r r r}
+    \toprule
+    $d_{\text{model}}$ & Dense (ms) & Triton\_all (ms) & Speedup \\
+    \midrule
+    512 & 1.96 & 0.41 & 4.83$\times$ \\
+    1024 & 7.06 & 1.07 & 6.58$\times$ \\
+    2048 & 29.00 & 3.71 & 7.81$\times$ \\
+    \bottomrule
+  \end{tabular}
+\end{table}
+At $d{=}256$, Triton is slower than dense in Table~\ref{tab:triton-backward}
+(0.85$\times$); at $d{=}512$, PyTorch single-kernel launches can still be hard
+to beat for only three active chunks.
+\subsection{End-to-end training on T4 (100 steps)}
+Six layers, 8 heads, $B{=}8$, $T{=}256$, chunk\_size${=}64$, $10\%$ active,
+seed${=}42$, AdamW lr$=$5e-4, full\_$G_X$. $d{=}2048$ did not fit 16GB T4.
+\begin{table}[t]
+  \centering
+  \caption{T4 E2E (100 steps); ``vs Dense'' is dense/ms\_mode.}
+  \label{tab:t4-e2e}
+  \begin{tabular}{r l r r r}
+    \toprule
+    $d_{\text{model}}$ & Mode & ms/step & vs.\ Dense & Val.\ loss \\
+    \midrule
+    512 & dense & 184.6 & 1.00$\times$ & 5.6954 \\
+    512 & pyloop & 179.0 & 1.03$\times$ & 5.8683 \\
+    512 & triton & 196.0 & 0.94$\times$ & 5.8683 \\
+    \midrule
+    1024 & dense & 451.5 & 1.00$\times$ & 5.5300 \\
+    1024 & pyloop & 435.6 & 1.04$\times$ & 5.4803 \\
+    1024 & triton & 441.0 & 1.02$\times$ & 5.4800 \\
+    \bottomrule
+  \end{tabular}
+\end{table}
+Triton E2E at $d{=}512$ is slower than dense here; autotune and short-run
+overhead dominate at small scale in the author's log.
+\subsection{EMA vs.\ oracle chunk overlap (T4)}
+$d{=}512$, 6 layers, chunk\_size${=}64$, $10\%$ active, 350 steps, seed${=}42$;
+first check step ${=}250$ post-anneal schedule. Jaccard/Recall vs.\ dense-oracle
+top-$k$ (as implemented in experiment).
+\begin{table}[t]
+  \centering
+  \caption{Predictor overlap (single seed; multi-seed long runs were pending).}
+  \label{tab:ema-overlap}
+  \begin{tabular}{r r r}
+    \toprule
+    Step & Jaccard & Recall \\
+    \midrule
+    250 & 0.6000 & 0.7500 \\
+    275 & 0.6552 & 0.7917 \\
+    300 & 0.7778 & 0.8750 \\
+    325 & 0.6000 & 0.7500 \\
+    \bottomrule
+  \end{tabular}
+\end{table}
+\subsection{Chunk size vs.\ step time (T4, PyLoop)}
+$d{=}512$, 6 layers, $10\%$ active, seed${=}42$, 50 training steps (warmup;
+loss not converged---timing only).
+\begin{table}[t]
+  \centering
+  \caption{ms/step vs.\ chunk size (PyLoop backend).}
+  \label{tab:chunk-size}
+  \begin{tabular}{r r}
+    \toprule
+    Chunk size & ms/step \\
+    \midrule
+    16 & 601.4 \\
+    32 & 453.0 \\
+    64 & 321.5 \\
+    128 & 251.3 \\
+    256 & 219.8 \\
+    \bottomrule
+  \end{tabular}
+\end{table}
+Larger chunks $\Rightarrow$ fewer Python iterations per layer in this backend.
+\subsection{Pending experiments (snapshot)}
+At draft time, additional A10G jobs were in flight, e.g.\ internal IDs
+\texttt{69f38371d70108f37ace1cae} (multi-baseline 2000-step suite),
+\texttt{69f395b3d70108f37ace1cee} ($d$ scaling), and
+\texttt{69f3af45d2c8bd8662bd419d} (E2E Triton including $d{=}2048$). Treat these
+only as lab run pointers.
+\section{Conclusion}
+Chunked EMA sparsity is not uniformly faster: \textbf{MPS} shows a crossover in
+$d_{\text{model}}$ between 512 and 2048 for full training;
+\textbf{T4} microbenchmarks monotonically favor sparse full-$G_X$ totals from
+$d{\approx}384$ upward to 1.35$\times$ at $d{=}2048$ in Table~\ref{tab:t4-ffn-micro},
+while \textbf{T4 E2E} at 100 steps shows small PyLoop wins and Triton not yet
+winning at $d{=}512$. Triton shows large factors when both $dW$ and $dX$ are
+sparse in the isolated harness, subject to training-quality tradeoffs not fully
+tabulated here. Future work: complete multi-seed tables and fused-kernel E2E at
+large $d$.
+\end{document}

sparse_transformer_v18_fast_knn.py ADDED Viewed

	@@ -0,0 +1,459 @@

+"""
+Sparse Transformer v18: Fast Chunked Sparse Backward + KNN Sensor Scheduler.
+This plugs the v16/v17 KNN sensor scheduler into the real chunked sparse backward path.
+It compares dense, EMA-topk sparse, KNN sparse, and random sparse in full_dX and sparse_dX modes.
+Run:
+    python3 sparse_transformer_v18_fast_knn.py --device mps --benchmark_sync
+"""
+from __future__ import annotations
+import argparse
+import math
+import random
+import time
+from typing import Dict, List, Literal, Optional, Tuple
+import torch
+torch.set_num_threads(1)
+import torch.nn as nn
+import torch.nn.functional as F
+Scheduler = Literal["dense", "ema_topk", "knn_scheduler", "random"]
+BackwardMode = Literal["dense_baseline", "sparse_dW_full_dX", "sparse_dW_sparse_dX"]
+def sync_device(device: str) -> None:
+    if device == "cuda" and torch.cuda.is_available():
+        torch.cuda.synchronize()
+    elif device == "mps" and hasattr(torch, "mps"):
+        torch.mps.synchronize()
+def set_seed(seed: int) -> None:
+    random.seed(seed)
+    torch.manual_seed(seed)
+def make_cpu_generator(seed: int) -> torch.Generator:
+    gen = torch.Generator(device="cpu")
+    gen.manual_seed(seed)
+    return gen
+def make_synthetic_corpus(n_sentences: int = 12000, seed: int = 7) -> str:
+    rng = random.Random(seed)
+    words = [
+        "ada", "turing", "grace", "lovelace", "gradients", "tokens", "circuits",
+        "features", "boldly", "strangely", "matrix", "attention", "kernel", "entropy", "signal",
+    ]
+    return "\n".join(
+        " ".join(rng.choices(words, k=rng.randint(4, 10))) + "." for _ in range(n_sentences)
+    )
+class CharCorpus:
+    def __init__(self, text: str, block_size: int, device: str):
+        chars = sorted(set(text))
+        self.stoi = {ch: i for i, ch in enumerate(chars)}
+        self.vocab_size = len(chars)
+        self.block_size = block_size
+        self.device = device
+        data = torch.tensor([self.stoi[ch] for ch in text], dtype=torch.long)
+        self.train_data = data[: int(0.9 * len(data))]
+        self.val_data = data[int(0.9 * len(data)) :]
+    def get_batch(self, split: str, batch_size: int, generator: Optional[torch.Generator] = None):
+        data = self.train_data if split == "train" else self.val_data
+        ix = torch.randint(len(data) - self.block_size - 1, (batch_size,), generator=generator)
+        x = torch.stack([data[i : i + self.block_size] for i in ix])
+        y = torch.stack([data[i + 1 : i + self.block_size + 1] for i in ix])
+        return x.to(self.device), y.to(self.device)
+class ChunkedMaskedLinear(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, weight, bias, active_chunks, chunk_size: int, sparse_dx: bool):
+        ctx.save_for_backward(x, weight, active_chunks)
+        ctx.has_bias = bias is not None
+        ctx.sparse_dx = bool(sparse_dx)
+        ctx.chunk_size = int(chunk_size)
+        return F.linear(x, weight, bias)
+    @staticmethod
+    def backward(ctx, grad_y):
+        x, weight, active_chunks = ctx.saved_tensors
+        chunk_size = ctx.chunk_size
+        x_flat = x.reshape(-1, x.shape[-1])
+        gy_flat = grad_y.reshape(-1, grad_y.shape[-1])
+        grad_w = torch.zeros_like(weight)
+        grad_b = torch.zeros(weight.shape[0], device=weight.device, dtype=weight.dtype) if ctx.has_bias else None
+        grad_x_flat = torch.zeros_like(x_flat) if ctx.sparse_dx else gy_flat @ weight
+        for c_idx in active_chunks.tolist():
+            start = int(c_idx) * chunk_size
+            end = start + chunk_size
+            gy_slice = gy_flat[:, start:end]
+            w_slice = weight[start:end, :]
+            grad_w[start:end, :] = gy_slice.transpose(0, 1) @ x_flat
+            if grad_b is not None:
+                grad_b[start:end] = gy_slice.sum(dim=0)
+            if ctx.sparse_dx:
+                grad_x_flat += gy_slice @ w_slice
+        return grad_x_flat.reshape(x.shape), grad_w, grad_b, None, None, None
+class SparseLinear(nn.Linear):
+    def __init__(self, in_features: int, out_features: int, bias: bool = True):
+        super().__init__(in_features, out_features, bias=bias)
+        self.sparse_enabled = False
+        self.sparse_dx = False
+        self.active_chunks: Optional[torch.Tensor] = None
+        self.chunk_size = 64
+    def forward(self, x):
+        if not self.sparse_enabled or self.active_chunks is None:
+            return F.linear(x, self.weight, self.bias)
+        return ChunkedMaskedLinear.apply(x, self.weight, self.bias, self.active_chunks, self.chunk_size, self.sparse_dx)
+class CausalSelfAttention(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        assert n_embd % n_head == 0
+        self.n_head = n_head
+        self.head_dim = n_embd // n_head
+        self.c_attn = SparseLinear(n_embd, 3 * n_embd)
+        self.c_proj = SparseLinear(n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+        self.register_buffer("mask", torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size))
+    def forward(self, x):
+        B, T, C = x.shape
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(C, dim=2)
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float("-inf"))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.c_proj(y)
+class FeedForward(nn.Module):
+    def __init__(self, n_embd: int, dropout: float):
+        super().__init__()
+        self.c_fc = SparseLinear(n_embd, 4 * n_embd)
+        self.c_proj = SparseLinear(4 * n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        return self.dropout(self.c_proj(F.gelu(self.c_fc(x))))
+class Block(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(n_embd)
+        self.attn = CausalSelfAttention(n_embd, n_head, block_size, dropout)
+        self.ln2 = nn.LayerNorm(n_embd)
+        self.mlp = FeedForward(n_embd, dropout)
+    def forward(self, x):
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+class MiniGPT(nn.Module):
+    def __init__(self, vocab_size: int, block_size: int, n_layer: int, n_head: int, n_embd: int, dropout: float):
+        super().__init__()
+        self.tok_emb = nn.Embedding(vocab_size, n_embd)
+        self.pos_emb = nn.Embedding(block_size, n_embd)
+        self.blocks = nn.Sequential(*[Block(n_embd, n_head, block_size, dropout) for _ in range(n_layer)])
+        self.ln_f = nn.LayerNorm(n_embd)
+        self.lm_head = nn.Linear(n_embd, vocab_size)
+    def forward(self, idx, targets=None):
+        B, T = idx.shape
+        pos = torch.arange(T, device=idx.device)
+        x = self.tok_emb(idx) + self.pos_emb(pos)[None, :, :]
+        x = self.blocks(x)
+        x = self.ln_f(x)
+        logits = self.lm_head(x)
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss
+def get_sparse_linears(model):
+    return [m for m in model.modules() if isinstance(m, SparseLinear)]
+class FastChunkScheduler:
+    def __init__(self, model, scheduler: Scheduler, target_fraction: float, chunk_size: int, device: str,
+                 mass_beta: float = 0.95, similarity_history: int = 128, min_similarity_history: int = 8,
+                 knn_k: int = 3):
+        self.scheduler = scheduler
+        self.target_fraction = target_fraction
+        self.chunk_size = chunk_size
+        self.device = device
+        self.mass_beta = mass_beta
+        self.similarity_history = similarity_history
+        self.min_similarity_history = min_similarity_history
+        self.knn_k = knn_k
+        self.linears = get_sparse_linears(model)
+        self.module_to_chunk_ids = {}
+        self.module_to_local_ids = {}
+        offset = 0
+        for m in self.linears:
+            m.chunk_size = chunk_size
+            assert m.out_features % chunk_size == 0
+            n_chunks = m.out_features // chunk_size
+            ids = torch.arange(offset, offset + n_chunks, device=device)
+            self.module_to_chunk_ids[m] = ids
+            self.module_to_local_ids[m] = torch.arange(n_chunks, device=device)
+            offset += n_chunks
+        self.n_chunks = offset
+        self.predicted_mass = torch.zeros(self.n_chunks, device=device)
+        self.mass_history = []
+        self.similarity = None
+        self.active_chunks = torch.zeros(self.n_chunks, dtype=torch.bool, device=device)
+        self.sensor_scores = torch.zeros(self.n_chunks, device=device)
+    def current_fraction(self, step: int, warmup_steps: int, anneal_steps: int) -> float:
+        if self.scheduler == "dense" or step < warmup_steps:
+            return 1.0
+        if anneal_steps > 0 and step < warmup_steps + anneal_steps:
+            progress = (step - warmup_steps) / anneal_steps
+            cosine_mult = 0.5 * (1.0 + math.cos(math.pi * progress))
+            return self.target_fraction + (1.0 - self.target_fraction) * cosine_mult
+        return self.target_fraction
+    def choose_active(self, step: int, warmup_steps: int, anneal_steps: int):
+        frac = self.current_fraction(step, warmup_steps, anneal_steps)
+        if frac >= 0.999 or self.scheduler == "dense":
+            self.active_chunks.fill_(True)
+            self.install_local_masks()
+            return self.active_chunks
+        k = max(1, int(frac * self.n_chunks))
+        self.active_chunks.fill_(False)
+        if self.scheduler == "random":
+            idx = torch.randperm(self.n_chunks, device=self.device)[:k]
+        elif self.scheduler == "ema_topk":
+            scores = self.predicted_mass + 1e-9 * torch.rand_like(self.predicted_mass)
+            idx = torch.topk(scores, k=k).indices
+        elif self.scheduler == "knn_scheduler":
+            base = self.sensor_scores if torch.count_nonzero(self.sensor_scores).item() else self.predicted_mass
+            scores = base + 1e-9 * torch.rand_like(base)
+            idx = torch.topk(scores, k=k).indices
+        else:
+            raise ValueError(f"Unknown scheduler: {self.scheduler}")
+        self.active_chunks[idx] = True
+        self.install_local_masks()
+        return self.active_chunks
+    def install_local_masks(self):
+        for m, global_ids in self.module_to_chunk_ids.items():
+            local = self.module_to_local_ids[m]
+            m.active_chunks = local[self.active_chunks[global_ids]]
+    @torch.no_grad()
+    def update_from_active_gradients(self, step: int, warmup_steps: int):
+        current_mass = torch.zeros_like(self.predicted_mass)
+        for m, ids in self.module_to_chunk_ids.items():
+            if m.weight.grad is None:
+                continue
+            w_sq = m.weight.grad.square().view(len(ids), self.chunk_size, -1).sum(dim=(1, 2))
+            if m.bias is not None and m.bias.grad is not None:
+                w_sq += m.bias.grad.square().view(len(ids), self.chunk_size).sum(dim=1)
+            current_mass[ids] = torch.sqrt(w_sq + 1e-30)
+        observed = self.active_chunks
+        never_seen = observed & (self.predicted_mass == 0)
+        already_seen = observed & ~never_seen
+        self.predicted_mass[never_seen] = current_mass[never_seen]
+        self.predicted_mass[already_seen] = self.mass_beta * self.predicted_mass[already_seen] + (1 - self.mass_beta) * current_mass[already_seen]
+        if step < warmup_steps:
+            self.mass_history.append(current_mass.detach().clone())
+            if len(self.mass_history) > self.similarity_history:
+                self.mass_history = self.mass_history[-self.similarity_history:]
+            if len(self.mass_history) >= self.min_similarity_history:
+                self.similarity = self.build_similarity()
+        self.sensor_scores = self.knn_scores(self.active_chunks, current_mass) if self.scheduler == "knn_scheduler" else self.predicted_mass.clone()
+    def build_similarity(self):
+        H = torch.stack(self.mass_history, dim=0)
+        H = H - H.mean(dim=0, keepdim=True)
+        H = H / (H.std(dim=0, keepdim=True) + 1e-6)
+        S = (H.T @ H) / max(1, H.shape[0] - 1)
+        S = torch.clamp(S, min=0.0)
+        S.fill_diagonal_(0.0)
+        allowed = torch.zeros_like(S, dtype=torch.bool)
+        for _, ids in self.module_to_chunk_ids.items():
+            allowed[ids[:, None], ids[None, :]] = True
+        return torch.where(allowed, S, torch.zeros_like(S))
+    def knn_scores(self, active_mask, current_mass):
+        if self.similarity is None:
+            return self.predicted_mass.clone()
+        scores = self.predicted_mass.clone()
+        scores[active_mask] = current_mass[active_mask]
+        active_idx = torch.nonzero(active_mask, as_tuple=False).flatten()
+        inactive_idx = torch.nonzero(~active_mask, as_tuple=False).flatten()
+        if active_idx.numel() == 0:
+            return scores
+        S = self.similarity
+        for i in inactive_idx.tolist():
+            weights = S[i, active_idx]
+            if weights.sum() <= 1e-12:
+                continue
+            kk = min(self.knn_k, weights.numel())
+            top = torch.topk(weights, k=kk)
+            w = top.values
+            aidx = active_idx[top.indices]
+            scores[i] = (w * current_mass[aidx]).sum() / (w.sum() + 1e-12)
+        return scores
+class ChunkedAdam:
+    def __init__(self, model, lr=3e-4, chunk_size=64):
+        self.model = model
+        self.lr = lr
+        self.chunk_size = chunk_size
+        self.state = {}
+        self.param_to_sparse_module = {}
+        for m in get_sparse_linears(model):
+            if m.weight is not None:
+                self.param_to_sparse_module[m.weight] = m
+            if m.bias is not None:
+                self.param_to_sparse_module[m.bias] = m
+    def zero_grad(self):
+        for p in self.model.parameters():
+            p.grad = None
+    @torch.no_grad()
+    def step(self):
+        for p in self.model.parameters():
+            if p.grad is None:
+                continue
+            if p not in self.state:
+                self.state[p] = {"m": torch.zeros_like(p), "v": torch.zeros_like(p)}
+            exp_avg = self.state[p]["m"]
+            exp_avg_sq = self.state[p]["v"]
+            sparse_module = self.param_to_sparse_module.get(p)
+            active_chunks = getattr(sparse_module, "active_chunks", None) if sparse_module else None
+            if active_chunks is None:
+                exp_avg.mul_(0.9).add_(p.grad, alpha=0.1)
+                exp_avg_sq.mul_(0.999).addcmul_(p.grad, p.grad, value=0.001)
+                p.sub_(exp_avg / (torch.sqrt(exp_avg_sq) + 1e-8), alpha=self.lr)
+            else:
+                for local_c in active_chunks.tolist():
+                    start = int(local_c) * self.chunk_size
+                    end = start + self.chunk_size
+                    p_chunk = p[start:end]
+                    g_chunk = p.grad[start:end]
+                    m_chunk = exp_avg[start:end]
+                    v_chunk = exp_avg_sq[start:end]
+                    m_chunk.mul_(0.9).add_(g_chunk, alpha=0.1)
+                    v_chunk.mul_(0.999).addcmul_(g_chunk, g_chunk, value=0.001)
+                    p_chunk.sub_(m_chunk / (torch.sqrt(v_chunk) + 1e-8), alpha=self.lr)
+def evaluate(model, corpus, batch_size, seed):
+    model.eval()
+    with torch.no_grad():
+        x, y = corpus.get_batch("val", batch_size, generator=make_cpu_generator(seed))
+        _, loss = model(x, y)
+    model.train()
+    return float(loss.item())
+def run_one(label, scheduler, mode, args):
+    set_seed(42)
+    corpus = CharCorpus(make_synthetic_corpus(), args.block_size, args.device)
+    model = MiniGPT(corpus.vocab_size, args.block_size, args.n_layer, args.n_head, args.n_embd, 0.0).to(args.device)
+    sched = FastChunkScheduler(model, scheduler, args.active_fraction, args.chunk_size, args.device)
+    opt = ChunkedAdam(model, lr=args.lr, chunk_size=args.chunk_size)
+    measured_steps = args.steps
+    if args.benchmark_sync:
+        sync_device(args.device)
+    t0 = time.perf_counter()
+    for step in range(args.steps):
+        if step == args.warmup_steps + args.anneal_steps:
+            if args.benchmark_sync:
+                sync_device(args.device)
+            t0 = time.perf_counter()
+            measured_steps = args.steps - step
+        if scheduler == "dense" or mode == "dense_baseline":
+            for m in get_sparse_linears(model):
+                m.sparse_enabled = False
+                m.active_chunks = None
+        else:
+            sched.choose_active(step, args.warmup_steps, args.anneal_steps)
+            for m in get_sparse_linears(model):
+                m.sparse_enabled = True
+                m.sparse_dx = mode == "sparse_dW_sparse_dX"
+        x, y = corpus.get_batch("train", args.batch_size, generator=make_cpu_generator(step))
+        opt.zero_grad()
+        _, loss = model(x, y)
+        loss.backward()
+        if scheduler != "dense" and mode != "dense_baseline":
+            sched.update_from_active_gradients(step, args.warmup_steps)
+        opt.step()
+    if args.benchmark_sync:
+        sync_device(args.device)
+    elapsed = time.perf_counter() - t0
+    return {"val": evaluate(model, corpus, args.batch_size, 12345), "ms": 1000 * elapsed / max(1, measured_steps)}
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--steps", type=int, default=500)
+    parser.add_argument("--batch_size", type=int, default=16)
+    parser.add_argument("--block_size", type=int, default=256)
+    parser.add_argument("--n_layer", type=int, default=4)
+    parser.add_argument("--n_head", type=int, default=16)
+    parser.add_argument("--n_embd", type=int, default=1024)
+    parser.add_argument("--chunk_size", type=int, default=64)
+    parser.add_argument("--active_fraction", type=float, default=0.10)
+    parser.add_argument("--warmup_steps", type=int, default=25)
+    parser.add_argument("--anneal_steps", type=int, default=150)
+    parser.add_argument("--lr", type=float, default=3e-4)
+    parser.add_argument("--device", type=str, default="mps")
+    parser.add_argument("--benchmark_sync", action="store_true")
+    args = parser.parse_args()
+    runs = [
+        ("dense", "dense", "dense_baseline"),
+        ("ema_full_dX", "ema_topk", "sparse_dW_full_dX"),
+        ("knn_full_dX", "knn_scheduler", "sparse_dW_full_dX"),
+        ("random_full_dX", "random", "sparse_dW_full_dX"),
+        ("ema_sparse_dX", "ema_topk", "sparse_dW_sparse_dX"),
+        ("knn_sparse_dX", "knn_scheduler", "sparse_dW_sparse_dX"),
+        ("random_sparse_dX", "random", "sparse_dW_sparse_dX"),
+    ]
+    print("\nFast chunked sparse backward with KNN scheduler")
+    print(f"device={args.device} steps={args.steps} d={args.n_embd} layers={args.n_layer}")
+    print(f"batch={args.batch_size} block={args.block_size} chunk={args.chunk_size}")
+    print(f"active={args.active_fraction} warmup={args.warmup_steps} anneal={args.anneal_steps}\n")
+    print(f"{'run':>18s} | {'val':>8s} | {'ms/step':>8s} | {'speedup':>8s}")
+    print("-" * 58)
+    dense_ms = None
+    for label, scheduler, mode in runs:
+        result = run_one(label, scheduler, mode, args)
+        if label == "dense":
+            dense_ms = result["ms"]
+        speedup = dense_ms / result["ms"] if dense_ms else float("nan")
+        print(f"{label:>18s} | {result['val']:8.4f} | {result['ms']:8.2f} | {speedup:8.3f}")
+if __name__ == "__main__":
+    main()

sparse_transformer_v18_fast_knn_triton.py ADDED Viewed

	@@ -0,0 +1,1044 @@

+#!/usr/bin/env python3
+"""
+Sparse Transformer v19: Triton-backed v18 KNN Scheduler.
+This is the CUDA/Triton version of the fast chunked sparse-backward loop.
+It combines:
+  - chunked sparse Linear backward
+  - Triton fused active-chunk dW + dBias
+  - optional Triton sparse dX
+  - EMA / KNN sensor scheduler / random support
+  - chunked sparse Adam update
+Core modes:
+  dense
+  ema_full_dX
+  knn_full_dX
+  random_full_dX
+  ema_sparse_dX
+  knn_sparse_dX
+  random_sparse_dX
+Safe mode:
+  knn_full_dX
+    Forward dense, dW/db sparse, dX full.
+Aggressive mode:
+  knn_sparse_dX
+    Forward dense, dW/db sparse, dX sparse through active chunks.
+Run:
+  python3 sparse_transformer_v19_triton_knn.py --device cuda --benchmark_sync
+Useful:
+  python3 sparse_transformer_v19_triton_knn.py --device cuda --steps 500 --n_embd 1024 --benchmark_sync
+  python3 sparse_transformer_v19_triton_knn.py --device cuda --steps 500 --n_embd 2048 --benchmark_sync
+Notes:
+  - This script needs CUDA + Triton.
+  - No autotune. Fixed configs reduce compile noise and keep comparisons stable.
+  - dW+dBias is fused.
+  - Uses block_ptr/tiled loads. On T4 this is not Hopper TMA; do not call it TMA.
+"""
+from __future__ import annotations
+import argparse
+import math
+import random
+import time
+from typing import Dict, List, Literal, Optional, Tuple
+import torch
+torch.set_num_threads(1)
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    import triton
+    import triton.language as tl
+    TRITON_AVAILABLE = True
+except Exception:
+    triton = None
+    tl = None
+    TRITON_AVAILABLE = False
+Scheduler = Literal["dense", "ema_topk", "knn_scheduler", "random"]
+BackwardMode = Literal["dense_baseline", "sparse_dW_full_dX", "sparse_dW_sparse_dX"]
+KernelBackend = Literal["triton", "torch"]
+# ================================================================
+# Utilities
+# ================================================================
+def sync_device(device: str) -> None:
+    if device == "cuda" and torch.cuda.is_available():
+        torch.cuda.synchronize()
+    elif device == "mps" and hasattr(torch, "mps"):
+        torch.mps.synchronize()
+def set_seed(seed: int) -> None:
+    random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+def make_cpu_generator(seed: int) -> torch.Generator:
+    gen = torch.Generator(device="cpu")
+    gen.manual_seed(seed)
+    return gen
+# ================================================================
+# Data
+# ================================================================
+def make_synthetic_corpus(n_sentences: int = 12000, seed: int = 7) -> str:
+    rng = random.Random(seed)
+    words = [
+        "ada", "turing", "grace", "lovelace", "gradients",
+        "tokens", "circuits", "features", "boldly", "strangely",
+        "matrix", "attention", "kernel", "entropy", "signal",
+    ]
+    return "\n".join(
+        " ".join(rng.choices(words, k=rng.randint(4, 10))) + "."
+        for _ in range(n_sentences)
+    )
+class CharCorpus:
+    def __init__(self, text: str, block_size: int, device: str):
+        chars = sorted(set(text))
+        self.stoi = {ch: i for i, ch in enumerate(chars)}
+        self.itos = {i: ch for ch, i in self.stoi.items()}
+        self.vocab_size = len(chars)
+        self.block_size = block_size
+        self.device = device
+        data = torch.tensor([self.stoi[ch] for ch in text], dtype=torch.long)
+        self.train_data = data[: int(0.9 * len(data))]
+        self.val_data = data[int(0.9 * len(data)) :]
+    def get_batch(
+        self,
+        split: str,
+        batch_size: int,
+        generator: Optional[torch.Generator] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        data = self.train_data if split == "train" else self.val_data
+        ix = torch.randint(len(data) - self.block_size - 1, (batch_size,), generator=generator)
+        x = torch.stack([data[i : i + self.block_size] for i in ix])
+        y = torch.stack([data[i + 1 : i + self.block_size + 1] for i in ix])
+        return x.to(self.device), y.to(self.device)
+# ================================================================
+# Triton sparse Linear backward kernels
+# ================================================================
+if TRITON_AVAILABLE:
+    @triton.jit
+    def _triton_sparse_bwd_dW_db_kernel(
+        X_ptr, dY_ptr, dW_ptr, dB_ptr, chunk_ids_ptr,
+        M: tl.constexpr, d_in: tl.constexpr, d_out: tl.constexpr, num_active: tl.constexpr,
+        stride_xm: tl.constexpr, stride_xk: tl.constexpr,
+        stride_dym: tl.constexpr, stride_dyn: tl.constexpr,
+        stride_dwn: tl.constexpr, stride_dwk: tl.constexpr,
+        HAS_BIAS: tl.constexpr,
+        CS: tl.constexpr, BK: tl.constexpr, BM: tl.constexpr,
+    ):
+        """
+        One program computes one [CS, BK] dW tile for one active chunk.
+        Bias for that chunk is fused into k_block_id == 0.
+        Grid: (num_active, ceil(d_in / BK))
+        """
+        chunk_linear_id = tl.program_id(0)
+        k_block_id = tl.program_id(1)
+        chunk_idx = tl.load(chunk_ids_ptr + chunk_linear_id)
+        chunk_start = chunk_idx * CS
+        k_offset = k_block_id * BK
+        dy_block_ptr = tl.make_block_ptr(
+            base=dY_ptr,
+            shape=(d_out, M),
+            strides=(stride_dyn, stride_dym),
+            offsets=(chunk_start, 0),
+            block_shape=(CS, BM),
+            order=(1, 0),
+        )
+        x_block_ptr = tl.make_block_ptr(
+            base=X_ptr,
+            shape=(M, d_in),
+            strides=(stride_xm, stride_xk),
+            offsets=(0, k_offset),
+            block_shape=(BM, BK),
+            order=(1, 0),
+        )
+        acc_dw = tl.zeros((CS, BK), dtype=tl.float32)
+        compute_bias = HAS_BIAS and (k_block_id == 0)
+        acc_db = tl.zeros((CS,), dtype=tl.float32)
+        for _ in range(0, M, BM):
+            dy_t = tl.load(dy_block_ptr, boundary_check=(0, 1))  # [CS, BM]
+            x = tl.load(x_block_ptr, boundary_check=(0, 1))      # [BM, BK]
+            acc_dw = tl.dot(dy_t, x, acc=acc_dw)
+            if compute_bias:
+                acc_db += tl.sum(dy_t, axis=1)
+            dy_block_ptr = tl.advance(dy_block_ptr, (0, BM))
+            x_block_ptr = tl.advance(x_block_ptr, (BM, 0))
+        dw_block_ptr = tl.make_block_ptr(
+            base=dW_ptr,
+            shape=(d_out, d_in),
+            strides=(stride_dwn, stride_dwk),
+            offsets=(chunk_start, k_offset),
+            block_shape=(CS, BK),
+            order=(1, 0),
+        )
+        tl.store(dw_block_ptr, acc_dw.to(dW_ptr.dtype.element_ty), boundary_check=(0, 1))
+        if compute_bias:
+            rn = chunk_start + tl.arange(0, CS)
+            tl.store(dB_ptr + rn, acc_db.to(dB_ptr.dtype.element_ty), mask=rn < d_out)
+    @triton.jit
+    def _triton_sparse_bwd_dX_kernel(
+        dY_ptr, W_ptr, dX_ptr, chunk_ids_ptr,
+        M: tl.constexpr, d_in: tl.constexpr, d_out: tl.constexpr, num_active: tl.constexpr,
+        stride_dym: tl.constexpr, stride_dyn: tl.constexpr,
+        stride_wn: tl.constexpr, stride_wk: tl.constexpr,
+        stride_dxm: tl.constexpr, stride_dxk: tl.constexpr,
+        CS: tl.constexpr, BM: tl.constexpr, BK: tl.constexpr,
+    ):
+        """
+        One program computes one [BM, BK] tile of dX by accumulating over active chunks.
+        Grid: (ceil(M/BM), ceil(d_in/BK)).
+        """
+        pid_m = tl.program_id(0)
+        pid_k = tl.program_id(1)
+        m_offset = pid_m * BM
+        k_offset = pid_k * BK
+        acc = tl.zeros((BM, BK), dtype=tl.float32)
+        for i in range(0, num_active):
+            chunk_idx = tl.load(chunk_ids_ptr + i)
+            chunk_start = chunk_idx * CS
+            dy_block_ptr = tl.make_block_ptr(
+                base=dY_ptr,
+                shape=(M, d_out),
+                strides=(stride_dym, stride_dyn),
+                offsets=(m_offset, chunk_start),
+                block_shape=(BM, CS),
+                order=(1, 0),
+            )
+            w_block_ptr = tl.make_block_ptr(
+                base=W_ptr,
+                shape=(d_out, d_in),
+                strides=(stride_wn, stride_wk),
+                offsets=(chunk_start, k_offset),
+                block_shape=(CS, BK),
+                order=(1, 0),
+            )
+            dy = tl.load(dy_block_ptr, boundary_check=(0, 1))  # [BM, CS]
+            w = tl.load(w_block_ptr, boundary_check=(0, 1))    # [CS, BK]
+            acc = tl.dot(dy, w, acc=acc)
+        dx_block_ptr = tl.make_block_ptr(
+            base=dX_ptr,
+            shape=(M, d_in),
+            strides=(stride_dxm, stride_dxk),
+            offsets=(m_offset, k_offset),
+            block_shape=(BM, BK),
+            order=(1, 0),
+        )
+        tl.store(dx_block_ptr, acc.to(dX_ptr.dtype.element_ty), boundary_check=(0, 1))
+def triton_sparse_bwd_dW_db(
+    x_flat: torch.Tensor,
+    gy_flat: torch.Tensor,
+    active_chunks: torch.Tensor,
+    chunk_size: int,
+    d_out: int,
+    has_bias: bool,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    if not TRITON_AVAILABLE:
+        raise RuntimeError("Triton is not available")
+    M, d_in = x_flat.shape
+    num_active = int(active_chunks.numel())
+    dW = torch.zeros((d_out, d_in), device=x_flat.device, dtype=x_flat.dtype)
+    dB = torch.zeros((d_out,), device=x_flat.device, dtype=x_flat.dtype) if has_bias else None
+    if num_active == 0:
+        return dW, dB
+    chunk_ids = active_chunks.to(torch.int32).contiguous()
+    # Fixed configs.
+    CS = int(chunk_size)
+    BK = 64
+    BM = 64
+    grid = (num_active, triton.cdiv(d_in, BK))
+    _triton_sparse_bwd_dW_db_kernel[grid](
+        x_flat, gy_flat, dW, dB if has_bias else dW, chunk_ids,
+        M, d_in, d_out, num_active,
+        x_flat.stride(0), x_flat.stride(1),
+        gy_flat.stride(0), gy_flat.stride(1),
+        dW.stride(0), dW.stride(1),
+        HAS_BIAS=has_bias,
+        CS=CS, BK=BK, BM=BM,
+        num_warps=4,
+    )
+    return dW, dB
+def triton_sparse_bwd_dX(
+    gy_flat: torch.Tensor,
+    weight: torch.Tensor,
+    active_chunks: torch.Tensor,
+    chunk_size: int,
+    M: int,
+    d_in: int,
+) -> torch.Tensor:
+    if not TRITON_AVAILABLE:
+        raise RuntimeError("Triton is not available")
+    num_active = int(active_chunks.numel())
+    d_out = gy_flat.shape[1]
+    dX = torch.zeros((M, d_in), device=gy_flat.device, dtype=gy_flat.dtype)
+    if num_active == 0:
+        return dX
+    chunk_ids = active_chunks.to(torch.int32).contiguous()
+    CS = int(chunk_size)
+    BM = 64
+    BK = 64
+    grid = (triton.cdiv(M, BM), triton.cdiv(d_in, BK))
+    _triton_sparse_bwd_dX_kernel[grid](
+        gy_flat, weight, dX, chunk_ids,
+        M, d_in, d_out, num_active,
+        gy_flat.stride(0), gy_flat.stride(1),
+        weight.stride(0), weight.stride(1),
+        dX.stride(0), dX.stride(1),
+        CS=CS, BM=BM, BK=BK,
+        num_warps=4,
+    )
+    return dX
+# ================================================================
+# Sparse Linear autograd
+# ================================================================
+class ChunkedMaskedLinearTorch(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        bias: Optional[torch.Tensor],
+        active_chunks: torch.Tensor,
+        chunk_size: int,
+        sparse_dx: bool,
+    ) -> torch.Tensor:
+        ctx.save_for_backward(x, weight, active_chunks)
+        ctx.has_bias = bias is not None
+        ctx.sparse_dx = bool(sparse_dx)
+        ctx.chunk_size = int(chunk_size)
+        return F.linear(x, weight, bias)
+    @staticmethod
+    def backward(ctx, grad_y: torch.Tensor):
+        x, weight, active_chunks = ctx.saved_tensors
+        chunk_size = ctx.chunk_size
+        x_flat = x.reshape(-1, x.shape[-1])
+        gy_flat = grad_y.reshape(-1, grad_y.shape[-1])
+        grad_w = torch.zeros_like(weight)
+        grad_b = torch.zeros(weight.shape[0], device=weight.device, dtype=weight.dtype) if ctx.has_bias else None
+        if ctx.sparse_dx:
+            grad_x_flat = torch.zeros_like(x_flat)
+        else:
+            grad_x_flat = gy_flat @ weight
+        for c_idx in active_chunks.tolist():
+            start = int(c_idx) * chunk_size
+            end = start + chunk_size
+            gy_slice = gy_flat[:, start:end]
+            w_slice = weight[start:end, :]
+            grad_w[start:end, :] = gy_slice.transpose(0, 1) @ x_flat
+            if grad_b is not None:
+                grad_b[start:end] = gy_slice.sum(dim=0)
+            if ctx.sparse_dx:
+                grad_x_flat += gy_slice @ w_slice
+        return grad_x_flat.reshape(x.shape), grad_w, grad_b, None, None, None
+class ChunkedMaskedLinearTriton(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        bias: Optional[torch.Tensor],
+        active_chunks: torch.Tensor,
+        chunk_size: int,
+        sparse_dx: bool,
+    ) -> torch.Tensor:
+        ctx.save_for_backward(x, weight, active_chunks)
+        ctx.has_bias = bias is not None
+        ctx.sparse_dx = bool(sparse_dx)
+        ctx.chunk_size = int(chunk_size)
+        return F.linear(x, weight, bias)
+    @staticmethod
+    def backward(ctx, grad_y: torch.Tensor):
+        x, weight, active_chunks = ctx.saved_tensors
+        chunk_size = ctx.chunk_size
+        x_shape = x.shape
+        x_flat = x.reshape(-1, x.shape[-1]).contiguous()
+        gy_flat = grad_y.reshape(-1, grad_y.shape[-1]).contiguous()
+        grad_w, grad_b = triton_sparse_bwd_dW_db(
+            x_flat=x_flat,
+            gy_flat=gy_flat,
+            active_chunks=active_chunks,
+            chunk_size=chunk_size,
+            d_out=weight.shape[0],
+            has_bias=ctx.has_bias,
+        )
+        if ctx.sparse_dx:
+            grad_x_flat = triton_sparse_bwd_dX(
+                gy_flat=gy_flat,
+                weight=weight.contiguous(),
+                active_chunks=active_chunks,
+                chunk_size=chunk_size,
+                M=x_flat.shape[0],
+                d_in=x_flat.shape[1],
+            )
+        else:
+            grad_x_flat = gy_flat @ weight
+        grad_x = grad_x_flat.reshape(x_shape)
+        return grad_x, grad_w, grad_b, None, None, None
+class SparseLinear(nn.Linear):
+    def __init__(self, in_features: int, out_features: int, bias: bool = True):
+        super().__init__(in_features, out_features, bias=bias)
+        self.sparse_enabled = False
+        self.sparse_dx = False
+        self.active_chunks: Optional[torch.Tensor] = None
+        self.chunk_size = 64
+        self.kernel_backend: KernelBackend = "triton"
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not self.sparse_enabled or self.active_chunks is None:
+            return F.linear(x, self.weight, self.bias)
+        if self.kernel_backend == "triton":
+            return ChunkedMaskedLinearTriton.apply(
+                x, self.weight, self.bias, self.active_chunks, self.chunk_size, self.sparse_dx
+            )
+        return ChunkedMaskedLinearTorch.apply(
+            x, self.weight, self.bias, self.active_chunks, self.chunk_size, self.sparse_dx
+        )
+# ================================================================
+# MiniGPT
+# ================================================================
+class CausalSelfAttention(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        assert n_embd % n_head == 0
+        self.n_head = n_head
+        self.head_dim = n_embd // n_head
+        self.c_attn = SparseLinear(n_embd, 3 * n_embd)
+        self.c_proj = SparseLinear(n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+        self.register_buffer(
+            "mask",
+            torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, T, C = x.shape
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(C, dim=2)
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float("-inf"))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.c_proj(y)
+class FeedForward(nn.Module):
+    def __init__(self, n_embd: int, dropout: float):
+        super().__init__()
+        self.c_fc = SparseLinear(n_embd, 4 * n_embd)
+        self.c_proj = SparseLinear(4 * n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.dropout(self.c_proj(F.gelu(self.c_fc(x))))
+class Block(nn.Module):
+    def __init__(self, n_embd: int, n_head: int, block_size: int, dropout: float):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(n_embd)
+        self.attn = CausalSelfAttention(n_embd, n_head, block_size, dropout)
+        self.ln2 = nn.LayerNorm(n_embd)
+        self.mlp = FeedForward(n_embd, dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+class MiniGPT(nn.Module):
+    def __init__(
+        self,
+        vocab_size: int,
+        block_size: int,
+        n_layer: int,
+        n_head: int,
+        n_embd: int,
+        dropout: float,
+    ):
+        super().__init__()
+        self.block_size = block_size
+        self.tok_emb = nn.Embedding(vocab_size, n_embd)
+        self.pos_emb = nn.Embedding(block_size, n_embd)
+        self.blocks = nn.Sequential(
+            *[Block(n_embd, n_head, block_size, dropout) for _ in range(n_layer)]
+        )
+        self.ln_f = nn.LayerNorm(n_embd)
+        self.lm_head = nn.Linear(n_embd, vocab_size)
+    def forward(self, idx: torch.Tensor, targets: Optional[torch.Tensor] = None):
+        B, T = idx.shape
+        pos = torch.arange(T, device=idx.device)
+        x = self.tok_emb(idx) + self.pos_emb(pos)[None, :, :]
+        x = self.blocks(x)
+        x = self.ln_f(x)
+        logits = self.lm_head(x)
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss
+def get_sparse_linears(model: nn.Module) -> List[SparseLinear]:
+    return [m for m in model.modules() if isinstance(m, SparseLinear)]
+# ================================================================
+# Scheduler
+# ================================================================
+class FastChunkScheduler:
+    def __init__(
+        self,
+        model: nn.Module,
+        scheduler: Scheduler,
+        target_fraction: float,
+        chunk_size: int,
+        device: str,
+        mass_beta: float = 0.95,
+        similarity_history: int = 128,
+        min_similarity_history: int = 8,
+        knn_k: int = 3,
+    ):
+        self.model = model
+        self.scheduler = scheduler
+        self.target_fraction = target_fraction
+        self.chunk_size = chunk_size
+        self.device = device
+        self.mass_beta = mass_beta
+        self.similarity_history = similarity_history
+        self.min_similarity_history = min_similarity_history
+        self.knn_k = knn_k
+        self.linears = get_sparse_linears(model)
+        self.module_to_chunk_ids: Dict[nn.Module, torch.Tensor] = {}
+        self.module_to_local_ids: Dict[nn.Module, torch.Tensor] = {}
+        offset = 0
+        for m in self.linears:
+            m.chunk_size = chunk_size
+            n_chunks = m.out_features // chunk_size
+            assert m.out_features % chunk_size == 0, (
+                f"out_features {m.out_features} not divisible by chunk_size {chunk_size}"
+            )
+            ids = torch.arange(offset, offset + n_chunks, device=device)
+            local = torch.arange(n_chunks, device=device)
+            self.module_to_chunk_ids[m] = ids
+            self.module_to_local_ids[m] = local
+            offset += n_chunks
+        self.n_chunks = offset
+        self.predicted_mass = torch.zeros(self.n_chunks, device=device)
+        self.mass_history: List[torch.Tensor] = []
+        self.similarity: Optional[torch.Tensor] = None
+        self.active_chunks = torch.zeros(self.n_chunks, dtype=torch.bool, device=device)
+        self.sensor_scores = torch.zeros(self.n_chunks, device=device)
+    def current_fraction(self, step: int, warmup_steps: int, anneal_steps: int) -> float:
+        if self.scheduler == "dense":
+            return 1.0
+        if step < warmup_steps:
+            return 1.0
+        if anneal_steps > 0 and step < warmup_steps + anneal_steps:
+            progress = (step - warmup_steps) / anneal_steps
+            cosine_mult = 0.5 * (1.0 + math.cos(math.pi * progress))
+            return self.target_fraction + (1.0 - self.target_fraction) * cosine_mult
+        return self.target_fraction
+    def choose_active(self, step: int, warmup_steps: int, anneal_steps: int) -> torch.Tensor:
+        frac = self.current_fraction(step, warmup_steps, anneal_steps)
+        if frac >= 0.999 or self.scheduler == "dense":
+            self.active_chunks.fill_(True)
+            self.install_local_masks()
+            return self.active_chunks
+        k = max(1, int(frac * self.n_chunks))
+        self.active_chunks.fill_(False)
+        if self.scheduler == "random":
+            idx = torch.randperm(self.n_chunks, device=self.device)[:k]
+        elif self.scheduler == "ema_topk":
+            scores = self.predicted_mass + 1e-9 * torch.rand_like(self.predicted_mass)
+            idx = torch.topk(scores, k=k).indices
+        elif self.scheduler == "knn_scheduler":
+            base = self.sensor_scores
+            if torch.count_nonzero(base).item() == 0:
+                base = self.predicted_mass
+            scores = base + 1e-9 * torch.rand_like(base)
+            idx = torch.topk(scores, k=k).indices
+        else:
+            raise ValueError(f"Unknown scheduler: {self.scheduler}")
+        self.active_chunks[idx] = True
+        self.install_local_masks()
+        return self.active_chunks
+    def install_local_masks(self) -> None:
+        for m, global_ids in self.module_to_chunk_ids.items():
+            local = self.module_to_local_ids[m]
+            m.active_chunks = local[self.active_chunks[global_ids]]
+    @torch.no_grad()
+    def update_from_active_gradients(self, step: int, warmup_steps: int) -> torch.Tensor:
+        current_mass = torch.zeros_like(self.predicted_mass)
+        for m, ids in self.module_to_chunk_ids.items():
+            if m.weight.grad is None:
+                continue
+            w_sq = m.weight.grad.square().view(len(ids), self.chunk_size, -1).sum(dim=(1, 2))
+            if m.bias is not None and m.bias.grad is not None:
+                w_sq += m.bias.grad.square().view(len(ids), self.chunk_size).sum(dim=1)
+            current_mass[ids] = torch.sqrt(w_sq + 1e-30)
+        observed = self.active_chunks
+        never_seen = observed & (self.predicted_mass == 0)
+        already_seen = observed & ~never_seen
+        self.predicted_mass[never_seen] = current_mass[never_seen]
+        self.predicted_mass[already_seen] = (
+            self.mass_beta * self.predicted_mass[already_seen]
+            + (1.0 - self.mass_beta) * current_mass[already_seen]
+        )
+        if step < warmup_steps:
+            self.mass_history.append(current_mass.detach().clone())
+            if len(self.mass_history) > self.similarity_history:
+                self.mass_history = self.mass_history[-self.similarity_history :]
+            if len(self.mass_history) >= self.min_similarity_history:
+                self.similarity = self.build_similarity()
+        if self.scheduler == "knn_scheduler":
+            self.sensor_scores = self.knn_scores(self.active_chunks, current_mass)
+        else:
+            self.sensor_scores = self.predicted_mass.clone()
+        return current_mass
+    def build_similarity(self) -> torch.Tensor:
+        H = torch.stack(self.mass_history, dim=0)
+        H = H - H.mean(dim=0, keepdim=True)
+        H = H / (H.std(dim=0, keepdim=True) + 1e-6)
+        S = (H.T @ H) / max(1, H.shape[0] - 1)
+        S = torch.clamp(S, min=0.0)
+        S.fill_diagonal_(0.0)
+        allowed = torch.zeros_like(S, dtype=torch.bool)
+        for _, ids in self.module_to_chunk_ids.items():
+            allowed[ids[:, None], ids[None, :]] = True
+        return torch.where(allowed, S, torch.zeros_like(S))
+    def knn_scores(self, active_mask: torch.Tensor, current_mass: torch.Tensor) -> torch.Tensor:
+        if self.similarity is None:
+            return self.predicted_mass.clone()
+        scores = self.predicted_mass.clone()
+        scores[active_mask] = current_mass[active_mask]
+        active_idx = torch.nonzero(active_mask, as_tuple=False).flatten()
+        inactive_idx = torch.nonzero(~active_mask, as_tuple=False).flatten()
+        if active_idx.numel() == 0:
+            return scores
+        S = self.similarity
+        for i in inactive_idx.tolist():
+            weights = S[i, active_idx]
+            if weights.sum() <= 1e-12:
+                continue
+            kk = min(self.knn_k, weights.numel())
+            top = torch.topk(weights, k=kk)
+            w = top.values
+            aidx = active_idx[top.indices]
+            scores[i] = (w * current_mass[aidx]).sum() / (w.sum() + 1e-12)
+        return scores
+# ================================================================
+# Chunked Adam
+# ================================================================
+class ChunkedAdam:
+    def __init__(self, model: nn.Module, lr: float = 3e-4, chunk_size: int = 64):
+        self.model = model
+        self.lr = lr
+        self.chunk_size = chunk_size
+        self.state: Dict[torch.nn.Parameter, Dict[str, torch.Tensor]] = {}
+        self.param_to_sparse_module: Dict[torch.nn.Parameter, SparseLinear] = {}
+        for m in get_sparse_linears(model):
+            if m.weight is not None:
+                self.param_to_sparse_module[m.weight] = m
+            if m.bias is not None:
+                self.param_to_sparse_module[m.bias] = m
+    def zero_grad(self):
+        for p in self.model.parameters():
+            p.grad = None
+    @torch.no_grad()
+    def step(self):
+        for p in self.model.parameters():
+            if p.grad is None:
+                continue
+            if p not in self.state:
+                self.state[p] = {"m": torch.zeros_like(p), "v": torch.zeros_like(p)}
+            exp_avg = self.state[p]["m"]
+            exp_avg_sq = self.state[p]["v"]
+            sparse_module = self.param_to_sparse_module.get(p)
+            active_chunks = getattr(sparse_module, "active_chunks", None) if sparse_module else None
+            if active_chunks is None:
+                exp_avg.mul_(0.9).add_(p.grad, alpha=0.1)
+                exp_avg_sq.mul_(0.999).addcmul_(p.grad, p.grad, value=0.001)
+                p.sub_(exp_avg / (torch.sqrt(exp_avg_sq) + 1e-8), alpha=self.lr)
+            else:
+                for local_c in active_chunks.tolist():
+                    start = int(local_c) * self.chunk_size
+                    end = start + self.chunk_size
+                    p_chunk = p[start:end]
+                    g_chunk = p.grad[start:end]
+                    m_chunk = exp_avg[start:end]
+                    v_chunk = exp_avg_sq[start:end]
+                    m_chunk.mul_(0.9).add_(g_chunk, alpha=0.1)
+                    v_chunk.mul_(0.999).addcmul_(g_chunk, g_chunk, value=0.001)
+                    p_chunk.sub_(m_chunk / (torch.sqrt(v_chunk) + 1e-8), alpha=self.lr)
+# ================================================================
+# Training
+# ================================================================
+def evaluate(model: nn.Module, corpus: CharCorpus, batch_size: int, seed: int) -> float:
+    model.eval()
+    with torch.no_grad():
+        x, y = corpus.get_batch("val", batch_size, generator=make_cpu_generator(seed))
+        _, loss = model(x, y)
+    model.train()
+    return float(loss.item())
+def run_one(
+    scheduler_name: Scheduler,
+    mode: BackwardMode,
+    kernel_backend: KernelBackend,
+    device: str,
+    steps: int,
+    batch_size: int,
+    block_size: int,
+    n_layer: int,
+    n_head: int,
+    n_embd: int,
+    chunk_size: int,
+    active_fraction: float,
+    warmup_steps: int,
+    anneal_steps: int,
+    benchmark_sync: bool,
+) -> Dict[str, float]:
+    set_seed(42)
+    corpus = CharCorpus(make_synthetic_corpus(), block_size, device)
+    model = MiniGPT(corpus.vocab_size, block_size, n_layer, n_head, n_embd, 0.0).to(device)
+    for m in get_sparse_linears(model):
+        m.chunk_size = chunk_size
+        m.kernel_backend = kernel_backend
+    sched = FastChunkScheduler(
+        model=model,
+        scheduler=scheduler_name,
+        target_fraction=active_fraction,
+        chunk_size=chunk_size,
+        device=device,
+    )
+    opt = ChunkedAdam(model, lr=3e-4, chunk_size=chunk_size)
+    measured_steps = steps
+    if benchmark_sync:
+        sync_device(device)
+    t0 = time.perf_counter()
+    for step in range(steps):
+        if step == warmup_steps + anneal_steps:
+            if benchmark_sync:
+                sync_device(device)
+            t0 = time.perf_counter()
+            measured_steps = steps - step
+        if scheduler_name == "dense" or mode == "dense_baseline":
+            for m in get_sparse_linears(model):
+                m.sparse_enabled = False
+                m.active_chunks = None
+        else:
+            sched.choose_active(step, warmup_steps=warmup_steps, anneal_steps=anneal_steps)
+            for m in get_sparse_linears(model):
+                m.sparse_enabled = True
+                m.sparse_dx = mode == "sparse_dW_sparse_dX"
+        x, y = corpus.get_batch("train", batch_size, generator=make_cpu_generator(step))
+        opt.zero_grad()
+        _, loss = model(x, y)
+        loss.backward()
+        if scheduler_name != "dense" and mode != "dense_baseline":
+            sched.update_from_active_gradients(step=step, warmup_steps=warmup_steps)
+        opt.step()
+    if benchmark_sync:
+        sync_device(device)
+    elapsed = time.perf_counter() - t0
+    val_loss = evaluate(model, corpus, batch_size, seed=12345)
+    return {"val": val_loss, "ms": 1000.0 * elapsed / max(1, measured_steps)}
+def run_correctness_smoke(
+    device: str,
+    chunk_size: int = 64,
+    dtype: torch.dtype = torch.float32,
+) -> None:
+    if device != "cuda":
+        print("Skipping Triton correctness smoke: device is not cuda")
+        return
+    if not TRITON_AVAILABLE:
+        print("Skipping Triton correctness smoke: Triton not available")
+        return
+    print("\nTriton sparse Linear correctness smoke")
+    print("-" * 60)
+    torch.manual_seed(123)
+    shapes = [(512, 2048), (1024, 4096)]
+    for d_in, d_out in shapes:
+        M = 512
+        n_chunks = d_out // chunk_size
+        n_active = max(1, int(0.1 * n_chunks))
+        active = torch.randperm(n_chunks, device=device)[:n_active].sort().values
+        x = torch.randn(M, d_in, device=device, dtype=dtype)
+        w = torch.randn(d_out, d_in, device=device, dtype=dtype)
+        gy = torch.randn(M, d_out, device=device, dtype=dtype)
+        ref_dw = torch.zeros_like(w)
+        ref_db = torch.zeros(d_out, device=device, dtype=dtype)
+        ref_dx = torch.zeros_like(x)
+        for c in active.tolist():
+            s = c * chunk_size
+            e = s + chunk_size
+            ref_dw[s:e] = gy[:, s:e].transpose(0, 1) @ x
+            ref_db[s:e] = gy[:, s:e].sum(0)
+            ref_dx += gy[:, s:e] @ w[s:e]
+        tri_dw, tri_db = triton_sparse_bwd_dW_db(x, gy, active, chunk_size, d_out, True)
+        tri_dx = triton_sparse_bwd_dX(gy, w, active, chunk_size, M, d_in)
+        dw_err = float((tri_dw - ref_dw).abs().max().item())
+        db_err = float((tri_db - ref_db).abs().max().item())
+        dx_err = float((tri_dx - ref_dx).abs().max().item())
+        print(f"d_in={d_in:4d} d_out={d_out:5d}: dW={dw_err:.6f} dB={db_err:.6f} dX={dx_err:.6f}")
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--steps", type=int, default=500)
+    parser.add_argument("--batch_size", type=int, default=16)
+    parser.add_argument("--block_size", type=int, default=256)
+    parser.add_argument("--n_layer", type=int, default=4)
+    parser.add_argument("--n_head", type=int, default=16)
+    parser.add_argument("--n_embd", type=int, default=1024)
+    parser.add_argument("--chunk_size", type=int, default=64)
+    parser.add_argument("--active_fraction", type=float, default=0.10)
+    parser.add_argument("--warmup_steps", type=int, default=25)
+    parser.add_argument("--anneal_steps", type=int, default=150)
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--kernel_backend", type=str, default="triton", choices=["triton", "torch"])
+    parser.add_argument("--benchmark_sync", action="store_true")
+    parser.add_argument("--skip_correctness", action="store_true")
+    args = parser.parse_args()
+    if args.kernel_backend == "triton":
+        if args.device != "cuda":
+            raise RuntimeError("--kernel_backend triton requires --device cuda")
+        if not TRITON_AVAILABLE:
+            raise RuntimeError("Triton is not available")
+    if not args.skip_correctness:
+        run_correctness_smoke(device=args.device, chunk_size=args.chunk_size)
+    runs: List[Tuple[str, Scheduler, BackwardMode]] = [
+        ("dense", "dense", "dense_baseline"),
+        ("ema_full_dX", "ema_topk", "sparse_dW_full_dX"),
+        ("knn_full_dX", "knn_scheduler", "sparse_dW_full_dX"),
+        ("random_full_dX", "random", "sparse_dW_full_dX"),
+        ("ema_sparse_dX", "ema_topk", "sparse_dW_sparse_dX"),
+        ("knn_sparse_dX", "knn_scheduler", "sparse_dW_sparse_dX"),
+        ("random_sparse_dX", "random", "sparse_dW_sparse_dX"),
+    ]
+    print("\nTriton-backed fast chunked sparse backward with KNN scheduler")
+    print(f"device={args.device} backend={args.kernel_backend} triton_available={TRITON_AVAILABLE}")
+    print(f"steps={args.steps} d={args.n_embd} layers={args.n_layer}")
+    print(f"batch={args.batch_size} block={args.block_size} chunk={args.chunk_size}")
+    print(f"active={args.active_fraction} warmup={args.warmup_steps} anneal={args.anneal_steps}\n")
+    print(f"{'run':>18s} | {'val':>8s} | {'ms/step':>8s} | {'speedup':>8s}")
+    print("-" * 58)
+    dense_ms: Optional[float] = None
+    for label, scheduler, mode in runs:
+        result = run_one(
+            scheduler_name=scheduler,
+            mode=mode,
+            kernel_backend=args.kernel_backend,  # type: ignore[arg-type]
+            device=args.device,
+            steps=args.steps,
+            batch_size=args.batch_size,
+            block_size=args.block_size,
+            n_layer=args.n_layer,
+            n_head=args.n_head,
+            n_embd=args.n_embd,
+            chunk_size=args.chunk_size,
+            active_fraction=args.active_fraction,
+            warmup_steps=args.warmup_steps,
+            anneal_steps=args.anneal_steps,
+            benchmark_sync=args.benchmark_sync,
+        )
+        if label == "dense":
+            dense_ms = result["ms"]
+        speedup = dense_ms / result["ms"] if dense_ms is not None else float("nan")
+        print(
+            f"{label:>18s} | "
+            f"{result['val']:8.4f} | "
+            f"{result['ms']:8.2f} | "
+            f"{speedup:8.3f}"
+        )
+if __name__ == "__main__":
+    main()