Spaces:

Jackoatmon
/

feather-runtime

Runtime error

App Files Files Community

Jackoatmon commited on 4 days ago

Commit

c2bf4b6

verified ·

1 Parent(s): e317e25

Update Feather H200 runtime: Nemotron streaming and HTM force-CPU canary fixes

Browse files

Files changed (41) hide show

overlay/htm_rust/bench_gpu.py +81 -81
overlay/htm_rust/build.rs +6 -12
overlay/htm_rust/docs/GPU_HTM.md +302 -302
overlay/htm_rust/src/gpu/fused.rs +50 -33
overlay/htm_rust/src/gpu/kernels/sp_boost_fused.cu +59 -59
overlay/htm_rust/src/gpu/kernels/sp_duty.cu +45 -45
overlay/htm_rust/src/gpu/kernels/sp_learn.cu +45 -45
overlay/htm_rust/src/gpu/kernels/sp_overlap.cu +78 -78
overlay/htm_rust/src/gpu/kernels/sp_topk.cu +117 -117
overlay/htm_rust/src/gpu/kernels/tm_activate.cu +66 -66
overlay/htm_rust/src/gpu/kernels/tm_anomaly.cu +43 -43
overlay/htm_rust/src/gpu/kernels/tm_grow.cu +155 -155
overlay/htm_rust/src/gpu/kernels/tm_learn.cu +75 -75
overlay/htm_rust/src/gpu/kernels/tm_predict.cu +102 -102
overlay/htm_rust/src/gpu/kernels/tm_punish.cu +64 -64
overlay/htm_rust/src/gpu/kernels/tm_reset.cu +36 -36
overlay/htm_rust/src/gpu/mod.rs +549 -549
overlay/htm_rust/src/gpu/sp_gpu.rs +796 -796
overlay/htm_rust/src/gpu/tm_gpu.rs +460 -460
overlay/htm_rust/uv.lock +8 -8
overlay/hydra/config.py +2 -2
overlay/hydra/engram.py +121 -104
overlay/hydra/model.py +1 -0
overlay/scripts/autoresearch.py +517 -517
overlay/scripts/chat.py +458 -458
overlay/scripts/chat_eval.py +300 -300
overlay/scripts/compile_debug.py +213 -213
overlay/scripts/dataset_audit.py +241 -241
overlay/scripts/download_sft_data.py +457 -457
overlay/scripts/eval_quality.py +525 -525
overlay/scripts/fetch_corpus.py +211 -211
overlay/scripts/grad_probe.py +196 -196
overlay/scripts/launch_feather_hf_job.py +8 -2
overlay/scripts/profile_forward.py +87 -87
overlay/scripts/run_domain_expanded_pretrain.sh +1 -5
overlay/scripts/sample_utils.py +107 -107
overlay/scripts/sft.py +559 -559
overlay/scripts/sft_orchestrator.sh +165 -165
overlay/subsystems/fused_sdr_project.py +7 -0
overlay/subsystems/htm.py +7 -1
overlay/subsystems/sdr_semantic.py +5 -27

overlay/htm_rust/bench_gpu.py CHANGED Viewed

@@ -1,81 +1,81 @@
-"""Microbenchmark: CPU vs GPU HTMLayer forward at HYDRA training sizes.
-Usage:
-    source .venv/bin/activate
-    export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda-12.1/lib64:$LD_LIBRARY_PATH
-    python htm_rust/bench_gpu.py
-"""
-import os
-import sys
-import time
-# Ensure /home/mikeb/work/feather is on sys.path so `subsystems` imports.
-_FEATHER = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-if _FEATHER not in sys.path:
-    sys.path.insert(0, _FEATHER)
-import numpy as np
-import torch
-from subsystems.htm import HTMLayer
-def bench(layer: HTMLayer, sdr: torch.Tensor, warmup: int = 1, iters: int = 3) -> float:
-    """Return mean ms/forward."""
-    for _ in range(warmup):
-        _ = layer(sdr)
-    if torch.cuda.is_available():
-        torch.cuda.synchronize()
-    t0 = time.perf_counter()
-    for _ in range(iters):
-        _ = layer(sdr)
-    if torch.cuda.is_available():
-        torch.cuda.synchronize()
-    dt = time.perf_counter() - t0
-    return dt * 1000 / iters
-def main() -> None:
-    # HYDRA training config: B=8, T=2048, bits=16384, cols=2048.
-    B, T, D = int(os.environ.get("B", 8)), int(os.environ.get("T", 2048)), 16384
-    n_cols = 2048
-    print(f"config: B={B} T={T} D={D} n_cols={n_cols}")
-    print(f"torch: {torch.__version__} cuda={torch.cuda.is_available()}")
-    # Build a fixed sparse SDR once.
-    rng = np.random.default_rng(0)
-    sdr = np.zeros((B, T, D), dtype=bool)
-    on = int(D * 0.02)
-    for b in range(B):
-        for t in range(T):
-            idx = rng.choice(D, size=on, replace=False)
-            sdr[b, t, idx] = True
-    sdr_t = torch.from_numpy(sdr)
-    # CPU baseline.
-    print("\n--- CPU ---")
-    cpu_layer = HTMLayer(
-        input_bits=D, n_columns=n_cols, cells_per_column=32,
-        batch_size=B, seed=42, use_gpu=False,
-    )
-    cpu_layer.train()
-    cpu_ms = bench(cpu_layer, sdr_t, warmup=1, iters=2)
-    print(f"CPU: {cpu_ms:.1f} ms/forward  ({cpu_ms/T:.2f} ms/step × T={T})")
-    # GPU.
-    print("\n--- GPU ---")
-    gpu_layer = HTMLayer(
-        input_bits=D, n_columns=n_cols, cells_per_column=32,
-        batch_size=B, seed=42, use_gpu=True,
-    )
-    gpu_layer.train()
-    sdr_cuda = sdr_t.cuda()
-    gpu_ms = bench(gpu_layer, sdr_cuda, warmup=1, iters=2)
-    print(f"GPU: {gpu_ms:.1f} ms/forward  ({gpu_ms/T:.2f} ms/step × T={T})")
-    print(f"\nSpeedup: {cpu_ms / gpu_ms:.2f}x")
-if __name__ == "__main__":
-    main()

+"""Microbenchmark: CPU vs GPU HTMLayer forward at HYDRA training sizes.
+Usage:
+    source .venv/bin/activate
+    export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda-12.1/lib64:$LD_LIBRARY_PATH
+    python htm_rust/bench_gpu.py
+"""
+import os
+import sys
+import time
+# Ensure /home/mikeb/work/feather is on sys.path so `subsystems` imports.
+_FEATHER = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+if _FEATHER not in sys.path:
+    sys.path.insert(0, _FEATHER)
+import numpy as np
+import torch
+from subsystems.htm import HTMLayer
+def bench(layer: HTMLayer, sdr: torch.Tensor, warmup: int = 1, iters: int = 3) -> float:
+    """Return mean ms/forward."""
+    for _ in range(warmup):
+        _ = layer(sdr)
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    t0 = time.perf_counter()
+    for _ in range(iters):
+        _ = layer(sdr)
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    dt = time.perf_counter() - t0
+    return dt * 1000 / iters
+def main() -> None:
+    # HYDRA training config: B=8, T=2048, bits=16384, cols=2048.
+    B, T, D = int(os.environ.get("B", 8)), int(os.environ.get("T", 2048)), 16384
+    n_cols = 2048
+    print(f"config: B={B} T={T} D={D} n_cols={n_cols}")
+    print(f"torch: {torch.__version__} cuda={torch.cuda.is_available()}")
+    # Build a fixed sparse SDR once.
+    rng = np.random.default_rng(0)
+    sdr = np.zeros((B, T, D), dtype=bool)
+    on = int(D * 0.02)
+    for b in range(B):
+        for t in range(T):
+            idx = rng.choice(D, size=on, replace=False)
+            sdr[b, t, idx] = True
+    sdr_t = torch.from_numpy(sdr)
+    # CPU baseline.
+    print("\n--- CPU ---")
+    cpu_layer = HTMLayer(
+        input_bits=D, n_columns=n_cols, cells_per_column=32,
+        batch_size=B, seed=42, use_gpu=False,
+    )
+    cpu_layer.train()
+    cpu_ms = bench(cpu_layer, sdr_t, warmup=1, iters=2)
+    print(f"CPU: {cpu_ms:.1f} ms/forward  ({cpu_ms/T:.2f} ms/step × T={T})")
+    # GPU.
+    print("\n--- GPU ---")
+    gpu_layer = HTMLayer(
+        input_bits=D, n_columns=n_cols, cells_per_column=32,
+        batch_size=B, seed=42, use_gpu=True,
+    )
+    gpu_layer.train()
+    sdr_cuda = sdr_t.cuda()
+    gpu_ms = bench(gpu_layer, sdr_cuda, warmup=1, iters=2)
+    print(f"GPU: {gpu_ms:.1f} ms/forward  ({gpu_ms/T:.2f} ms/step × T={T})")
+    print(f"\nSpeedup: {cpu_ms / gpu_ms:.2f}x")
+if __name__ == "__main__":
+    main()

overlay/htm_rust/build.rs CHANGED Viewed

@@ -26,11 +26,8 @@ fn main() {
         return;
     }
-    let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR"));
-    let arch = env::var("HTM_CUDA_ARCH").unwrap_or_else(|_| "sm_86".into());
-    // Base kernels — compile for any sm_80+ GPU. Each .cu file → one .ptx file.
-    let base_kernels: &[&str] = &[
         "sp_overlap",
         "sp_topk",
         "sp_learn",
@@ -43,20 +40,17 @@ fn main() {
         "tm_grow",
         "tm_anomaly",
         "tm_reset",
     ];
-    // htm_fused_step now compiles for ALL architectures (sm_80+).
-    // On Hopper (sm_90+): uses cluster-distributed shared memory for hot state.
-    // On Ampere (sm_86) and other pre-Hopper: uses global memory reads/writes
-    // with grid.sync() for cross-block synchronization (cooperative launch).
-    let kernels: Vec<&str> = base_kernels.iter().chain(["htm_fused_step"].iter()).copied().collect();
     let kernels_dir = PathBuf::from("src/gpu/kernels");
-    for k in &kernels {
         let src = kernels_dir.join(format!("{k}.cu"));
         println!("cargo:rerun-if-changed={}", src.display());
     }
     let nvcc = find_nvcc();
     println!("cargo:warning=htm_rust: nvcc = {nvcc}");

         return;
     }
+    // Kernels to compile. Each .cu file → one .ptx file, embedded by name.
+    let kernels: &[&str] = &[
         "sp_overlap",
         "sp_topk",
         "sp_learn",
         "tm_grow",
         "tm_anomaly",
         "tm_reset",
+        "htm_fused_step",
     ];
     let kernels_dir = PathBuf::from("src/gpu/kernels");
+    for k in kernels {
         let src = kernels_dir.join(format!("{k}.cu"));
         println!("cargo:rerun-if-changed={}", src.display());
     }
+    let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR"));
+    let arch = env::var("HTM_CUDA_ARCH").unwrap_or_else(|_| "sm_86".into());
     let nvcc = find_nvcc();
     println!("cargo:warning=htm_rust: nvcc = {nvcc}");

overlay/htm_rust/docs/GPU_HTM.md CHANGED Viewed

@@ -1,302 +1,302 @@
-# GPU HTM Backend
-## Status
-**FUSED MEGAKERNEL: entire T-timestep SP+TM forward collapsed into a single
-CUDA launch per forward pass.**
-* Legacy path: 12 kernels × T=2048 timesteps = 24K launches per forward.
-* Fused path: **1 launch per forward** (24000× launch-overhead reduction).
-* End-to-end training throughput: **~2.7k → ~60k tok/sec** (~22x speedup).
-* Fused path uses per-column threshold inhibition instead of global top-K
-  (see §Fused Kernel below — this is a real architectural change).
-## Fused Kernel
-### Why
-Global top-K column selection requires cross-block synchronization at every
-timestep. On WSL2/sm_86 without `-rdc=true`, `cooperative_groups::grid_sync()`
-is unreliable. Without a grid sync, collapsing the T-loop into one kernel is
-impossible, so every forward pays 12×T kernel launches and 90%+ of runtime is
-CUDA launch overhead + small-kernel tails.
-### How
-Replace global top-K with **per-column threshold activation**:
-    is_active[c] = (overlap[c] * boost[c]) > inhibition_threshold[c]
-`inhibition_threshold[c]` is a per-column scalar, learned via EMA update:
-    err = active_duty[c] - sparsity_target
-    new_thr = clamp(thr + thr_adapt_rate * err * 100, 0.1, 1000)
-This is biologically grounded (GABAergic local lateral inhibition in
-neocortical columns) and supported by HTM theory. The duty-cycle-driven
-feedback loop was already present; we simply redirect its output to drive
-activation threshold instead of multiplicative boost. The global top-K,
-which had no biological basis, is removed.
-### Cross-block coherence
-- **Ping-pong bitsets** for `cell_active_bits` and `cell_winner_bits`: at
-  even t write to `_a`, read from `_b`; at odd t reversed. This eliminates
-  the need for an in-place snapshot kernel between timesteps.
-- **Primary path: cooperative launch + hardware grid sync**. Host code probes
-  `CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH`, computes the cooperative whole-grid
-  residency limit from occupancy, and launches the fused megakernel with
-  `cuLaunchCooperativeKernel`. In-kernel barriers use
-  `cooperative_groups::this_grid().sync()`.
-- **Fallback path: software grid barrier** via a 3-slot atomic counter array
-  (`barrier_counters`). This remains as a compatibility fallback when
-  cooperative launch is unavailable.
-- **Launch invariant**: cooperative launch is capped to the hardware residency
-  limit for `blockDim.x = 1024`; software fallback remains capped conservatively
-  (`HTM_FUSED_GRID_CAP`, default 8) to avoid whole-grid spin deadlock.
-### Kernel structure
-```
-for t in 0..T:
-    # Phase 0: clear curr_active/curr_winner for my column range
-    grid_barrier()
-    # Phase A: SP overlap → boost → threshold → SP learn → duty + threshold EMA
-    grid_barrier()
-    # Phase B: TM predict (per cell, per seg) → TM learn (reinforce on match)
-    #                   → burst if none predicted → segment grow/reinforce
-    grid_barrier()
-    # Phase C: block 0 writes anomaly[t]
-```
-Each warp owns a contiguous slice of columns. At grid=24 blocks × 32 warps =
-768 warps, n_columns=2048 → 2-3 columns per warp.
-### Parity with legacy GPU path
-**Semantics diverge**. Legacy: exactly `k = round(sparsity * n_cols)` columns
-active per step. Fused: variable, converging to `sparsity * n_cols` on
-average via the per-column EMA. Anomaly decay on repeating sequences is
-preserved (see `gpu_fused_tm_anomaly_decays_on_repeating_sequence` test).
-This is an intentional architectural change committed under
-`no-bypass/full-architecture` per program.md rules. The legacy top-K path
-(`step_many_cuda`) remains available for reference and can be re-enabled via
-`HYDRA_HTM_FUSED=0`.
-### Tests
-- `gpu_threshold_converges_to_sparsity` (tests.rs): 1000-step warmup on
-  random SDRs, then measure mean active cols/step on next 200 steps. Must
-  land within [0.25×, 4×] of `sparsity_target * n_cols`.
-- `gpu_fused_tm_anomaly_decays_on_repeating_sequence`: feed A,B,C repeating
-  for 300 steps. Late anomaly must be < early anomaly AND < 0.5.
-## Legacy Pipeline (kept for fallback)
-* SP: 5 kernels, bit-identical parity with CPU under strict-parity mode.
-* TM: 7 kernels, relaxed-parity with CPU.
-* Speedup at training size (B=8, T=2048, bits=16384): **3.83x** vs CPU.
-## Building
-CPU-only (default, zero CUDA dep):
-```bash
-cargo build --release
-```
-GPU-enabled:
-```bash
-export PATH=/usr/local/cuda-12.1/bin:$PATH
-export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda-12.1/lib64:$LD_LIBRARY_PATH
-export HTM_PTX_VERSION=7.8   # lower if driver older than nvcc
-cargo build --release --features gpu
-cargo test  --release --features gpu --lib   # fused path includes cooperative launch + grid-sync tests
-# Python wheel:
-maturin develop --release --features gpu --manifest-path htm_rust/Cargo.toml
-```
-## Architecture
-### Module layout
-```
-src/gpu/
-  mod.rs            # HTMRegionGpu pyclass + step_many_gpu (full pipeline)
-  sp_gpu.rs         # Persistent SP device buffers + step_batch_with_tm
-  tm_gpu.rs         # Persistent TM device buffers + step (predict→activate→learn)
-  tests.rs          # CPU-vs-GPU SP parity + end-to-end TM anomaly decay
-  kernels/
-    sp_overlap.cu       # per-column overlap reduction
-    sp_topk.cu          # k-WTA top-K winner selection
-    sp_learn.cu         # Hebbian +inc/-dec on proximal synapses
-    sp_duty.cu          # EMA duty-cycle update
-    sp_boost_fused.cu   # fused mean + exp boost (GPU-side)
-    tm_reset.cu         # per-step: snapshot active→prev, clear buffers
-    tm_predict.cu       # per-cell: score owned segments vs prev_active_bits
-    tm_activate.cu      # per-col: activate predicted cells OR burst
-    tm_learn.cu         # per-cell: reinforce correctly-predicted segments
-    tm_punish.cu        # per-cell: decay matching segs on inactive cols
-    tm_grow.cu          # per-bursting-col: reuse matching seg OR create new,
-                        #                    grow synapses to prev_winners
-    tm_anomaly.cu       # per-step: unpredicted/active ratio
-```
-### Persistent SP state (per region, unchanged from Phase 1)
-At n_cols=2048, S=40, bits=16384: ~355 KB persistent + ~90 KB transient.
-### Persistent TM state (per region)
-Capacity knobs (configured in `tm_gpu.rs`):
-- `MAX_SEGMENTS_PER_CELL = 4`
-- `MAX_SYN_PER_SEGMENT   = 20`
-At cells_per_col=32, n_cols=2048:
-- `n_cells          = 65_536`
-- `n_segments_max   = 262_144`   (~262K)
-- `n_synapses_max   = 5_242_880` (~5.2M)
-| Buffer                | Shape / type         | Notes                                  |
-|-----------------------|----------------------|----------------------------------------|
-| `seg_cell_id`         | (n_segs,) u32        | owning cell; U32_MAX = unused          |
-| `seg_syn_count`       | (n_segs,) u32        | #active synapses in slot               |
-| `syn_presyn`          | (n_segs × S,) u32    | presynaptic cell indices               |
-| `syn_perm`            | (n_segs × S,) i16    | permanence scaled 0..32767 (0.0..1.0)  |
-| `cell_seg_count`      | (n_cells,) u32       | segments allocated on each cell        |
-| `cell_active_bits`    | (n_cells/32,) u32    | packed bitset, current step            |
-| `cell_winner_bits`    | (n_cells/32,) u32    | packed bitset, current step            |
-| `cell_predictive_bits`| (n_cells/32,) u32    | set by predict, read by activate       |
-| `prev_active_bits`    | (n_cells/32,) u32    | snapshot at step start                 |
-| `prev_winner_bits`    | (n_cells/32,) u32    | snapshot at step start                 |
-| `col_predicted`       | (n_cols,) u8         | set if any cell in col is predictive   |
-| `col_best_match`      | (n_cols,) u32        | packed (pot<<21 | seg_id), atomicMax  |
-| `seg_num_active_conn` | (n_segs,) u32        | output of predict                      |
-| `seg_num_active_pot`  | (n_segs,) u32        | output of predict                      |
-| `unpredicted_count`   | (1,) u32             | atomic counter for anomaly             |
-| `burst_cols_flat`     | (n_cols,) u32        | list of bursting cols                  |
-| `burst_cols_count`    | (1,) u32             | length of above list                   |
-**Total per TM region: ~42 MB.** Batch of 8 regions: ~340 MB. Fits 6 GB RTX 3060.
-### Per-step pipeline (single iteration of `step_batch_with_tm`)
-```
-  SP side                            TM side
-  ---------                          ---------
-  1. D2D input slice → inp_dev
-  2. sp_overlap (n_cols blocks)
-  3. sp_topk    (1 block)
-  4. sp_learn   (n_cols blocks)
-  5. sp_duty    (n_cols/256 blocks)
-  6. sp_boost_fused (1 block)
-  7. D2D active_mask → cols_dev[ti]
-                                     8. tm_reset_step   (ceil(n_cells/32/256))
-                                     9. tm_predict      (n_cells blocks × 32 thr)
-                                    10. tm_activate     (n_cols/256 blocks)
-                                    11. tm_anomaly      (1 block)
-                                    if learn:
-                                    12. tm_learn        (n_cells blocks)
-                                    13. tm_punish       (n_cells blocks)
-                                    14. tm_grow         (n_cols blocks — early-exits)
-```
-No host sync in the T-step loop. At the end one `dtoh_sync_copy` each for
-`cols_dev` (T × n_cols bytes) and `anom_dev` (T × f32).
-## Parity
-### SP: strict bit-identical
-See Phase 1 docs — `gpu_sp_matches_cpu_with_learn` over 50 steps passes exact.
-### TM: relaxed-parity
-The GPU TM has known, deliberate deviations from CPU to admit massive parallelism:
-1. **Bursting winner cell**: CPU picks the least-used cell (fewest segments) with
-   random tiebreak. GPU picks cell 0 of the column (deterministic, branch-free).
-   Learning dynamics are preserved because segment creation/reinforcement is
-   the dominant effect, not which specific cell in a bursting column wins.
-2. **Permanence storage**: i16 fixed-point (scale 32767) vs f32. Rounding
-   differs by <=1 ULP of the scale (~3.0e-5), below any meaningful learning
-   quantum (inc=0.10, dec=0.10, predicted_segment_dec=0.10).
-3. **Grown synapse candidate order**: CPU randomly samples from prev_winner_cells.
-   GPU iterates prev_winner_bits words in a pseudo-random rotated order keyed
-   by (bursting_col_idx, iter_seed). Output is a different subset but same size.
-4. **Segment LRU eviction**: CPU tracks `last_used_iteration` per segment.
-   GPU wraps around (slot = count % max_segments_per_cell). In the autoresearch
-   loop where TM resets every forward, eviction rarely triggers.
-The GPU parity test (`gpu_tm_anomaly_decays_on_repeating_sequence`) feeds a
-repeating A,B,C sequence and asserts anomaly decays: **1.000 early → 0.000 late**.
-## Bottleneck Analysis
-| Source                           | Cost/step (B=8 T=2048)   |
-|----------------------------------|-------------------------:|
-| 14 kernel launches               | ~70 μs                   |
-| ~262K predict/learn/punish blocks| ~2.5 ms                  |
-| No D2H until end-of-batch        | 0 μs                     |
-| Final D2H (T × n_cols + T × f32) | ~200 μs per region       |
-Per-step wall time at B=8 T=2048:
-- CPU (reference): **~11.4 ms / step**
-- GPU (current):   **~2.98 ms / step**
-- **Speedup: 3.83x**
-## End-to-End Training Benchmark
-**Config**: B=8, T=2048, vocab=8192, 60-second time budget, full HYDRA stack
-(SDR Semantic + HTM + Mamba-3 + Engram + mHC + Hestia QAT).
-**Results**:
-- GPU util: **97-98% sustained**
-- VRAM: **5.4 GB / 6.0 GB** (90% utilisation)
-- Steps completed: 16
-- tok/sec: **~2,200-2,500** (stable post-warmup)
-- Final val_bpb: **2.249** (from ~3.1 initial)
-- Factual eval: 1/9 hits
-Compared to previous CPU-HTM baseline (~100 tok/s), the full-GPU HTM delivers
-**~22x end-to-end throughput** — far above the 3-10x target.
-## Bench Commands
-```bash
-source .venv/bin/activate
-export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda-12.1/lib64:$LD_LIBRARY_PATH
-# Microbench
-B=8 T=2048 python htm_rust/bench_gpu.py
-# Full training
-HYDRA_TIME_BUDGET=60 HYDRA_BATCH_SIZE=8 HYDRA_TOTAL_BATCH=32768 python -u train.py
-```
-## Known Limitations / Future Work
-- **Segment-compacted launches**: predict/learn/punish iterate all n_cells
-  blocks, using `cell_seg_count` to skip empty cells. A compacted live-cell
-  list would shave another ~40% of launch overhead.
-- **Winner selection**: currently cell 0 of bursting col. Proper least-used
-  selection would help stability of cross-column patterns.
-- **Single CUDA stream per region**: with B=8 regions we serialise on stream 0.
-  Multi-stream would lift the ~20% launch overhead at small batch sizes.
-- **Permanence bump on chronically under-stimulated columns**: SP's strict-parity
-  bump is not mirrored on GPU fast path. Effect on long runs needs measurement.
-- **`seg_num_active_conn` output is reused across reinforce + punish**: the two
-  kernels each launch n_cells blocks. They could be fused into one for one fewer
-  kernel launch per step.
-## Files
-- `htm_rust/build.rs` — nvcc-driven PTX compilation, 12 kernels.
-- `htm_rust/Cargo.toml` — `gpu` feature flag, cudarc dep.
-- `htm_rust/src/gpu/mod.rs` — `HTMRegionGpu` pyclass + `step_many_gpu`.
-- `htm_rust/src/gpu/sp_gpu.rs` — SP state + `step_batch_with_tm`.
-- `htm_rust/src/gpu/tm_gpu.rs` — TM state + `step`.
-- `htm_rust/src/gpu/tests.rs` — parity + correctness tests.
-- `htm_rust/src/gpu/kernels/*.cu` — 5 SP + 7 TM kernels.
-- `htm_rust/bench_gpu.py` — CPU-vs-GPU microbench.
-- `subsystems/htm.py` — transparent GPU/CPU backend selection in `HTMLayer`.

+# GPU HTM Backend
+## Status
+**FUSED MEGAKERNEL: entire T-timestep SP+TM forward collapsed into a single
+CUDA launch per forward pass.**
+* Legacy path: 12 kernels × T=2048 timesteps = 24K launches per forward.
+* Fused path: **1 launch per forward** (24000× launch-overhead reduction).
+* End-to-end training throughput: **~2.7k → ~60k tok/sec** (~22x speedup).
+* Fused path uses per-column threshold inhibition instead of global top-K
+  (see §Fused Kernel below — this is a real architectural change).
+## Fused Kernel
+### Why
+Global top-K column selection requires cross-block synchronization at every
+timestep. On WSL2/sm_86 without `-rdc=true`, `cooperative_groups::grid_sync()`
+is unreliable. Without a grid sync, collapsing the T-loop into one kernel is
+impossible, so every forward pays 12×T kernel launches and 90%+ of runtime is
+CUDA launch overhead + small-kernel tails.
+### How
+Replace global top-K with **per-column threshold activation**:
+    is_active[c] = (overlap[c] * boost[c]) > inhibition_threshold[c]
+`inhibition_threshold[c]` is a per-column scalar, learned via EMA update:
+    err = active_duty[c] - sparsity_target
+    new_thr = clamp(thr + thr_adapt_rate * err * 100, 0.1, 1000)
+This is biologically grounded (GABAergic local lateral inhibition in
+neocortical columns) and supported by HTM theory. The duty-cycle-driven
+feedback loop was already present; we simply redirect its output to drive
+activation threshold instead of multiplicative boost. The global top-K,
+which had no biological basis, is removed.
+### Cross-block coherence
+- **Ping-pong bitsets** for `cell_active_bits` and `cell_winner_bits`: at
+  even t write to `_a`, read from `_b`; at odd t reversed. This eliminates
+  the need for an in-place snapshot kernel between timesteps.
+- **Primary path: cooperative launch + hardware grid sync**. Host code probes
+  `CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH`, computes the cooperative whole-grid
+  residency limit from occupancy, and launches the fused megakernel with
+  `cuLaunchCooperativeKernel`. In-kernel barriers use
+  `cooperative_groups::this_grid().sync()`.
+- **Fallback path: software grid barrier** via a 3-slot atomic counter array
+  (`barrier_counters`). This remains as a compatibility fallback when
+  cooperative launch is unavailable.
+- **Launch invariant**: cooperative launch is capped to the hardware residency
+  limit for `blockDim.x = 1024`; software fallback remains capped conservatively
+  (`HTM_FUSED_GRID_CAP`, default 8) to avoid whole-grid spin deadlock.
+### Kernel structure
+```
+for t in 0..T:
+    # Phase 0: clear curr_active/curr_winner for my column range
+    grid_barrier()
+    # Phase A: SP overlap → boost → threshold → SP learn → duty + threshold EMA
+    grid_barrier()
+    # Phase B: TM predict (per cell, per seg) → TM learn (reinforce on match)
+    #                   → burst if none predicted → segment grow/reinforce
+    grid_barrier()
+    # Phase C: block 0 writes anomaly[t]
+```
+Each warp owns a contiguous slice of columns. At grid=24 blocks × 32 warps =
+768 warps, n_columns=2048 → 2-3 columns per warp.
+### Parity with legacy GPU path
+**Semantics diverge**. Legacy: exactly `k = round(sparsity * n_cols)` columns
+active per step. Fused: variable, converging to `sparsity * n_cols` on
+average via the per-column EMA. Anomaly decay on repeating sequences is
+preserved (see `gpu_fused_tm_anomaly_decays_on_repeating_sequence` test).
+This is an intentional architectural change committed under
+`no-bypass/full-architecture` per program.md rules. The legacy top-K path
+(`step_many_cuda`) remains available for reference and can be re-enabled via
+`HYDRA_HTM_FUSED=0`.
+### Tests
+- `gpu_threshold_converges_to_sparsity` (tests.rs): 1000-step warmup on
+  random SDRs, then measure mean active cols/step on next 200 steps. Must
+  land within [0.25×, 4×] of `sparsity_target * n_cols`.
+- `gpu_fused_tm_anomaly_decays_on_repeating_sequence`: feed A,B,C repeating
+  for 300 steps. Late anomaly must be < early anomaly AND < 0.5.
+## Legacy Pipeline (kept for fallback)
+* SP: 5 kernels, bit-identical parity with CPU under strict-parity mode.
+* TM: 7 kernels, relaxed-parity with CPU.
+* Speedup at training size (B=8, T=2048, bits=16384): **3.83x** vs CPU.
+## Building
+CPU-only (default, zero CUDA dep):
+```bash
+cargo build --release
+```
+GPU-enabled:
+```bash
+export PATH=/usr/local/cuda-12.1/bin:$PATH
+export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda-12.1/lib64:$LD_LIBRARY_PATH
+export HTM_PTX_VERSION=7.8   # lower if driver older than nvcc
+cargo build --release --features gpu
+cargo test  --release --features gpu --lib   # fused path includes cooperative launch + grid-sync tests
+# Python wheel:
+maturin develop --release --features gpu --manifest-path htm_rust/Cargo.toml
+```
+## Architecture
+### Module layout
+```
+src/gpu/
+  mod.rs            # HTMRegionGpu pyclass + step_many_gpu (full pipeline)
+  sp_gpu.rs         # Persistent SP device buffers + step_batch_with_tm
+  tm_gpu.rs         # Persistent TM device buffers + step (predict→activate→learn)
+  tests.rs          # CPU-vs-GPU SP parity + end-to-end TM anomaly decay
+  kernels/
+    sp_overlap.cu       # per-column overlap reduction
+    sp_topk.cu          # k-WTA top-K winner selection
+    sp_learn.cu         # Hebbian +inc/-dec on proximal synapses
+    sp_duty.cu          # EMA duty-cycle update
+    sp_boost_fused.cu   # fused mean + exp boost (GPU-side)
+    tm_reset.cu         # per-step: snapshot active→prev, clear buffers
+    tm_predict.cu       # per-cell: score owned segments vs prev_active_bits
+    tm_activate.cu      # per-col: activate predicted cells OR burst
+    tm_learn.cu         # per-cell: reinforce correctly-predicted segments
+    tm_punish.cu        # per-cell: decay matching segs on inactive cols
+    tm_grow.cu          # per-bursting-col: reuse matching seg OR create new,
+                        #                    grow synapses to prev_winners
+    tm_anomaly.cu       # per-step: unpredicted/active ratio
+```
+### Persistent SP state (per region, unchanged from Phase 1)
+At n_cols=2048, S=40, bits=16384: ~355 KB persistent + ~90 KB transient.
+### Persistent TM state (per region)
+Capacity knobs (configured in `tm_gpu.rs`):
+- `MAX_SEGMENTS_PER_CELL = 4`
+- `MAX_SYN_PER_SEGMENT   = 20`
+At cells_per_col=32, n_cols=2048:
+- `n_cells          = 65_536`
+- `n_segments_max   = 262_144`   (~262K)
+- `n_synapses_max   = 5_242_880` (~5.2M)
+| Buffer                | Shape / type         | Notes                                  |
+|-----------------------|----------------------|----------------------------------------|
+| `seg_cell_id`         | (n_segs,) u32        | owning cell; U32_MAX = unused          |
+| `seg_syn_count`       | (n_segs,) u32        | #active synapses in slot               |
+| `syn_presyn`          | (n_segs × S,) u32    | presynaptic cell indices               |
+| `syn_perm`            | (n_segs × S,) i16    | permanence scaled 0..32767 (0.0..1.0)  |
+| `cell_seg_count`      | (n_cells,) u32       | segments allocated on each cell        |
+| `cell_active_bits`    | (n_cells/32,) u32    | packed bitset, current step            |
+| `cell_winner_bits`    | (n_cells/32,) u32    | packed bitset, current step            |
+| `cell_predictive_bits`| (n_cells/32,) u32    | set by predict, read by activate       |
+| `prev_active_bits`    | (n_cells/32,) u32    | snapshot at step start                 |
+| `prev_winner_bits`    | (n_cells/32,) u32    | snapshot at step start                 |
+| `col_predicted`       | (n_cols,) u8         | set if any cell in col is predictive   |
+| `col_best_match`      | (n_cols,) u32        | packed (pot<<21 | seg_id), atomicMax  |
+| `seg_num_active_conn` | (n_segs,) u32        | output of predict                      |
+| `seg_num_active_pot`  | (n_segs,) u32        | output of predict                      |
+| `unpredicted_count`   | (1,) u32             | atomic counter for anomaly             |
+| `burst_cols_flat`     | (n_cols,) u32        | list of bursting cols                  |
+| `burst_cols_count`    | (1,) u32             | length of above list                   |
+**Total per TM region: ~42 MB.** Batch of 8 regions: ~340 MB. Fits 6 GB RTX 3060.
+### Per-step pipeline (single iteration of `step_batch_with_tm`)
+```
+  SP side                            TM side
+  ---------                          ---------
+  1. D2D input slice → inp_dev
+  2. sp_overlap (n_cols blocks)
+  3. sp_topk    (1 block)
+  4. sp_learn   (n_cols blocks)
+  5. sp_duty    (n_cols/256 blocks)
+  6. sp_boost_fused (1 block)
+  7. D2D active_mask → cols_dev[ti]
+                                     8. tm_reset_step   (ceil(n_cells/32/256))
+                                     9. tm_predict      (n_cells blocks × 32 thr)
+                                    10. tm_activate     (n_cols/256 blocks)
+                                    11. tm_anomaly      (1 block)
+                                    if learn:
+                                    12. tm_learn        (n_cells blocks)
+                                    13. tm_punish       (n_cells blocks)
+                                    14. tm_grow         (n_cols blocks — early-exits)
+```
+No host sync in the T-step loop. At the end one `dtoh_sync_copy` each for
+`cols_dev` (T × n_cols bytes) and `anom_dev` (T × f32).
+## Parity
+### SP: strict bit-identical
+See Phase 1 docs — `gpu_sp_matches_cpu_with_learn` over 50 steps passes exact.
+### TM: relaxed-parity
+The GPU TM has known, deliberate deviations from CPU to admit massive parallelism:
+1. **Bursting winner cell**: CPU picks the least-used cell (fewest segments) with
+   random tiebreak. GPU picks cell 0 of the column (deterministic, branch-free).
+   Learning dynamics are preserved because segment creation/reinforcement is
+   the dominant effect, not which specific cell in a bursting column wins.
+2. **Permanence storage**: i16 fixed-point (scale 32767) vs f32. Rounding
+   differs by <=1 ULP of the scale (~3.0e-5), below any meaningful learning
+   quantum (inc=0.10, dec=0.10, predicted_segment_dec=0.10).
+3. **Grown synapse candidate order**: CPU randomly samples from prev_winner_cells.
+   GPU iterates prev_winner_bits words in a pseudo-random rotated order keyed
+   by (bursting_col_idx, iter_seed). Output is a different subset but same size.
+4. **Segment LRU eviction**: CPU tracks `last_used_iteration` per segment.
+   GPU wraps around (slot = count % max_segments_per_cell). In the autoresearch
+   loop where TM resets every forward, eviction rarely triggers.
+The GPU parity test (`gpu_tm_anomaly_decays_on_repeating_sequence`) feeds a
+repeating A,B,C sequence and asserts anomaly decays: **1.000 early → 0.000 late**.
+## Bottleneck Analysis
+| Source                           | Cost/step (B=8 T=2048)   |
+|----------------------------------|-------------------------:|
+| 14 kernel launches               | ~70 μs                   |
+| ~262K predict/learn/punish blocks| ~2.5 ms                  |
+| No D2H until end-of-batch        | 0 μs                     |
+| Final D2H (T × n_cols + T × f32) | ~200 μs per region       |
+Per-step wall time at B=8 T=2048:
+- CPU (reference): **~11.4 ms / step**
+- GPU (current):   **~2.98 ms / step**
+- **Speedup: 3.83x**
+## End-to-End Training Benchmark
+**Config**: B=8, T=2048, vocab=8192, 60-second time budget, full HYDRA stack
+(SDR Semantic + HTM + Mamba-3 + Engram + mHC + Hestia QAT).
+**Results**:
+- GPU util: **97-98% sustained**
+- VRAM: **5.4 GB / 6.0 GB** (90% utilisation)
+- Steps completed: 16
+- tok/sec: **~2,200-2,500** (stable post-warmup)
+- Final val_bpb: **2.249** (from ~3.1 initial)
+- Factual eval: 1/9 hits
+Compared to previous CPU-HTM baseline (~100 tok/s), the full-GPU HTM delivers
+**~22x end-to-end throughput** — far above the 3-10x target.
+## Bench Commands
+```bash
+source .venv/bin/activate
+export LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda-12.1/lib64:$LD_LIBRARY_PATH
+# Microbench
+B=8 T=2048 python htm_rust/bench_gpu.py
+# Full training
+HYDRA_TIME_BUDGET=60 HYDRA_BATCH_SIZE=8 HYDRA_TOTAL_BATCH=32768 python -u train.py
+```
+## Known Limitations / Future Work
+- **Segment-compacted launches**: predict/learn/punish iterate all n_cells
+  blocks, using `cell_seg_count` to skip empty cells. A compacted live-cell
+  list would shave another ~40% of launch overhead.
+- **Winner selection**: currently cell 0 of bursting col. Proper least-used
+  selection would help stability of cross-column patterns.
+- **Single CUDA stream per region**: with B=8 regions we serialise on stream 0.
+  Multi-stream would lift the ~20% launch overhead at small batch sizes.
+- **Permanence bump on chronically under-stimulated columns**: SP's strict-parity
+  bump is not mirrored on GPU fast path. Effect on long runs needs measurement.
+- **`seg_num_active_conn` output is reused across reinforce + punish**: the two
+  kernels each launch n_cells blocks. They could be fused into one for one fewer
+  kernel launch per step.
+## Files
+- `htm_rust/build.rs` — nvcc-driven PTX compilation, 12 kernels.
+- `htm_rust/Cargo.toml` — `gpu` feature flag, cudarc dep.
+- `htm_rust/src/gpu/mod.rs` — `HTMRegionGpu` pyclass + `step_many_gpu`.
+- `htm_rust/src/gpu/sp_gpu.rs` — SP state + `step_batch_with_tm`.
+- `htm_rust/src/gpu/tm_gpu.rs` — TM state + `step`.
+- `htm_rust/src/gpu/tests.rs` — parity + correctness tests.
+- `htm_rust/src/gpu/kernels/*.cu` — 5 SP + 7 TM kernels.
+- `htm_rust/bench_gpu.py` — CPU-vs-GPU microbench.
+- `subsystems/htm.py` — transparent GPU/CPU backend selection in `HTMLayer`.

overlay/htm_rust/src/gpu/fused.rs CHANGED Viewed

@@ -20,15 +20,15 @@
 use std::ffi::CString;
 use std::sync::Arc;
-use cudarc::driver::{result, sys, CudaDevice, CudaSlice, DeviceRepr, DevicePtr, DriverError,
-                      LaunchConfig};
 use cudarc::nvrtc::Ptx;
 use super::sp_gpu::SpatialPoolerGpu;
 use super::tm_gpu::{TemporalMemoryGpu, MAX_SEGMENTS_PER_CELL, MAX_SYN_PER_SEGMENT};
-const PTX_HTM_FUSED: &str =
-    include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/htm_fused_step.ptx"));
 /// Struct-by-value pointer pack — matches C-side `FusedPtrs`.
 ///
@@ -132,11 +132,9 @@ pub(crate) fn plan_fused_launch(
     grid_cap_override: Option<u32>,
 ) -> Result<FusedLaunchPlan, String> {
     let sm_count = sm_count.max(1);
-    // 1024 threads/block exceeds the register file on Ampere (sm_86: 65536
-    // regs/SM ÷ 1024 = 64 regs/thread; fused kernel needs ~80+). 256 gives
-    // 256 regs/thread which is ample. Compensate with more blocks via
-    // cooperative launch. On Hopper (228 KB smem, 255 regs/thread baseline),
-    // 1024 works fine, but 256 is safe everywhere.
     let block_dim_x = 256u32;
     // Cluster launch path: cooperative launch is not required. Keep the probe
@@ -145,10 +143,11 @@ pub(crate) fn plan_fused_launch(
         eprintln!("[htm_rust] INFO: cooperative launch unsupported; cluster path only.");
     }
-    // Tested grid_cap: 4 blocks = 30ms (too serial), 16 blocks = 10.8ms (parallel wins).
-    // Parallelism in SP overlap + TM predict stages outweighs grid.sync() cost.
     let default_grid_cap = 16u32;
-    let grid_cap = grid_cap_override.unwrap_or(default_grid_cap);
     let resident_bound = if cooperative_grid_limit > 0 {
         cooperative_grid_limit.max(sm_count * 2)
     } else {
@@ -218,7 +217,7 @@ pub struct FusedState {
     pub cell_active_bits_b: CudaSlice<u32>,
     pub cell_winner_bits_a: CudaSlice<u32>,
     pub cell_winner_bits_b: CudaSlice<u32>,
-    pub step_scratch: CudaSlice<u32>,       // length 6
     pub grid_dim_x: u32,
     pub block_dim_x: u32,
@@ -241,7 +240,10 @@ impl FusedState {
         initial_threshold: f32,
     ) -> Result<Self, DriverError> {
         let n_cells = n_columns * cells_per_column;
-        assert!(n_cells % 32 == 0, "n_cells must be divisible by 32 for bitsets");
         let bits_words = n_cells / 32;
         let mut inhibition_threshold = dev.alloc_zeros::<f32>(n_columns)?;
@@ -278,7 +280,8 @@ impl FusedState {
         // every launched kernel function, otherwise cuLaunchKernelEx rejects
         // the cluster dim with CUDA_ERROR_INVALID_CLUSTER_SIZE.
         unsafe {
-            let attr = sys::CUfunction_attribute::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED;
             // Ignore errors: older CUDA may lack the attribute, in which case
             // only portable sizes (<= 8) work — plan_fused_launch caps at 8.
             let _ = sys::lib().cuFuncSetAttribute(function, attr, 1);
@@ -294,9 +297,9 @@ impl FusedState {
         };
         // T1: Probe Hopper cluster launch capability.
-        let max_cluster_size = match dev.attribute(
-            cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH,
-        ) {
             Ok(v) if v > 0 => {
                 // H200/sm_90a supports up to 16 blocks per cluster.
                 // There is no MAX_CLUSTER_SIZE attribute in CUDA 12.4; hard-code the
@@ -346,7 +349,11 @@ impl FusedState {
         Ok(Self {
             dev,
-            raw_kernel: RawFusedKernel { module, function, function_batched },
             inhibition_threshold,
             cell_active_bits_a,
             cell_active_bits_b,
@@ -445,7 +452,7 @@ pub fn launch_fused(
         inputs: *inputs_flat.device_ptr(),
         cols_out: *cols_out.device_ptr(),
         anom_out: *anom_out.device_ptr(),
-        barrier_counters: 0u64,  // ABI-compat dummy; cluster barrier replaces DLB.
         step_scratch: *fused.step_scratch.device_ptr(),
     };
@@ -493,14 +500,17 @@ pub fn launch_fused(
             }
         } else {
             // Pre-Hopper: cooperative kernel launch. The fused kernel uses
-            // grid.sync() for cross-block synchronization which REQUIRES
-            // cuLaunchCooperativeKernel (normal launch silently crashes on
-            // the first grid.sync() call).
             let ret = sys::lib().cuLaunchCooperativeKernel(
                 fused.raw_kernel.function,
-                grid_x, 1, 1,
-                block_x, 1, 1,
-                0,  // sharedMemBytes
                 cu_stream,
                 kernel_params.as_mut_ptr(),
             );
@@ -616,7 +626,7 @@ pub(super) fn launch_fused_batched_raw(
                 inputs: inputs_per_region[i],
                 cols_out: cols_per_region[i],
                 anom_out: anom_per_region[i],
-                barrier_counters: 0u64,  // ABI-compat dummy; cluster barrier replaces DLB.
                 step_scratch: *r.fused_state.step_scratch.device_ptr(),
             }
         })
@@ -636,8 +646,8 @@ pub(super) fn launch_fused_batched_raw(
         let r0 = unsafe { &*region_ptrs[0] };
         r0.fused_state.cluster_info.max_cluster_size > 0
     };
-    let grid_x = plan_batched_grid_dim(grid_x, cooperative_grid_limit, b, use_cluster)
-        .map_err(|msg| {
             eprintln!("[htm_rust] FATAL: {msg}");
             DriverError(cudarc::driver::sys::CUresult::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE)
         })?;
@@ -678,12 +688,19 @@ pub(super) fn launch_fused_batched_raw(
                 return Err(DriverError(ret));
             }
         } else {
-            // Pre-Hopper: cooperative kernel launch (grid.sync() requires it).
             let ret = sys::lib().cuLaunchCooperativeKernel(
                 function_batched,
-                grid_x, b as u32, 1,
-                block_x, 1, 1,
-                0,  // sharedMemBytes
                 cu_stream,
                 kernel_params.as_mut_ptr(),
             );

 use std::ffi::CString;
 use std::sync::Arc;
+use cudarc::driver::{
+    result, sys, CudaDevice, CudaSlice, DevicePtr, DeviceRepr, DriverError, LaunchConfig,
+};
 use cudarc::nvrtc::Ptx;
 use super::sp_gpu::SpatialPoolerGpu;
 use super::tm_gpu::{TemporalMemoryGpu, MAX_SEGMENTS_PER_CELL, MAX_SYN_PER_SEGMENT};
+const PTX_HTM_FUSED: &str = include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/htm_fused_step.ptx"));
 /// Struct-by-value pointer pack — matches C-side `FusedPtrs`.
 ///
     grid_cap_override: Option<u32>,
 ) -> Result<FusedLaunchPlan, String> {
     let sm_count = sm_count.max(1);
+    // 1024 threads/block exceeds the register file on Ampere and makes the
+    // cooperative-grid residency probe lie when the launch uses a different
+    // block size. Keep the planned block size identical to the occupancy probe.
     let block_dim_x = 256u32;
     // Cluster launch path: cooperative launch is not required. Keep the probe
         eprintln!("[htm_rust] INFO: cooperative launch unsupported; cluster path only.");
     }
+    // Cluster constraint: grid_dim_x must equal the cluster size (16) so that
+    // each region maps to exactly one cluster. `HTM_FUSED_GRID_CAP` can lower
+    // this for debugging but should not exceed 16 for cluster correctness.
     let default_grid_cap = 16u32;
+    let grid_cap = grid_cap_override.unwrap_or(default_grid_cap).min(16);
     let resident_bound = if cooperative_grid_limit > 0 {
         cooperative_grid_limit.max(sm_count * 2)
     } else {
     pub cell_active_bits_b: CudaSlice<u32>,
     pub cell_winner_bits_a: CudaSlice<u32>,
     pub cell_winner_bits_b: CudaSlice<u32>,
+    pub step_scratch: CudaSlice<u32>, // length 6
     pub grid_dim_x: u32,
     pub block_dim_x: u32,
         initial_threshold: f32,
     ) -> Result<Self, DriverError> {
         let n_cells = n_columns * cells_per_column;
+        assert!(
+            n_cells % 32 == 0,
+            "n_cells must be divisible by 32 for bitsets"
+        );
         let bits_words = n_cells / 32;
         let mut inhibition_threshold = dev.alloc_zeros::<f32>(n_columns)?;
         // every launched kernel function, otherwise cuLaunchKernelEx rejects
         // the cluster dim with CUDA_ERROR_INVALID_CLUSTER_SIZE.
         unsafe {
+            let attr =
+                sys::CUfunction_attribute::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED;
             // Ignore errors: older CUDA may lack the attribute, in which case
             // only portable sizes (<= 8) work — plan_fused_launch caps at 8.
             let _ = sys::lib().cuFuncSetAttribute(function, attr, 1);
         };
         // T1: Probe Hopper cluster launch capability.
+        let max_cluster_size = match dev
+            .attribute(cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH)
+        {
             Ok(v) if v > 0 => {
                 // H200/sm_90a supports up to 16 blocks per cluster.
                 // There is no MAX_CLUSTER_SIZE attribute in CUDA 12.4; hard-code the
         Ok(Self {
             dev,
+            raw_kernel: RawFusedKernel {
+                module,
+                function,
+                function_batched,
+            },
             inhibition_threshold,
             cell_active_bits_a,
             cell_active_bits_b,
         inputs: *inputs_flat.device_ptr(),
         cols_out: *cols_out.device_ptr(),
         anom_out: *anom_out.device_ptr(),
+        barrier_counters: 0u64, // ABI-compat dummy; cluster barrier replaces DLB.
         step_scratch: *fused.step_scratch.device_ptr(),
     };
             }
         } else {
             // Pre-Hopper: cooperative kernel launch. The fused kernel uses
+            // cg::this_grid().sync(); normal launches poison the CUDA context
+            // with an asynchronous unspecified launch failure.
             let ret = sys::lib().cuLaunchCooperativeKernel(
                 fused.raw_kernel.function,
+                grid_x,
+                1,
+                1,
+                block_x,
+                1,
+                1,
+                0,
                 cu_stream,
                 kernel_params.as_mut_ptr(),
             );
                 inputs: inputs_per_region[i],
                 cols_out: cols_per_region[i],
                 anom_out: anom_per_region[i],
+                barrier_counters: 0u64, // ABI-compat dummy; cluster barrier replaces DLB.
                 step_scratch: *r.fused_state.step_scratch.device_ptr(),
             }
         })
         let r0 = unsafe { &*region_ptrs[0] };
         r0.fused_state.cluster_info.max_cluster_size > 0
     };
+    let grid_x =
+        plan_batched_grid_dim(grid_x, cooperative_grid_limit, b, use_cluster).map_err(|msg| {
             eprintln!("[htm_rust] FATAL: {msg}");
             DriverError(cudarc::driver::sys::CUresult::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE)
         })?;
                 return Err(DriverError(ret));
             }
         } else {
+            // Pre-Hopper: cooperative kernel launch. The fused kernel uses
+            // cg::this_grid().sync(), which is only valid under cooperative
+            // launch. A normal launch can run until the first grid.sync() and
+            // then poison the CUDA context with an unspecified launch failure.
             let ret = sys::lib().cuLaunchCooperativeKernel(
                 function_batched,
+                grid_x,
+                b as u32,
+                1,
+                block_x,
+                1,
+                1,
+                0,
                 cu_stream,
                 kernel_params.as_mut_ptr(),
             );

overlay/htm_rust/src/gpu/kernels/sp_boost_fused.cu CHANGED Viewed

@@ -1,59 +1,59 @@
-// Fused mean-reduction + boost-update kernel.
-//
-// Inputs:
-//   active_duty[n] (f32)
-//   boost_strength (f32)
-//
-// Output:
-//   boost[n] (f32) = expf(-boost_strength * (active_duty[c] - mean))
-//
-// Launch: single block (1024 threads), shared mem for reduction. At n=2048
-// each thread handles 2 elements.
-extern "C" __global__
-void sp_boost_from_duty(
-    const float * __restrict__ active_duty,  // (n,)
-    float       * __restrict__ boost,        // (n,) in-place out
-    float         boost_strength,
-    unsigned int  n
-) {
-    extern __shared__ float smem_raw[];
-    float * smem = smem_raw;
-    const unsigned int tid = threadIdx.x;
-    const unsigned int bsz = blockDim.x;
-    // Phase 1: parallel sum of active_duty into smem[0..32] (warp-level).
-    float local_sum = 0.0f;
-    for (unsigned int i = tid; i < n; i += bsz) {
-        local_sum += active_duty[i];
-    }
-    // Warp reduction.
-    for (int off = 16; off > 0; off >>= 1) {
-        local_sum += __shfl_down_sync(0xffffffff, local_sum, off);
-    }
-    unsigned int lane = tid & 31;
-    unsigned int warp = tid >> 5;
-    if (lane == 0) smem[warp] = local_sum;
-    __syncthreads();
-    // Warp 0 reduces warp-sums.
-    __shared__ float mean_s;
-    if (warp == 0) {
-        unsigned int nwarps = (bsz + 31) / 32;
-        float v = (lane < nwarps) ? smem[lane] : 0.0f;
-        for (int off = 16; off > 0; off >>= 1) {
-            v += __shfl_down_sync(0xffffffff, v, off);
-        }
-        if (tid == 0) {
-            mean_s = v / (float)n;
-        }
-    }
-    __syncthreads();
-    // Phase 2: boost[c] = expf(-strength * (active_duty[c] - mean)).
-    float mean = mean_s;
-    for (unsigned int i = tid; i < n; i += bsz) {
-        float d = active_duty[i] - mean;
-        boost[i] = expf(-boost_strength * d);
-    }
-}

+// Fused mean-reduction + boost-update kernel.
+//
+// Inputs:
+//   active_duty[n] (f32)
+//   boost_strength (f32)
+//
+// Output:
+//   boost[n] (f32) = expf(-boost_strength * (active_duty[c] - mean))
+//
+// Launch: single block (1024 threads), shared mem for reduction. At n=2048
+// each thread handles 2 elements.
+extern "C" __global__
+void sp_boost_from_duty(
+    const float * __restrict__ active_duty,  // (n,)
+    float       * __restrict__ boost,        // (n,) in-place out
+    float         boost_strength,
+    unsigned int  n
+) {
+    extern __shared__ float smem_raw[];
+    float * smem = smem_raw;
+    const unsigned int tid = threadIdx.x;
+    const unsigned int bsz = blockDim.x;
+    // Phase 1: parallel sum of active_duty into smem[0..32] (warp-level).
+    float local_sum = 0.0f;
+    for (unsigned int i = tid; i < n; i += bsz) {
+        local_sum += active_duty[i];
+    }
+    // Warp reduction.
+    for (int off = 16; off > 0; off >>= 1) {
+        local_sum += __shfl_down_sync(0xffffffff, local_sum, off);
+    }
+    unsigned int lane = tid & 31;
+    unsigned int warp = tid >> 5;
+    if (lane == 0) smem[warp] = local_sum;
+    __syncthreads();
+    // Warp 0 reduces warp-sums.
+    __shared__ float mean_s;
+    if (warp == 0) {
+        unsigned int nwarps = (bsz + 31) / 32;
+        float v = (lane < nwarps) ? smem[lane] : 0.0f;
+        for (int off = 16; off > 0; off >>= 1) {
+            v += __shfl_down_sync(0xffffffff, v, off);
+        }
+        if (tid == 0) {
+            mean_s = v / (float)n;
+        }
+    }
+    __syncthreads();
+    // Phase 2: boost[c] = expf(-strength * (active_duty[c] - mean)).
+    float mean = mean_s;
+    for (unsigned int i = tid; i < n; i += bsz) {
+        float d = active_duty[i] - mean;
+        boost[i] = expf(-boost_strength * d);
+    }
+}

overlay/htm_rust/src/gpu/kernels/sp_duty.cu CHANGED Viewed

@@ -1,45 +1,45 @@
-// Duty cycle + boost update kernel.
-//
-// For each column c (one thread each):
-//   active_sample    = active_mask[c] ? 1 : 0
-//   overlap_sample   = raw_overlap[c] >= stim_thr ? 1 : 0
-//   active_duty[c]   = (1-alpha) * active_duty[c]  + alpha * active_sample
-//   overlap_duty[c]  = (1-alpha) * overlap_duty[c] + alpha * overlap_sample
-//
-// Then, if learn:
-//   boost[c] = exp(-boost_strength * (active_duty[c] - mean_duty))
-// mean_duty is computed on the host (one reduction) and passed in.
-extern "C" __global__
-void sp_duty_update(
-    const unsigned char * __restrict__ active_mask,  // (n_columns,)
-    const unsigned int  * __restrict__ raw_overlap,  // (n_columns,)
-    float               * __restrict__ active_duty,  // (n_columns,) in-place
-    float               * __restrict__ overlap_duty, // (n_columns,) in-place
-    float               * __restrict__ boost,        // (n_columns,) in-place
-    float                 alpha,
-    float                 stim_thr,
-    float                 boost_strength,            // 0 to skip boost
-    float                 mean_duty,
-    unsigned int          learn_flag,                // 0 or 1
-    unsigned int          n_columns
-) {
-    unsigned int c = blockIdx.x * blockDim.x + threadIdx.x;
-    if (c >= n_columns) return;
-    float ad = active_duty[c];
-    float od = overlap_duty[c];
-    float a_sample = (active_mask[c] != 0) ? 1.0f : 0.0f;
-    float o_sample = ((float)raw_overlap[c] >= stim_thr) ? 1.0f : 0.0f;
-    ad = (1.0f - alpha) * ad + alpha * a_sample;
-    od = (1.0f - alpha) * od + alpha * o_sample;
-    active_duty[c]  = ad;
-    overlap_duty[c] = od;
-    if (learn_flag && boost_strength > 0.0f) {
-        boost[c] = expf(-boost_strength * (ad - mean_duty));
-    }
-}

+// Duty cycle + boost update kernel.
+//
+// For each column c (one thread each):
+//   active_sample    = active_mask[c] ? 1 : 0
+//   overlap_sample   = raw_overlap[c] >= stim_thr ? 1 : 0
+//   active_duty[c]   = (1-alpha) * active_duty[c]  + alpha * active_sample
+//   overlap_duty[c]  = (1-alpha) * overlap_duty[c] + alpha * overlap_sample
+//
+// Then, if learn:
+//   boost[c] = exp(-boost_strength * (active_duty[c] - mean_duty))
+// mean_duty is computed on the host (one reduction) and passed in.
+extern "C" __global__
+void sp_duty_update(
+    const unsigned char * __restrict__ active_mask,  // (n_columns,)
+    const unsigned int  * __restrict__ raw_overlap,  // (n_columns,)
+    float               * __restrict__ active_duty,  // (n_columns,) in-place
+    float               * __restrict__ overlap_duty, // (n_columns,) in-place
+    float               * __restrict__ boost,        // (n_columns,) in-place
+    float                 alpha,
+    float                 stim_thr,
+    float                 boost_strength,            // 0 to skip boost
+    float                 mean_duty,
+    unsigned int          learn_flag,                // 0 or 1
+    unsigned int          n_columns
+) {
+    unsigned int c = blockIdx.x * blockDim.x + threadIdx.x;
+    if (c >= n_columns) return;
+    float ad = active_duty[c];
+    float od = overlap_duty[c];
+    float a_sample = (active_mask[c] != 0) ? 1.0f : 0.0f;
+    float o_sample = ((float)raw_overlap[c] >= stim_thr) ? 1.0f : 0.0f;
+    ad = (1.0f - alpha) * ad + alpha * a_sample;
+    od = (1.0f - alpha) * od + alpha * o_sample;
+    active_duty[c]  = ad;
+    overlap_duty[c] = od;
+    if (learn_flag && boost_strength > 0.0f) {
+        boost[c] = expf(-boost_strength * (ad - mean_duty));
+    }
+}

overlay/htm_rust/src/gpu/kernels/sp_learn.cu CHANGED Viewed

@@ -1,45 +1,45 @@
-// SP Hebbian learning kernel.
-//
-// For each active (winner) column c, for each of its synapses s:
-//   if input[bit[c][s]] active: perm += inc
-//   else:                       perm -= dec
-// Clamp to [0, 1].
-//
-// Launch: one block per column (2048 blocks), but we predicate on
-// active_mask[c] to avoid launching k-specific blocks.
-//
-// This matches the CPU reference line-for-line:
-//   src/sp.rs lines 157-169.
-extern "C" __global__
-void sp_learn(
-    const unsigned char * __restrict__ active_mask,  // (n_columns,) 0/1
-    const unsigned char * __restrict__ inp,          // (input_bits,)
-    const unsigned int  * __restrict__ syn_bit,      // (n_columns * S,)
-    float               * __restrict__ syn_perm,     // (n_columns * S,) in-place
-    float                 inc,
-    float                 dec,
-    unsigned int          synapses_per_col,
-    unsigned int          n_columns
-) {
-    const unsigned int c = blockIdx.x;
-    if (c >= n_columns) return;
-    if (active_mask[c] == 0) return;
-    const unsigned int base = c * synapses_per_col;
-    const unsigned int tid = threadIdx.x;
-    const unsigned int bsz = blockDim.x;
-    for (unsigned int s = tid; s < synapses_per_col; s += bsz) {
-        unsigned int b = syn_bit[base + s];
-        float p = syn_perm[base + s];
-        if (inp[b] != 0) {
-            p += inc;
-            if (p > 1.0f) p = 1.0f;
-        } else {
-            p -= dec;
-            if (p < 0.0f) p = 0.0f;
-        }
-        syn_perm[base + s] = p;
-    }
-}

+// SP Hebbian learning kernel.
+//
+// For each active (winner) column c, for each of its synapses s:
+//   if input[bit[c][s]] active: perm += inc
+//   else:                       perm -= dec
+// Clamp to [0, 1].
+//
+// Launch: one block per column (2048 blocks), but we predicate on
+// active_mask[c] to avoid launching k-specific blocks.
+//
+// This matches the CPU reference line-for-line:
+//   src/sp.rs lines 157-169.
+extern "C" __global__
+void sp_learn(
+    const unsigned char * __restrict__ active_mask,  // (n_columns,) 0/1
+    const unsigned char * __restrict__ inp,          // (input_bits,)
+    const unsigned int  * __restrict__ syn_bit,      // (n_columns * S,)
+    float               * __restrict__ syn_perm,     // (n_columns * S,) in-place
+    float                 inc,
+    float                 dec,
+    unsigned int          synapses_per_col,
+    unsigned int          n_columns
+) {
+    const unsigned int c = blockIdx.x;
+    if (c >= n_columns) return;
+    if (active_mask[c] == 0) return;
+    const unsigned int base = c * synapses_per_col;
+    const unsigned int tid = threadIdx.x;
+    const unsigned int bsz = blockDim.x;
+    for (unsigned int s = tid; s < synapses_per_col; s += bsz) {
+        unsigned int b = syn_bit[base + s];
+        float p = syn_perm[base + s];
+        if (inp[b] != 0) {
+            p += inc;
+            if (p > 1.0f) p = 1.0f;
+        } else {
+            p -= dec;
+            if (p < 0.0f) p = 0.0f;
+        }
+        syn_perm[base + s] = p;
+    }
+}

overlay/htm_rust/src/gpu/kernels/sp_overlap.cu CHANGED Viewed

@@ -1,78 +1,78 @@
-// SP overlap kernel.
-//
-// For each column c (one CUDA block), compute:
-//   overlap[c] = sum over its synapse list of {inp[bit[c][s]] && perm[c][s] >= conn_thr}
-//   boosted[c] = overlap[c] * boost[c]
-//   raw_overlap[c] = overlap[c]   (also returned so host can drive duty cycle)
-//
-// Memory layout (flat, column-major with per-column stride = synapses_per_col):
-//   syn_bit[c * S + s]  : u32 index into input SDR
-//   syn_perm[c * S + s] : f32 permanence in [0, 1]
-//   boost[c]           : f32
-//   inp[b]             : u8 0/1
-// Output:
-//   raw[c]     : u32
-//   boosted[c] : f32
-//
-// Launch:
-//   grid  = n_columns
-//   block = 128 (or 256) — one warp-sweep across synapses; many warps give
-//                          parallel reduction across S (typically S=40).
-//
-// At S=40 this is completely latency-bound; we coalesce reads and do a
-// warp-shuffle reduction. For clarity we use a simple block-wide shared-mem
-// reduction which is sufficient for S <= 1024 and has zero correctness risk.
-extern "C" __global__
-void sp_overlap(
-    const unsigned char * __restrict__ inp,     // (input_bits,)
-    const unsigned int  * __restrict__ syn_bit, // (n_columns * S,)
-    const float         * __restrict__ syn_perm,// (n_columns * S,)
-    const float         * __restrict__ boost,   // (n_columns,)
-    float                 conn_thr,
-    unsigned int          synapses_per_col,     // S
-    unsigned int          n_columns,
-    unsigned int        * __restrict__ raw_out,     // (n_columns,)
-    float               * __restrict__ boosted_out  // (n_columns,)
-) {
-    const unsigned int c = blockIdx.x;
-    if (c >= n_columns) return;
-    const unsigned int base = c * synapses_per_col;
-    const unsigned int tid = threadIdx.x;
-    const unsigned int bsz = blockDim.x;
-    // Per-thread partial count.
-    unsigned int local = 0;
-    for (unsigned int s = tid; s < synapses_per_col; s += bsz) {
-        unsigned int b = syn_bit[base + s];
-        float p = syn_perm[base + s];
-        // Branchless: only counts when input active AND perm connected.
-        // Using (inp != 0) to tolerate u8 layout.
-        unsigned int hit = ((inp[b] != 0) && (p >= conn_thr)) ? 1u : 0u;
-        local += hit;
-    }
-    // Block-wide reduction in shared memory.
-    __shared__ unsigned int smem[32];
-    // Warp-level reduction via shuffle.
-    unsigned int lane = tid & 31;
-    unsigned int warp = tid >> 5;
-    for (int off = 16; off > 0; off >>= 1) {
-        local += __shfl_down_sync(0xffffffff, local, off);
-    }
-    if (lane == 0) smem[warp] = local;
-    __syncthreads();
-    if (warp == 0) {
-        unsigned int v = (tid < (bsz + 31) / 32) ? smem[lane] : 0;
-        for (int off = 16; off > 0; off >>= 1) {
-            v += __shfl_down_sync(0xffffffff, v, off);
-        }
-        if (tid == 0) {
-            raw_out[c] = v;
-            boosted_out[c] = (float)v * boost[c];
-        }
-    }
-}

+// SP overlap kernel.
+//
+// For each column c (one CUDA block), compute:
+//   overlap[c] = sum over its synapse list of {inp[bit[c][s]] && perm[c][s] >= conn_thr}
+//   boosted[c] = overlap[c] * boost[c]
+//   raw_overlap[c] = overlap[c]   (also returned so host can drive duty cycle)
+//
+// Memory layout (flat, column-major with per-column stride = synapses_per_col):
+//   syn_bit[c * S + s]  : u32 index into input SDR
+//   syn_perm[c * S + s] : f32 permanence in [0, 1]
+//   boost[c]           : f32
+//   inp[b]             : u8 0/1
+// Output:
+//   raw[c]     : u32
+//   boosted[c] : f32
+//
+// Launch:
+//   grid  = n_columns
+//   block = 128 (or 256) — one warp-sweep across synapses; many warps give
+//                          parallel reduction across S (typically S=40).
+//
+// At S=40 this is completely latency-bound; we coalesce reads and do a
+// warp-shuffle reduction. For clarity we use a simple block-wide shared-mem
+// reduction which is sufficient for S <= 1024 and has zero correctness risk.
+extern "C" __global__
+void sp_overlap(
+    const unsigned char * __restrict__ inp,     // (input_bits,)
+    const unsigned int  * __restrict__ syn_bit, // (n_columns * S,)
+    const float         * __restrict__ syn_perm,// (n_columns * S,)
+    const float         * __restrict__ boost,   // (n_columns,)
+    float                 conn_thr,
+    unsigned int          synapses_per_col,     // S
+    unsigned int          n_columns,
+    unsigned int        * __restrict__ raw_out,     // (n_columns,)
+    float               * __restrict__ boosted_out  // (n_columns,)
+) {
+    const unsigned int c = blockIdx.x;
+    if (c >= n_columns) return;
+    const unsigned int base = c * synapses_per_col;
+    const unsigned int tid = threadIdx.x;
+    const unsigned int bsz = blockDim.x;
+    // Per-thread partial count.
+    unsigned int local = 0;
+    for (unsigned int s = tid; s < synapses_per_col; s += bsz) {
+        unsigned int b = syn_bit[base + s];
+        float p = syn_perm[base + s];
+        // Branchless: only counts when input active AND perm connected.
+        // Using (inp != 0) to tolerate u8 layout.
+        unsigned int hit = ((inp[b] != 0) && (p >= conn_thr)) ? 1u : 0u;
+        local += hit;
+    }
+    // Block-wide reduction in shared memory.
+    __shared__ unsigned int smem[32];
+    // Warp-level reduction via shuffle.
+    unsigned int lane = tid & 31;
+    unsigned int warp = tid >> 5;
+    for (int off = 16; off > 0; off >>= 1) {
+        local += __shfl_down_sync(0xffffffff, local, off);
+    }
+    if (lane == 0) smem[warp] = local;
+    __syncthreads();
+    if (warp == 0) {
+        unsigned int v = (tid < (bsz + 31) / 32) ? smem[lane] : 0;
+        for (int off = 16; off > 0; off >>= 1) {
+            v += __shfl_down_sync(0xffffffff, v, off);
+        }
+        if (tid == 0) {
+            raw_out[c] = v;
+            boosted_out[c] = (float)v * boost[c];
+        }
+    }
+}

overlay/htm_rust/src/gpu/kernels/sp_topk.cu CHANGED Viewed

@@ -1,117 +1,117 @@
-// Top-K column selection.
-//
-// Inputs:
-//   boosted[n_columns] : f32 score
-// Output:
-//   active_mask[n_columns] : u8 0/1, exactly k ones
-//
-// Tie-breaking: when scores are equal, the LOWER column index wins (matches
-// CPU reference `select_nth_unstable_by` with secondary index comparator).
-//
-// Strategy: a single-block implementation. n_columns is typically 2048, which
-// fits comfortably in shared memory. We use a bitonic top-k via per-thread
-// radix-select of the (score, -index) key. At k≈41 of n=2048 the simplest
-// correct approach is a thresholding pass:
-//
-//   1. Radix-like bucket pass to find the k-th largest score.
-//   2. Mark winners = strictly-greater-than-threshold AND ties until count hits k.
-//
-// For strict index-ordered tie-break we materialise a 64-bit key:
-//   key = (float_to_sortable_u32(score) << 32) | (0xffffffff - index)
-// Larger key = (higher score) OR (same score, smaller index).
-//
-// Then we find the k-th largest 64-bit key via radix-select and mark all
-// columns whose key >= threshold. This is O(n_cols * log k) and well under
-// 100 μs for n=2048, k=41 on sm_86.
-//
-// For simplicity and correctness this kernel uses a single-block parallel
-// selection sort variant (find max → mark → zero → repeat, k iterations).
-// At k=41 this is 41 passes of 2048 threads = ~2048*41 = 84K ops, trivially
-// fast.
-extern "C" __global__
-void sp_topk_select(
-    const float * __restrict__ scores,    // (n_columns,)
-    unsigned int  n_columns,
-    unsigned int  k,
-    unsigned char * __restrict__ active_out  // (n_columns,)
-) {
-    extern __shared__ float smem[];
-    // Layout: smem[0..n] = working scores (we'll mark selected entries as -inf)
-    //         smem[n..n+32*2] = reduction scratch (score + index, per warp)
-    float * work = smem;
-    const unsigned int tid = threadIdx.x;
-    const unsigned int bsz = blockDim.x;
-    // Load scores into shared; also init active_out = 0.
-    for (unsigned int i = tid; i < n_columns; i += bsz) {
-        work[i] = scores[i];
-        active_out[i] = 0;
-    }
-    __syncthreads();
-    __shared__ int   winner_idx;
-    __shared__ float winner_score;
-    for (unsigned int iter = 0; iter < k; ++iter) {
-        // Find (argmax score, lowest index for ties).
-        float best_s = -INFINITY;
-        int   best_i = n_columns;   // sentinel larger than any index
-        for (unsigned int i = tid; i < n_columns; i += bsz) {
-            float s = work[i];
-            if (s > best_s || (s == best_s && (int)i < best_i)) {
-                best_s = s;
-                best_i = (int)i;
-            }
-        }
-        // Warp reduction. We reduce pairs (score, idx) keeping (max score, min idx on tie).
-        unsigned int mask = 0xffffffff;
-        for (int off = 16; off > 0; off >>= 1) {
-            float os = __shfl_down_sync(mask, best_s, off);
-            int   oi = __shfl_down_sync(mask, best_i, off);
-            if (os > best_s || (os == best_s && oi < best_i)) {
-                best_s = os;
-                best_i = oi;
-            }
-        }
-        // Warp 0 collects lane 0 values from other warps via shared mem.
-        __shared__ float warp_s[32];
-        __shared__ int   warp_i[32];
-        unsigned int lane = tid & 31;
-        unsigned int warp = tid >> 5;
-        if (lane == 0) {
-            warp_s[warp] = best_s;
-            warp_i[warp] = best_i;
-        }
-        __syncthreads();
-        if (warp == 0) {
-            unsigned int nwarps = (bsz + 31) / 32;
-            float s = (lane < nwarps) ? warp_s[lane] : -INFINITY;
-            int   i = (lane < nwarps) ? warp_i[lane] : (int)n_columns;
-            for (int off = 16; off > 0; off >>= 1) {
-                float os = __shfl_down_sync(mask, s, off);
-                int   oi = __shfl_down_sync(mask, i, off);
-                if (os > s || (os == s && oi < i)) {
-                    s = os;
-                    i = oi;
-                }
-            }
-            if (tid == 0) {
-                winner_score = s;
-                winner_idx = i;
-            }
-        }
-        __syncthreads();
-        if (tid == 0) {
-            if (winner_idx < (int)n_columns) {
-                active_out[winner_idx] = 1;
-                work[winner_idx] = -INFINITY;
-            }
-        }
-        __syncthreads();
-    }
-}

+// Top-K column selection.
+//
+// Inputs:
+//   boosted[n_columns] : f32 score
+// Output:
+//   active_mask[n_columns] : u8 0/1, exactly k ones
+//
+// Tie-breaking: when scores are equal, the LOWER column index wins (matches
+// CPU reference `select_nth_unstable_by` with secondary index comparator).
+//
+// Strategy: a single-block implementation. n_columns is typically 2048, which
+// fits comfortably in shared memory. We use a bitonic top-k via per-thread
+// radix-select of the (score, -index) key. At k≈41 of n=2048 the simplest
+// correct approach is a thresholding pass:
+//
+//   1. Radix-like bucket pass to find the k-th largest score.
+//   2. Mark winners = strictly-greater-than-threshold AND ties until count hits k.
+//
+// For strict index-ordered tie-break we materialise a 64-bit key:
+//   key = (float_to_sortable_u32(score) << 32) | (0xffffffff - index)
+// Larger key = (higher score) OR (same score, smaller index).
+//
+// Then we find the k-th largest 64-bit key via radix-select and mark all
+// columns whose key >= threshold. This is O(n_cols * log k) and well under
+// 100 μs for n=2048, k=41 on sm_86.
+//
+// For simplicity and correctness this kernel uses a single-block parallel
+// selection sort variant (find max → mark → zero → repeat, k iterations).
+// At k=41 this is 41 passes of 2048 threads = ~2048*41 = 84K ops, trivially
+// fast.
+extern "C" __global__
+void sp_topk_select(
+    const float * __restrict__ scores,    // (n_columns,)
+    unsigned int  n_columns,
+    unsigned int  k,
+    unsigned char * __restrict__ active_out  // (n_columns,)
+) {
+    extern __shared__ float smem[];
+    // Layout: smem[0..n] = working scores (we'll mark selected entries as -inf)
+    //         smem[n..n+32*2] = reduction scratch (score + index, per warp)
+    float * work = smem;
+    const unsigned int tid = threadIdx.x;
+    const unsigned int bsz = blockDim.x;
+    // Load scores into shared; also init active_out = 0.
+    for (unsigned int i = tid; i < n_columns; i += bsz) {
+        work[i] = scores[i];
+        active_out[i] = 0;
+    }
+    __syncthreads();
+    __shared__ int   winner_idx;
+    __shared__ float winner_score;
+    for (unsigned int iter = 0; iter < k; ++iter) {
+        // Find (argmax score, lowest index for ties).
+        float best_s = -INFINITY;
+        int   best_i = n_columns;   // sentinel larger than any index
+        for (unsigned int i = tid; i < n_columns; i += bsz) {
+            float s = work[i];
+            if (s > best_s || (s == best_s && (int)i < best_i)) {
+                best_s = s;
+                best_i = (int)i;
+            }
+        }
+        // Warp reduction. We reduce pairs (score, idx) keeping (max score, min idx on tie).
+        unsigned int mask = 0xffffffff;
+        for (int off = 16; off > 0; off >>= 1) {
+            float os = __shfl_down_sync(mask, best_s, off);
+            int   oi = __shfl_down_sync(mask, best_i, off);
+            if (os > best_s || (os == best_s && oi < best_i)) {
+                best_s = os;
+                best_i = oi;
+            }
+        }
+        // Warp 0 collects lane 0 values from other warps via shared mem.
+        __shared__ float warp_s[32];
+        __shared__ int   warp_i[32];
+        unsigned int lane = tid & 31;
+        unsigned int warp = tid >> 5;
+        if (lane == 0) {
+            warp_s[warp] = best_s;
+            warp_i[warp] = best_i;
+        }
+        __syncthreads();
+        if (warp == 0) {
+            unsigned int nwarps = (bsz + 31) / 32;
+            float s = (lane < nwarps) ? warp_s[lane] : -INFINITY;
+            int   i = (lane < nwarps) ? warp_i[lane] : (int)n_columns;
+            for (int off = 16; off > 0; off >>= 1) {
+                float os = __shfl_down_sync(mask, s, off);
+                int   oi = __shfl_down_sync(mask, i, off);
+                if (os > s || (os == s && oi < i)) {
+                    s = os;
+                    i = oi;
+                }
+            }
+            if (tid == 0) {
+                winner_score = s;
+                winner_idx = i;
+            }
+        }
+        __syncthreads();
+        if (tid == 0) {
+            if (winner_idx < (int)n_columns) {
+                active_out[winner_idx] = 1;
+                work[winner_idx] = -INFINITY;
+            }
+        }
+        __syncthreads();
+    }
+}

overlay/htm_rust/src/gpu/kernels/tm_activate.cu CHANGED Viewed

@@ -1,66 +1,66 @@
-// TM activate kernel. See tm_predict.cu for TmConfig.
-struct TmConfig {
-    unsigned int activation_threshold;
-    unsigned int learning_threshold;
-    unsigned int cells_per_column;
-    unsigned int synapses_per_segment;
-    unsigned int n_segments;
-    unsigned int n_cells;
-    unsigned int max_segments_per_cell;
-    unsigned int max_new_synapses;
-    int conn_thr_i16;
-    int perm_inc_i16;
-    int perm_dec_i16;
-    int predicted_seg_dec_i16;
-    int initial_perm_i16;
-    unsigned int iter_seed;
-    unsigned int n_cols;
-    unsigned int bits_words;
-};
-extern "C" __global__
-void tm_activate(
-    const unsigned char * __restrict__ sp_active_mask,
-    const unsigned char * __restrict__ col_predicted,
-    const unsigned int  * __restrict__ cell_predictive_bits,
-    unsigned int        * __restrict__ cell_active_bits,
-    unsigned int        * __restrict__ cell_winner_bits,
-    unsigned int        * __restrict__ unpredicted_count,
-    unsigned int        * __restrict__ burst_cols_flat,
-    unsigned int        * __restrict__ burst_cols_count,
-    TmConfig              cfg
-) {
-    unsigned int col = blockIdx.x * blockDim.x + threadIdx.x;
-    if (col >= cfg.n_cols) return;
-    if (sp_active_mask[col] == 0) return;
-    unsigned int base_cell = col * cfg.cells_per_column;
-    if (col_predicted[col]) {
-        for (unsigned int k = 0; k < cfg.cells_per_column; k++) {
-            unsigned int cell = base_cell + k;
-            unsigned int word_idx = cell >> 5;
-            unsigned int bit_mask = 1u << (cell & 31u);
-            unsigned int pred_word = cell_predictive_bits[word_idx];
-            if (pred_word & bit_mask) {
-                atomicOr(&cell_active_bits[word_idx], bit_mask);
-                atomicOr(&cell_winner_bits[word_idx], bit_mask);
-            }
-        }
-    } else {
-        atomicAdd(unpredicted_count, 1u);
-        for (unsigned int k = 0; k < cfg.cells_per_column; k++) {
-            unsigned int cell = base_cell + k;
-            unsigned int word_idx = cell >> 5;
-            unsigned int bit_mask = 1u << (cell & 31u);
-            atomicOr(&cell_active_bits[word_idx], bit_mask);
-        }
-        unsigned int winner = base_cell;
-        unsigned int word_idx = winner >> 5;
-        unsigned int bit_mask = 1u << (winner & 31u);
-        atomicOr(&cell_winner_bits[word_idx], bit_mask);
-        unsigned int slot = atomicAdd(burst_cols_count, 1u);
-        burst_cols_flat[slot] = col;
-    }
-}

+// TM activate kernel. See tm_predict.cu for TmConfig.
+struct TmConfig {
+    unsigned int activation_threshold;
+    unsigned int learning_threshold;
+    unsigned int cells_per_column;
+    unsigned int synapses_per_segment;
+    unsigned int n_segments;
+    unsigned int n_cells;
+    unsigned int max_segments_per_cell;
+    unsigned int max_new_synapses;
+    int conn_thr_i16;
+    int perm_inc_i16;
+    int perm_dec_i16;
+    int predicted_seg_dec_i16;
+    int initial_perm_i16;
+    unsigned int iter_seed;
+    unsigned int n_cols;
+    unsigned int bits_words;
+};
+extern "C" __global__
+void tm_activate(
+    const unsigned char * __restrict__ sp_active_mask,
+    const unsigned char * __restrict__ col_predicted,
+    const unsigned int  * __restrict__ cell_predictive_bits,
+    unsigned int        * __restrict__ cell_active_bits,
+    unsigned int        * __restrict__ cell_winner_bits,
+    unsigned int        * __restrict__ unpredicted_count,
+    unsigned int        * __restrict__ burst_cols_flat,
+    unsigned int        * __restrict__ burst_cols_count,
+    TmConfig              cfg
+) {
+    unsigned int col = blockIdx.x * blockDim.x + threadIdx.x;
+    if (col >= cfg.n_cols) return;
+    if (sp_active_mask[col] == 0) return;
+    unsigned int base_cell = col * cfg.cells_per_column;
+    if (col_predicted[col]) {
+        for (unsigned int k = 0; k < cfg.cells_per_column; k++) {
+            unsigned int cell = base_cell + k;
+            unsigned int word_idx = cell >> 5;
+            unsigned int bit_mask = 1u << (cell & 31u);
+            unsigned int pred_word = cell_predictive_bits[word_idx];
+            if (pred_word & bit_mask) {
+                atomicOr(&cell_active_bits[word_idx], bit_mask);
+                atomicOr(&cell_winner_bits[word_idx], bit_mask);
+            }
+        }
+    } else {
+        atomicAdd(unpredicted_count, 1u);
+        for (unsigned int k = 0; k < cfg.cells_per_column; k++) {
+            unsigned int cell = base_cell + k;
+            unsigned int word_idx = cell >> 5;
+            unsigned int bit_mask = 1u << (cell & 31u);
+            atomicOr(&cell_active_bits[word_idx], bit_mask);
+        }
+        unsigned int winner = base_cell;
+        unsigned int word_idx = winner >> 5;
+        unsigned int bit_mask = 1u << (winner & 31u);
+        atomicOr(&cell_winner_bits[word_idx], bit_mask);
+        unsigned int slot = atomicAdd(burst_cols_count, 1u);
+        burst_cols_flat[slot] = col;
+    }
+}

overlay/htm_rust/src/gpu/kernels/tm_anomaly.cu CHANGED Viewed

@@ -1,43 +1,43 @@
-// TM anomaly kernel.
-//
-// Computes:
-//   n_active = sum of sp_active_mask
-//   anomaly  = unpredicted_count / n_active   (if n_active > 0)
-//            = 0                              (else)
-//
-// Launch: single block, 256 threads.
-extern "C" __global__
-void tm_anomaly(
-    const unsigned char * __restrict__ sp_active_mask,
-    const unsigned int  * __restrict__ unpredicted_count,
-    float               * __restrict__ anomaly_out,       // (1,) or (t_slot,)
-    unsigned int          t_slot,
-    unsigned int          n_cols
-) {
-    const unsigned int tid = threadIdx.x;
-    __shared__ unsigned int n_active_s;
-    if (tid == 0) n_active_s = 0u;
-    __syncthreads();
-    unsigned int local = 0u;
-    for (unsigned int i = tid; i < n_cols; i += blockDim.x) {
-        if (sp_active_mask[i]) local += 1u;
-    }
-    // Warp reduce.
-    for (int off = 16; off > 0; off >>= 1) {
-        local += __shfl_down_sync(0xffffffffu, local, off);
-    }
-    if ((tid & 31u) == 0) {
-        atomicAdd(&n_active_s, local);
-    }
-    __syncthreads();
-    if (tid == 0) {
-        unsigned int total = n_active_s;
-        unsigned int bad = unpredicted_count[0];
-        float anom = (total > 0u) ? ((float)bad / (float)total) : 0.0f;
-        anomaly_out[t_slot] = anom;
-    }
-}

+// TM anomaly kernel.
+//
+// Computes:
+//   n_active = sum of sp_active_mask
+//   anomaly  = unpredicted_count / n_active   (if n_active > 0)
+//            = 0                              (else)
+//
+// Launch: single block, 256 threads.
+extern "C" __global__
+void tm_anomaly(
+    const unsigned char * __restrict__ sp_active_mask,
+    const unsigned int  * __restrict__ unpredicted_count,
+    float               * __restrict__ anomaly_out,       // (1,) or (t_slot,)
+    unsigned int          t_slot,
+    unsigned int          n_cols
+) {
+    const unsigned int tid = threadIdx.x;
+    __shared__ unsigned int n_active_s;
+    if (tid == 0) n_active_s = 0u;
+    __syncthreads();
+    unsigned int local = 0u;
+    for (unsigned int i = tid; i < n_cols; i += blockDim.x) {
+        if (sp_active_mask[i]) local += 1u;
+    }
+    // Warp reduce.
+    for (int off = 16; off > 0; off >>= 1) {
+        local += __shfl_down_sync(0xffffffffu, local, off);
+    }
+    if ((tid & 31u) == 0) {
+        atomicAdd(&n_active_s, local);
+    }
+    __syncthreads();
+    if (tid == 0) {
+        unsigned int total = n_active_s;
+        unsigned int bad = unpredicted_count[0];
+        float anom = (total > 0u) ? ((float)bad / (float)total) : 0.0f;
+        anomaly_out[t_slot] = anom;
+    }
+}

overlay/htm_rust/src/gpu/kernels/tm_grow.cu CHANGED Viewed

@@ -1,155 +1,155 @@
-// TM grow+reinforce kernel.
-//
-// For each bursting column:
-//   If col_best_match[col] is non-zero (i.e. at least one matching segment
-//     with num_active_potential >= learning_threshold exists on cells in this col):
-//     Target = that matching segment.
-//     Reinforce its existing synapses: +inc if presyn in prev_active, -dec otherwise.
-//     Grow up to (max_new - current_syn_count) additional synapses to prev_winners.
-//   Else:
-//     Allocate a fresh segment slot on winner cell (cell 0 of col).
-//     Grow up to max_new synapses to prev_winners (no reinforce needed — new seg).
-//
-// This mirrors the CPU TM burst logic.
-struct TmConfig {
-    unsigned int activation_threshold;
-    unsigned int learning_threshold;
-    unsigned int cells_per_column;
-    unsigned int synapses_per_segment;
-    unsigned int n_segments;
-    unsigned int n_cells;
-    unsigned int max_segments_per_cell;
-    unsigned int max_new_synapses;
-    int conn_thr_i16;
-    int perm_inc_i16;
-    int perm_dec_i16;
-    int predicted_seg_dec_i16;
-    int initial_perm_i16;
-    unsigned int iter_seed;
-    unsigned int n_cols;
-    unsigned int bits_words;
-};
-extern "C" __global__
-void tm_grow(
-    unsigned int       * __restrict__ seg_cell_id,
-    unsigned int       * __restrict__ seg_syn_count,
-    unsigned int       * __restrict__ syn_presyn,
-    short              * __restrict__ syn_perm,
-    unsigned int       * __restrict__ cell_seg_count,
-    const unsigned int * __restrict__ burst_cols_flat,
-    const unsigned int * __restrict__ burst_cols_count,
-    const unsigned int * __restrict__ prev_winner_bits,
-    const unsigned int * __restrict__ prev_active_bits,
-    const unsigned int * __restrict__ col_best_match,
-    TmConfig             cfg
-) {
-    const unsigned int b = blockIdx.x;
-    const unsigned int n_burst_cols = burst_cols_count[0];
-    if (b >= n_burst_cols) return;
-    const unsigned int tid = threadIdx.x;
-    const unsigned int col = burst_cols_flat[b];
-    __shared__ unsigned int shared_seg_id;
-    __shared__ unsigned int shared_existing_syn_count;
-    __shared__ unsigned int shared_grown;
-    __shared__ unsigned int shared_is_new;
-    __shared__ unsigned int shared_start_offset;
-    if (tid == 0) {
-        unsigned int match_key = col_best_match[col];
-        if (match_key != 0u) {
-            // Reuse matching segment.
-            unsigned int seg_id = match_key & 0x1FFFFFu;
-            shared_seg_id = seg_id;
-            shared_existing_syn_count = seg_syn_count[seg_id];
-            shared_is_new = 0u;
-        } else {
-            // Allocate new segment on winner cell (cell 0 of col).
-            unsigned int winner_cell = col * cfg.cells_per_column;
-            unsigned int slot = atomicAdd(&cell_seg_count[winner_cell], 1u);
-            if (slot >= cfg.max_segments_per_cell) {
-                slot = slot % cfg.max_segments_per_cell;
-            }
-            unsigned int seg_id = winner_cell * cfg.max_segments_per_cell + slot;
-            seg_cell_id[seg_id] = winner_cell;
-            seg_syn_count[seg_id] = 0;
-            shared_seg_id = seg_id;
-            shared_existing_syn_count = 0u;
-            shared_is_new = 1u;
-        }
-        shared_grown = 0u;
-        shared_start_offset = (b * 2654435761u + cfg.iter_seed) % cfg.bits_words;
-    }
-    __syncthreads();
-    const unsigned int seg_id = shared_seg_id;
-    const unsigned int seg_base = seg_id * cfg.synapses_per_segment;
-    const unsigned int existing_syn = shared_existing_syn_count;
-    const unsigned int is_new = shared_is_new;
-    const unsigned int start = shared_start_offset;
-    // PHASE 1: If reusing, reinforce existing synapses.
-    if (!is_new) {
-        for (unsigned int s = tid; s < existing_syn; s += 32u) {
-            unsigned int presyn = syn_presyn[seg_base + s];
-            unsigned int word = prev_active_bits[presyn >> 5];
-            unsigned int bit = (word >> (presyn & 31u)) & 1u;
-            int p = (int)syn_perm[seg_base + s];
-            if (bit) {
-                int np = p + cfg.perm_inc_i16;
-                if (np > 32767) np = 32767;
-                syn_perm[seg_base + s] = (short)np;
-            } else {
-                int np = p - cfg.perm_dec_i16;
-                if (np < 0) np = 0;
-                syn_perm[seg_base + s] = (short)np;
-            }
-        }
-        __syncthreads();
-    }
-    // PHASE 2: Grow up to `max_new_synapses` (or room) synapses to prev_winners
-    // that aren't already presynaptic to this segment.
-    const unsigned int room = (cfg.synapses_per_segment > existing_syn)
-        ? (cfg.synapses_per_segment - existing_syn) : 0u;
-    const unsigned int max_grow = (cfg.max_new_synapses < room) ? cfg.max_new_synapses : room;
-    for (unsigned int w_off = 0; w_off < cfg.bits_words; w_off += 32u) {
-        if (shared_grown >= max_grow) break;
-        unsigned int widx = (start + w_off + tid) % cfg.bits_words;
-        unsigned int word = prev_winner_bits[widx];
-        while (word != 0u) {
-            if (shared_grown >= max_grow) break;
-            unsigned int bit_pos = __ffs(word) - 1u;
-            word &= ~(1u << bit_pos);
-            unsigned int cell = widx * 32u + bit_pos;
-            if (cell >= cfg.n_cells) continue;
-            // Skip if already presynaptic (O(existing_syn) scan; usually small).
-            bool exists = false;
-            for (unsigned int s = 0; s < existing_syn; s++) {
-                if (syn_presyn[seg_base + s] == cell) { exists = true; break; }
-            }
-            if (exists) continue;
-            unsigned int slot = atomicAdd(&shared_grown, 1u);
-            if (slot >= max_grow) break;
-            unsigned int write_idx = existing_syn + slot;
-            if (write_idx >= cfg.synapses_per_segment) break;
-            syn_presyn[seg_base + write_idx] = cell;
-            syn_perm[seg_base + write_idx] = (short)cfg.initial_perm_i16;
-        }
-    }
-    __syncthreads();
-    if (tid == 0) {
-        unsigned int grown = shared_grown;
-        if (grown > max_grow) grown = max_grow;
-        unsigned int new_count = existing_syn + grown;
-        if (new_count > cfg.synapses_per_segment) new_count = cfg.synapses_per_segment;
-        seg_syn_count[seg_id] = new_count;
-    }
-}

+// TM grow+reinforce kernel.
+//
+// For each bursting column:
+//   If col_best_match[col] is non-zero (i.e. at least one matching segment
+//     with num_active_potential >= learning_threshold exists on cells in this col):
+//     Target = that matching segment.
+//     Reinforce its existing synapses: +inc if presyn in prev_active, -dec otherwise.
+//     Grow up to (max_new - current_syn_count) additional synapses to prev_winners.
+//   Else:
+//     Allocate a fresh segment slot on winner cell (cell 0 of col).
+//     Grow up to max_new synapses to prev_winners (no reinforce needed — new seg).
+//
+// This mirrors the CPU TM burst logic.
+struct TmConfig {
+    unsigned int activation_threshold;
+    unsigned int learning_threshold;
+    unsigned int cells_per_column;
+    unsigned int synapses_per_segment;
+    unsigned int n_segments;
+    unsigned int n_cells;
+    unsigned int max_segments_per_cell;
+    unsigned int max_new_synapses;
+    int conn_thr_i16;
+    int perm_inc_i16;
+    int perm_dec_i16;
+    int predicted_seg_dec_i16;
+    int initial_perm_i16;
+    unsigned int iter_seed;
+    unsigned int n_cols;
+    unsigned int bits_words;
+};
+extern "C" __global__
+void tm_grow(
+    unsigned int       * __restrict__ seg_cell_id,
+    unsigned int       * __restrict__ seg_syn_count,
+    unsigned int       * __restrict__ syn_presyn,
+    short              * __restrict__ syn_perm,
+    unsigned int       * __restrict__ cell_seg_count,
+    const unsigned int * __restrict__ burst_cols_flat,
+    const unsigned int * __restrict__ burst_cols_count,
+    const unsigned int * __restrict__ prev_winner_bits,
+    const unsigned int * __restrict__ prev_active_bits,
+    const unsigned int * __restrict__ col_best_match,
+    TmConfig             cfg
+) {
+    const unsigned int b = blockIdx.x;
+    const unsigned int n_burst_cols = burst_cols_count[0];
+    if (b >= n_burst_cols) return;
+    const unsigned int tid = threadIdx.x;
+    const unsigned int col = burst_cols_flat[b];
+    __shared__ unsigned int shared_seg_id;
+    __shared__ unsigned int shared_existing_syn_count;
+    __shared__ unsigned int shared_grown;
+    __shared__ unsigned int shared_is_new;
+    __shared__ unsigned int shared_start_offset;
+    if (tid == 0) {
+        unsigned int match_key = col_best_match[col];
+        if (match_key != 0u) {
+            // Reuse matching segment.
+            unsigned int seg_id = match_key & 0x1FFFFFu;
+            shared_seg_id = seg_id;
+            shared_existing_syn_count = seg_syn_count[seg_id];
+            shared_is_new = 0u;
+        } else {
+            // Allocate new segment on winner cell (cell 0 of col).
+            unsigned int winner_cell = col * cfg.cells_per_column;
+            unsigned int slot = atomicAdd(&cell_seg_count[winner_cell], 1u);
+            if (slot >= cfg.max_segments_per_cell) {
+                slot = slot % cfg.max_segments_per_cell;
+            }
+            unsigned int seg_id = winner_cell * cfg.max_segments_per_cell + slot;
+            seg_cell_id[seg_id] = winner_cell;
+            seg_syn_count[seg_id] = 0;
+            shared_seg_id = seg_id;
+            shared_existing_syn_count = 0u;
+            shared_is_new = 1u;
+        }
+        shared_grown = 0u;
+        shared_start_offset = (b * 2654435761u + cfg.iter_seed) % cfg.bits_words;
+    }
+    __syncthreads();
+    const unsigned int seg_id = shared_seg_id;
+    const unsigned int seg_base = seg_id * cfg.synapses_per_segment;
+    const unsigned int existing_syn = shared_existing_syn_count;
+    const unsigned int is_new = shared_is_new;
+    const unsigned int start = shared_start_offset;
+    // PHASE 1: If reusing, reinforce existing synapses.
+    if (!is_new) {
+        for (unsigned int s = tid; s < existing_syn; s += 32u) {
+            unsigned int presyn = syn_presyn[seg_base + s];
+            unsigned int word = prev_active_bits[presyn >> 5];
+            unsigned int bit = (word >> (presyn & 31u)) & 1u;
+            int p = (int)syn_perm[seg_base + s];
+            if (bit) {
+                int np = p + cfg.perm_inc_i16;
+                if (np > 32767) np = 32767;
+                syn_perm[seg_base + s] = (short)np;
+            } else {
+                int np = p - cfg.perm_dec_i16;
+                if (np < 0) np = 0;
+                syn_perm[seg_base + s] = (short)np;
+            }
+        }
+        __syncthreads();
+    }
+    // PHASE 2: Grow up to `max_new_synapses` (or room) synapses to prev_winners
+    // that aren't already presynaptic to this segment.
+    const unsigned int room = (cfg.synapses_per_segment > existing_syn)
+        ? (cfg.synapses_per_segment - existing_syn) : 0u;
+    const unsigned int max_grow = (cfg.max_new_synapses < room) ? cfg.max_new_synapses : room;
+    for (unsigned int w_off = 0; w_off < cfg.bits_words; w_off += 32u) {
+        if (shared_grown >= max_grow) break;
+        unsigned int widx = (start + w_off + tid) % cfg.bits_words;
+        unsigned int word = prev_winner_bits[widx];
+        while (word != 0u) {
+            if (shared_grown >= max_grow) break;
+            unsigned int bit_pos = __ffs(word) - 1u;
+            word &= ~(1u << bit_pos);
+            unsigned int cell = widx * 32u + bit_pos;
+            if (cell >= cfg.n_cells) continue;
+            // Skip if already presynaptic (O(existing_syn) scan; usually small).
+            bool exists = false;
+            for (unsigned int s = 0; s < existing_syn; s++) {
+                if (syn_presyn[seg_base + s] == cell) { exists = true; break; }
+            }
+            if (exists) continue;
+            unsigned int slot = atomicAdd(&shared_grown, 1u);
+            if (slot >= max_grow) break;
+            unsigned int write_idx = existing_syn + slot;
+            if (write_idx >= cfg.synapses_per_segment) break;
+            syn_presyn[seg_base + write_idx] = cell;
+            syn_perm[seg_base + write_idx] = (short)cfg.initial_perm_i16;
+        }
+    }
+    __syncthreads();
+    if (tid == 0) {
+        unsigned int grown = shared_grown;
+        if (grown > max_grow) grown = max_grow;
+        unsigned int new_count = existing_syn + grown;
+        if (new_count > cfg.synapses_per_segment) new_count = cfg.synapses_per_segment;
+        seg_syn_count[seg_id] = new_count;
+    }
+}

overlay/htm_rust/src/gpu/kernels/tm_learn.cu CHANGED Viewed

@@ -1,75 +1,75 @@
-// TM learn (reinforce correctly predicted segments) — cell-grouped launch.
-//
-// Grid: n_cells.
-// For each cell in a predicted, SP-active column: iterate its segments.
-// For each segment with num_active_connected >= activation_threshold,
-// reinforce its synapses against prev_active_bits.
-struct TmConfig {
-    unsigned int activation_threshold;
-    unsigned int learning_threshold;
-    unsigned int cells_per_column;
-    unsigned int synapses_per_segment;
-    unsigned int n_segments;
-    unsigned int n_cells;
-    unsigned int max_segments_per_cell;
-    unsigned int max_new_synapses;
-    int conn_thr_i16;
-    int perm_inc_i16;
-    int perm_dec_i16;
-    int predicted_seg_dec_i16;
-    int initial_perm_i16;
-    unsigned int iter_seed;
-    unsigned int n_cols;
-    unsigned int bits_words;
-};
-extern "C" __global__
-void tm_learn_reinforce(
-    const unsigned int * __restrict__ seg_cell_id,
-    const unsigned int * __restrict__ seg_syn_count,
-    const unsigned int * __restrict__ syn_presyn,
-    short              * __restrict__ syn_perm,
-    const unsigned int * __restrict__ seg_num_active_connected,
-    const unsigned int * __restrict__ prev_active_bits,
-    const unsigned char * __restrict__ sp_active_mask,
-    const unsigned char * __restrict__ col_predicted,
-    const unsigned int * __restrict__ cell_seg_count,
-    TmConfig             cfg
-) {
-    const unsigned int cell = blockIdx.x;
-    if (cell >= cfg.n_cells) return;
-    const unsigned int col = cell / cfg.cells_per_column;
-    if (sp_active_mask[col] == 0) return;
-    if (col_predicted[col] == 0) return;
-    const unsigned int n_segs_here = min(cell_seg_count[cell], cfg.max_segments_per_cell);
-    if (n_segs_here == 0) return;
-    const unsigned int tid = threadIdx.x;
-    const unsigned int seg_base_id = cell * cfg.max_segments_per_cell;
-    for (unsigned int local_seg = 0; local_seg < n_segs_here; local_seg++) {
-        const unsigned int seg = seg_base_id + local_seg;
-        if (seg_num_active_connected[seg] < cfg.activation_threshold) continue;
-        const unsigned int n_syn = seg_syn_count[seg];
-        if (n_syn == 0) continue;
-        const unsigned int syn_base = seg * cfg.synapses_per_segment;
-        for (unsigned int s = tid; s < n_syn; s += 32u) {
-            unsigned int presyn = syn_presyn[syn_base + s];
-            unsigned int word = prev_active_bits[presyn >> 5];
-            unsigned int bit = (word >> (presyn & 31u)) & 1u;
-            int p = (int)syn_perm[syn_base + s];
-            if (bit) {
-                int np = p + cfg.perm_inc_i16;
-                if (np > 32767) np = 32767;
-                syn_perm[syn_base + s] = (short)np;
-            } else {
-                int np = p - cfg.perm_dec_i16;
-                if (np < 0) np = 0;
-                syn_perm[syn_base + s] = (short)np;
-            }
-        }
-    }
-}

+// TM learn (reinforce correctly predicted segments) — cell-grouped launch.
+//
+// Grid: n_cells.
+// For each cell in a predicted, SP-active column: iterate its segments.
+// For each segment with num_active_connected >= activation_threshold,
+// reinforce its synapses against prev_active_bits.
+struct TmConfig {
+    unsigned int activation_threshold;
+    unsigned int learning_threshold;
+    unsigned int cells_per_column;
+    unsigned int synapses_per_segment;
+    unsigned int n_segments;
+    unsigned int n_cells;
+    unsigned int max_segments_per_cell;
+    unsigned int max_new_synapses;
+    int conn_thr_i16;
+    int perm_inc_i16;
+    int perm_dec_i16;
+    int predicted_seg_dec_i16;
+    int initial_perm_i16;
+    unsigned int iter_seed;
+    unsigned int n_cols;
+    unsigned int bits_words;
+};
+extern "C" __global__
+void tm_learn_reinforce(
+    const unsigned int * __restrict__ seg_cell_id,
+    const unsigned int * __restrict__ seg_syn_count,
+    const unsigned int * __restrict__ syn_presyn,
+    short              * __restrict__ syn_perm,
+    const unsigned int * __restrict__ seg_num_active_connected,
+    const unsigned int * __restrict__ prev_active_bits,
+    const unsigned char * __restrict__ sp_active_mask,
+    const unsigned char * __restrict__ col_predicted,
+    const unsigned int * __restrict__ cell_seg_count,
+    TmConfig             cfg
+) {
+    const unsigned int cell = blockIdx.x;
+    if (cell >= cfg.n_cells) return;
+    const unsigned int col = cell / cfg.cells_per_column;
+    if (sp_active_mask[col] == 0) return;
+    if (col_predicted[col] == 0) return;
+    const unsigned int n_segs_here = min(cell_seg_count[cell], cfg.max_segments_per_cell);
+    if (n_segs_here == 0) return;
+    const unsigned int tid = threadIdx.x;
+    const unsigned int seg_base_id = cell * cfg.max_segments_per_cell;
+    for (unsigned int local_seg = 0; local_seg < n_segs_here; local_seg++) {
+        const unsigned int seg = seg_base_id + local_seg;
+        if (seg_num_active_connected[seg] < cfg.activation_threshold) continue;
+        const unsigned int n_syn = seg_syn_count[seg];
+        if (n_syn == 0) continue;
+        const unsigned int syn_base = seg * cfg.synapses_per_segment;
+        for (unsigned int s = tid; s < n_syn; s += 32u) {
+            unsigned int presyn = syn_presyn[syn_base + s];
+            unsigned int word = prev_active_bits[presyn >> 5];
+            unsigned int bit = (word >> (presyn & 31u)) & 1u;
+            int p = (int)syn_perm[syn_base + s];
+            if (bit) {
+                int np = p + cfg.perm_inc_i16;
+                if (np > 32767) np = 32767;
+                syn_perm[syn_base + s] = (short)np;
+            } else {
+                int np = p - cfg.perm_dec_i16;
+                if (np < 0) np = 0;
+                syn_perm[syn_base + s] = (short)np;
+            }
+        }
+    }
+}

overlay/htm_rust/src/gpu/kernels/tm_predict.cu CHANGED Viewed

@@ -1,102 +1,102 @@
-// TM predict kernel — cell-grouped launch.
-//
-// Grid: n_cells blocks (one per cell).
-// Block: 32 threads (one warp).
-//
-// Each block iterates the segments owned by its cell (count in cell_seg_count[cell]).
-// For each live segment, counts active connected/potential synapses against
-// prev_active_bits. Updates per-segment counters, cell_predictive bit, and
-// col_predicted flag.
-struct TmConfig {
-    unsigned int activation_threshold;
-    unsigned int learning_threshold;
-    unsigned int cells_per_column;
-    unsigned int synapses_per_segment;
-    unsigned int n_segments;
-    unsigned int n_cells;
-    unsigned int max_segments_per_cell;
-    unsigned int max_new_synapses;
-    int conn_thr_i16;
-    int perm_inc_i16;
-    int perm_dec_i16;
-    int predicted_seg_dec_i16;
-    int initial_perm_i16;
-    unsigned int iter_seed;
-    unsigned int n_cols;
-    unsigned int bits_words;
-};
-extern "C" __global__
-void tm_predict(
-    const unsigned int * __restrict__ seg_cell_id,
-    const unsigned int * __restrict__ seg_syn_count,
-    const unsigned int * __restrict__ syn_presyn,
-    const short        * __restrict__ syn_perm,
-    const unsigned int * __restrict__ cell_active_bits,
-    unsigned int       * __restrict__ cell_predictive_bits,
-    unsigned char      * __restrict__ col_predicted,
-    unsigned int       * __restrict__ seg_num_active_connected,
-    unsigned int       * __restrict__ seg_num_active_potential,
-    unsigned int       * __restrict__ col_best_match,
-    const unsigned int * __restrict__ cell_seg_count,
-    TmConfig             cfg
-) {
-    const unsigned int cell = blockIdx.x;
-    if (cell >= cfg.n_cells) return;
-    const unsigned int n_segs_here = min(cell_seg_count[cell], cfg.max_segments_per_cell);
-    if (n_segs_here == 0) return;
-    const unsigned int tid = threadIdx.x;
-    const unsigned int col = cell / cfg.cells_per_column;
-    const unsigned int seg_base_id = cell * cfg.max_segments_per_cell;
-    for (unsigned int local_seg = 0; local_seg < n_segs_here; local_seg++) {
-        const unsigned int seg = seg_base_id + local_seg;
-        const unsigned int n_syn = seg_syn_count[seg];
-        if (n_syn == 0) {
-            if (tid == 0) {
-                seg_num_active_connected[seg] = 0;
-                seg_num_active_potential[seg] = 0;
-            }
-            continue;
-        }
-        const unsigned int syn_base = seg * cfg.synapses_per_segment;
-        unsigned int local_conn = 0;
-        unsigned int local_pot = 0;
-        for (unsigned int s = tid; s < n_syn; s += 32u) {
-            unsigned int presyn = syn_presyn[syn_base + s];
-            unsigned int word = cell_active_bits[presyn >> 5];
-            unsigned int bit  = (word >> (presyn & 31u)) & 1u;
-            if (bit) {
-                local_pot += 1u;
-                int p = (int)syn_perm[syn_base + s];
-                if (p >= cfg.conn_thr_i16) {
-                    local_conn += 1u;
-                }
-            }
-        }
-        for (int off = 16; off > 0; off >>= 1) {
-            local_conn += __shfl_down_sync(0xffffffffu, local_conn, off);
-            local_pot  += __shfl_down_sync(0xffffffffu, local_pot,  off);
-        }
-        if (tid == 0) {
-            seg_num_active_connected[seg] = local_conn;
-            seg_num_active_potential[seg] = local_pot;
-            if (local_conn >= cfg.activation_threshold) {
-                unsigned int word_idx = cell >> 5;
-                unsigned int bit_mask = 1u << (cell & 31u);
-                atomicOr(&cell_predictive_bits[word_idx], bit_mask);
-                col_predicted[col] = 1;
-            }
-            if (local_pot >= cfg.learning_threshold) {
-                unsigned int pot_c = local_pot > 2047u ? 2047u : local_pot;
-                unsigned int key = (pot_c << 21) | (seg & 0x1FFFFFu);
-                atomicMax(&col_best_match[col], key);
-            }
-        }
-    }
-}

+// TM predict kernel — cell-grouped launch.
+//
+// Grid: n_cells blocks (one per cell).
+// Block: 32 threads (one warp).
+//
+// Each block iterates the segments owned by its cell (count in cell_seg_count[cell]).
+// For each live segment, counts active connected/potential synapses against
+// prev_active_bits. Updates per-segment counters, cell_predictive bit, and
+// col_predicted flag.
+struct TmConfig {
+    unsigned int activation_threshold;
+    unsigned int learning_threshold;
+    unsigned int cells_per_column;
+    unsigned int synapses_per_segment;
+    unsigned int n_segments;
+    unsigned int n_cells;
+    unsigned int max_segments_per_cell;
+    unsigned int max_new_synapses;
+    int conn_thr_i16;
+    int perm_inc_i16;
+    int perm_dec_i16;
+    int predicted_seg_dec_i16;
+    int initial_perm_i16;
+    unsigned int iter_seed;
+    unsigned int n_cols;
+    unsigned int bits_words;
+};
+extern "C" __global__
+void tm_predict(
+    const unsigned int * __restrict__ seg_cell_id,
+    const unsigned int * __restrict__ seg_syn_count,
+    const unsigned int * __restrict__ syn_presyn,
+    const short        * __restrict__ syn_perm,
+    const unsigned int * __restrict__ cell_active_bits,
+    unsigned int       * __restrict__ cell_predictive_bits,
+    unsigned char      * __restrict__ col_predicted,
+    unsigned int       * __restrict__ seg_num_active_connected,
+    unsigned int       * __restrict__ seg_num_active_potential,
+    unsigned int       * __restrict__ col_best_match,
+    const unsigned int * __restrict__ cell_seg_count,
+    TmConfig             cfg
+) {
+    const unsigned int cell = blockIdx.x;
+    if (cell >= cfg.n_cells) return;
+    const unsigned int n_segs_here = min(cell_seg_count[cell], cfg.max_segments_per_cell);
+    if (n_segs_here == 0) return;
+    const unsigned int tid = threadIdx.x;
+    const unsigned int col = cell / cfg.cells_per_column;
+    const unsigned int seg_base_id = cell * cfg.max_segments_per_cell;
+    for (unsigned int local_seg = 0; local_seg < n_segs_here; local_seg++) {
+        const unsigned int seg = seg_base_id + local_seg;
+        const unsigned int n_syn = seg_syn_count[seg];
+        if (n_syn == 0) {
+            if (tid == 0) {
+                seg_num_active_connected[seg] = 0;
+                seg_num_active_potential[seg] = 0;
+            }
+            continue;
+        }
+        const unsigned int syn_base = seg * cfg.synapses_per_segment;
+        unsigned int local_conn = 0;
+        unsigned int local_pot = 0;
+        for (unsigned int s = tid; s < n_syn; s += 32u) {
+            unsigned int presyn = syn_presyn[syn_base + s];
+            unsigned int word = cell_active_bits[presyn >> 5];
+            unsigned int bit  = (word >> (presyn & 31u)) & 1u;
+            if (bit) {
+                local_pot += 1u;
+                int p = (int)syn_perm[syn_base + s];
+                if (p >= cfg.conn_thr_i16) {
+                    local_conn += 1u;
+                }
+            }
+        }
+        for (int off = 16; off > 0; off >>= 1) {
+            local_conn += __shfl_down_sync(0xffffffffu, local_conn, off);
+            local_pot  += __shfl_down_sync(0xffffffffu, local_pot,  off);
+        }
+        if (tid == 0) {
+            seg_num_active_connected[seg] = local_conn;
+            seg_num_active_potential[seg] = local_pot;
+            if (local_conn >= cfg.activation_threshold) {
+                unsigned int word_idx = cell >> 5;
+                unsigned int bit_mask = 1u << (cell & 31u);
+                atomicOr(&cell_predictive_bits[word_idx], bit_mask);
+                col_predicted[col] = 1;
+            }
+            if (local_pot >= cfg.learning_threshold) {
+                unsigned int pot_c = local_pot > 2047u ? 2047u : local_pot;
+                unsigned int key = (pot_c << 21) | (seg & 0x1FFFFFu);
+                atomicMax(&col_best_match[col], key);
+            }
+        }
+    }
+}

overlay/htm_rust/src/gpu/kernels/tm_punish.cu CHANGED Viewed

@@ -1,64 +1,64 @@
-// TM punish — cell-grouped launch.
-struct TmConfig {
-    unsigned int activation_threshold;
-    unsigned int learning_threshold;
-    unsigned int cells_per_column;
-    unsigned int synapses_per_segment;
-    unsigned int n_segments;
-    unsigned int n_cells;
-    unsigned int max_segments_per_cell;
-    unsigned int max_new_synapses;
-    int conn_thr_i16;
-    int perm_inc_i16;
-    int perm_dec_i16;
-    int predicted_seg_dec_i16;
-    int initial_perm_i16;
-    unsigned int iter_seed;
-    unsigned int n_cols;
-    unsigned int bits_words;
-};
-extern "C" __global__
-void tm_punish(
-    const unsigned int  * __restrict__ seg_cell_id,
-    const unsigned int  * __restrict__ seg_syn_count,
-    const unsigned int  * __restrict__ syn_presyn,
-    short               * __restrict__ syn_perm,
-    const unsigned int  * __restrict__ seg_num_active_potential,
-    const unsigned int  * __restrict__ prev_active_bits,
-    const unsigned char * __restrict__ sp_active_mask,
-    const unsigned int  * __restrict__ cell_seg_count,
-    TmConfig              cfg
-) {
-    const unsigned int cell = blockIdx.x;
-    if (cell >= cfg.n_cells) return;
-    const unsigned int col = cell / cfg.cells_per_column;
-    if (sp_active_mask[col] != 0) return;   // skip: col became active
-    const unsigned int n_segs_here = min(cell_seg_count[cell], cfg.max_segments_per_cell);
-    if (n_segs_here == 0) return;
-    const unsigned int tid = threadIdx.x;
-    const unsigned int seg_base_id = cell * cfg.max_segments_per_cell;
-    for (unsigned int local_seg = 0; local_seg < n_segs_here; local_seg++) {
-        const unsigned int seg = seg_base_id + local_seg;
-        if (seg_num_active_potential[seg] < cfg.learning_threshold) continue;
-        const unsigned int n_syn = seg_syn_count[seg];
-        if (n_syn == 0) continue;
-        const unsigned int syn_base = seg * cfg.synapses_per_segment;
-        for (unsigned int s = tid; s < n_syn; s += 32u) {
-            unsigned int presyn = syn_presyn[syn_base + s];
-            unsigned int word = prev_active_bits[presyn >> 5];
-            unsigned int bit = (word >> (presyn & 31u)) & 1u;
-            if (bit) {
-                int p = (int)syn_perm[syn_base + s];
-                int np = p - cfg.predicted_seg_dec_i16;
-                if (np < 0) np = 0;
-                syn_perm[syn_base + s] = (short)np;
-            }
-        }
-    }
-}

+// TM punish — cell-grouped launch.
+struct TmConfig {
+    unsigned int activation_threshold;
+    unsigned int learning_threshold;
+    unsigned int cells_per_column;
+    unsigned int synapses_per_segment;
+    unsigned int n_segments;
+    unsigned int n_cells;
+    unsigned int max_segments_per_cell;
+    unsigned int max_new_synapses;
+    int conn_thr_i16;
+    int perm_inc_i16;
+    int perm_dec_i16;
+    int predicted_seg_dec_i16;
+    int initial_perm_i16;
+    unsigned int iter_seed;
+    unsigned int n_cols;
+    unsigned int bits_words;
+};
+extern "C" __global__
+void tm_punish(
+    const unsigned int  * __restrict__ seg_cell_id,
+    const unsigned int  * __restrict__ seg_syn_count,
+    const unsigned int  * __restrict__ syn_presyn,
+    short               * __restrict__ syn_perm,
+    const unsigned int  * __restrict__ seg_num_active_potential,
+    const unsigned int  * __restrict__ prev_active_bits,
+    const unsigned char * __restrict__ sp_active_mask,
+    const unsigned int  * __restrict__ cell_seg_count,
+    TmConfig              cfg
+) {
+    const unsigned int cell = blockIdx.x;
+    if (cell >= cfg.n_cells) return;
+    const unsigned int col = cell / cfg.cells_per_column;
+    if (sp_active_mask[col] != 0) return;   // skip: col became active
+    const unsigned int n_segs_here = min(cell_seg_count[cell], cfg.max_segments_per_cell);
+    if (n_segs_here == 0) return;
+    const unsigned int tid = threadIdx.x;
+    const unsigned int seg_base_id = cell * cfg.max_segments_per_cell;
+    for (unsigned int local_seg = 0; local_seg < n_segs_here; local_seg++) {
+        const unsigned int seg = seg_base_id + local_seg;
+        if (seg_num_active_potential[seg] < cfg.learning_threshold) continue;
+        const unsigned int n_syn = seg_syn_count[seg];
+        if (n_syn == 0) continue;
+        const unsigned int syn_base = seg * cfg.synapses_per_segment;
+        for (unsigned int s = tid; s < n_syn; s += 32u) {
+            unsigned int presyn = syn_presyn[syn_base + s];
+            unsigned int word = prev_active_bits[presyn >> 5];
+            unsigned int bit = (word >> (presyn & 31u)) & 1u;
+            if (bit) {
+                int p = (int)syn_perm[syn_base + s];
+                int np = p - cfg.predicted_seg_dec_i16;
+                if (np < 0) np = 0;
+                syn_perm[syn_base + s] = (short)np;
+            }
+        }
+    }
+}

overlay/htm_rust/src/gpu/kernels/tm_reset.cu CHANGED Viewed

@@ -1,36 +1,36 @@
-// TM reset-per-step kernel.
-extern "C" __global__
-void tm_reset_step(
-    unsigned int * __restrict__ cell_active_bits,
-    unsigned int * __restrict__ cell_winner_bits,
-    unsigned int * __restrict__ cell_predictive_bits,
-    unsigned int * __restrict__ prev_active_bits,
-    unsigned int * __restrict__ prev_winner_bits,
-    unsigned char * __restrict__ col_predicted,
-    unsigned int * __restrict__ unpredicted_count,
-    unsigned int * __restrict__ burst_cols_count,
-    unsigned int * __restrict__ col_best_match,
-    unsigned int   bits_words,
-    unsigned int   n_cols
-) {
-    unsigned int tid_global = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid_global < bits_words) {
-        prev_active_bits[tid_global] = cell_active_bits[tid_global];
-        prev_winner_bits[tid_global] = cell_winner_bits[tid_global];
-        cell_active_bits[tid_global] = 0u;
-        cell_winner_bits[tid_global] = 0u;
-        cell_predictive_bits[tid_global] = 0u;
-    }
-    if (tid_global < n_cols) {
-        col_predicted[tid_global] = 0;
-        col_best_match[tid_global] = 0u;
-    }
-    if (tid_global == 0) {
-        unpredicted_count[0] = 0u;
-        burst_cols_count[0] = 0u;
-    }
-}

+// TM reset-per-step kernel.
+extern "C" __global__
+void tm_reset_step(
+    unsigned int * __restrict__ cell_active_bits,
+    unsigned int * __restrict__ cell_winner_bits,
+    unsigned int * __restrict__ cell_predictive_bits,
+    unsigned int * __restrict__ prev_active_bits,
+    unsigned int * __restrict__ prev_winner_bits,
+    unsigned char * __restrict__ col_predicted,
+    unsigned int * __restrict__ unpredicted_count,
+    unsigned int * __restrict__ burst_cols_count,
+    unsigned int * __restrict__ col_best_match,
+    unsigned int   bits_words,
+    unsigned int   n_cols
+) {
+    unsigned int tid_global = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid_global < bits_words) {
+        prev_active_bits[tid_global] = cell_active_bits[tid_global];
+        prev_winner_bits[tid_global] = cell_winner_bits[tid_global];
+        cell_active_bits[tid_global] = 0u;
+        cell_winner_bits[tid_global] = 0u;
+        cell_predictive_bits[tid_global] = 0u;
+    }
+    if (tid_global < n_cols) {
+        col_predicted[tid_global] = 0;
+        col_best_match[tid_global] = 0u;
+    }
+    if (tid_global == 0) {
+        unpredicted_count[0] = 0u;
+        burst_cols_count[0] = 0u;
+    }
+}

overlay/htm_rust/src/gpu/mod.rs CHANGED Viewed

@@ -1,549 +1,549 @@
-//! GPU backend for HTM.
-//!
-//! Full-GPU pipeline (SP + TM). Per-step state lives entirely on device; the
-//! batch API (`step_many_gpu`) uploads T steps of input once, runs T iterations
-//! of the full HTM pipeline on GPU, and copies (T, n_cols) u8 + (T,) f32 back
-//! to the host in one shot.
-//!
-//! TM parity with the CPU reference is approximate:
-//!   - Segment growth: winner = cell 0 of bursting column (CPU picks
-//!     least-used-cell with RNG tiebreak). This is a pragmatic simplification
-//!     for GPU atomicity; learning dynamics are preserved.
-//!   - Permanences stored as i16 (scaled 0..32767). Rounding differs from
-//!     f32 by <= 1 ULP of the scale factor (≈ 3e-5) — inside any meaningful
-//!     HTM learning quantum.
-#![cfg(feature = "gpu")]
-pub mod sp_gpu;
-pub mod tm_gpu;
-pub mod fused;
-#[cfg(test)]
-mod tests;
-use std::mem::ManuallyDrop;
-use pyo3::prelude::*;
-use pyo3::types::{PyDict, PyTuple};
-use numpy::{PyArray1, PyArray2, PyArrayMethods, PyReadonlyArray2, PyUntypedArrayMethods};
-use crate::region::HTMRegionCore;
-use crate::sp::SpatialPoolerConfig;
-use sp_gpu::SpatialPoolerGpu;
-use tm_gpu::TemporalMemoryGpu;
-use fused::FusedState;
-/// Extract (device_ptr, shape, typestr) from a `__cuda_array_interface__` dict.
-/// Returns Err if the dict is malformed. Used by `step_many_cuda` to wrap
-/// torch-owned CUDA allocations zero-copy.
-fn cai_parse(cai: &Bound<'_, PyDict>) -> PyResult<(u64, Vec<usize>, String)> {
-    // `data` is a (ptr: int, readonly: bool) tuple.
-    let data_obj = cai.get_item("data")?
-        .ok_or_else(|| pyo3::exceptions::PyValueError::new_err("CAI missing 'data'"))?;
-    let data_tup: Bound<'_, PyTuple> = data_obj.downcast_into()
-        .map_err(|_| pyo3::exceptions::PyValueError::new_err("CAI 'data' must be a tuple"))?;
-    let ptr: u64 = data_tup.get_item(0)?.extract()?;
-    // `shape` is a tuple of ints.
-    let shape_obj = cai.get_item("shape")?
-        .ok_or_else(|| pyo3::exceptions::PyValueError::new_err("CAI missing 'shape'"))?;
-    let shape_tup: Bound<'_, PyTuple> = shape_obj.downcast_into()
-        .map_err(|_| pyo3::exceptions::PyValueError::new_err("CAI 'shape' must be a tuple"))?;
-    let shape: Vec<usize> = (0..shape_tup.len())
-        .map(|i| shape_tup.get_item(i).and_then(|v| v.extract::<usize>()))
-        .collect::<PyResult<Vec<_>>>()?;
-    // `typestr` (e.g. "|u1", "<f4").
-    let typestr_obj = cai.get_item("typestr")?
-        .ok_or_else(|| pyo3::exceptions::PyValueError::new_err("CAI missing 'typestr'"))?;
-    let typestr: String = typestr_obj.extract()?;
-    // Reject non-contiguous tensors — we don't handle strides.
-    if let Some(strides) = cai.get_item("strides")? {
-        if !strides.is_none() {
-            return Err(pyo3::exceptions::PyValueError::new_err(
-                "CAI 'strides' must be None (tensor must be contiguous)",
-            ));
-        }
-    }
-    Ok((ptr, shape, typestr))
-}
-/// Python-exposed GPU HTM region. Drop-in replacement for `HTMRegion`.
-#[pyclass(module = "htm_rust")]
-pub struct HTMRegionGpu {
-    pub(super) sp_gpu: SpatialPoolerGpu,
-    pub(super) tm_gpu: TemporalMemoryGpu,
-    pub(super) fused_state: FusedState,
-    pub(super) n_columns: usize,
-    pub(super) input_bits: usize,
-    pub(super) cells_per_column: usize,
-}
-#[pymethods]
-impl HTMRegionGpu {
-    #[new]
-    #[pyo3(signature = (input_bits, n_columns, cells_per_column, seed=42))]
-    fn new(
-        input_bits: usize,
-        n_columns: usize,
-        cells_per_column: usize,
-        seed: u64,
-    ) -> PyResult<Self> {
-        if input_bits == 0 || n_columns == 0 || cells_per_column == 0 {
-            return Err(pyo3::exceptions::PyValueError::new_err(
-                "input_bits, n_columns, cells_per_column must all be > 0",
-            ));
-        }
-        // CPU reference for deterministic SP init.
-        let cpu_ref = HTMRegionCore::new(input_bits, n_columns, cells_per_column, seed);
-        let sp_cfg: &SpatialPoolerConfig = &cpu_ref.sp.cfg;
-        let sp_gpu = SpatialPoolerGpu::from_cpu(&cpu_ref.sp).map_err(|e| {
-            pyo3::exceptions::PyRuntimeError::new_err(format!(
-                "GPU SP init failed: {e:?}. Config: input_bits={}, n_columns={}",
-                sp_cfg.input_bits, sp_cfg.n_columns,
-            ))
-        })?;
-        let dev = sp_gpu.dev_ref().clone();
-        let tm_gpu = TemporalMemoryGpu::new(dev.clone(), n_columns, cells_per_column).map_err(|e| {
-            pyo3::exceptions::PyRuntimeError::new_err(format!(
-                "GPU TM init failed: {e:?}",
-            ))
-        })?;
-        let initial_threshold = sp_gpu.initial_threshold_estimate();
-        let fused_state = FusedState::new(dev, n_columns, cells_per_column, initial_threshold)
-            .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!(
-                "GPU fused state init failed: {e:?}",
-            )))?;
-        Ok(Self {
-            sp_gpu,
-            tm_gpu,
-            fused_state,
-            n_columns,
-            input_bits,
-            cells_per_column,
-        })
-    }
-    #[getter] fn input_bits(&self) -> usize { self.input_bits }
-    #[getter] fn n_columns(&self) -> usize { self.n_columns }
-    #[getter] fn cells_per_column(&self) -> usize { self.cells_per_column }
-    /// Process T timesteps in one call on GPU. Per-step state (SP + TM) stays
-    /// on device; only the final (T, n_cols) mask and (T,) anomaly are copied
-    /// to the host at the end.
-    #[pyo3(signature = (inputs, learn=true))]
-    fn step_many_gpu<'py>(
-        &mut self,
-        py: Python<'py>,
-        inputs: PyReadonlyArray2<'py, bool>,
-        learn: bool,
-    ) -> PyResult<(Bound<'py, PyArray2<f32>>, Bound<'py, PyArray1<f32>>)> {
-        let shape = inputs.shape();
-        if shape.len() != 2 {
-            return Err(pyo3::exceptions::PyValueError::new_err(
-                "inputs must be 2-D (T, input_bits)",
-            ));
-        }
-        let t = shape[0];
-        let bits = shape[1];
-        if bits != self.input_bits {
-            return Err(pyo3::exceptions::PyValueError::new_err(format!(
-                "inputs last dim {bits} != expected input_bits {}",
-                self.input_bits,
-            )));
-        }
-        let slice = inputs.as_slice()?;
-        let n_cols = self.n_columns;
-        let input_vec: Vec<bool> = slice.to_vec();
-        let result = py.allow_threads(|| -> Result<(Vec<u8>, Vec<f32>), String> {
-            // 1. Upload T*input_bits bytes (32 MB at T=2048, bits=16384).
-            let sdr_u8_all: Vec<u8> = input_vec.iter().map(|&b| b as u8).collect();
-            let inputs_dev = self
-                .sp_gpu
-                .dev_ref()
-                .htod_sync_copy(&sdr_u8_all)
-                .map_err(|e| format!("H2D inputs: {e:?}"))?;
-            // 2. Allocate output buffers on device.
-            let mut cols_dev = self.sp_gpu.dev_ref()
-                .alloc_zeros::<u8>(t * n_cols)
-                .map_err(|e| format!("alloc cols: {e:?}"))?;
-            let mut anom_dev = self.sp_gpu.dev_ref()
-                .alloc_zeros::<f32>(t)
-                .map_err(|e| format!("alloc anom: {e:?}"))?;
-            // 3. Run T steps of SP + TM on GPU with NO per-step host sync.
-            self.sp_gpu.step_batch_with_tm(
-                &inputs_dev,
-                t,
-                self.input_bits,
-                learn,
-                &mut cols_dev,
-                &mut anom_dev,
-                &mut self.tm_gpu,
-            ).map_err(|e| format!("step_batch_with_tm: {e:?}"))?;
-            // 4. ONE D2H for the whole run (T * n_cols bytes + T floats).
-            let cols_host: Vec<u8> = self.sp_gpu.dev_ref()
-                .dtoh_sync_copy(&cols_dev)
-                .map_err(|e| format!("D2H cols: {e:?}"))?;
-            let anom_host: Vec<f32> = self.sp_gpu.dev_ref()
-                .dtoh_sync_copy(&anom_dev)
-                .map_err(|e| format!("D2H anom: {e:?}"))?;
-            Ok((cols_host, anom_host))
-        });
-        let (cols_u8, anom) = result.map_err(pyo3::exceptions::PyRuntimeError::new_err)?;
-        let cols_f32: Vec<f32> = cols_u8.iter().map(|&b| b as f32).collect();
-        let cols_arr = numpy::PyArray1::from_vec_bound(py, cols_f32)
-            .reshape([t, n_cols])
-            .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("{e}")))?;
-        let anom_arr = numpy::PyArray1::from_vec_bound(py, anom);
-        Ok((cols_arr, anom_arr))
-    }
-    /// Zero-copy CUDA path: accept torch tensors via __cuda_array_interface__,
-    /// write outputs directly into caller-allocated torch tensors. Skips the
-    /// host round-trip that `step_many_gpu` pays on every call (sdr.cpu() +
-    /// two D2H copies at the end). This is the hot path for `train.py`.
-    ///
-    /// Contract:
-    ///   sdr_cai.shape  == (T, input_bits), dtype u8   (0/1 mask)
-    ///   cols_cai.shape == (T, n_columns),  dtype u8   (written)
-    ///   anom_cai.shape == (T,),            dtype f32  (written)
-    /// All three tensors must live on the SAME CUDA device as this region.
-    ///
-    /// The torch tensors still own their memory — this method only wraps
-    /// them as borrowed CudaSlice views (via ManuallyDrop) so cudarc's Drop
-    /// impl can't free pytorch's allocator.
-    #[pyo3(signature = (sdr_cai, cols_cai, anom_cai, learn=true))]
-    fn step_many_cuda(
-        &mut self,
-        py: Python<'_>,
-        sdr_cai: &Bound<'_, PyDict>,
-        cols_cai: &Bound<'_, PyDict>,
-        anom_cai: &Bound<'_, PyDict>,
-        learn: bool,
-    ) -> PyResult<()> {
-        let (sdr_ptr, sdr_shape, sdr_type) = cai_parse(sdr_cai)?;
-        let (cols_ptr, cols_shape, cols_type) = cai_parse(cols_cai)?;
-        let (anom_ptr, anom_shape, anom_type) = cai_parse(anom_cai)?;
-        // typestr sanity. numpy u1 is what torch.uint8 exports.
-        if sdr_type != "|u1" {
-            return Err(pyo3::exceptions::PyValueError::new_err(format!(
-                "sdr_cai typestr must be '|u1' (uint8), got {sdr_type}",
-            )));
-        }
-        if cols_type != "|u1" {
-            return Err(pyo3::exceptions::PyValueError::new_err(format!(
-                "cols_cai typestr must be '|u1' (uint8), got {cols_type}",
-            )));
-        }
-        if anom_type != "<f4" && anom_type != "=f4" {
-            return Err(pyo3::exceptions::PyValueError::new_err(format!(
-                "anom_cai typestr must be '<f4' (float32), got {anom_type}",
-            )));
-        }
-        // Shape validation.
-        if sdr_shape.len() != 2 || sdr_shape[1] != self.input_bits {
-            return Err(pyo3::exceptions::PyValueError::new_err(format!(
-                "sdr_cai shape {sdr_shape:?} != (T, {})",
-                self.input_bits,
-            )));
-        }
-        let t = sdr_shape[0];
-        if cols_shape != [t, self.n_columns] {
-            return Err(pyo3::exceptions::PyValueError::new_err(format!(
-                "cols_cai shape {cols_shape:?} != ({t}, {})",
-                self.n_columns,
-            )));
-        }
-        if anom_shape != [t] {
-            return Err(pyo3::exceptions::PyValueError::new_err(format!(
-                "anom_cai shape {anom_shape:?} != ({t},)",
-            )));
-        }
-        let dev = self.sp_gpu.dev_ref().clone();
-        let n_cols = self.n_columns;
-        let input_bits = self.input_bits;
-        let result = py.allow_threads(|| -> Result<(), String> {
-            // SAFETY:
-            // - ptrs came from torch CUDA tensors validated non-null by the
-            //   __cuda_array_interface__ contract.
-            // - lens computed from validated shapes.
-            // - We wrap the returned CudaSlice in ManuallyDrop so cudarc's
-            //   Drop (which calls cuMemFree) never runs against torch memory.
-            //   The underlying allocation is owned+freed by torch.
-            // - The slices are used only for the duration of this call;
-            //   torch guarantees the backing tensors are live across it
-            //   (Python holds refs on the wrapping tensors).
-            let inputs_dev = ManuallyDrop::new(unsafe {
-                dev.upgrade_device_ptr::<u8>(sdr_ptr, t * input_bits)
-            });
-            let mut cols_dev = ManuallyDrop::new(unsafe {
-                dev.upgrade_device_ptr::<u8>(cols_ptr, t * n_cols)
-            });
-            let mut anom_dev = ManuallyDrop::new(unsafe {
-                dev.upgrade_device_ptr::<f32>(anom_ptr, t)
-            });
-            self.sp_gpu.step_batch_with_tm(
-                &inputs_dev,
-                t,
-                input_bits,
-                learn,
-                &mut cols_dev,
-                &mut anom_dev,
-                &mut self.tm_gpu,
-            ).map_err(|e| format!("step_batch_with_tm: {e:?}"))?;
-            // Synchronize: kernel writes must be visible to the next torch
-            // op that reads cols/anom. Pytorch's default stream is stream 0,
-            // and cudarc launches on its own stream — a full device sync
-            // is the simplest correct barrier. (Could narrow to a stream
-            // wait event in PR 2.)
-            // No dev.synchronize() here: caller must explicitly sync via the
-            // `device_sync()` method (or PyTorch auto-syncs when the output
-            // tensor is next consumed). Removing the per-launch barrier lets
-            // subsequent GPU work (mamba3 fwd, etc.) overlap in time.
-            Ok(())
-        });
-        result.map_err(pyo3::exceptions::PyRuntimeError::new_err)?;
-        Ok(())
-    }
-    /// Clear TM state on the GPU.
-    fn reset(&mut self) -> PyResult<()> {
-        self.tm_gpu.reset().map_err(|e| {
-            pyo3::exceptions::PyRuntimeError::new_err(format!("GPU TM reset: {e:?}"))
-        })?;
-        self.fused_state.reset().map_err(|e| {
-            pyo3::exceptions::PyRuntimeError::new_err(format!("GPU fused reset: {e:?}"))
-        })
-    }
-    /// FUSED MEGAKERNEL PATH: single CUDA launch for the entire T-step
-    /// forward (SP + TM all in one). Accepts torch CUDA tensors via
-    /// `__cuda_array_interface__` (zero-copy). Writes active-column mask +
-    /// anomaly directly into caller-allocated torch tensors.
-    ///
-    /// Semantics diverge from `step_many_cuda` in one important way: column
-    /// activation uses per-column threshold inhibition instead of global
-    /// top-K. The threshold is EMA-adapted per column toward the sparsity
-    /// target. See `docs/GPU_HTM.md` §Fused Kernel.
-    #[pyo3(signature = (sdr_cai, cols_cai, anom_cai, learn=true))]
-    fn step_many_fused_cuda(
-        &mut self,
-        py: Python<'_>,
-        sdr_cai: &Bound<'_, PyDict>,
-        cols_cai: &Bound<'_, PyDict>,
-        anom_cai: &Bound<'_, PyDict>,
-        learn: bool,
-    ) -> PyResult<()> {
-        let (sdr_ptr, sdr_shape, sdr_type) = cai_parse(sdr_cai)?;
-        let (cols_ptr, cols_shape, cols_type) = cai_parse(cols_cai)?;
-        let (anom_ptr, anom_shape, anom_type) = cai_parse(anom_cai)?;
-        if sdr_type != "|u1" {
-            return Err(pyo3::exceptions::PyValueError::new_err(format!(
-                "sdr_cai typestr must be '|u1' (uint8), got {sdr_type}",
-            )));
-        }
-        if cols_type != "|u1" {
-            return Err(pyo3::exceptions::PyValueError::new_err(format!(
-                "cols_cai typestr must be '|u1' (uint8), got {cols_type}",
-            )));
-        }
-        if anom_type != "<f4" && anom_type != "=f4" {
-            return Err(pyo3::exceptions::PyValueError::new_err(format!(
-                "anom_cai typestr must be '<f4' (float32), got {anom_type}",
-            )));
-        }
-        if sdr_shape.len() != 2 || sdr_shape[1] != self.input_bits {
-            return Err(pyo3::exceptions::PyValueError::new_err(format!(
-                "sdr_cai shape {sdr_shape:?} != (T, {})",
-                self.input_bits,
-            )));
-        }
-        let t = sdr_shape[0];
-        if cols_shape != [t, self.n_columns] {
-            return Err(pyo3::exceptions::PyValueError::new_err(format!(
-                "cols_cai shape {cols_shape:?} != ({t}, {})",
-                self.n_columns,
-            )));
-        }
-        if anom_shape != [t] {
-            return Err(pyo3::exceptions::PyValueError::new_err(format!(
-                "anom_cai shape {anom_shape:?} != ({t},)",
-            )));
-        }
-        let dev = self.sp_gpu.dev_ref().clone();
-        let n_cols = self.n_columns;
-        let input_bits = self.input_bits;
-        let result = py.allow_threads(|| -> Result<(), String> {
-            let inputs_dev = ManuallyDrop::new(unsafe {
-                dev.upgrade_device_ptr::<u8>(sdr_ptr, t * input_bits)
-            });
-            let mut cols_dev = ManuallyDrop::new(unsafe {
-                dev.upgrade_device_ptr::<u8>(cols_ptr, t * n_cols)
-            });
-            let mut anom_dev = ManuallyDrop::new(unsafe {
-                dev.upgrade_device_ptr::<f32>(anom_ptr, t)
-            });
-            fused::launch_fused(
-                &mut self.sp_gpu,
-                &mut self.tm_gpu,
-                &mut self.fused_state,
-                &inputs_dev,
-                &mut cols_dev,
-                &mut anom_dev,
-                t,
-                input_bits,
-                learn,
-            ).map_err(|e| format!("launch_fused: {e:?}"))?;
-            // No dev.synchronize() here: caller must explicitly sync via the
-            // `device_sync()` method (or PyTorch auto-syncs when the output
-            // tensor is next consumed). Removing the per-launch barrier lets
-            // subsequent GPU work (mamba3 fwd, etc.) overlap in time.
-            Ok(())
-        });
-        result.map_err(pyo3::exceptions::PyRuntimeError::new_err)?;
-        Ok(())
-    }
-    /// Explicit device synchronization — the caller must invoke this after
-    /// all batched `step_many_*_cuda` calls complete, before reading the
-    /// output tensors from a different CUDA stream. Equivalent to the old
-    /// per-call `dev.synchronize()` that was removed for overlap.
-    fn device_sync(&self) -> PyResult<()> {
-        let dev = self.sp_gpu.dev_ref();
-        dev.synchronize()
-            .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("sync: {e:?}")))?;
-        Ok(())
-    }
-}
-/// Batch B regions into ONE cooperative kernel launch. Breaks through the
-/// CUDA cooperative-kernel device-level serialization: a single cooperative
-/// launch with grid.y=B processes all regions concurrently — ~B× speedup
-/// over B sequential launches.
-///
-/// All regions must have the same config (input_bits, n_columns,
-/// cells_per_column). Each region keeps its independent GPU state.
-/// Does NOT sync; caller must invoke `device_sync()` on any region
-/// afterwards (or rely on a downstream torch op to auto-sync).
-#[pyfunction]
-#[pyo3(signature = (regions, sdr_cais, cols_cais, anom_cais, learn=true))]
-fn step_batch_fused_cuda(
-    py: Python<'_>,
-    regions: Vec<Py<HTMRegionGpu>>,
-    sdr_cais: Vec<Bound<'_, PyDict>>,
-    cols_cais: Vec<Bound<'_, PyDict>>,
-    anom_cais: Vec<Bound<'_, PyDict>>,
-    learn: bool,
-) -> PyResult<()> {
-    let b = regions.len();
-    if b == 0 {
-        return Err(pyo3::exceptions::PyValueError::new_err("regions is empty"));
-    }
-    if sdr_cais.len() != b || cols_cais.len() != b || anom_cais.len() != b {
-        return Err(pyo3::exceptions::PyValueError::new_err(
-            "sdr_cais / cols_cais / anom_cais length must match regions",
-        ));
-    }
-    // Parse all CAI dicts; collect device pointers. Validate shapes/dtypes.
-    let mut sdr_ptrs = Vec::with_capacity(b);
-    let mut cols_ptrs = Vec::with_capacity(b);
-    let mut anom_ptrs = Vec::with_capacity(b);
-    let (input_bits, n_columns, t) = {
-        let r0 = regions[0].bind(py).borrow();
-        (r0.input_bits, r0.n_columns, {
-            let (_p, sh, _ty) = cai_parse(&sdr_cais[0])?;
-            if sh.len() != 2 {
-                return Err(pyo3::exceptions::PyValueError::new_err(
-                    format!("sdr_cai must be 2-D (T, input_bits), got {sh:?}"),
-                ));
-            }
-            sh[0]
-        })
-    };
-    for i in 0..b {
-        let (sdr_ptr, sdr_shape, sdr_type) = cai_parse(&sdr_cais[i])?;
-        let (cols_ptr, cols_shape, cols_type) = cai_parse(&cols_cais[i])?;
-        let (anom_ptr, anom_shape, anom_type) = cai_parse(&anom_cais[i])?;
-        if sdr_type != "|u1" || cols_type != "|u1" {
-            return Err(pyo3::exceptions::PyValueError::new_err(
-                "sdr/cols typestr must be '|u1' (uint8)",
-            ));
-        }
-        if anom_type != "<f4" && anom_type != "=f4" {
-            return Err(pyo3::exceptions::PyValueError::new_err(
-                "anom typestr must be '<f4' (float32)",
-            ));
-        }
-        if sdr_shape != [t, input_bits] {
-            return Err(pyo3::exceptions::PyValueError::new_err(format!(
-                "sdr[{i}] shape {sdr_shape:?} != ({t}, {input_bits})"
-            )));
-        }
-        if cols_shape != [t, n_columns] {
-            return Err(pyo3::exceptions::PyValueError::new_err(format!(
-                "cols[{i}] shape {cols_shape:?} != ({t}, {n_columns})"
-            )));
-        }
-        if anom_shape != [t] {
-            return Err(pyo3::exceptions::PyValueError::new_err(format!(
-                "anom[{i}] shape {anom_shape:?} != ({t},)"
-            )));
-        }
-        sdr_ptrs.push(sdr_ptr);
-        cols_ptrs.push(cols_ptr);
-        anom_ptrs.push(anom_ptr);
-    }
-    // Exclusively borrow each region. PyRefMut guarantees uniqueness.
-    let mut region_refs: Vec<pyo3::PyRefMut<HTMRegionGpu>> =
-        regions.iter().map(|p| p.bind(py).borrow_mut()).collect();
-    // Collect raw mutable pointers — each PyRefMut exclusively borrows its
-    // region for the lifetime of this call, so pointers stay valid and
-    // unique. launch_fused_batched_raw only dereferences one region at a
-    // time, not constructing an aliased slice.
-    let raw_ptrs: Vec<*mut HTMRegionGpu> = region_refs
-        .iter_mut()
-        .map(|r| &mut **r as *mut HTMRegionGpu)
-        .collect();
-    // No allow_threads: raw pointers aren't Send. The launch is GPU-queued
-    // and sync'd downstream; holding the GIL for the duration is cheap.
-    fused::launch_fused_batched_raw(
-        &raw_ptrs, &sdr_ptrs, &cols_ptrs, &anom_ptrs,
-        t, input_bits, learn,
-    )
-    .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("launch_fused_batched: {e:?}")))?;
-    Ok(())
-}
-pub fn register(m: &Bound<'_, PyModule>) -> PyResult<()> {
-    m.add_class::<HTMRegionGpu>()?;
-    m.add_function(pyo3::wrap_pyfunction!(step_batch_fused_cuda, m)?)?;
-    Ok(())
-}

+//! GPU backend for HTM.
+//!
+//! Full-GPU pipeline (SP + TM). Per-step state lives entirely on device; the
+//! batch API (`step_many_gpu`) uploads T steps of input once, runs T iterations
+//! of the full HTM pipeline on GPU, and copies (T, n_cols) u8 + (T,) f32 back
+//! to the host in one shot.
+//!
+//! TM parity with the CPU reference is approximate:
+//!   - Segment growth: winner = cell 0 of bursting column (CPU picks
+//!     least-used-cell with RNG tiebreak). This is a pragmatic simplification
+//!     for GPU atomicity; learning dynamics are preserved.
+//!   - Permanences stored as i16 (scaled 0..32767). Rounding differs from
+//!     f32 by <= 1 ULP of the scale factor (≈ 3e-5) — inside any meaningful
+//!     HTM learning quantum.
+#![cfg(feature = "gpu")]
+pub mod sp_gpu;
+pub mod tm_gpu;
+pub mod fused;
+#[cfg(test)]
+mod tests;
+use std::mem::ManuallyDrop;
+use pyo3::prelude::*;
+use pyo3::types::{PyDict, PyTuple};
+use numpy::{PyArray1, PyArray2, PyArrayMethods, PyReadonlyArray2, PyUntypedArrayMethods};
+use crate::region::HTMRegionCore;
+use crate::sp::SpatialPoolerConfig;
+use sp_gpu::SpatialPoolerGpu;
+use tm_gpu::TemporalMemoryGpu;
+use fused::FusedState;
+/// Extract (device_ptr, shape, typestr) from a `__cuda_array_interface__` dict.
+/// Returns Err if the dict is malformed. Used by `step_many_cuda` to wrap
+/// torch-owned CUDA allocations zero-copy.
+fn cai_parse(cai: &Bound<'_, PyDict>) -> PyResult<(u64, Vec<usize>, String)> {
+    // `data` is a (ptr: int, readonly: bool) tuple.
+    let data_obj = cai.get_item("data")?
+        .ok_or_else(|| pyo3::exceptions::PyValueError::new_err("CAI missing 'data'"))?;
+    let data_tup: Bound<'_, PyTuple> = data_obj.downcast_into()
+        .map_err(|_| pyo3::exceptions::PyValueError::new_err("CAI 'data' must be a tuple"))?;
+    let ptr: u64 = data_tup.get_item(0)?.extract()?;
+    // `shape` is a tuple of ints.
+    let shape_obj = cai.get_item("shape")?
+        .ok_or_else(|| pyo3::exceptions::PyValueError::new_err("CAI missing 'shape'"))?;
+    let shape_tup: Bound<'_, PyTuple> = shape_obj.downcast_into()
+        .map_err(|_| pyo3::exceptions::PyValueError::new_err("CAI 'shape' must be a tuple"))?;
+    let shape: Vec<usize> = (0..shape_tup.len())
+        .map(|i| shape_tup.get_item(i).and_then(|v| v.extract::<usize>()))
+        .collect::<PyResult<Vec<_>>>()?;
+    // `typestr` (e.g. "|u1", "<f4").
+    let typestr_obj = cai.get_item("typestr")?
+        .ok_or_else(|| pyo3::exceptions::PyValueError::new_err("CAI missing 'typestr'"))?;
+    let typestr: String = typestr_obj.extract()?;
+    // Reject non-contiguous tensors — we don't handle strides.
+    if let Some(strides) = cai.get_item("strides")? {
+        if !strides.is_none() {
+            return Err(pyo3::exceptions::PyValueError::new_err(
+                "CAI 'strides' must be None (tensor must be contiguous)",
+            ));
+        }
+    }
+    Ok((ptr, shape, typestr))
+}
+/// Python-exposed GPU HTM region. Drop-in replacement for `HTMRegion`.
+#[pyclass(module = "htm_rust")]
+pub struct HTMRegionGpu {
+    pub(super) sp_gpu: SpatialPoolerGpu,
+    pub(super) tm_gpu: TemporalMemoryGpu,
+    pub(super) fused_state: FusedState,
+    pub(super) n_columns: usize,
+    pub(super) input_bits: usize,
+    pub(super) cells_per_column: usize,
+}
+#[pymethods]
+impl HTMRegionGpu {
+    #[new]
+    #[pyo3(signature = (input_bits, n_columns, cells_per_column, seed=42))]
+    fn new(
+        input_bits: usize,
+        n_columns: usize,
+        cells_per_column: usize,
+        seed: u64,
+    ) -> PyResult<Self> {
+        if input_bits == 0 || n_columns == 0 || cells_per_column == 0 {
+            return Err(pyo3::exceptions::PyValueError::new_err(
+                "input_bits, n_columns, cells_per_column must all be > 0",
+            ));
+        }
+        // CPU reference for deterministic SP init.
+        let cpu_ref = HTMRegionCore::new(input_bits, n_columns, cells_per_column, seed);
+        let sp_cfg: &SpatialPoolerConfig = &cpu_ref.sp.cfg;
+        let sp_gpu = SpatialPoolerGpu::from_cpu(&cpu_ref.sp).map_err(|e| {
+            pyo3::exceptions::PyRuntimeError::new_err(format!(
+                "GPU SP init failed: {e:?}. Config: input_bits={}, n_columns={}",
+                sp_cfg.input_bits, sp_cfg.n_columns,
+            ))
+        })?;
+        let dev = sp_gpu.dev_ref().clone();
+        let tm_gpu = TemporalMemoryGpu::new(dev.clone(), n_columns, cells_per_column).map_err(|e| {
+            pyo3::exceptions::PyRuntimeError::new_err(format!(
+                "GPU TM init failed: {e:?}",
+            ))
+        })?;
+        let initial_threshold = sp_gpu.initial_threshold_estimate();
+        let fused_state = FusedState::new(dev, n_columns, cells_per_column, initial_threshold)
+            .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!(
+                "GPU fused state init failed: {e:?}",
+            )))?;
+        Ok(Self {
+            sp_gpu,
+            tm_gpu,
+            fused_state,
+            n_columns,
+            input_bits,
+            cells_per_column,
+        })
+    }
+    #[getter] fn input_bits(&self) -> usize { self.input_bits }
+    #[getter] fn n_columns(&self) -> usize { self.n_columns }
+    #[getter] fn cells_per_column(&self) -> usize { self.cells_per_column }
+    /// Process T timesteps in one call on GPU. Per-step state (SP + TM) stays
+    /// on device; only the final (T, n_cols) mask and (T,) anomaly are copied
+    /// to the host at the end.
+    #[pyo3(signature = (inputs, learn=true))]
+    fn step_many_gpu<'py>(
+        &mut self,
+        py: Python<'py>,
+        inputs: PyReadonlyArray2<'py, bool>,
+        learn: bool,
+    ) -> PyResult<(Bound<'py, PyArray2<f32>>, Bound<'py, PyArray1<f32>>)> {
+        let shape = inputs.shape();
+        if shape.len() != 2 {
+            return Err(pyo3::exceptions::PyValueError::new_err(
+                "inputs must be 2-D (T, input_bits)",
+            ));
+        }
+        let t = shape[0];
+        let bits = shape[1];
+        if bits != self.input_bits {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "inputs last dim {bits} != expected input_bits {}",
+                self.input_bits,
+            )));
+        }
+        let slice = inputs.as_slice()?;
+        let n_cols = self.n_columns;
+        let input_vec: Vec<bool> = slice.to_vec();
+        let result = py.allow_threads(|| -> Result<(Vec<u8>, Vec<f32>), String> {
+            // 1. Upload T*input_bits bytes (32 MB at T=2048, bits=16384).
+            let sdr_u8_all: Vec<u8> = input_vec.iter().map(|&b| b as u8).collect();
+            let inputs_dev = self
+                .sp_gpu
+                .dev_ref()
+                .htod_sync_copy(&sdr_u8_all)
+                .map_err(|e| format!("H2D inputs: {e:?}"))?;
+            // 2. Allocate output buffers on device.
+            let mut cols_dev = self.sp_gpu.dev_ref()
+                .alloc_zeros::<u8>(t * n_cols)
+                .map_err(|e| format!("alloc cols: {e:?}"))?;
+            let mut anom_dev = self.sp_gpu.dev_ref()
+                .alloc_zeros::<f32>(t)
+                .map_err(|e| format!("alloc anom: {e:?}"))?;
+            // 3. Run T steps of SP + TM on GPU with NO per-step host sync.
+            self.sp_gpu.step_batch_with_tm(
+                &inputs_dev,
+                t,
+                self.input_bits,
+                learn,
+                &mut cols_dev,
+                &mut anom_dev,
+                &mut self.tm_gpu,
+            ).map_err(|e| format!("step_batch_with_tm: {e:?}"))?;
+            // 4. ONE D2H for the whole run (T * n_cols bytes + T floats).
+            let cols_host: Vec<u8> = self.sp_gpu.dev_ref()
+                .dtoh_sync_copy(&cols_dev)
+                .map_err(|e| format!("D2H cols: {e:?}"))?;
+            let anom_host: Vec<f32> = self.sp_gpu.dev_ref()
+                .dtoh_sync_copy(&anom_dev)
+                .map_err(|e| format!("D2H anom: {e:?}"))?;
+            Ok((cols_host, anom_host))
+        });
+        let (cols_u8, anom) = result.map_err(pyo3::exceptions::PyRuntimeError::new_err)?;
+        let cols_f32: Vec<f32> = cols_u8.iter().map(|&b| b as f32).collect();
+        let cols_arr = numpy::PyArray1::from_vec_bound(py, cols_f32)
+            .reshape([t, n_cols])
+            .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("{e}")))?;
+        let anom_arr = numpy::PyArray1::from_vec_bound(py, anom);
+        Ok((cols_arr, anom_arr))
+    }
+    /// Zero-copy CUDA path: accept torch tensors via __cuda_array_interface__,
+    /// write outputs directly into caller-allocated torch tensors. Skips the
+    /// host round-trip that `step_many_gpu` pays on every call (sdr.cpu() +
+    /// two D2H copies at the end). This is the hot path for `train.py`.
+    ///
+    /// Contract:
+    ///   sdr_cai.shape  == (T, input_bits), dtype u8   (0/1 mask)
+    ///   cols_cai.shape == (T, n_columns),  dtype u8   (written)
+    ///   anom_cai.shape == (T,),            dtype f32  (written)
+    /// All three tensors must live on the SAME CUDA device as this region.
+    ///
+    /// The torch tensors still own their memory — this method only wraps
+    /// them as borrowed CudaSlice views (via ManuallyDrop) so cudarc's Drop
+    /// impl can't free pytorch's allocator.
+    #[pyo3(signature = (sdr_cai, cols_cai, anom_cai, learn=true))]
+    fn step_many_cuda(
+        &mut self,
+        py: Python<'_>,
+        sdr_cai: &Bound<'_, PyDict>,
+        cols_cai: &Bound<'_, PyDict>,
+        anom_cai: &Bound<'_, PyDict>,
+        learn: bool,
+    ) -> PyResult<()> {
+        let (sdr_ptr, sdr_shape, sdr_type) = cai_parse(sdr_cai)?;
+        let (cols_ptr, cols_shape, cols_type) = cai_parse(cols_cai)?;
+        let (anom_ptr, anom_shape, anom_type) = cai_parse(anom_cai)?;
+        // typestr sanity. numpy u1 is what torch.uint8 exports.
+        if sdr_type != "|u1" {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "sdr_cai typestr must be '|u1' (uint8), got {sdr_type}",
+            )));
+        }
+        if cols_type != "|u1" {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "cols_cai typestr must be '|u1' (uint8), got {cols_type}",
+            )));
+        }
+        if anom_type != "<f4" && anom_type != "=f4" {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "anom_cai typestr must be '<f4' (float32), got {anom_type}",
+            )));
+        }
+        // Shape validation.
+        if sdr_shape.len() != 2 || sdr_shape[1] != self.input_bits {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "sdr_cai shape {sdr_shape:?} != (T, {})",
+                self.input_bits,
+            )));
+        }
+        let t = sdr_shape[0];
+        if cols_shape != [t, self.n_columns] {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "cols_cai shape {cols_shape:?} != ({t}, {})",
+                self.n_columns,
+            )));
+        }
+        if anom_shape != [t] {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "anom_cai shape {anom_shape:?} != ({t},)",
+            )));
+        }
+        let dev = self.sp_gpu.dev_ref().clone();
+        let n_cols = self.n_columns;
+        let input_bits = self.input_bits;
+        let result = py.allow_threads(|| -> Result<(), String> {
+            // SAFETY:
+            // - ptrs came from torch CUDA tensors validated non-null by the
+            //   __cuda_array_interface__ contract.
+            // - lens computed from validated shapes.
+            // - We wrap the returned CudaSlice in ManuallyDrop so cudarc's
+            //   Drop (which calls cuMemFree) never runs against torch memory.
+            //   The underlying allocation is owned+freed by torch.
+            // - The slices are used only for the duration of this call;
+            //   torch guarantees the backing tensors are live across it
+            //   (Python holds refs on the wrapping tensors).
+            let inputs_dev = ManuallyDrop::new(unsafe {
+                dev.upgrade_device_ptr::<u8>(sdr_ptr, t * input_bits)
+            });
+            let mut cols_dev = ManuallyDrop::new(unsafe {
+                dev.upgrade_device_ptr::<u8>(cols_ptr, t * n_cols)
+            });
+            let mut anom_dev = ManuallyDrop::new(unsafe {
+                dev.upgrade_device_ptr::<f32>(anom_ptr, t)
+            });
+            self.sp_gpu.step_batch_with_tm(
+                &inputs_dev,
+                t,
+                input_bits,
+                learn,
+                &mut cols_dev,
+                &mut anom_dev,
+                &mut self.tm_gpu,
+            ).map_err(|e| format!("step_batch_with_tm: {e:?}"))?;
+            // Synchronize: kernel writes must be visible to the next torch
+            // op that reads cols/anom. Pytorch's default stream is stream 0,
+            // and cudarc launches on its own stream — a full device sync
+            // is the simplest correct barrier. (Could narrow to a stream
+            // wait event in PR 2.)
+            // No dev.synchronize() here: caller must explicitly sync via the
+            // `device_sync()` method (or PyTorch auto-syncs when the output
+            // tensor is next consumed). Removing the per-launch barrier lets
+            // subsequent GPU work (mamba3 fwd, etc.) overlap in time.
+            Ok(())
+        });
+        result.map_err(pyo3::exceptions::PyRuntimeError::new_err)?;
+        Ok(())
+    }
+    /// Clear TM state on the GPU.
+    fn reset(&mut self) -> PyResult<()> {
+        self.tm_gpu.reset().map_err(|e| {
+            pyo3::exceptions::PyRuntimeError::new_err(format!("GPU TM reset: {e:?}"))
+        })?;
+        self.fused_state.reset().map_err(|e| {
+            pyo3::exceptions::PyRuntimeError::new_err(format!("GPU fused reset: {e:?}"))
+        })
+    }
+    /// FUSED MEGAKERNEL PATH: single CUDA launch for the entire T-step
+    /// forward (SP + TM all in one). Accepts torch CUDA tensors via
+    /// `__cuda_array_interface__` (zero-copy). Writes active-column mask +
+    /// anomaly directly into caller-allocated torch tensors.
+    ///
+    /// Semantics diverge from `step_many_cuda` in one important way: column
+    /// activation uses per-column threshold inhibition instead of global
+    /// top-K. The threshold is EMA-adapted per column toward the sparsity
+    /// target. See `docs/GPU_HTM.md` §Fused Kernel.
+    #[pyo3(signature = (sdr_cai, cols_cai, anom_cai, learn=true))]
+    fn step_many_fused_cuda(
+        &mut self,
+        py: Python<'_>,
+        sdr_cai: &Bound<'_, PyDict>,
+        cols_cai: &Bound<'_, PyDict>,
+        anom_cai: &Bound<'_, PyDict>,
+        learn: bool,
+    ) -> PyResult<()> {
+        let (sdr_ptr, sdr_shape, sdr_type) = cai_parse(sdr_cai)?;
+        let (cols_ptr, cols_shape, cols_type) = cai_parse(cols_cai)?;
+        let (anom_ptr, anom_shape, anom_type) = cai_parse(anom_cai)?;
+        if sdr_type != "|u1" {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "sdr_cai typestr must be '|u1' (uint8), got {sdr_type}",
+            )));
+        }
+        if cols_type != "|u1" {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "cols_cai typestr must be '|u1' (uint8), got {cols_type}",
+            )));
+        }
+        if anom_type != "<f4" && anom_type != "=f4" {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "anom_cai typestr must be '<f4' (float32), got {anom_type}",
+            )));
+        }
+        if sdr_shape.len() != 2 || sdr_shape[1] != self.input_bits {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "sdr_cai shape {sdr_shape:?} != (T, {})",
+                self.input_bits,
+            )));
+        }
+        let t = sdr_shape[0];
+        if cols_shape != [t, self.n_columns] {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "cols_cai shape {cols_shape:?} != ({t}, {})",
+                self.n_columns,
+            )));
+        }
+        if anom_shape != [t] {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "anom_cai shape {anom_shape:?} != ({t},)",
+            )));
+        }
+        let dev = self.sp_gpu.dev_ref().clone();
+        let n_cols = self.n_columns;
+        let input_bits = self.input_bits;
+        let result = py.allow_threads(|| -> Result<(), String> {
+            let inputs_dev = ManuallyDrop::new(unsafe {
+                dev.upgrade_device_ptr::<u8>(sdr_ptr, t * input_bits)
+            });
+            let mut cols_dev = ManuallyDrop::new(unsafe {
+                dev.upgrade_device_ptr::<u8>(cols_ptr, t * n_cols)
+            });
+            let mut anom_dev = ManuallyDrop::new(unsafe {
+                dev.upgrade_device_ptr::<f32>(anom_ptr, t)
+            });
+            fused::launch_fused(
+                &mut self.sp_gpu,
+                &mut self.tm_gpu,
+                &mut self.fused_state,
+                &inputs_dev,
+                &mut cols_dev,
+                &mut anom_dev,
+                t,
+                input_bits,
+                learn,
+            ).map_err(|e| format!("launch_fused: {e:?}"))?;
+            // No dev.synchronize() here: caller must explicitly sync via the
+            // `device_sync()` method (or PyTorch auto-syncs when the output
+            // tensor is next consumed). Removing the per-launch barrier lets
+            // subsequent GPU work (mamba3 fwd, etc.) overlap in time.
+            Ok(())
+        });
+        result.map_err(pyo3::exceptions::PyRuntimeError::new_err)?;
+        Ok(())
+    }
+    /// Explicit device synchronization — the caller must invoke this after
+    /// all batched `step_many_*_cuda` calls complete, before reading the
+    /// output tensors from a different CUDA stream. Equivalent to the old
+    /// per-call `dev.synchronize()` that was removed for overlap.
+    fn device_sync(&self) -> PyResult<()> {
+        let dev = self.sp_gpu.dev_ref();
+        dev.synchronize()
+            .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("sync: {e:?}")))?;
+        Ok(())
+    }
+}
+/// Batch B regions into ONE cooperative kernel launch. Breaks through the
+/// CUDA cooperative-kernel device-level serialization: a single cooperative
+/// launch with grid.y=B processes all regions concurrently — ~B× speedup
+/// over B sequential launches.
+///
+/// All regions must have the same config (input_bits, n_columns,
+/// cells_per_column). Each region keeps its independent GPU state.
+/// Does NOT sync; caller must invoke `device_sync()` on any region
+/// afterwards (or rely on a downstream torch op to auto-sync).
+#[pyfunction]
+#[pyo3(signature = (regions, sdr_cais, cols_cais, anom_cais, learn=true))]
+fn step_batch_fused_cuda(
+    py: Python<'_>,
+    regions: Vec<Py<HTMRegionGpu>>,
+    sdr_cais: Vec<Bound<'_, PyDict>>,
+    cols_cais: Vec<Bound<'_, PyDict>>,
+    anom_cais: Vec<Bound<'_, PyDict>>,
+    learn: bool,
+) -> PyResult<()> {
+    let b = regions.len();
+    if b == 0 {
+        return Err(pyo3::exceptions::PyValueError::new_err("regions is empty"));
+    }
+    if sdr_cais.len() != b || cols_cais.len() != b || anom_cais.len() != b {
+        return Err(pyo3::exceptions::PyValueError::new_err(
+            "sdr_cais / cols_cais / anom_cais length must match regions",
+        ));
+    }
+    // Parse all CAI dicts; collect device pointers. Validate shapes/dtypes.
+    let mut sdr_ptrs = Vec::with_capacity(b);
+    let mut cols_ptrs = Vec::with_capacity(b);
+    let mut anom_ptrs = Vec::with_capacity(b);
+    let (input_bits, n_columns, t) = {
+        let r0 = regions[0].bind(py).borrow();
+        (r0.input_bits, r0.n_columns, {
+            let (_p, sh, _ty) = cai_parse(&sdr_cais[0])?;
+            if sh.len() != 2 {
+                return Err(pyo3::exceptions::PyValueError::new_err(
+                    format!("sdr_cai must be 2-D (T, input_bits), got {sh:?}"),
+                ));
+            }
+            sh[0]
+        })
+    };
+    for i in 0..b {
+        let (sdr_ptr, sdr_shape, sdr_type) = cai_parse(&sdr_cais[i])?;
+        let (cols_ptr, cols_shape, cols_type) = cai_parse(&cols_cais[i])?;
+        let (anom_ptr, anom_shape, anom_type) = cai_parse(&anom_cais[i])?;
+        if sdr_type != "|u1" || cols_type != "|u1" {
+            return Err(pyo3::exceptions::PyValueError::new_err(
+                "sdr/cols typestr must be '|u1' (uint8)",
+            ));
+        }
+        if anom_type != "<f4" && anom_type != "=f4" {
+            return Err(pyo3::exceptions::PyValueError::new_err(
+                "anom typestr must be '<f4' (float32)",
+            ));
+        }
+        if sdr_shape != [t, input_bits] {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "sdr[{i}] shape {sdr_shape:?} != ({t}, {input_bits})"
+            )));
+        }
+        if cols_shape != [t, n_columns] {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "cols[{i}] shape {cols_shape:?} != ({t}, {n_columns})"
+            )));
+        }
+        if anom_shape != [t] {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "anom[{i}] shape {anom_shape:?} != ({t},)"
+            )));
+        }
+        sdr_ptrs.push(sdr_ptr);
+        cols_ptrs.push(cols_ptr);
+        anom_ptrs.push(anom_ptr);
+    }
+    // Exclusively borrow each region. PyRefMut guarantees uniqueness.
+    let mut region_refs: Vec<pyo3::PyRefMut<HTMRegionGpu>> =
+        regions.iter().map(|p| p.bind(py).borrow_mut()).collect();
+    // Collect raw mutable pointers — each PyRefMut exclusively borrows its
+    // region for the lifetime of this call, so pointers stay valid and
+    // unique. launch_fused_batched_raw only dereferences one region at a
+    // time, not constructing an aliased slice.
+    let raw_ptrs: Vec<*mut HTMRegionGpu> = region_refs
+        .iter_mut()
+        .map(|r| &mut **r as *mut HTMRegionGpu)
+        .collect();
+    // No allow_threads: raw pointers aren't Send. The launch is GPU-queued
+    // and sync'd downstream; holding the GIL for the duration is cheap.
+    fused::launch_fused_batched_raw(
+        &raw_ptrs, &sdr_ptrs, &cols_ptrs, &anom_ptrs,
+        t, input_bits, learn,
+    )
+    .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("launch_fused_batched: {e:?}")))?;
+    Ok(())
+}
+pub fn register(m: &Bound<'_, PyModule>) -> PyResult<()> {
+    m.add_class::<HTMRegionGpu>()?;
+    m.add_function(pyo3::wrap_pyfunction!(step_batch_fused_cuda, m)?)?;
+    Ok(())
+}

overlay/htm_rust/src/gpu/sp_gpu.rs CHANGED Viewed

@@ -1,796 +1,796 @@
-//! GPU implementation of the Spatial Pooler.
-//!
-//! One `SpatialPoolerGpu` owns a set of persistent device buffers + 4 PTX
-//! kernels. `compute(input, learn)` performs one SP step and returns the
-//! sorted active-column indices (host `Vec<u32>`) — this is what the CPU
-//! TemporalMemory consumes.
-//!
-//! Persistent state on device (per region):
-//!   syn_bit     : u32  [n_columns × S]  (constant after init)
-//!   syn_perm    : f32  [n_columns × S]  (updated by sp_learn)
-//!   boost       : f32  [n_columns]
-//!   active_duty : f32  [n_columns]
-//!   overlap_duty: f32  [n_columns]
-//!
-//! Per-step transient state:
-//!   inp_dev     : u8   [input_bits]     (H2D copy each step)
-//!   raw         : u32  [n_columns]
-//!   boosted     : f32  [n_columns]
-//!   active_mask : u8   [n_columns]      (topk output, D2H at the end)
-use std::sync::Arc;
-use cudarc::driver::{CudaDevice, CudaSlice, DeviceSlice, DriverError, LaunchAsync, LaunchConfig};
-use cudarc::nvrtc::Ptx;
-use crate::sp::SpatialPooler;
-// Embed PTX at compile time. OUT_DIR is set by build.rs.
-const PTX_SP_OVERLAP: &str =
-    include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/sp_overlap.ptx"));
-const PTX_SP_TOPK: &str =
-    include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/sp_topk.ptx"));
-const PTX_SP_LEARN: &str =
-    include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/sp_learn.ptx"));
-const PTX_SP_DUTY: &str =
-    include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/sp_duty.ptx"));
-const PTX_SP_BOOST_FUSED: &str =
-    include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/sp_boost_fused.ptx"));
-pub struct SpatialPoolerGpu {
-    dev: Arc<CudaDevice>,
-    // Config mirror (we don't touch CPU SpatialPooler after init).
-    input_bits: usize,
-    n_columns: usize,
-    synapses_per_col: usize,
-    conn_thr: f32,
-    inc: f32,
-    dec: f32,
-    sparsity: f32,
-    duty_period: f32,
-    boost_strength: f32,
-    // Persistent device state.
-    syn_bit: CudaSlice<u32>,
-    syn_perm: CudaSlice<f32>,
-    boost: CudaSlice<f32>,
-    active_duty: CudaSlice<f32>,
-    overlap_duty: CudaSlice<f32>,
-    // Transient scratch (reused each step).
-    inp_dev: CudaSlice<u8>,
-    raw: CudaSlice<u32>,
-    boosted: CudaSlice<f32>,
-    active_mask: CudaSlice<u8>,
-    // Reusable host buffer for D2H of active_mask.
-    host_mask: Vec<u8>,
-    /// Strict bit-parity with CPU reference. Enabled for tests.
-    /// Forces host-side boost/exp computation and the overlap-duty bump check
-    /// every step. Default false for max throughput.
-    strict_parity: bool,
-}
-impl SpatialPoolerGpu {
-    /// Copy CPU SpatialPooler state onto the device. This preserves the
-    /// exact seeded proximal synapse layout + initial permanences, so the
-    /// GPU SP is a bit-identical parallel implementation of the CPU SP.
-    pub fn from_cpu(cpu: &SpatialPooler) -> Result<Self, DriverError> {
-        let dev = CudaDevice::new(0)?;
-        let cfg = &cpu.cfg;
-        let n = cfg.n_columns;
-        let s = cfg.potential_synapses;
-        // Flatten proximal dendrites into column-major arrays.
-        let mut syn_bit_h: Vec<u32> = Vec::with_capacity(n * s);
-        let mut syn_perm_h: Vec<f32> = Vec::with_capacity(n * s);
-        for col in &cpu.columns {
-            debug_assert_eq!(col.inputs.len(), s);
-            debug_assert_eq!(col.perms.len(), s);
-            syn_bit_h.extend_from_slice(&col.inputs);
-            syn_perm_h.extend_from_slice(&col.perms);
-        }
-        let syn_bit = dev.htod_sync_copy(&syn_bit_h)?;
-        let syn_perm = dev.htod_sync_copy(&syn_perm_h)?;
-        let boost = dev.htod_sync_copy(&cpu.boost)?;
-        let active_duty = dev.htod_sync_copy(&cpu.active_duty_cycle)?;
-        let overlap_duty = dev.htod_sync_copy(&cpu.overlap_duty_cycle)?;
-        let inp_dev: CudaSlice<u8> = dev.alloc_zeros(cfg.input_bits)?;
-        let raw: CudaSlice<u32> = dev.alloc_zeros(n)?;
-        let boosted: CudaSlice<f32> = dev.alloc_zeros(n)?;
-        let active_mask: CudaSlice<u8> = dev.alloc_zeros(n)?;
-        // Load PTX modules. Each .ptx is a module containing one `extern "C"`
-        // function; we tag them by unique module names so multiple SP instances
-        // don't collide (cudarc uses the (module, func) pair).
-        // Actually: CudaDevice::load_ptx stores under the given module name
-        // globally on the device, so we use a deterministic naming scheme.
-        let modules = [
-            ("htm_sp_overlap", PTX_SP_OVERLAP, "sp_overlap"),
-            ("htm_sp_topk", PTX_SP_TOPK, "sp_topk_select"),
-            ("htm_sp_learn", PTX_SP_LEARN, "sp_learn"),
-            ("htm_sp_duty", PTX_SP_DUTY, "sp_duty_update"),
-            ("htm_sp_boost_fused", PTX_SP_BOOST_FUSED, "sp_boost_from_duty"),
-        ];
-        for (modname, ptx, fnname) in modules {
-            // load_ptx is NOT idempotent — calling twice errors. For multi-region
-            // support we check-then-load.
-            if dev.get_func(modname, fnname).is_none() {
-                dev.load_ptx(Ptx::from_src(ptx), modname, &[fnname])?;
-            }
-        }
-        Ok(Self {
-            dev,
-            input_bits: cfg.input_bits,
-            n_columns: n,
-            synapses_per_col: s,
-            conn_thr: cfg.connected_threshold,
-            inc: cfg.syn_perm_active_inc,
-            dec: cfg.syn_perm_inactive_dec,
-            sparsity: cfg.sparsity,
-            duty_period: cfg.duty_cycle_period,
-            boost_strength: cfg.boost_strength,
-            syn_bit,
-            syn_perm,
-            boost,
-            active_duty,
-            overlap_duty,
-            inp_dev,
-            raw,
-            boosted,
-            active_mask,
-            host_mask: vec![0u8; n],
-            strict_parity: false,
-        })
-    }
-    /// Enable strict bit-parity mode. Parity tests use this.
-    pub fn set_strict_parity(&mut self, strict: bool) {
-        self.strict_parity = strict;
-    }
-    /// Access to the underlying CudaDevice for host-side orchestration.
-    pub fn dev_ref(&self) -> &Arc<CudaDevice> {
-        &self.dev
-    }
-    // --- Fused-path accessors (immutable state reads + pointer-grabs). ---
-    pub fn n_columns_accessor(&self) -> usize { self.n_columns }
-    #[allow(dead_code)]
-    pub fn input_bits_accessor(&self) -> usize { self.input_bits }
-    pub fn synapses_per_col_accessor(&self) -> usize { self.synapses_per_col }
-    pub fn conn_thr_accessor(&self) -> f32 { self.conn_thr }
-    pub fn inc_accessor(&self) -> f32 { self.inc }
-    pub fn dec_accessor(&self) -> f32 { self.dec }
-    pub fn sparsity_accessor(&self) -> f32 { self.sparsity }
-    pub fn duty_period_accessor(&self) -> f32 { self.duty_period }
-    #[allow(dead_code)]
-    pub fn boost_strength_accessor(&self) -> f32 { self.boost_strength }
-    pub fn syn_bit_accessor(&self) -> &CudaSlice<u32> { &self.syn_bit }
-    pub fn syn_perm_accessor(&self) -> &CudaSlice<f32> { &self.syn_perm }
-    pub fn boost_accessor(&self) -> &CudaSlice<f32> { &self.boost }
-    pub fn active_duty_accessor(&self) -> &CudaSlice<f32> { &self.active_duty }
-    /// Compute the 95th-percentile-like initial threshold from raw overlaps
-    /// after a short warmup pass. Used to seed `inhibition_threshold` such
-    /// that activation rate starts near the sparsity target.
-    /// Placeholder (returns a conservative constant); real warmup pass
-    /// happens on the Rust orchestrator side.
-    pub fn initial_threshold_estimate(&self) -> f32 {
-        // With conn_thr=0.5, init_perm around 0.5±0.1, S=40, sparse SDR at 2%:
-        // expected overlap ~ 40 * 0.02 = 0.8 connected hits → boosted ~ 0.8.
-        // Top-K selects top 2%, so threshold for top 2% is roughly the
-        // 98th-percentile of boosted. Conservative start: 2.0.
-        // The per-column adaptation will quickly steer each column's thr.
-        2.0f32
-    }
-    /// Batched multi-step SP on the GPU. Processes T timesteps from a
-    /// pre-uploaded device input buffer. Emits `(T, n_cols)` u8 active-column
-    /// mask to `cols_dev_out` and `(T,)` active column index list (in a
-    /// per-step window of size k, padded with u32::MAX).
-    ///
-    /// For each step, this runs the same 5-kernel pipeline as `compute`, but
-    /// skips the per-step boost/duty D2H→exp→H2D round-trip: instead it
-    /// accumulates to a host scratch once every `boost_interval` steps.
-    ///
-    /// This is the fast path used by `HTMRegionGpu.step_many_gpu`.
-    #[allow(clippy::too_many_arguments)]
-    pub fn step_batch(
-        &mut self,
-        inputs_flat_dev: &CudaSlice<u8>,
-        t: usize,
-        input_bits: usize,
-        learn: bool,
-        cols_out: &mut [u8],
-        active_indices_host: &mut Vec<u32>,
-    ) -> Result<(), DriverError> {
-        let n = self.n_columns;
-        let k = ((self.sparsity * n as f32).round() as usize).max(1);
-        debug_assert_eq!(cols_out.len(), t * n);
-        let overlap_fn = self.dev.get_func("htm_sp_overlap", "sp_overlap").unwrap();
-        let topk_fn = self.dev.get_func("htm_sp_topk", "sp_topk_select").unwrap();
-        let learn_fn = self.dev.get_func("htm_sp_learn", "sp_learn").unwrap();
-        let duty_fn = self.dev.get_func("htm_sp_duty", "sp_duty_update").unwrap();
-        let overlap_cfg = LaunchConfig {
-            grid_dim: (n as u32, 1, 1),
-            block_dim: (128, 1, 1),
-            shared_mem_bytes: 0,
-        };
-        let topk_cfg = LaunchConfig {
-            grid_dim: (1, 1, 1),
-            block_dim: (256, 1, 1),
-            shared_mem_bytes: (n * std::mem::size_of::<f32>()) as u32,
-        };
-        let learn_cfg = overlap_cfg;
-        let duty_cfg = LaunchConfig {
-            grid_dim: ((n as u32 + 255) / 256, 1, 1),
-            block_dim: (256, 1, 1),
-            shared_mem_bytes: 0,
-        };
-        let alpha = 1.0f32 / self.duty_period.max(1.0);
-        // Reusable host buffer for the per-step active_mask D2H.
-        self.host_mask.resize(n, 0);
-        active_indices_host.clear();
-        for ti in 0..t {
-            // Point overlap kernel at the ti-th slice of the pre-uploaded input.
-            // cudarc CudaSlice doesn't have a "view" per se, so we must copy the
-            // slice into the reusable inp_dev buffer. This is a D2D copy — much
-            // faster than H2D.
-            // (Alternative: rewrite kernel to accept an offset; deferred.)
-            let in_off = ti * input_bits;
-            // Use dtod_copy via raw slice indexing: cudarc exposes slice() for this.
-            let sub = inputs_flat_dev.slice(in_off..in_off + input_bits);
-            self.dev.dtod_copy(&sub, &mut self.inp_dev)?;
-            // 1. sp_overlap
-            unsafe {
-                overlap_fn.clone().launch(
-                    overlap_cfg,
-                    (
-                        &self.inp_dev,
-                        &self.syn_bit,
-                        &self.syn_perm,
-                        &self.boost,
-                        self.conn_thr,
-                        self.synapses_per_col as u32,
-                        n as u32,
-                        &mut self.raw,
-                        &mut self.boosted,
-                    ),
-                )?;
-            }
-            // 2. Clear active_mask, then sp_topk
-            self.dev.memset_zeros(&mut self.active_mask)?;
-            unsafe {
-                topk_fn.clone().launch(
-                    topk_cfg,
-                    (&self.boosted, n as u32, k as u32, &mut self.active_mask),
-                )?;
-            }
-            // 3. sp_learn
-            if learn {
-                unsafe {
-                    learn_fn.clone().launch(
-                        learn_cfg,
-                        (
-                            &self.active_mask,
-                            &self.inp_dev,
-                            &self.syn_bit,
-                            &mut self.syn_perm,
-                            self.inc,
-                            self.dec,
-                            self.synapses_per_col as u32,
-                            n as u32,
-                        ),
-                    )?;
-                }
-            }
-            // 4. duty update (device)
-            unsafe {
-                duty_fn.clone().launch(
-                    duty_cfg,
-                    (
-                        &self.active_mask,
-                        &self.raw,
-                        &mut self.active_duty,
-                        &mut self.overlap_duty,
-                        &mut self.boost,
-                        alpha,
-                        1.0f32,
-                        0.0f32,
-                        0.0f32,
-                        0u32,
-                        n as u32,
-                    ),
-                )?;
-            }
-            // 5. Boost update. Two modes:
-            //    * strict_parity (tests): host-side exp for bit-exact match.
-            //    * default (production): GPU expf is close enough and ~10x faster
-            //      since we skip the D2H/H2D round-trip.
-            if learn && self.boost_strength > 0.0 {
-                if self.strict_parity {
-                    let mut duty_host = vec![0f32; n];
-                    self.dev
-                        .dtoh_sync_copy_into(&self.active_duty, &mut duty_host)?;
-                    let sum: f32 = duty_host.iter().sum();
-                    let mean = sum / (n as f32);
-                    let mut boost_host = vec![0f32; n];
-                    for i in 0..n {
-                        boost_host[i] =
-                            (-self.boost_strength * (duty_host[i] - mean)).exp();
-                    }
-                    self.dev.htod_sync_copy_into(&boost_host, &mut self.boost)?;
-                    // Permanence bump (rare). Only evaluated in strict mode.
-                    let mut ov_host = vec![0f32; n];
-                    self.dev
-                        .dtoh_sync_copy_into(&self.overlap_duty, &mut ov_host)?;
-                    let max_ov = ov_host.iter().cloned().fold(0f32, f32::max);
-                    if max_ov > 0.0 {
-                        let thr = 0.001f32 * max_ov;
-                        let bump = self.inc * 0.1f32;
-                        let bump_cols: Vec<u32> = ov_host
-                            .iter()
-                            .enumerate()
-                            .filter_map(|(i, &o)| {
-                                if o < thr { Some(i as u32) } else { None }
-                            })
-                            .collect();
-                        if !bump_cols.is_empty() {
-                            let s = self.synapses_per_col;
-                            let mut perm_host = vec![0f32; n * s];
-                            self.dev
-                                .dtoh_sync_copy_into(&self.syn_perm, &mut perm_host)?;
-                            for &c in &bump_cols {
-                                let base = (c as usize) * s;
-                                for p in &mut perm_host[base..base + s] {
-                                    *p = (*p + bump).min(1.0);
-                                }
-                            }
-                            self.dev.htod_sync_copy_into(&perm_host, &mut self.syn_perm)?;
-                        }
-                    }
-                } else {
-                    // Fast path: fused mean + boost = expf(-strength*(ad-mean))
-                    // in a single GPU block. Zero D2H, zero H2D — fully async.
-                    let boost_fn = self
-                        .dev
-                        .get_func("htm_sp_boost_fused", "sp_boost_from_duty")
-                        .expect("sp_boost_fused not loaded");
-                    let boost_cfg = LaunchConfig {
-                        grid_dim: (1, 1, 1),
-                        block_dim: (1024, 1, 1),
-                        shared_mem_bytes: 32 * std::mem::size_of::<f32>() as u32,
-                    };
-                    unsafe {
-                        boost_fn.launch(
-                            boost_cfg,
-                            (
-                                &self.active_duty,
-                                &mut self.boost,
-                                self.boost_strength,
-                                n as u32,
-                            ),
-                        )?;
-                    }
-                }
-            }
-            // D2H the active_mask for this step. This is the single
-            // unavoidable sync point per step — CPU TM needs the active
-            // indices for its next state update. At 2048 bytes / step this
-            // is tiny in bandwidth but costs a full syncronize (~5-10μs).
-            self.dev
-                .dtoh_sync_copy_into(&self.active_mask, &mut self.host_mask)?;
-            let co = ti * n;
-            cols_out[co..co + n].copy_from_slice(&self.host_mask);
-            // Extract active indices.
-            for (i, &b) in self.host_mask.iter().enumerate() {
-                if b != 0 {
-                    active_indices_host.push(i as u32);
-                }
-            }
-            // Insert separator (u32::MAX) between steps to demarcate step boundaries.
-            active_indices_host.push(u32::MAX);
-        }
-        Ok(())
-    }
-    /// Fully-on-GPU batched SP + TM. Zero per-step host sync.
-    ///
-    /// Inputs:
-    ///   inputs_flat_dev : (T * input_bits) u8 already uploaded
-    ///   cols_dev        : (T * n_cols) u8 output — active-column mask per step
-    ///   anom_dev        : (T,) f32 output — anomaly score per step
-    ///   tm              : persistent GPU TemporalMemory for this region
-    #[allow(clippy::too_many_arguments)]
-    pub fn step_batch_with_tm(
-        &mut self,
-        inputs_flat_dev: &CudaSlice<u8>,
-        t: usize,
-        input_bits: usize,
-        learn: bool,
-        cols_dev: &mut CudaSlice<u8>,
-        anom_dev: &mut CudaSlice<f32>,
-        tm: &mut crate::gpu::tm_gpu::TemporalMemoryGpu,
-    ) -> Result<(), DriverError> {
-        let n = self.n_columns;
-        let k = ((self.sparsity * n as f32).round() as usize).max(1);
-        debug_assert_eq!(cols_dev.len(), t * n);
-        debug_assert_eq!(anom_dev.len(), t);
-        let overlap_fn = self.dev.get_func("htm_sp_overlap", "sp_overlap").unwrap();
-        let topk_fn    = self.dev.get_func("htm_sp_topk", "sp_topk_select").unwrap();
-        let learn_fn   = self.dev.get_func("htm_sp_learn", "sp_learn").unwrap();
-        let duty_fn    = self.dev.get_func("htm_sp_duty", "sp_duty_update").unwrap();
-        let overlap_cfg = LaunchConfig {
-            grid_dim: (n as u32, 1, 1),
-            block_dim: (128, 1, 1),
-            shared_mem_bytes: 0,
-        };
-        let topk_cfg = LaunchConfig {
-            grid_dim: (1, 1, 1),
-            block_dim: (256, 1, 1),
-            shared_mem_bytes: (n * std::mem::size_of::<f32>()) as u32,
-        };
-        let learn_cfg = overlap_cfg;
-        let duty_cfg = LaunchConfig {
-            grid_dim: ((n as u32 + 255) / 256, 1, 1),
-            block_dim: (256, 1, 1),
-            shared_mem_bytes: 0,
-        };
-        let alpha = 1.0f32 / self.duty_period.max(1.0);
-        for ti in 0..t {
-            let in_off = ti * input_bits;
-            let sub = inputs_flat_dev.slice(in_off..in_off + input_bits);
-            self.dev.dtod_copy(&sub, &mut self.inp_dev)?;
-            // 1. sp_overlap
-            unsafe {
-                overlap_fn.clone().launch(
-                    overlap_cfg,
-                    (
-                        &self.inp_dev,
-                        &self.syn_bit,
-                        &self.syn_perm,
-                        &self.boost,
-                        self.conn_thr,
-                        self.synapses_per_col as u32,
-                        n as u32,
-                        &mut self.raw,
-                        &mut self.boosted,
-                    ),
-                )?;
-            }
-            // 2. clear + sp_topk
-            self.dev.memset_zeros(&mut self.active_mask)?;
-            unsafe {
-                topk_fn.clone().launch(
-                    topk_cfg,
-                    (&self.boosted, n as u32, k as u32, &mut self.active_mask),
-                )?;
-            }
-            // 3. sp_learn
-            if learn {
-                unsafe {
-                    learn_fn.clone().launch(
-                        learn_cfg,
-                        (
-                            &self.active_mask,
-                            &self.inp_dev,
-                            &self.syn_bit,
-                            &mut self.syn_perm,
-                            self.inc,
-                            self.dec,
-                            self.synapses_per_col as u32,
-                            n as u32,
-                        ),
-                    )?;
-                }
-            }
-            // 4. duty update (stage 1: no-boost write)
-            unsafe {
-                duty_fn.clone().launch(
-                    duty_cfg,
-                    (
-                        &self.active_mask,
-                        &self.raw,
-                        &mut self.active_duty,
-                        &mut self.overlap_duty,
-                        &mut self.boost,
-                        alpha,
-                        1.0f32,
-                        0.0f32,
-                        0.0f32,
-                        0u32,
-                        n as u32,
-                    ),
-                )?;
-            }
-            // 5. Boost update: fused GPU kernel (no D2H).
-            if learn && self.boost_strength > 0.0 {
-                let boost_fn = self.dev
-                    .get_func("htm_sp_boost_fused", "sp_boost_from_duty")
-                    .expect("sp_boost_fused not loaded");
-                let boost_cfg = LaunchConfig {
-                    grid_dim: (1, 1, 1),
-                    block_dim: (1024, 1, 1),
-                    shared_mem_bytes: 32 * std::mem::size_of::<f32>() as u32,
-                };
-                unsafe {
-                    boost_fn.launch(
-                        boost_cfg,
-                        (
-                            &self.active_duty,
-                            &mut self.boost,
-                            self.boost_strength,
-                            n as u32,
-                        ),
-                    )?;
-                }
-            }
-            // 6. Copy active_mask slice into cols_dev[ti*n .. (ti+1)*n].
-            let mut dst_slice = cols_dev.slice_mut(ti * n..(ti + 1) * n);
-            self.dev.dtod_copy(&self.active_mask, &mut dst_slice)?;
-            // 7. GPU TM step: predict + activate + anomaly + learn, all on device.
-            tm.step(&self.active_mask, anom_dev, ti as u32, learn)?;
-        }
-        Ok(())
-    }
-    /// One SP step on the GPU. Returns sorted active-column indices.
-    pub fn compute(&mut self, input: &[u8], learn: bool) -> Result<Vec<u32>, DriverError> {
-        debug_assert_eq!(input.len(), self.input_bits);
-        let n = self.n_columns;
-        let k = ((self.sparsity * n as f32).round() as usize).max(1);
-        // 1. H2D input SDR.
-        self.dev.htod_sync_copy_into(input, &mut self.inp_dev)?;
-        // 2. Launch sp_overlap: grid=n_columns, block=128.
-        let overlap_fn = self
-            .dev
-            .get_func("htm_sp_overlap", "sp_overlap")
-            .expect("sp_overlap not loaded");
-        let overlap_cfg = LaunchConfig {
-            grid_dim: (n as u32, 1, 1),
-            block_dim: (128, 1, 1),
-            shared_mem_bytes: 0,
-        };
-        unsafe {
-            overlap_fn.launch(
-                overlap_cfg,
-                (
-                    &self.inp_dev,
-                    &self.syn_bit,
-                    &self.syn_perm,
-                    &self.boost,
-                    self.conn_thr,
-                    self.synapses_per_col as u32,
-                    n as u32,
-                    &mut self.raw,
-                    &mut self.boosted,
-                ),
-            )?;
-        }
-        // 3. Launch sp_topk: single block, shared mem = n_columns * f32.
-        let topk_fn = self
-            .dev
-            .get_func("htm_sp_topk", "sp_topk_select")
-            .expect("sp_topk not loaded");
-        let topk_cfg = LaunchConfig {
-            grid_dim: (1, 1, 1),
-            block_dim: (256, 1, 1),
-            shared_mem_bytes: (n * std::mem::size_of::<f32>()) as u32,
-        };
-        // Clear active_mask first. memset_zeros avoids an H2D of a host
-        // zeroes vector every step.
-        self.dev.memset_zeros(&mut self.active_mask)?;
-        unsafe {
-            topk_fn.launch(
-                topk_cfg,
-                (
-                    &self.boosted,
-                    n as u32,
-                    k as u32,
-                    &mut self.active_mask,
-                ),
-            )?;
-        }
-        // 4. Optional: sp_learn on active columns.
-        if learn {
-            let learn_fn = self
-                .dev
-                .get_func("htm_sp_learn", "sp_learn")
-                .expect("sp_learn not loaded");
-            let learn_cfg = LaunchConfig {
-                grid_dim: (n as u32, 1, 1),
-                block_dim: (128, 1, 1),
-                shared_mem_bytes: 0,
-            };
-            unsafe {
-                learn_fn.launch(
-                    learn_cfg,
-                    (
-                        &self.active_mask,
-                        &self.inp_dev,
-                        &self.syn_bit,
-                        &mut self.syn_perm,
-                        self.inc,
-                        self.dec,
-                        self.synapses_per_col as u32,
-                        n as u32,
-                    ),
-                )?;
-            }
-        }
-        // 5. Duty cycle + boost update. Always runs (matches CPU).
-        //    We need mean_duty on the host — compute BEFORE the update (matches
-        //    CPU sp.rs line 200-205 where mean is computed then written).
-        //    Actually CPU computes mean of the PRE-update duty cycles too? Re-read:
-        //      sp.rs lines 186-196 update duty cycles (pre-mean).
-        //      Line 202: mean = sum(active_duty_cycle) / n  ← after update.
-        //      Line 204: boost[i] = exp(-strength*(active_duty[i] - mean)).
-        //    So mean is on POST-update values.
-        //    Easiest: 1) run duty update with boost_strength=0 (skip boost calc),
-        //             2) D2H active_duty, compute mean, 3) run a boost-only kernel
-        //                OR inline the exp() in a second launch with mean passed.
-        //
-        //    For simplicity and correctness we fuse: run the duty kernel with
-        //    mean=0 and boost_strength=0 (disables boost write), then D2H to
-        //    compute mean, then re-launch with the true mean. Two launches, one
-        //    tiny D2H (n × f32). At n=2048 this is 8KB per step — negligible.
-        let alpha = 1.0f32 / self.duty_period.max(1.0);
-        let duty_fn = self
-            .dev
-            .get_func("htm_sp_duty", "sp_duty_update")
-            .expect("sp_duty not loaded");
-        let duty_cfg = LaunchConfig {
-            grid_dim: ((n as u32 + 255) / 256, 1, 1),
-            block_dim: (256, 1, 1),
-            shared_mem_bytes: 0,
-        };
-        // Stage 1: update duty cycles (boost_strength=0 -> no write).
-        unsafe {
-            duty_fn.launch(
-                duty_cfg,
-                (
-                    &self.active_mask,
-                    &self.raw,
-                    &mut self.active_duty,
-                    &mut self.overlap_duty,
-                    &mut self.boost,
-                    alpha,
-                    1.0f32,   // stim_thr
-                    0.0f32,   // boost_strength = 0 -> skip write
-                    0.0f32,   // mean_duty (unused)
-                    0u32,     // learn_flag = 0
-                    n as u32,
-                ),
-            )?;
-        }
-        if learn && self.boost_strength > 0.0 && self.strict_parity {
-            // Boost update must bit-match CPU `f32::exp`, so we compute it on
-            // the host and copy back. Cost per step: 8KB D2H + 8KB H2D at n=2048.
-            // Critical for learning parity — CUDA expf (even without fast-math)
-            // uses different rounding for some inputs than host libm.
-            let mut duty_host = vec![0f32; n];
-            self.dev
-                .dtoh_sync_copy_into(&self.active_duty, &mut duty_host)?;
-            let sum: f32 = duty_host.iter().sum();
-            let mean = sum / (n as f32);
-            let mut boost_host = vec![0f32; n];
-            for i in 0..n {
-                boost_host[i] = (-self.boost_strength * (duty_host[i] - mean)).exp();
-            }
-            self.dev.htod_sync_copy_into(&boost_host, &mut self.boost)?;
-            // CPU sp.rs 210-226: permanence bump for chronically under-stimulated
-            // columns. If overlap_duty_cycle[i] < 0.001 * max(overlap_duty_cycle),
-            // add inc*0.1 to every synapse of column i (clamped to 1.0).
-            // This runs only once per step and only for the rare cases, but we
-            // need it for bit-exact parity with CPU learn.
-            let mut ov_host = vec![0f32; n];
-            self.dev
-                .dtoh_sync_copy_into(&self.overlap_duty, &mut ov_host)?;
-            let max_ov = ov_host.iter().cloned().fold(0f32, f32::max);
-            if max_ov > 0.0 {
-                let thr = 0.001f32 * max_ov;
-                let bump = self.inc * 0.1f32;
-                // Find columns needing a bump. Usually empty. Rare → D2H/H2D
-                // of syn_perm is cheap (n*S*4 = 320KB at n=2048,S=40).
-                let bump_cols: Vec<u32> = ov_host
-                    .iter()
-                    .enumerate()
-                    .filter_map(|(i, &o)| if o < thr { Some(i as u32) } else { None })
-                    .collect();
-                if !bump_cols.is_empty() {
-                    // Download, bump, upload. (Keeps implementation simple and
-                    // bit-exact. Could kernelize later.)
-                    let s = self.synapses_per_col;
-                    let mut perm_host = vec![0f32; n * s];
-                    self.dev.dtoh_sync_copy_into(&self.syn_perm, &mut perm_host)?;
-                    for &c in &bump_cols {
-                        let base = (c as usize) * s;
-                        for p in &mut perm_host[base..base + s] {
-                            *p = (*p + bump).min(1.0);
-                        }
-                    }
-                    self.dev.htod_sync_copy_into(&perm_host, &mut self.syn_perm)?;
-                }
-            }
-        } else if learn && self.boost_strength > 0.0 {
-            // Fast path: GPU-side boost using the already-loaded duty kernel.
-            let mut duty_host = vec![0f32; n];
-            self.dev
-                .dtoh_sync_copy_into(&self.active_duty, &mut duty_host)?;
-            let sum: f32 = duty_host.iter().sum();
-            let mean = sum / (n as f32);
-            let boost_fn = self
-                .dev
-                .get_func("htm_sp_duty", "sp_duty_update")
-                .expect("sp_duty not loaded");
-            unsafe {
-                boost_fn.launch(
-                    duty_cfg,
-                    (
-                        &self.active_mask,
-                        &self.raw,
-                        &mut self.active_duty,
-                        &mut self.overlap_duty,
-                        &mut self.boost,
-                        0.0f32,
-                        1.0f32,
-                        self.boost_strength,
-                        mean,
-                        1u32,
-                        n as u32,
-                    ),
-                )?;
-            }
-        }
-        // 6. D2H active_mask and convert to sorted index list.
-        self.dev
-            .dtoh_sync_copy_into(&self.active_mask, &mut self.host_mask)?;
-        let mut active: Vec<u32> = Vec::with_capacity(k);
-        for (i, &b) in self.host_mask.iter().enumerate() {
-            if b != 0 {
-                active.push(i as u32);
-            }
-        }
-        debug_assert_eq!(active.len(), k, "SP must emit exactly k winners");
-        Ok(active)
-    }
-}

+//! GPU implementation of the Spatial Pooler.
+//!
+//! One `SpatialPoolerGpu` owns a set of persistent device buffers + 4 PTX
+//! kernels. `compute(input, learn)` performs one SP step and returns the
+//! sorted active-column indices (host `Vec<u32>`) — this is what the CPU
+//! TemporalMemory consumes.
+//!
+//! Persistent state on device (per region):
+//!   syn_bit     : u32  [n_columns × S]  (constant after init)
+//!   syn_perm    : f32  [n_columns × S]  (updated by sp_learn)
+//!   boost       : f32  [n_columns]
+//!   active_duty : f32  [n_columns]
+//!   overlap_duty: f32  [n_columns]
+//!
+//! Per-step transient state:
+//!   inp_dev     : u8   [input_bits]     (H2D copy each step)
+//!   raw         : u32  [n_columns]
+//!   boosted     : f32  [n_columns]
+//!   active_mask : u8   [n_columns]      (topk output, D2H at the end)
+use std::sync::Arc;
+use cudarc::driver::{CudaDevice, CudaSlice, DeviceSlice, DriverError, LaunchAsync, LaunchConfig};
+use cudarc::nvrtc::Ptx;
+use crate::sp::SpatialPooler;
+// Embed PTX at compile time. OUT_DIR is set by build.rs.
+const PTX_SP_OVERLAP: &str =
+    include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/sp_overlap.ptx"));
+const PTX_SP_TOPK: &str =
+    include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/sp_topk.ptx"));
+const PTX_SP_LEARN: &str =
+    include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/sp_learn.ptx"));
+const PTX_SP_DUTY: &str =
+    include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/sp_duty.ptx"));
+const PTX_SP_BOOST_FUSED: &str =
+    include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/sp_boost_fused.ptx"));
+pub struct SpatialPoolerGpu {
+    dev: Arc<CudaDevice>,
+    // Config mirror (we don't touch CPU SpatialPooler after init).
+    input_bits: usize,
+    n_columns: usize,
+    synapses_per_col: usize,
+    conn_thr: f32,
+    inc: f32,
+    dec: f32,
+    sparsity: f32,
+    duty_period: f32,
+    boost_strength: f32,
+    // Persistent device state.
+    syn_bit: CudaSlice<u32>,
+    syn_perm: CudaSlice<f32>,
+    boost: CudaSlice<f32>,
+    active_duty: CudaSlice<f32>,
+    overlap_duty: CudaSlice<f32>,
+    // Transient scratch (reused each step).
+    inp_dev: CudaSlice<u8>,
+    raw: CudaSlice<u32>,
+    boosted: CudaSlice<f32>,
+    active_mask: CudaSlice<u8>,
+    // Reusable host buffer for D2H of active_mask.
+    host_mask: Vec<u8>,
+    /// Strict bit-parity with CPU reference. Enabled for tests.
+    /// Forces host-side boost/exp computation and the overlap-duty bump check
+    /// every step. Default false for max throughput.
+    strict_parity: bool,
+}
+impl SpatialPoolerGpu {
+    /// Copy CPU SpatialPooler state onto the device. This preserves the
+    /// exact seeded proximal synapse layout + initial permanences, so the
+    /// GPU SP is a bit-identical parallel implementation of the CPU SP.
+    pub fn from_cpu(cpu: &SpatialPooler) -> Result<Self, DriverError> {
+        let dev = CudaDevice::new(0)?;
+        let cfg = &cpu.cfg;
+        let n = cfg.n_columns;
+        let s = cfg.potential_synapses;
+        // Flatten proximal dendrites into column-major arrays.
+        let mut syn_bit_h: Vec<u32> = Vec::with_capacity(n * s);
+        let mut syn_perm_h: Vec<f32> = Vec::with_capacity(n * s);
+        for col in &cpu.columns {
+            debug_assert_eq!(col.inputs.len(), s);
+            debug_assert_eq!(col.perms.len(), s);
+            syn_bit_h.extend_from_slice(&col.inputs);
+            syn_perm_h.extend_from_slice(&col.perms);
+        }
+        let syn_bit = dev.htod_sync_copy(&syn_bit_h)?;
+        let syn_perm = dev.htod_sync_copy(&syn_perm_h)?;
+        let boost = dev.htod_sync_copy(&cpu.boost)?;
+        let active_duty = dev.htod_sync_copy(&cpu.active_duty_cycle)?;
+        let overlap_duty = dev.htod_sync_copy(&cpu.overlap_duty_cycle)?;
+        let inp_dev: CudaSlice<u8> = dev.alloc_zeros(cfg.input_bits)?;
+        let raw: CudaSlice<u32> = dev.alloc_zeros(n)?;
+        let boosted: CudaSlice<f32> = dev.alloc_zeros(n)?;
+        let active_mask: CudaSlice<u8> = dev.alloc_zeros(n)?;
+        // Load PTX modules. Each .ptx is a module containing one `extern "C"`
+        // function; we tag them by unique module names so multiple SP instances
+        // don't collide (cudarc uses the (module, func) pair).
+        // Actually: CudaDevice::load_ptx stores under the given module name
+        // globally on the device, so we use a deterministic naming scheme.
+        let modules = [
+            ("htm_sp_overlap", PTX_SP_OVERLAP, "sp_overlap"),
+            ("htm_sp_topk", PTX_SP_TOPK, "sp_topk_select"),
+            ("htm_sp_learn", PTX_SP_LEARN, "sp_learn"),
+            ("htm_sp_duty", PTX_SP_DUTY, "sp_duty_update"),
+            ("htm_sp_boost_fused", PTX_SP_BOOST_FUSED, "sp_boost_from_duty"),
+        ];
+        for (modname, ptx, fnname) in modules {
+            // load_ptx is NOT idempotent — calling twice errors. For multi-region
+            // support we check-then-load.
+            if dev.get_func(modname, fnname).is_none() {
+                dev.load_ptx(Ptx::from_src(ptx), modname, &[fnname])?;
+            }
+        }
+        Ok(Self {
+            dev,
+            input_bits: cfg.input_bits,
+            n_columns: n,
+            synapses_per_col: s,
+            conn_thr: cfg.connected_threshold,
+            inc: cfg.syn_perm_active_inc,
+            dec: cfg.syn_perm_inactive_dec,
+            sparsity: cfg.sparsity,
+            duty_period: cfg.duty_cycle_period,
+            boost_strength: cfg.boost_strength,
+            syn_bit,
+            syn_perm,
+            boost,
+            active_duty,
+            overlap_duty,
+            inp_dev,
+            raw,
+            boosted,
+            active_mask,
+            host_mask: vec![0u8; n],
+            strict_parity: false,
+        })
+    }
+    /// Enable strict bit-parity mode. Parity tests use this.
+    pub fn set_strict_parity(&mut self, strict: bool) {
+        self.strict_parity = strict;
+    }
+    /// Access to the underlying CudaDevice for host-side orchestration.
+    pub fn dev_ref(&self) -> &Arc<CudaDevice> {
+        &self.dev
+    }
+    // --- Fused-path accessors (immutable state reads + pointer-grabs). ---
+    pub fn n_columns_accessor(&self) -> usize { self.n_columns }
+    #[allow(dead_code)]
+    pub fn input_bits_accessor(&self) -> usize { self.input_bits }
+    pub fn synapses_per_col_accessor(&self) -> usize { self.synapses_per_col }
+    pub fn conn_thr_accessor(&self) -> f32 { self.conn_thr }
+    pub fn inc_accessor(&self) -> f32 { self.inc }
+    pub fn dec_accessor(&self) -> f32 { self.dec }
+    pub fn sparsity_accessor(&self) -> f32 { self.sparsity }
+    pub fn duty_period_accessor(&self) -> f32 { self.duty_period }
+    #[allow(dead_code)]
+    pub fn boost_strength_accessor(&self) -> f32 { self.boost_strength }
+    pub fn syn_bit_accessor(&self) -> &CudaSlice<u32> { &self.syn_bit }
+    pub fn syn_perm_accessor(&self) -> &CudaSlice<f32> { &self.syn_perm }
+    pub fn boost_accessor(&self) -> &CudaSlice<f32> { &self.boost }
+    pub fn active_duty_accessor(&self) -> &CudaSlice<f32> { &self.active_duty }
+    /// Compute the 95th-percentile-like initial threshold from raw overlaps
+    /// after a short warmup pass. Used to seed `inhibition_threshold` such
+    /// that activation rate starts near the sparsity target.
+    /// Placeholder (returns a conservative constant); real warmup pass
+    /// happens on the Rust orchestrator side.
+    pub fn initial_threshold_estimate(&self) -> f32 {
+        // With conn_thr=0.5, init_perm around 0.5±0.1, S=40, sparse SDR at 2%:
+        // expected overlap ~ 40 * 0.02 = 0.8 connected hits → boosted ~ 0.8.
+        // Top-K selects top 2%, so threshold for top 2% is roughly the
+        // 98th-percentile of boosted. Conservative start: 2.0.
+        // The per-column adaptation will quickly steer each column's thr.
+        2.0f32
+    }
+    /// Batched multi-step SP on the GPU. Processes T timesteps from a
+    /// pre-uploaded device input buffer. Emits `(T, n_cols)` u8 active-column
+    /// mask to `cols_dev_out` and `(T,)` active column index list (in a
+    /// per-step window of size k, padded with u32::MAX).
+    ///
+    /// For each step, this runs the same 5-kernel pipeline as `compute`, but
+    /// skips the per-step boost/duty D2H→exp→H2D round-trip: instead it
+    /// accumulates to a host scratch once every `boost_interval` steps.
+    ///
+    /// This is the fast path used by `HTMRegionGpu.step_many_gpu`.
+    #[allow(clippy::too_many_arguments)]
+    pub fn step_batch(
+        &mut self,
+        inputs_flat_dev: &CudaSlice<u8>,
+        t: usize,
+        input_bits: usize,
+        learn: bool,
+        cols_out: &mut [u8],
+        active_indices_host: &mut Vec<u32>,
+    ) -> Result<(), DriverError> {
+        let n = self.n_columns;
+        let k = ((self.sparsity * n as f32).round() as usize).max(1);
+        debug_assert_eq!(cols_out.len(), t * n);
+        let overlap_fn = self.dev.get_func("htm_sp_overlap", "sp_overlap").unwrap();
+        let topk_fn = self.dev.get_func("htm_sp_topk", "sp_topk_select").unwrap();
+        let learn_fn = self.dev.get_func("htm_sp_learn", "sp_learn").unwrap();
+        let duty_fn = self.dev.get_func("htm_sp_duty", "sp_duty_update").unwrap();
+        let overlap_cfg = LaunchConfig {
+            grid_dim: (n as u32, 1, 1),
+            block_dim: (128, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        let topk_cfg = LaunchConfig {
+            grid_dim: (1, 1, 1),
+            block_dim: (256, 1, 1),
+            shared_mem_bytes: (n * std::mem::size_of::<f32>()) as u32,
+        };
+        let learn_cfg = overlap_cfg;
+        let duty_cfg = LaunchConfig {
+            grid_dim: ((n as u32 + 255) / 256, 1, 1),
+            block_dim: (256, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        let alpha = 1.0f32 / self.duty_period.max(1.0);
+        // Reusable host buffer for the per-step active_mask D2H.
+        self.host_mask.resize(n, 0);
+        active_indices_host.clear();
+        for ti in 0..t {
+            // Point overlap kernel at the ti-th slice of the pre-uploaded input.
+            // cudarc CudaSlice doesn't have a "view" per se, so we must copy the
+            // slice into the reusable inp_dev buffer. This is a D2D copy — much
+            // faster than H2D.
+            // (Alternative: rewrite kernel to accept an offset; deferred.)
+            let in_off = ti * input_bits;
+            // Use dtod_copy via raw slice indexing: cudarc exposes slice() for this.
+            let sub = inputs_flat_dev.slice(in_off..in_off + input_bits);
+            self.dev.dtod_copy(&sub, &mut self.inp_dev)?;
+            // 1. sp_overlap
+            unsafe {
+                overlap_fn.clone().launch(
+                    overlap_cfg,
+                    (
+                        &self.inp_dev,
+                        &self.syn_bit,
+                        &self.syn_perm,
+                        &self.boost,
+                        self.conn_thr,
+                        self.synapses_per_col as u32,
+                        n as u32,
+                        &mut self.raw,
+                        &mut self.boosted,
+                    ),
+                )?;
+            }
+            // 2. Clear active_mask, then sp_topk
+            self.dev.memset_zeros(&mut self.active_mask)?;
+            unsafe {
+                topk_fn.clone().launch(
+                    topk_cfg,
+                    (&self.boosted, n as u32, k as u32, &mut self.active_mask),
+                )?;
+            }
+            // 3. sp_learn
+            if learn {
+                unsafe {
+                    learn_fn.clone().launch(
+                        learn_cfg,
+                        (
+                            &self.active_mask,
+                            &self.inp_dev,
+                            &self.syn_bit,
+                            &mut self.syn_perm,
+                            self.inc,
+                            self.dec,
+                            self.synapses_per_col as u32,
+                            n as u32,
+                        ),
+                    )?;
+                }
+            }
+            // 4. duty update (device)
+            unsafe {
+                duty_fn.clone().launch(
+                    duty_cfg,
+                    (
+                        &self.active_mask,
+                        &self.raw,
+                        &mut self.active_duty,
+                        &mut self.overlap_duty,
+                        &mut self.boost,
+                        alpha,
+                        1.0f32,
+                        0.0f32,
+                        0.0f32,
+                        0u32,
+                        n as u32,
+                    ),
+                )?;
+            }
+            // 5. Boost update. Two modes:
+            //    * strict_parity (tests): host-side exp for bit-exact match.
+            //    * default (production): GPU expf is close enough and ~10x faster
+            //      since we skip the D2H/H2D round-trip.
+            if learn && self.boost_strength > 0.0 {
+                if self.strict_parity {
+                    let mut duty_host = vec![0f32; n];
+                    self.dev
+                        .dtoh_sync_copy_into(&self.active_duty, &mut duty_host)?;
+                    let sum: f32 = duty_host.iter().sum();
+                    let mean = sum / (n as f32);
+                    let mut boost_host = vec![0f32; n];
+                    for i in 0..n {
+                        boost_host[i] =
+                            (-self.boost_strength * (duty_host[i] - mean)).exp();
+                    }
+                    self.dev.htod_sync_copy_into(&boost_host, &mut self.boost)?;
+                    // Permanence bump (rare). Only evaluated in strict mode.
+                    let mut ov_host = vec![0f32; n];
+                    self.dev
+                        .dtoh_sync_copy_into(&self.overlap_duty, &mut ov_host)?;
+                    let max_ov = ov_host.iter().cloned().fold(0f32, f32::max);
+                    if max_ov > 0.0 {
+                        let thr = 0.001f32 * max_ov;
+                        let bump = self.inc * 0.1f32;
+                        let bump_cols: Vec<u32> = ov_host
+                            .iter()
+                            .enumerate()
+                            .filter_map(|(i, &o)| {
+                                if o < thr { Some(i as u32) } else { None }
+                            })
+                            .collect();
+                        if !bump_cols.is_empty() {
+                            let s = self.synapses_per_col;
+                            let mut perm_host = vec![0f32; n * s];
+                            self.dev
+                                .dtoh_sync_copy_into(&self.syn_perm, &mut perm_host)?;
+                            for &c in &bump_cols {
+                                let base = (c as usize) * s;
+                                for p in &mut perm_host[base..base + s] {
+                                    *p = (*p + bump).min(1.0);
+                                }
+                            }
+                            self.dev.htod_sync_copy_into(&perm_host, &mut self.syn_perm)?;
+                        }
+                    }
+                } else {
+                    // Fast path: fused mean + boost = expf(-strength*(ad-mean))
+                    // in a single GPU block. Zero D2H, zero H2D — fully async.
+                    let boost_fn = self
+                        .dev
+                        .get_func("htm_sp_boost_fused", "sp_boost_from_duty")
+                        .expect("sp_boost_fused not loaded");
+                    let boost_cfg = LaunchConfig {
+                        grid_dim: (1, 1, 1),
+                        block_dim: (1024, 1, 1),
+                        shared_mem_bytes: 32 * std::mem::size_of::<f32>() as u32,
+                    };
+                    unsafe {
+                        boost_fn.launch(
+                            boost_cfg,
+                            (
+                                &self.active_duty,
+                                &mut self.boost,
+                                self.boost_strength,
+                                n as u32,
+                            ),
+                        )?;
+                    }
+                }
+            }
+            // D2H the active_mask for this step. This is the single
+            // unavoidable sync point per step — CPU TM needs the active
+            // indices for its next state update. At 2048 bytes / step this
+            // is tiny in bandwidth but costs a full syncronize (~5-10μs).
+            self.dev
+                .dtoh_sync_copy_into(&self.active_mask, &mut self.host_mask)?;
+            let co = ti * n;
+            cols_out[co..co + n].copy_from_slice(&self.host_mask);
+            // Extract active indices.
+            for (i, &b) in self.host_mask.iter().enumerate() {
+                if b != 0 {
+                    active_indices_host.push(i as u32);
+                }
+            }
+            // Insert separator (u32::MAX) between steps to demarcate step boundaries.
+            active_indices_host.push(u32::MAX);
+        }
+        Ok(())
+    }
+    /// Fully-on-GPU batched SP + TM. Zero per-step host sync.
+    ///
+    /// Inputs:
+    ///   inputs_flat_dev : (T * input_bits) u8 already uploaded
+    ///   cols_dev        : (T * n_cols) u8 output — active-column mask per step
+    ///   anom_dev        : (T,) f32 output — anomaly score per step
+    ///   tm              : persistent GPU TemporalMemory for this region
+    #[allow(clippy::too_many_arguments)]
+    pub fn step_batch_with_tm(
+        &mut self,
+        inputs_flat_dev: &CudaSlice<u8>,
+        t: usize,
+        input_bits: usize,
+        learn: bool,
+        cols_dev: &mut CudaSlice<u8>,
+        anom_dev: &mut CudaSlice<f32>,
+        tm: &mut crate::gpu::tm_gpu::TemporalMemoryGpu,
+    ) -> Result<(), DriverError> {
+        let n = self.n_columns;
+        let k = ((self.sparsity * n as f32).round() as usize).max(1);
+        debug_assert_eq!(cols_dev.len(), t * n);
+        debug_assert_eq!(anom_dev.len(), t);
+        let overlap_fn = self.dev.get_func("htm_sp_overlap", "sp_overlap").unwrap();
+        let topk_fn    = self.dev.get_func("htm_sp_topk", "sp_topk_select").unwrap();
+        let learn_fn   = self.dev.get_func("htm_sp_learn", "sp_learn").unwrap();
+        let duty_fn    = self.dev.get_func("htm_sp_duty", "sp_duty_update").unwrap();
+        let overlap_cfg = LaunchConfig {
+            grid_dim: (n as u32, 1, 1),
+            block_dim: (128, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        let topk_cfg = LaunchConfig {
+            grid_dim: (1, 1, 1),
+            block_dim: (256, 1, 1),
+            shared_mem_bytes: (n * std::mem::size_of::<f32>()) as u32,
+        };
+        let learn_cfg = overlap_cfg;
+        let duty_cfg = LaunchConfig {
+            grid_dim: ((n as u32 + 255) / 256, 1, 1),
+            block_dim: (256, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        let alpha = 1.0f32 / self.duty_period.max(1.0);
+        for ti in 0..t {
+            let in_off = ti * input_bits;
+            let sub = inputs_flat_dev.slice(in_off..in_off + input_bits);
+            self.dev.dtod_copy(&sub, &mut self.inp_dev)?;
+            // 1. sp_overlap
+            unsafe {
+                overlap_fn.clone().launch(
+                    overlap_cfg,
+                    (
+                        &self.inp_dev,
+                        &self.syn_bit,
+                        &self.syn_perm,
+                        &self.boost,
+                        self.conn_thr,
+                        self.synapses_per_col as u32,
+                        n as u32,
+                        &mut self.raw,
+                        &mut self.boosted,
+                    ),
+                )?;
+            }
+            // 2. clear + sp_topk
+            self.dev.memset_zeros(&mut self.active_mask)?;
+            unsafe {
+                topk_fn.clone().launch(
+                    topk_cfg,
+                    (&self.boosted, n as u32, k as u32, &mut self.active_mask),
+                )?;
+            }
+            // 3. sp_learn
+            if learn {
+                unsafe {
+                    learn_fn.clone().launch(
+                        learn_cfg,
+                        (
+                            &self.active_mask,
+                            &self.inp_dev,
+                            &self.syn_bit,
+                            &mut self.syn_perm,
+                            self.inc,
+                            self.dec,
+                            self.synapses_per_col as u32,
+                            n as u32,
+                        ),
+                    )?;
+                }
+            }
+            // 4. duty update (stage 1: no-boost write)
+            unsafe {
+                duty_fn.clone().launch(
+                    duty_cfg,
+                    (
+                        &self.active_mask,
+                        &self.raw,
+                        &mut self.active_duty,
+                        &mut self.overlap_duty,
+                        &mut self.boost,
+                        alpha,
+                        1.0f32,
+                        0.0f32,
+                        0.0f32,
+                        0u32,
+                        n as u32,
+                    ),
+                )?;
+            }
+            // 5. Boost update: fused GPU kernel (no D2H).
+            if learn && self.boost_strength > 0.0 {
+                let boost_fn = self.dev
+                    .get_func("htm_sp_boost_fused", "sp_boost_from_duty")
+                    .expect("sp_boost_fused not loaded");
+                let boost_cfg = LaunchConfig {
+                    grid_dim: (1, 1, 1),
+                    block_dim: (1024, 1, 1),
+                    shared_mem_bytes: 32 * std::mem::size_of::<f32>() as u32,
+                };
+                unsafe {
+                    boost_fn.launch(
+                        boost_cfg,
+                        (
+                            &self.active_duty,
+                            &mut self.boost,
+                            self.boost_strength,
+                            n as u32,
+                        ),
+                    )?;
+                }
+            }
+            // 6. Copy active_mask slice into cols_dev[ti*n .. (ti+1)*n].
+            let mut dst_slice = cols_dev.slice_mut(ti * n..(ti + 1) * n);
+            self.dev.dtod_copy(&self.active_mask, &mut dst_slice)?;
+            // 7. GPU TM step: predict + activate + anomaly + learn, all on device.
+            tm.step(&self.active_mask, anom_dev, ti as u32, learn)?;
+        }
+        Ok(())
+    }
+    /// One SP step on the GPU. Returns sorted active-column indices.
+    pub fn compute(&mut self, input: &[u8], learn: bool) -> Result<Vec<u32>, DriverError> {
+        debug_assert_eq!(input.len(), self.input_bits);
+        let n = self.n_columns;
+        let k = ((self.sparsity * n as f32).round() as usize).max(1);
+        // 1. H2D input SDR.
+        self.dev.htod_sync_copy_into(input, &mut self.inp_dev)?;
+        // 2. Launch sp_overlap: grid=n_columns, block=128.
+        let overlap_fn = self
+            .dev
+            .get_func("htm_sp_overlap", "sp_overlap")
+            .expect("sp_overlap not loaded");
+        let overlap_cfg = LaunchConfig {
+            grid_dim: (n as u32, 1, 1),
+            block_dim: (128, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        unsafe {
+            overlap_fn.launch(
+                overlap_cfg,
+                (
+                    &self.inp_dev,
+                    &self.syn_bit,
+                    &self.syn_perm,
+                    &self.boost,
+                    self.conn_thr,
+                    self.synapses_per_col as u32,
+                    n as u32,
+                    &mut self.raw,
+                    &mut self.boosted,
+                ),
+            )?;
+        }
+        // 3. Launch sp_topk: single block, shared mem = n_columns * f32.
+        let topk_fn = self
+            .dev
+            .get_func("htm_sp_topk", "sp_topk_select")
+            .expect("sp_topk not loaded");
+        let topk_cfg = LaunchConfig {
+            grid_dim: (1, 1, 1),
+            block_dim: (256, 1, 1),
+            shared_mem_bytes: (n * std::mem::size_of::<f32>()) as u32,
+        };
+        // Clear active_mask first. memset_zeros avoids an H2D of a host
+        // zeroes vector every step.
+        self.dev.memset_zeros(&mut self.active_mask)?;
+        unsafe {
+            topk_fn.launch(
+                topk_cfg,
+                (
+                    &self.boosted,
+                    n as u32,
+                    k as u32,
+                    &mut self.active_mask,
+                ),
+            )?;
+        }
+        // 4. Optional: sp_learn on active columns.
+        if learn {
+            let learn_fn = self
+                .dev
+                .get_func("htm_sp_learn", "sp_learn")
+                .expect("sp_learn not loaded");
+            let learn_cfg = LaunchConfig {
+                grid_dim: (n as u32, 1, 1),
+                block_dim: (128, 1, 1),
+                shared_mem_bytes: 0,
+            };
+            unsafe {
+                learn_fn.launch(
+                    learn_cfg,
+                    (
+                        &self.active_mask,
+                        &self.inp_dev,
+                        &self.syn_bit,
+                        &mut self.syn_perm,
+                        self.inc,
+                        self.dec,
+                        self.synapses_per_col as u32,
+                        n as u32,
+                    ),
+                )?;
+            }
+        }
+        // 5. Duty cycle + boost update. Always runs (matches CPU).
+        //    We need mean_duty on the host — compute BEFORE the update (matches
+        //    CPU sp.rs line 200-205 where mean is computed then written).
+        //    Actually CPU computes mean of the PRE-update duty cycles too? Re-read:
+        //      sp.rs lines 186-196 update duty cycles (pre-mean).
+        //      Line 202: mean = sum(active_duty_cycle) / n  ← after update.
+        //      Line 204: boost[i] = exp(-strength*(active_duty[i] - mean)).
+        //    So mean is on POST-update values.
+        //    Easiest: 1) run duty update with boost_strength=0 (skip boost calc),
+        //             2) D2H active_duty, compute mean, 3) run a boost-only kernel
+        //                OR inline the exp() in a second launch with mean passed.
+        //
+        //    For simplicity and correctness we fuse: run the duty kernel with
+        //    mean=0 and boost_strength=0 (disables boost write), then D2H to
+        //    compute mean, then re-launch with the true mean. Two launches, one
+        //    tiny D2H (n × f32). At n=2048 this is 8KB per step — negligible.
+        let alpha = 1.0f32 / self.duty_period.max(1.0);
+        let duty_fn = self
+            .dev
+            .get_func("htm_sp_duty", "sp_duty_update")
+            .expect("sp_duty not loaded");
+        let duty_cfg = LaunchConfig {
+            grid_dim: ((n as u32 + 255) / 256, 1, 1),
+            block_dim: (256, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        // Stage 1: update duty cycles (boost_strength=0 -> no write).
+        unsafe {
+            duty_fn.launch(
+                duty_cfg,
+                (
+                    &self.active_mask,
+                    &self.raw,
+                    &mut self.active_duty,
+                    &mut self.overlap_duty,
+                    &mut self.boost,
+                    alpha,
+                    1.0f32,   // stim_thr
+                    0.0f32,   // boost_strength = 0 -> skip write
+                    0.0f32,   // mean_duty (unused)
+                    0u32,     // learn_flag = 0
+                    n as u32,
+                ),
+            )?;
+        }
+        if learn && self.boost_strength > 0.0 && self.strict_parity {
+            // Boost update must bit-match CPU `f32::exp`, so we compute it on
+            // the host and copy back. Cost per step: 8KB D2H + 8KB H2D at n=2048.
+            // Critical for learning parity — CUDA expf (even without fast-math)
+            // uses different rounding for some inputs than host libm.
+            let mut duty_host = vec![0f32; n];
+            self.dev
+                .dtoh_sync_copy_into(&self.active_duty, &mut duty_host)?;
+            let sum: f32 = duty_host.iter().sum();
+            let mean = sum / (n as f32);
+            let mut boost_host = vec![0f32; n];
+            for i in 0..n {
+                boost_host[i] = (-self.boost_strength * (duty_host[i] - mean)).exp();
+            }
+            self.dev.htod_sync_copy_into(&boost_host, &mut self.boost)?;
+            // CPU sp.rs 210-226: permanence bump for chronically under-stimulated
+            // columns. If overlap_duty_cycle[i] < 0.001 * max(overlap_duty_cycle),
+            // add inc*0.1 to every synapse of column i (clamped to 1.0).
+            // This runs only once per step and only for the rare cases, but we
+            // need it for bit-exact parity with CPU learn.
+            let mut ov_host = vec![0f32; n];
+            self.dev
+                .dtoh_sync_copy_into(&self.overlap_duty, &mut ov_host)?;
+            let max_ov = ov_host.iter().cloned().fold(0f32, f32::max);
+            if max_ov > 0.0 {
+                let thr = 0.001f32 * max_ov;
+                let bump = self.inc * 0.1f32;
+                // Find columns needing a bump. Usually empty. Rare → D2H/H2D
+                // of syn_perm is cheap (n*S*4 = 320KB at n=2048,S=40).
+                let bump_cols: Vec<u32> = ov_host
+                    .iter()
+                    .enumerate()
+                    .filter_map(|(i, &o)| if o < thr { Some(i as u32) } else { None })
+                    .collect();
+                if !bump_cols.is_empty() {
+                    // Download, bump, upload. (Keeps implementation simple and
+                    // bit-exact. Could kernelize later.)
+                    let s = self.synapses_per_col;
+                    let mut perm_host = vec![0f32; n * s];
+                    self.dev.dtoh_sync_copy_into(&self.syn_perm, &mut perm_host)?;
+                    for &c in &bump_cols {
+                        let base = (c as usize) * s;
+                        for p in &mut perm_host[base..base + s] {
+                            *p = (*p + bump).min(1.0);
+                        }
+                    }
+                    self.dev.htod_sync_copy_into(&perm_host, &mut self.syn_perm)?;
+                }
+            }
+        } else if learn && self.boost_strength > 0.0 {
+            // Fast path: GPU-side boost using the already-loaded duty kernel.
+            let mut duty_host = vec![0f32; n];
+            self.dev
+                .dtoh_sync_copy_into(&self.active_duty, &mut duty_host)?;
+            let sum: f32 = duty_host.iter().sum();
+            let mean = sum / (n as f32);
+            let boost_fn = self
+                .dev
+                .get_func("htm_sp_duty", "sp_duty_update")
+                .expect("sp_duty not loaded");
+            unsafe {
+                boost_fn.launch(
+                    duty_cfg,
+                    (
+                        &self.active_mask,
+                        &self.raw,
+                        &mut self.active_duty,
+                        &mut self.overlap_duty,
+                        &mut self.boost,
+                        0.0f32,
+                        1.0f32,
+                        self.boost_strength,
+                        mean,
+                        1u32,
+                        n as u32,
+                    ),
+                )?;
+            }
+        }
+        // 6. D2H active_mask and convert to sorted index list.
+        self.dev
+            .dtoh_sync_copy_into(&self.active_mask, &mut self.host_mask)?;
+        let mut active: Vec<u32> = Vec::with_capacity(k);
+        for (i, &b) in self.host_mask.iter().enumerate() {
+            if b != 0 {
+                active.push(i as u32);
+            }
+        }
+        debug_assert_eq!(active.len(), k, "SP must emit exactly k winners");
+        Ok(active)
+    }
+}

overlay/htm_rust/src/gpu/tm_gpu.rs CHANGED Viewed

@@ -1,460 +1,460 @@
-//! GPU Temporal Memory.
-//!
-//! Flat device storage. Pre-allocated segment slab:
-//!   n_cells = n_columns * cells_per_column
-//!   n_segments_max = n_cells * MAX_SEGMENTS_PER_CELL
-//!   n_synapses_max = n_segments_max * MAX_SYN_PER_SEGMENT
-//!
-//! Defaults (CPU parity targets relaxed on GPU to keep memory tractable):
-//!   MAX_SEGMENTS_PER_CELL = 16
-//!   MAX_SYN_PER_SEGMENT   = 32
-//!
-//! At n_cells = 65536:
-//!   n_segments_max =  1_048_576   (~1M)
-//!   n_synapses_max = 33_554_432   (~33M)
-//! Storage:
-//!   syn_presyn : u32  × 33M = 128 MB
-//!   syn_perm   : i16  × 33M =  64 MB
-//!   seg_cell   : u32  ×  1M =   4 MB
-//!   seg_syn_n  : u32  ×  1M =   4 MB
-//!   misc bitsets etc        ~ <1 MB
-//!   -------------------------------
-//!   Total per region        ~200 MB
-//!
-//! Permanences are stored as i16 scaled by 32767 (→ [0, 32767] represents
-//! [0.0, 1.0]). inc/dec are provided pre-scaled.
-use std::sync::Arc;
-use cudarc::driver::{CudaDevice, CudaSlice, DriverError, DeviceRepr, LaunchAsync, LaunchConfig};
-use cudarc::nvrtc::Ptx;
-/// Packed config struct passed by value to TM kernels to stay under
-/// cudarc's 12-tuple launch limit. Layout must match the C-side
-/// `TmConfig` struct declared in each kernel.
-#[repr(C)]
-#[derive(Clone, Copy)]
-pub struct TmConfig {
-    pub activation_threshold: u32,
-    pub learning_threshold: u32,
-    pub cells_per_column: u32,
-    pub synapses_per_segment: u32,
-    pub n_segments: u32,
-    pub n_cells: u32,
-    pub max_segments_per_cell: u32,
-    pub max_new_synapses: u32,
-    pub conn_thr_i16: i32,        // i16 widened to i32 for alignment
-    pub perm_inc_i16: i32,
-    pub perm_dec_i16: i32,
-    pub predicted_seg_dec_i16: i32,
-    pub initial_perm_i16: i32,
-    pub iter_seed: u32,
-    pub n_cols: u32,
-    pub bits_words: u32,
-}
-unsafe impl DeviceRepr for TmConfig {}
-// Embedded PTX.
-const PTX_TM_PREDICT:  &str = include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/tm_predict.ptx"));
-const PTX_TM_ACTIVATE: &str = include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/tm_activate.ptx"));
-const PTX_TM_LEARN:    &str = include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/tm_learn.ptx"));
-const PTX_TM_PUNISH:   &str = include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/tm_punish.ptx"));
-const PTX_TM_GROW:     &str = include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/tm_grow.ptx"));
-const PTX_TM_ANOMALY:  &str = include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/tm_anomaly.ptx"));
-const PTX_TM_RESET:    &str = include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/tm_reset.ptx"));
-/// Capacity trade-offs for 6 GB VRAM (RTX 3060) shared with the model:
-///   n_cells            = 2048 × 32 = 65_536
-///   n_segments_max     = n_cells × MAX_SEGMENTS_PER_CELL
-///   n_synapses_max     = n_segments_max × MAX_SYN_PER_SEGMENT
-///
-/// At 4/20 these are 262_144 segments and ~5.2M synapses (~50 MB per region).
-/// The training loop runs with `reset_each_forward=True`, so segment counts
-/// per window stay well below 32K (typical: ~n_cols new segs per step until
-/// the first matching segment is reused; in a 2048-step window that plateaus
-/// around ~5K total live segments). The 262K ceiling is generous headroom.
-pub const MAX_SEGMENTS_PER_CELL: usize = 4;
-pub const MAX_SYN_PER_SEGMENT:   usize = 20;
-const PERM_SCALE: f32 = 32767.0;
-fn perm_f32_to_i16(x: f32) -> i16 {
-    let clamped = x.clamp(0.0, 1.0);
-    (clamped * PERM_SCALE).round() as i16
-}
-pub struct TemporalMemoryGpu {
-    dev: Arc<CudaDevice>,
-    // Config mirror
-    pub n_columns: usize,
-    pub cells_per_column: usize,
-    pub activation_threshold: u32,
-    pub learning_threshold: u32,
-    pub initial_perm_i16: i16,
-    pub conn_thr_i16: i16,
-    pub perm_inc_i16: i16,
-    pub perm_dec_i16: i16,
-    pub predicted_seg_dec_i16: i16,
-    pub max_new_synapse_count: u32,
-    // Sizes
-    pub n_cells: usize,
-    pub n_segments_max: usize,
-    pub bits_words: usize,  // n_cells / 32
-    // Persistent device buffers
-    seg_cell_id:       CudaSlice<u32>,
-    seg_syn_count:     CudaSlice<u32>,
-    syn_presyn:        CudaSlice<u32>,
-    syn_perm:          CudaSlice<i16>,
-    cell_seg_count:    CudaSlice<u32>,
-    cell_active_bits:     CudaSlice<u32>,
-    cell_winner_bits:     CudaSlice<u32>,
-    cell_predictive_bits: CudaSlice<u32>,
-    prev_active_bits:     CudaSlice<u32>,
-    prev_winner_bits:     CudaSlice<u32>,
-    col_predicted:         CudaSlice<u8>,
-    seg_num_active_conn:   CudaSlice<u32>,
-    seg_num_active_pot:    CudaSlice<u32>,
-    unpredicted_count:     CudaSlice<u32>,
-    burst_cols_flat:       CudaSlice<u32>,
-    burst_cols_count:      CudaSlice<u32>,
-    col_best_match:        CudaSlice<u32>,
-    iter_counter: u32,
-}
-impl TemporalMemoryGpu {
-    pub fn new(
-        dev: Arc<CudaDevice>,
-        n_columns: usize,
-        cells_per_column: usize,
-    ) -> Result<Self, DriverError> {
-        let n_cells = n_columns * cells_per_column;
-        assert!(n_cells % 32 == 0, "n_cells must be divisible by 32 for bitsets");
-        let n_segments_max = n_cells * MAX_SEGMENTS_PER_CELL;
-        let bits_words = n_cells / 32;
-        // Numenta defaults.
-        let activation_threshold = 15u32;
-        let learning_threshold   = 13u32;
-        let initial_perm_i16        = perm_f32_to_i16(0.21);
-        let conn_thr_i16            = perm_f32_to_i16(0.50);
-        let perm_inc_i16            = perm_f32_to_i16(0.10);
-        let perm_dec_i16            = perm_f32_to_i16(0.10);
-        let predicted_seg_dec_i16   = perm_f32_to_i16(0.10);
-        let max_new_synapse_count   = 20u32;
-        // Allocate buffers.
-        let seg_cell_id_host: Vec<u32> = vec![u32::MAX; n_segments_max];
-        let seg_cell_id = dev.htod_sync_copy(&seg_cell_id_host)?;
-        let seg_syn_count = dev.alloc_zeros::<u32>(n_segments_max)?;
-        let syn_presyn = dev.alloc_zeros::<u32>(n_segments_max * MAX_SYN_PER_SEGMENT)?;
-        let syn_perm = dev.alloc_zeros::<i16>(n_segments_max * MAX_SYN_PER_SEGMENT)?;
-        let cell_seg_count = dev.alloc_zeros::<u32>(n_cells)?;
-        let cell_active_bits     = dev.alloc_zeros::<u32>(bits_words)?;
-        let cell_winner_bits     = dev.alloc_zeros::<u32>(bits_words)?;
-        let cell_predictive_bits = dev.alloc_zeros::<u32>(bits_words)?;
-        let prev_active_bits     = dev.alloc_zeros::<u32>(bits_words)?;
-        let prev_winner_bits     = dev.alloc_zeros::<u32>(bits_words)?;
-        let col_predicted       = dev.alloc_zeros::<u8>(n_columns)?;
-        let seg_num_active_conn = dev.alloc_zeros::<u32>(n_segments_max)?;
-        let seg_num_active_pot  = dev.alloc_zeros::<u32>(n_segments_max)?;
-        let unpredicted_count   = dev.alloc_zeros::<u32>(1)?;
-        // Bursting columns for one step bounded by n_columns.
-        let burst_cols_flat   = dev.alloc_zeros::<u32>(n_columns)?;
-        let burst_cols_count  = dev.alloc_zeros::<u32>(1)?;
-        let col_best_match    = dev.alloc_zeros::<u32>(n_columns)?;
-        // Load PTX modules.
-        let modules = [
-            ("htm_tm_predict",  PTX_TM_PREDICT,  "tm_predict"),
-            ("htm_tm_activate", PTX_TM_ACTIVATE, "tm_activate"),
-            ("htm_tm_learn",    PTX_TM_LEARN,    "tm_learn_reinforce"),
-            ("htm_tm_punish",   PTX_TM_PUNISH,   "tm_punish"),
-            ("htm_tm_grow",     PTX_TM_GROW,     "tm_grow"),
-            ("htm_tm_anomaly",  PTX_TM_ANOMALY,  "tm_anomaly"),
-            ("htm_tm_reset",    PTX_TM_RESET,    "tm_reset_step"),
-        ];
-        for (modname, ptx, fnname) in modules {
-            if dev.get_func(modname, fnname).is_none() {
-                dev.load_ptx(Ptx::from_src(ptx), modname, &[fnname])?;
-            }
-        }
-        Ok(Self {
-            dev,
-            n_columns,
-            cells_per_column,
-            activation_threshold,
-            learning_threshold,
-            initial_perm_i16,
-            conn_thr_i16,
-            perm_inc_i16,
-            perm_dec_i16,
-            predicted_seg_dec_i16,
-            max_new_synapse_count,
-            n_cells,
-            n_segments_max,
-            bits_words,
-            seg_cell_id,
-            seg_syn_count,
-            syn_presyn,
-            syn_perm,
-            cell_seg_count,
-            cell_active_bits,
-            cell_winner_bits,
-            cell_predictive_bits,
-            prev_active_bits,
-            prev_winner_bits,
-            col_predicted,
-            seg_num_active_conn,
-            seg_num_active_pot,
-            unpredicted_count,
-            burst_cols_flat,
-            burst_cols_count,
-            col_best_match,
-            iter_counter: 0,
-        })
-    }
-    // --- Fused-path accessors ---
-    pub fn seg_cell_id_accessor(&self) -> &CudaSlice<u32> { &self.seg_cell_id }
-    pub fn seg_syn_count_accessor(&self) -> &CudaSlice<u32> { &self.seg_syn_count }
-    pub fn syn_presyn_accessor(&self) -> &CudaSlice<u32> { &self.syn_presyn }
-    pub fn syn_perm_accessor(&self) -> &CudaSlice<i16> { &self.syn_perm }
-    pub fn cell_seg_count_accessor(&self) -> &CudaSlice<u32> { &self.cell_seg_count }
-    /// Hard reset — clear everything (predictive + active + segments).
-    pub fn reset(&mut self) -> Result<(), DriverError> {
-        // Restore "unused" sentinel in seg_cell_id.
-        let unused_host: Vec<u32> = vec![u32::MAX; self.n_segments_max];
-        self.dev.htod_sync_copy_into(&unused_host, &mut self.seg_cell_id)?;
-        self.dev.memset_zeros(&mut self.seg_syn_count)?;
-        self.dev.memset_zeros(&mut self.cell_seg_count)?;
-        self.dev.memset_zeros(&mut self.cell_active_bits)?;
-        self.dev.memset_zeros(&mut self.cell_winner_bits)?;
-        self.dev.memset_zeros(&mut self.cell_predictive_bits)?;
-        self.dev.memset_zeros(&mut self.prev_active_bits)?;
-        self.dev.memset_zeros(&mut self.prev_winner_bits)?;
-        self.dev.memset_zeros(&mut self.col_best_match)?;
-        self.iter_counter = 0;
-        Ok(())
-    }
-    fn build_cfg(&self) -> TmConfig {
-        TmConfig {
-            activation_threshold: self.activation_threshold,
-            learning_threshold: self.learning_threshold,
-            cells_per_column: self.cells_per_column as u32,
-            synapses_per_segment: MAX_SYN_PER_SEGMENT as u32,
-            n_segments: self.n_segments_max as u32,
-            n_cells: self.n_cells as u32,
-            max_segments_per_cell: MAX_SEGMENTS_PER_CELL as u32,
-            max_new_synapses: self.max_new_synapse_count,
-            conn_thr_i16: self.conn_thr_i16 as i32,
-            perm_inc_i16: self.perm_inc_i16 as i32,
-            perm_dec_i16: self.perm_dec_i16 as i32,
-            predicted_seg_dec_i16: self.predicted_seg_dec_i16 as i32,
-            initial_perm_i16: self.initial_perm_i16 as i32,
-            iter_seed: self.iter_counter,
-            n_cols: self.n_columns as u32,
-            bits_words: self.bits_words as u32,
-        }
-    }
-    /// Run one TM step on the GPU. Takes the SP active-column mask (u8, already
-    /// on device) and writes `anomaly_out[t_slot]`.
-    pub fn step(
-        &mut self,
-        sp_active_mask: &CudaSlice<u8>,
-        anomaly_out: &mut CudaSlice<f32>,
-        t_slot: u32,
-        learn: bool,
-    ) -> Result<(), DriverError> {
-        let n_cells = self.n_cells;
-        let n_cols = self.n_columns;
-        let predict_fn  = self.dev.get_func("htm_tm_predict",  "tm_predict").unwrap();
-        let activate_fn = self.dev.get_func("htm_tm_activate", "tm_activate").unwrap();
-        let learn_fn    = self.dev.get_func("htm_tm_learn",    "tm_learn_reinforce").unwrap();
-        let punish_fn   = self.dev.get_func("htm_tm_punish",   "tm_punish").unwrap();
-        let grow_fn     = self.dev.get_func("htm_tm_grow",     "tm_grow").unwrap();
-        let anom_fn     = self.dev.get_func("htm_tm_anomaly",  "tm_anomaly").unwrap();
-        let reset_fn    = self.dev.get_func("htm_tm_reset",    "tm_reset_step").unwrap();
-        self.iter_counter = self.iter_counter.wrapping_add(1);
-        let cfg_val = self.build_cfg();
-        // 0. Per-step reset.
-        let reset_words = self.bits_words.max(n_cols);
-        let reset_cfg = LaunchConfig {
-            grid_dim: (((reset_words + 255) / 256) as u32, 1, 1),
-            block_dim: (256, 1, 1),
-            shared_mem_bytes: 0,
-        };
-        unsafe {
-            reset_fn.clone().launch(
-                reset_cfg,
-                (
-                    &mut self.cell_active_bits,
-                    &mut self.cell_winner_bits,
-                    &mut self.cell_predictive_bits,
-                    &mut self.prev_active_bits,
-                    &mut self.prev_winner_bits,
-                    &mut self.col_predicted,
-                    &mut self.unpredicted_count,
-                    &mut self.burst_cols_count,
-                    &mut self.col_best_match,
-                    self.bits_words as u32,
-                    n_cols as u32,
-                ),
-            )?;
-        }
-        // 1. Predict (grid = n_cells; each block iterates its cell's segments).
-        let predict_cfg = LaunchConfig {
-            grid_dim: (n_cells as u32, 1, 1),
-            block_dim: (32, 1, 1),
-            shared_mem_bytes: 0,
-        };
-        unsafe {
-            predict_fn.clone().launch(
-                predict_cfg,
-                (
-                    &self.seg_cell_id,
-                    &self.seg_syn_count,
-                    &self.syn_presyn,
-                    &self.syn_perm,
-                    &self.prev_active_bits,
-                    &mut self.cell_predictive_bits,
-                    &mut self.col_predicted,
-                    &mut self.seg_num_active_conn,
-                    &mut self.seg_num_active_pot,
-                    &mut self.col_best_match,
-                    &self.cell_seg_count,
-                    cfg_val,
-                ),
-            )?;
-        }
-        // 2. Activate.
-        let activate_cfg = LaunchConfig {
-            grid_dim: (((n_cols + 255) / 256) as u32, 1, 1),
-            block_dim: (256, 1, 1),
-            shared_mem_bytes: 0,
-        };
-        unsafe {
-            activate_fn.clone().launch(
-                activate_cfg,
-                (
-                    sp_active_mask,
-                    &self.col_predicted,
-                    &self.cell_predictive_bits,
-                    &mut self.cell_active_bits,
-                    &mut self.cell_winner_bits,
-                    &mut self.unpredicted_count,
-                    &mut self.burst_cols_flat,
-                    &mut self.burst_cols_count,
-                    cfg_val,
-                ),
-            )?;
-        }
-        // 3. Anomaly.
-        let anom_cfg = LaunchConfig {
-            grid_dim: (1, 1, 1),
-            block_dim: (256, 1, 1),
-            shared_mem_bytes: 0,
-        };
-        unsafe {
-            anom_fn.clone().launch(
-                anom_cfg,
-                (
-                    sp_active_mask,
-                    &self.unpredicted_count,
-                    anomaly_out,
-                    t_slot,
-                    n_cols as u32,
-                ),
-            )?;
-        }
-        if learn {
-            // 4. Reinforce (grid = n_cells).
-            let learn_cfg = LaunchConfig {
-                grid_dim: (n_cells as u32, 1, 1),
-                block_dim: (32, 1, 1),
-                shared_mem_bytes: 0,
-            };
-            unsafe {
-                learn_fn.clone().launch(
-                    learn_cfg,
-                    (
-                        &self.seg_cell_id,
-                        &self.seg_syn_count,
-                        &self.syn_presyn,
-                        &mut self.syn_perm,
-                        &self.seg_num_active_conn,
-                        &self.prev_active_bits,
-                        sp_active_mask,
-                        &self.col_predicted,
-                        &self.cell_seg_count,
-                        cfg_val,
-                    ),
-                )?;
-            }
-            // 5. Punish.
-            unsafe {
-                punish_fn.clone().launch(
-                    learn_cfg,
-                    (
-                        &self.seg_cell_id,
-                        &self.seg_syn_count,
-                        &self.syn_presyn,
-                        &mut self.syn_perm,
-                        &self.seg_num_active_pot,
-                        &self.prev_active_bits,
-                        sp_active_mask,
-                        &self.cell_seg_count,
-                        cfg_val,
-                    ),
-                )?;
-            }
-            // 6. Grow.
-            let grow_cfg = LaunchConfig {
-                grid_dim: (n_cols as u32, 1, 1),
-                block_dim: (32, 1, 1),
-                shared_mem_bytes: 0,
-            };
-            unsafe {
-                grow_fn.clone().launch(
-                    grow_cfg,
-                    (
-                        &mut self.seg_cell_id,
-                        &mut self.seg_syn_count,
-                        &mut self.syn_presyn,
-                        &mut self.syn_perm,
-                        &mut self.cell_seg_count,
-                        &self.burst_cols_flat,
-                        &self.burst_cols_count,
-                        &self.prev_winner_bits,
-                        &self.prev_active_bits,
-                        &self.col_best_match,
-                        cfg_val,
-                    ),
-                )?;
-            }
-        }
-        Ok(())
-    }
-}

+//! GPU Temporal Memory.
+//!
+//! Flat device storage. Pre-allocated segment slab:
+//!   n_cells = n_columns * cells_per_column
+//!   n_segments_max = n_cells * MAX_SEGMENTS_PER_CELL
+//!   n_synapses_max = n_segments_max * MAX_SYN_PER_SEGMENT
+//!
+//! Defaults (CPU parity targets relaxed on GPU to keep memory tractable):
+//!   MAX_SEGMENTS_PER_CELL = 16
+//!   MAX_SYN_PER_SEGMENT   = 32
+//!
+//! At n_cells = 65536:
+//!   n_segments_max =  1_048_576   (~1M)
+//!   n_synapses_max = 33_554_432   (~33M)
+//! Storage:
+//!   syn_presyn : u32  × 33M = 128 MB
+//!   syn_perm   : i16  × 33M =  64 MB
+//!   seg_cell   : u32  ×  1M =   4 MB
+//!   seg_syn_n  : u32  ×  1M =   4 MB
+//!   misc bitsets etc        ~ <1 MB
+//!   -------------------------------
+//!   Total per region        ~200 MB
+//!
+//! Permanences are stored as i16 scaled by 32767 (→ [0, 32767] represents
+//! [0.0, 1.0]). inc/dec are provided pre-scaled.
+use std::sync::Arc;
+use cudarc::driver::{CudaDevice, CudaSlice, DriverError, DeviceRepr, LaunchAsync, LaunchConfig};
+use cudarc::nvrtc::Ptx;
+/// Packed config struct passed by value to TM kernels to stay under
+/// cudarc's 12-tuple launch limit. Layout must match the C-side
+/// `TmConfig` struct declared in each kernel.
+#[repr(C)]
+#[derive(Clone, Copy)]
+pub struct TmConfig {
+    pub activation_threshold: u32,
+    pub learning_threshold: u32,
+    pub cells_per_column: u32,
+    pub synapses_per_segment: u32,
+    pub n_segments: u32,
+    pub n_cells: u32,
+    pub max_segments_per_cell: u32,
+    pub max_new_synapses: u32,
+    pub conn_thr_i16: i32,        // i16 widened to i32 for alignment
+    pub perm_inc_i16: i32,
+    pub perm_dec_i16: i32,
+    pub predicted_seg_dec_i16: i32,
+    pub initial_perm_i16: i32,
+    pub iter_seed: u32,
+    pub n_cols: u32,
+    pub bits_words: u32,
+}
+unsafe impl DeviceRepr for TmConfig {}
+// Embedded PTX.
+const PTX_TM_PREDICT:  &str = include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/tm_predict.ptx"));
+const PTX_TM_ACTIVATE: &str = include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/tm_activate.ptx"));
+const PTX_TM_LEARN:    &str = include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/tm_learn.ptx"));
+const PTX_TM_PUNISH:   &str = include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/tm_punish.ptx"));
+const PTX_TM_GROW:     &str = include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/tm_grow.ptx"));
+const PTX_TM_ANOMALY:  &str = include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/tm_anomaly.ptx"));
+const PTX_TM_RESET:    &str = include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/tm_reset.ptx"));
+/// Capacity trade-offs for 6 GB VRAM (RTX 3060) shared with the model:
+///   n_cells            = 2048 × 32 = 65_536
+///   n_segments_max     = n_cells × MAX_SEGMENTS_PER_CELL
+///   n_synapses_max     = n_segments_max × MAX_SYN_PER_SEGMENT
+///
+/// At 4/20 these are 262_144 segments and ~5.2M synapses (~50 MB per region).
+/// The training loop runs with `reset_each_forward=True`, so segment counts
+/// per window stay well below 32K (typical: ~n_cols new segs per step until
+/// the first matching segment is reused; in a 2048-step window that plateaus
+/// around ~5K total live segments). The 262K ceiling is generous headroom.
+pub const MAX_SEGMENTS_PER_CELL: usize = 4;
+pub const MAX_SYN_PER_SEGMENT:   usize = 20;
+const PERM_SCALE: f32 = 32767.0;
+fn perm_f32_to_i16(x: f32) -> i16 {
+    let clamped = x.clamp(0.0, 1.0);
+    (clamped * PERM_SCALE).round() as i16
+}
+pub struct TemporalMemoryGpu {
+    dev: Arc<CudaDevice>,
+    // Config mirror
+    pub n_columns: usize,
+    pub cells_per_column: usize,
+    pub activation_threshold: u32,
+    pub learning_threshold: u32,
+    pub initial_perm_i16: i16,
+    pub conn_thr_i16: i16,
+    pub perm_inc_i16: i16,
+    pub perm_dec_i16: i16,
+    pub predicted_seg_dec_i16: i16,
+    pub max_new_synapse_count: u32,
+    // Sizes
+    pub n_cells: usize,
+    pub n_segments_max: usize,
+    pub bits_words: usize,  // n_cells / 32
+    // Persistent device buffers
+    seg_cell_id:       CudaSlice<u32>,
+    seg_syn_count:     CudaSlice<u32>,
+    syn_presyn:        CudaSlice<u32>,
+    syn_perm:          CudaSlice<i16>,
+    cell_seg_count:    CudaSlice<u32>,
+    cell_active_bits:     CudaSlice<u32>,
+    cell_winner_bits:     CudaSlice<u32>,
+    cell_predictive_bits: CudaSlice<u32>,
+    prev_active_bits:     CudaSlice<u32>,
+    prev_winner_bits:     CudaSlice<u32>,
+    col_predicted:         CudaSlice<u8>,
+    seg_num_active_conn:   CudaSlice<u32>,
+    seg_num_active_pot:    CudaSlice<u32>,
+    unpredicted_count:     CudaSlice<u32>,
+    burst_cols_flat:       CudaSlice<u32>,
+    burst_cols_count:      CudaSlice<u32>,
+    col_best_match:        CudaSlice<u32>,
+    iter_counter: u32,
+}
+impl TemporalMemoryGpu {
+    pub fn new(
+        dev: Arc<CudaDevice>,
+        n_columns: usize,
+        cells_per_column: usize,
+    ) -> Result<Self, DriverError> {
+        let n_cells = n_columns * cells_per_column;
+        assert!(n_cells % 32 == 0, "n_cells must be divisible by 32 for bitsets");
+        let n_segments_max = n_cells * MAX_SEGMENTS_PER_CELL;
+        let bits_words = n_cells / 32;
+        // Numenta defaults.
+        let activation_threshold = 15u32;
+        let learning_threshold   = 13u32;
+        let initial_perm_i16        = perm_f32_to_i16(0.21);
+        let conn_thr_i16            = perm_f32_to_i16(0.50);
+        let perm_inc_i16            = perm_f32_to_i16(0.10);
+        let perm_dec_i16            = perm_f32_to_i16(0.10);
+        let predicted_seg_dec_i16   = perm_f32_to_i16(0.10);
+        let max_new_synapse_count   = 20u32;
+        // Allocate buffers.
+        let seg_cell_id_host: Vec<u32> = vec![u32::MAX; n_segments_max];
+        let seg_cell_id = dev.htod_sync_copy(&seg_cell_id_host)?;
+        let seg_syn_count = dev.alloc_zeros::<u32>(n_segments_max)?;
+        let syn_presyn = dev.alloc_zeros::<u32>(n_segments_max * MAX_SYN_PER_SEGMENT)?;
+        let syn_perm = dev.alloc_zeros::<i16>(n_segments_max * MAX_SYN_PER_SEGMENT)?;
+        let cell_seg_count = dev.alloc_zeros::<u32>(n_cells)?;
+        let cell_active_bits     = dev.alloc_zeros::<u32>(bits_words)?;
+        let cell_winner_bits     = dev.alloc_zeros::<u32>(bits_words)?;
+        let cell_predictive_bits = dev.alloc_zeros::<u32>(bits_words)?;
+        let prev_active_bits     = dev.alloc_zeros::<u32>(bits_words)?;
+        let prev_winner_bits     = dev.alloc_zeros::<u32>(bits_words)?;
+        let col_predicted       = dev.alloc_zeros::<u8>(n_columns)?;
+        let seg_num_active_conn = dev.alloc_zeros::<u32>(n_segments_max)?;
+        let seg_num_active_pot  = dev.alloc_zeros::<u32>(n_segments_max)?;
+        let unpredicted_count   = dev.alloc_zeros::<u32>(1)?;
+        // Bursting columns for one step bounded by n_columns.
+        let burst_cols_flat   = dev.alloc_zeros::<u32>(n_columns)?;
+        let burst_cols_count  = dev.alloc_zeros::<u32>(1)?;
+        let col_best_match    = dev.alloc_zeros::<u32>(n_columns)?;
+        // Load PTX modules.
+        let modules = [
+            ("htm_tm_predict",  PTX_TM_PREDICT,  "tm_predict"),
+            ("htm_tm_activate", PTX_TM_ACTIVATE, "tm_activate"),
+            ("htm_tm_learn",    PTX_TM_LEARN,    "tm_learn_reinforce"),
+            ("htm_tm_punish",   PTX_TM_PUNISH,   "tm_punish"),
+            ("htm_tm_grow",     PTX_TM_GROW,     "tm_grow"),
+            ("htm_tm_anomaly",  PTX_TM_ANOMALY,  "tm_anomaly"),
+            ("htm_tm_reset",    PTX_TM_RESET,    "tm_reset_step"),
+        ];
+        for (modname, ptx, fnname) in modules {
+            if dev.get_func(modname, fnname).is_none() {
+                dev.load_ptx(Ptx::from_src(ptx), modname, &[fnname])?;
+            }
+        }
+        Ok(Self {
+            dev,
+            n_columns,
+            cells_per_column,
+            activation_threshold,
+            learning_threshold,
+            initial_perm_i16,
+            conn_thr_i16,
+            perm_inc_i16,
+            perm_dec_i16,
+            predicted_seg_dec_i16,
+            max_new_synapse_count,
+            n_cells,
+            n_segments_max,
+            bits_words,
+            seg_cell_id,
+            seg_syn_count,
+            syn_presyn,
+            syn_perm,
+            cell_seg_count,
+            cell_active_bits,
+            cell_winner_bits,
+            cell_predictive_bits,
+            prev_active_bits,
+            prev_winner_bits,
+            col_predicted,
+            seg_num_active_conn,
+            seg_num_active_pot,
+            unpredicted_count,
+            burst_cols_flat,
+            burst_cols_count,
+            col_best_match,
+            iter_counter: 0,
+        })
+    }
+    // --- Fused-path accessors ---
+    pub fn seg_cell_id_accessor(&self) -> &CudaSlice<u32> { &self.seg_cell_id }
+    pub fn seg_syn_count_accessor(&self) -> &CudaSlice<u32> { &self.seg_syn_count }
+    pub fn syn_presyn_accessor(&self) -> &CudaSlice<u32> { &self.syn_presyn }
+    pub fn syn_perm_accessor(&self) -> &CudaSlice<i16> { &self.syn_perm }
+    pub fn cell_seg_count_accessor(&self) -> &CudaSlice<u32> { &self.cell_seg_count }
+    /// Hard reset — clear everything (predictive + active + segments).
+    pub fn reset(&mut self) -> Result<(), DriverError> {
+        // Restore "unused" sentinel in seg_cell_id.
+        let unused_host: Vec<u32> = vec![u32::MAX; self.n_segments_max];
+        self.dev.htod_sync_copy_into(&unused_host, &mut self.seg_cell_id)?;
+        self.dev.memset_zeros(&mut self.seg_syn_count)?;
+        self.dev.memset_zeros(&mut self.cell_seg_count)?;
+        self.dev.memset_zeros(&mut self.cell_active_bits)?;
+        self.dev.memset_zeros(&mut self.cell_winner_bits)?;
+        self.dev.memset_zeros(&mut self.cell_predictive_bits)?;
+        self.dev.memset_zeros(&mut self.prev_active_bits)?;
+        self.dev.memset_zeros(&mut self.prev_winner_bits)?;
+        self.dev.memset_zeros(&mut self.col_best_match)?;
+        self.iter_counter = 0;
+        Ok(())
+    }
+    fn build_cfg(&self) -> TmConfig {
+        TmConfig {
+            activation_threshold: self.activation_threshold,
+            learning_threshold: self.learning_threshold,
+            cells_per_column: self.cells_per_column as u32,
+            synapses_per_segment: MAX_SYN_PER_SEGMENT as u32,
+            n_segments: self.n_segments_max as u32,
+            n_cells: self.n_cells as u32,
+            max_segments_per_cell: MAX_SEGMENTS_PER_CELL as u32,
+            max_new_synapses: self.max_new_synapse_count,
+            conn_thr_i16: self.conn_thr_i16 as i32,
+            perm_inc_i16: self.perm_inc_i16 as i32,
+            perm_dec_i16: self.perm_dec_i16 as i32,
+            predicted_seg_dec_i16: self.predicted_seg_dec_i16 as i32,
+            initial_perm_i16: self.initial_perm_i16 as i32,
+            iter_seed: self.iter_counter,
+            n_cols: self.n_columns as u32,
+            bits_words: self.bits_words as u32,
+        }
+    }
+    /// Run one TM step on the GPU. Takes the SP active-column mask (u8, already
+    /// on device) and writes `anomaly_out[t_slot]`.
+    pub fn step(
+        &mut self,
+        sp_active_mask: &CudaSlice<u8>,
+        anomaly_out: &mut CudaSlice<f32>,
+        t_slot: u32,
+        learn: bool,
+    ) -> Result<(), DriverError> {
+        let n_cells = self.n_cells;
+        let n_cols = self.n_columns;
+        let predict_fn  = self.dev.get_func("htm_tm_predict",  "tm_predict").unwrap();
+        let activate_fn = self.dev.get_func("htm_tm_activate", "tm_activate").unwrap();
+        let learn_fn    = self.dev.get_func("htm_tm_learn",    "tm_learn_reinforce").unwrap();
+        let punish_fn   = self.dev.get_func("htm_tm_punish",   "tm_punish").unwrap();
+        let grow_fn     = self.dev.get_func("htm_tm_grow",     "tm_grow").unwrap();
+        let anom_fn     = self.dev.get_func("htm_tm_anomaly",  "tm_anomaly").unwrap();
+        let reset_fn    = self.dev.get_func("htm_tm_reset",    "tm_reset_step").unwrap();
+        self.iter_counter = self.iter_counter.wrapping_add(1);
+        let cfg_val = self.build_cfg();
+        // 0. Per-step reset.
+        let reset_words = self.bits_words.max(n_cols);
+        let reset_cfg = LaunchConfig {
+            grid_dim: (((reset_words + 255) / 256) as u32, 1, 1),
+            block_dim: (256, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        unsafe {
+            reset_fn.clone().launch(
+                reset_cfg,
+                (
+                    &mut self.cell_active_bits,
+                    &mut self.cell_winner_bits,
+                    &mut self.cell_predictive_bits,
+                    &mut self.prev_active_bits,
+                    &mut self.prev_winner_bits,
+                    &mut self.col_predicted,
+                    &mut self.unpredicted_count,
+                    &mut self.burst_cols_count,
+                    &mut self.col_best_match,
+                    self.bits_words as u32,
+                    n_cols as u32,
+                ),
+            )?;
+        }
+        // 1. Predict (grid = n_cells; each block iterates its cell's segments).
+        let predict_cfg = LaunchConfig {
+            grid_dim: (n_cells as u32, 1, 1),
+            block_dim: (32, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        unsafe {
+            predict_fn.clone().launch(
+                predict_cfg,
+                (
+                    &self.seg_cell_id,
+                    &self.seg_syn_count,
+                    &self.syn_presyn,
+                    &self.syn_perm,
+                    &self.prev_active_bits,
+                    &mut self.cell_predictive_bits,
+                    &mut self.col_predicted,
+                    &mut self.seg_num_active_conn,
+                    &mut self.seg_num_active_pot,
+                    &mut self.col_best_match,
+                    &self.cell_seg_count,
+                    cfg_val,
+                ),
+            )?;
+        }
+        // 2. Activate.
+        let activate_cfg = LaunchConfig {
+            grid_dim: (((n_cols + 255) / 256) as u32, 1, 1),
+            block_dim: (256, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        unsafe {
+            activate_fn.clone().launch(
+                activate_cfg,
+                (
+                    sp_active_mask,
+                    &self.col_predicted,
+                    &self.cell_predictive_bits,
+                    &mut self.cell_active_bits,
+                    &mut self.cell_winner_bits,
+                    &mut self.unpredicted_count,
+                    &mut self.burst_cols_flat,
+                    &mut self.burst_cols_count,
+                    cfg_val,
+                ),
+            )?;
+        }
+        // 3. Anomaly.
+        let anom_cfg = LaunchConfig {
+            grid_dim: (1, 1, 1),
+            block_dim: (256, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        unsafe {
+            anom_fn.clone().launch(
+                anom_cfg,
+                (
+                    sp_active_mask,
+                    &self.unpredicted_count,
+                    anomaly_out,
+                    t_slot,
+                    n_cols as u32,
+                ),
+            )?;
+        }
+        if learn {
+            // 4. Reinforce (grid = n_cells).
+            let learn_cfg = LaunchConfig {
+                grid_dim: (n_cells as u32, 1, 1),
+                block_dim: (32, 1, 1),
+                shared_mem_bytes: 0,
+            };
+            unsafe {
+                learn_fn.clone().launch(
+                    learn_cfg,
+                    (
+                        &self.seg_cell_id,
+                        &self.seg_syn_count,
+                        &self.syn_presyn,
+                        &mut self.syn_perm,
+                        &self.seg_num_active_conn,
+                        &self.prev_active_bits,
+                        sp_active_mask,
+                        &self.col_predicted,
+                        &self.cell_seg_count,
+                        cfg_val,
+                    ),
+                )?;
+            }
+            // 5. Punish.
+            unsafe {
+                punish_fn.clone().launch(
+                    learn_cfg,
+                    (
+                        &self.seg_cell_id,
+                        &self.seg_syn_count,
+                        &self.syn_presyn,
+                        &mut self.syn_perm,
+                        &self.seg_num_active_pot,
+                        &self.prev_active_bits,
+                        sp_active_mask,
+                        &self.cell_seg_count,
+                        cfg_val,
+                    ),
+                )?;
+            }
+            // 6. Grow.
+            let grow_cfg = LaunchConfig {
+                grid_dim: (n_cols as u32, 1, 1),
+                block_dim: (32, 1, 1),
+                shared_mem_bytes: 0,
+            };
+            unsafe {
+                grow_fn.clone().launch(
+                    grow_cfg,
+                    (
+                        &mut self.seg_cell_id,
+                        &mut self.seg_syn_count,
+                        &mut self.syn_presyn,
+                        &mut self.syn_perm,
+                        &mut self.cell_seg_count,
+                        &self.burst_cols_flat,
+                        &self.burst_cols_count,
+                        &self.prev_winner_bits,
+                        &self.prev_active_bits,
+                        &self.col_best_match,
+                        cfg_val,
+                    ),
+                )?;
+            }
+        }
+        Ok(())
+    }
+}

overlay/htm_rust/uv.lock CHANGED Viewed

@@ -1,8 +1,8 @@
-version = 1
-revision = 3
-requires-python = ">=3.11"
-[[package]]
-name = "htm-rust"
-version = "0.1.0"
-source = { editable = "." }

+version = 1
+revision = 3
+requires-python = ">=3.11"
+[[package]]
+name = "htm-rust"
+version = "0.1.0"
+source = { editable = "." }

overlay/hydra/config.py CHANGED Viewed

@@ -110,8 +110,8 @@ class PostSemClawConfig:
     gdn_layers: tuple[int, ...] = field(default_factory=_parse_gdn_layers_env)
     # Label smoothing + Z-loss
-    label_smoothing: float = field(default_factory=lambda: float(os.environ.get("HYDRA_LABEL_SMOOTHING", "0.0")))
-    z_loss_weight: float = field(default_factory=lambda: float(os.environ.get("HYDRA_Z_LOSS_WEIGHT", "1e-4")))
 # ---------------------------------------------------------------------------

     gdn_layers: tuple[int, ...] = field(default_factory=_parse_gdn_layers_env)
     # Label smoothing + Z-loss
+    label_smoothing: float = 0.0   # disabled: any smoothing hurts in 5-min budget
+    z_loss_weight: float = 1e-4
 # ---------------------------------------------------------------------------

overlay/hydra/engram.py CHANGED Viewed

@@ -1,23 +1,93 @@
-"""GPU Engram — Top-k Sparse Hopfield retrieval with optional Cantor/SDR nerve constraint."""
-from __future__ import annotations
-import os
 import torch
 import torch.nn as nn
-_ENGRAM_TOPK = int(os.environ.get("HYDRA_ENGRAM_TOPK", "64"))
-class GPUEngram(nn.Module):
-    """GPU Engram: Top-k Sparse Hopfield retrieval.
-    Default `routing_mode=flat` preserves the existing full-memory top-k path.
-    `cantor_sdr` constrains candidates to the current Cantor leaf shard and SDR
-    active offsets. `auto` only uses that local path when it is cheaper than the
-    full score matrix (`K * d_model < n_columns`).
     """
     def __init__(
@@ -31,15 +101,20 @@ class GPUEngram(nn.Module):
         self.n_columns = n_columns
         self.max_ngram = max_ngram
         self.hebbian_boost = hebbian_boost
         self.memory = nn.Parameter(torch.randn(n_columns, d_model) * 0.01)
         self.gate = nn.Linear(d_model, 1, bias=True)
-        nn.init.constant_(self.gate.bias, 0.0)
-        self.topk_k = min(_ENGRAM_TOPK, n_columns)
         self.primes = [2654435761, 2246822519, 3266489917]
         self.hebbian_lr = 0.01
-        self.routing_mode = os.environ.get("HYDRA_ENGRAM_ROUTING", "auto").lower()
     def _hash(self, token_ids: torch.Tensor) -> torch.Tensor:
         B, T = token_ids.shape
         h = token_ids * self.primes[0]
         if T > 1:
@@ -52,103 +127,44 @@ class GPUEngram(nn.Module):
             h = h ^ (shifted2 * self.primes[2])
         return h % self.n_columns
-    def _validate_active_indices(self, sdr_active_indices: torch.Tensor, x: torch.Tensor) -> None:
-        if not torch.is_floating_point(sdr_active_indices) and sdr_active_indices.dtype != torch.bool:
-            pass
-        else:
-            raise ValueError("Engram Cantor/SDR routing expects compact active indices, not a dense SDR mask")
-        if sdr_active_indices.dim() not in (2, 3):
-            raise ValueError("compact active indices must have shape (B,T,K) or (B*T,K)")
-        # Dense SDR masks arrive with K ~= n_bits; compact buffers are small
-        # (retina target_active or RealityBridge l0_k). Refuse obviously dense
-        # masks so forced cantor_sdr cannot silently route 0/1 values as offsets.
-        if sdr_active_indices.shape[-1] > 1024 or sdr_active_indices.shape[-1] > self.n_columns:
-            raise ValueError("Engram Cantor/SDR routing expects compact active indices, not a dense SDR mask")
-    def _cantor_sdr_candidates(
-        self,
-        sdr_active_indices: torch.Tensor,
-        cantor_leaf_ids: torch.Tensor,
-        n_leaves: int,
-    ) -> torch.Tensor:
-        """Map SDR active offsets into each Cantor leaf's Engram column shard."""
-        self._validate_active_indices(sdr_active_indices, cantor_leaf_ids)
-        if sdr_active_indices.dim() == 2:
-            B, T = cantor_leaf_ids.shape
-            sdr_active_indices = sdr_active_indices.view(B, T, -1)
-        sdr = sdr_active_indices.to(device=cantor_leaf_ids.device, dtype=torch.long)
-        leaves = cantor_leaf_ids.to(dtype=torch.long).clamp(min=0, max=max(0, n_leaves - 1))
-        cols_per_leaf = max(1, self.n_columns // max(1, n_leaves))
-        offsets = sdr.remainder(cols_per_leaf)
-        base = leaves.unsqueeze(-1) * cols_per_leaf
-        return (base + offsets).clamp(max=self.n_columns - 1)
-    def _flat_retrieve(self, x: torch.Tensor) -> torch.Tensor:
-        scores = x @ self.memory.T
-        topk_vals, topk_idx = scores.topk(self.topk_k, dim=-1)
-        topk_w = torch.softmax(topk_vals, dim=-1)
-        selected_mem = self.memory[topk_idx]
-        return torch.einsum('btk,btkd->btd', topk_w, selected_mem)
-    def _cantor_sdr_retrieve(
-        self,
-        x: torch.Tensor,
-        sdr_active_indices: torch.Tensor,
-        cantor_leaf_ids: torch.Tensor,
-        cantor_n_leaves: int,
-    ) -> torch.Tensor:
-        candidates = self._cantor_sdr_candidates(
-            sdr_active_indices,
-            cantor_leaf_ids,
-            n_leaves=cantor_n_leaves,
-        )
-        cand_mem = self.memory[candidates]
-        scores = torch.einsum('btd,btkd->btk', x, cand_mem)
-        k = min(self.topk_k, scores.shape[-1])
-        topk_vals, local_idx = scores.topk(k, dim=-1)
-        topk_w = torch.softmax(topk_vals, dim=-1)
-        global_idx = candidates.gather(-1, local_idx)
-        selected_mem = self.memory[global_idx]
-        return torch.einsum('btk,btkd->btd', topk_w, selected_mem)
-    def forward(
-        self,
-        x: torch.Tensor,
-        token_ids: torch.Tensor,
-        sdr_active_indices: torch.Tensor | None = None,
-        cantor_leaf_ids: torch.Tensor | None = None,
-        cantor_n_leaves: int | None = None,
-    ):
-        B, T, D = x.shape
-        mode = self.routing_mode
-        use_cantor = (
-            mode in {"cantor_sdr", "auto"}
-            and sdr_active_indices is not None
-            and cantor_leaf_ids is not None
-            and cantor_n_leaves is not None
-        )
-        if mode == "auto" and use_cantor:
-            k_active = sdr_active_indices.shape[-1]
-            # Compare actual retrieval candidates against the full-memory scan.
-            # The previous `(k_active * D) < n_columns` check mixed candidate
-            # count with feature dimension, so d256/k64 fell back to flat
-            # retrieval even though Cantor/SDR scores only 64 candidates vs
-            # 8k-16k memory columns. That kept required subsystems active but
-            # spent tens of billions of extra MACs per forward.
-            use_cantor = k_active < self.n_columns
-        if use_cantor and mode in {"cantor_sdr", "auto"}:
-            retrieved = self._cantor_sdr_retrieve(x, sdr_active_indices, cantor_leaf_ids, cantor_n_leaves)
-        else:
-            retrieved = self._flat_retrieve(x)
-        alpha = torch.sigmoid(self.gate(x))
         if self.training and self.hebbian_boost:
             with torch.no_grad():
                 indices = self._hash(token_ids)
-                flat_idx = indices.reshape(-1)
-                flat_x = x.detach().reshape(-1, D)
                 mem_dtype = self.memory.data.dtype
                 updates = (
                     self.hebbian_lr * flat_x
@@ -156,5 +172,6 @@ class GPUEngram(nn.Module):
                 ).to(mem_dtype)
                 self.memory.data.index_add_(0, flat_idx, updates)
         hit_rate = (alpha.detach() > 0.1).float().mean()
         return x + alpha * retrieved, hit_rate

+"""GPU Engram — Sparse Modern Hopfield retrieval path.
+## What changed (scatter-gather → Hopfield matmul)
+The original forward used `self.memory[indices]` (scatter-gather), which misses
+L2 cache at n_columns > 4096 and creates a hard tps ceiling.
+The replacement uses:
+    scores   = x @ self.memory.T          # (B, T, n_columns) — coalesced matmul
+    weights  = entmax15(scores, dim=-1)   # sparse attention; 95%+ exact zeros
+    retrieved = weights @ self.memory     # (B, T, d_model)   — coalesced matmul
+Both matmuls are tile-friendly (cuBLAS GEMM), so L2 reuse is high regardless of
+n_columns. Gradient flows through both matmuls so `self.memory` learns via
+autograd in addition to (or instead of) the Hebbian EMA writes.
+## Sparsity mechanism
+alpha-entmax with alpha=1.5 (entmax15) is a sparse attention operator that maps
+logit vectors to distributions where many entries are *exactly* zero (not merely
+small). It generalises softmax (alpha=1) and argmax (alpha→∞). At n_columns=1024
+with d_model=64 a random batch typically hits ≥95% zero entries — the key
+property that keeps bandwidth proportional to *attended* columns, not all columns.
+Fallback: if `entmax` is not pip-installed, top-k softmax (k=32) is used instead.
+This is chosen at module-import time — NO runtime branching per forward call.
+## token_ids argument
+token_ids is accepted for API compatibility with the rest of the hydra stack
+(train.py, lightning_module.py call `engram(x, token_ids)`). It is NOT used in
+the retrieval path — the Hopfield path computes dense similarity over the whole
+memory bank, which subsumes any hash-based column selection. Documented here to
+prevent confusion.
+## Hebbian writes (hebbian_boost=False by default)
+With Hopfield retrieval, gradient signals reach self.memory through autograd, so
+Hebbian EMA writes are no longer critical. They are preserved as an *optional*
+boost (hebbian_boost=True) for experiments that want both signals. Default is off.
+## Checkpoint compatibility
+`self.memory` shape (n_columns, d_model) is unchanged, so existing .pt / .ckpt
+files load without modification.
+"""
+from __future__ import annotations
 import torch
 import torch.nn as nn
+# ---------------------------------------------------------------------------
+# Sparse-attention backend — chosen ONCE at import time, no runtime branching.
+# ---------------------------------------------------------------------------
+try:
+    from entmax import entmax15 as _entmax15  # type: ignore[import]
+    def _sparse_attention(scores: torch.Tensor) -> torch.Tensor:
+        """alpha-entmax (alpha=1.5): truly sparse distribution over last dim."""
+        return _entmax15(scores, dim=-1).to(dtype=scores.dtype)
+    _BACKEND = "entmax15"
+except ImportError:  # pragma: no cover — entmax always installed in CI
+    _K = 32  # top-k for fallback
+    def _sparse_attention(scores: torch.Tensor) -> torch.Tensor:  # type: ignore[misc]
+        """Top-k softmax fallback: zero outside the k highest-scoring columns."""
+        topk_vals, topk_idx = scores.topk(_K, dim=-1)
+        topk_w = torch.softmax(topk_vals, dim=-1)
+        weights = torch.zeros_like(scores)
+        weights.scatter_(-1, topk_idx, topk_w.to(dtype=weights.dtype))
+        return weights
+    _BACKEND = "topk32"
+class GPUEngram(nn.Module):
+    """GPU Engram: Sparse Modern Hopfield retrieval.
+    Args:
+        d_model:       Model dimension — must match the surrounding transformer.
+        n_columns:     Number of memory columns (key-value pairs). Safe at 32 768
+                       with the matmul path; the old scatter-gather had an L2
+                       cliff above ~4 096.
+        max_ngram:     Retained for API compatibility; unused in retrieval path.
+        hebbian_boost: If True, also run a Hebbian EMA write on the memory bank
+                       during training (old behaviour, now optional). Default False.
     """
     def __init__(
         self.n_columns = n_columns
         self.max_ngram = max_ngram
         self.hebbian_boost = hebbian_boost
+        # Shape unchanged from original — existing checkpoints load cleanly.
         self.memory = nn.Parameter(torch.randn(n_columns, d_model) * 0.01)
         self.gate = nn.Linear(d_model, 1, bias=True)
+        nn.init.constant_(self.gate.bias, 0.0)  # START OPEN
+        # Retained for any external code that reads these attrs.
         self.primes = [2654435761, 2246822519, 3266489917]
         self.hebbian_lr = 0.01
+    # ------------------------------------------------------------------
+    # _hash: retained for API/checkpoint compat; unused in forward below.
+    # ------------------------------------------------------------------
     def _hash(self, token_ids: torch.Tensor) -> torch.Tensor:
+        """N-gram hash → column index (kept for backward-compat; not used in retrieval)."""
         B, T = token_ids.shape
         h = token_ids * self.primes[0]
         if T > 1:
             h = h ^ (shifted2 * self.primes[2])
         return h % self.n_columns
+    # ------------------------------------------------------------------
+    # forward
+    # ------------------------------------------------------------------
+    def forward(self, x: torch.Tensor, token_ids: torch.Tensor):
+        """Hopfield retrieve + soft gate + residual.
+        Args:
+            x:         (B, T, d_model) — input activations.
+            token_ids: (B, T) — token indices. Accepted for API compatibility;
+                       NOT used in the retrieval path (see module docstring).
+        Returns:
+            (x + alpha * retrieved, hit_rate)
+            - x + alpha * retrieved: (B, T, d_model)
+            - hit_rate: scalar tensor — fraction of gate values > 0.1
+        """
+        # ---- 1. Similarity scores (coalesced GEMM) ----------------------
+        # scores[b, t, c] = dot(x[b,t], memory[c])
+        scores = x @ self.memory.T  # (B, T, n_columns)
+        # ---- 2. Sparse attention weights --------------------------------
+        # _sparse_attention is fixed at import time (entmax15 or top-k).
+        weights = _sparse_attention(scores)  # (B, T, n_columns), many exact zeros
+        # ---- 3. Retrieved vector (coalesced GEMM) -----------------------
+        retrieved = weights @ self.memory  # (B, T, d_model)
+        # ---- 4. Soft gate (unchanged) -----------------------------------
+        alpha = torch.sigmoid(self.gate(x))  # (B, T, 1)
+        # ---- 5. Optional Hebbian EMA write ------------------------------
         if self.training and self.hebbian_boost:
             with torch.no_grad():
+                # Reuse the hash-based indices for the write target (sparse update).
                 indices = self._hash(token_ids)
+                flat_idx = indices.reshape(-1)           # (B*T,)
+                flat_x = x.detach().reshape(-1, x.shape[-1])  # (B*T, d_model)
                 mem_dtype = self.memory.data.dtype
                 updates = (
                     self.hebbian_lr * flat_x
                 ).to(mem_dtype)
                 self.memory.data.index_add_(0, flat_idx, updates)
+        # ---- 6. Residual + hit_rate -------------------------------------
         hit_rate = (alpha.detach() > 0.1).float().mean()
         return x + alpha * retrieved, hit_rate

overlay/hydra/model.py CHANGED Viewed

@@ -469,6 +469,7 @@ class PostSemClawModel(nn.Module):
         # Cast to bf16 to match Mamba3 dtype; Muon groups by shape so mixed
         # dtypes in the same shape group would break lerp_ dtype checks.
         self.wte.to(dtype=torch.bfloat16)
         self.htm_proj.to(dtype=torch.bfloat16)
         self.sdr_proj.to(dtype=torch.bfloat16)
         self.engram.to(dtype=torch.bfloat16)

         # Cast to bf16 to match Mamba3 dtype; Muon groups by shape so mixed
         # dtypes in the same shape group would break lerp_ dtype checks.
         self.wte.to(dtype=torch.bfloat16)
+        self.blocks.to(dtype=torch.bfloat16)
         self.htm_proj.to(dtype=torch.bfloat16)
         self.sdr_proj.to(dtype=torch.bfloat16)
         self.engram.to(dtype=torch.bfloat16)

overlay/scripts/autoresearch.py CHANGED Viewed

@@ -1,517 +1,517 @@
-#!/usr/bin/env python3
-"""HYDRA Autoresearch Mutation Loop.
-Runs baseline training -> evaluates -> picks ONE mutation at a time ->
-trains -> evaluates -> keeps if quality improves AND tps >= floor.
-Repeats until all mutations exhausted or Ctrl+C.
-State persisted in .omc/autoresearch_config.json for resume support.
-Usage:
-    python scripts/autoresearch.py              # run full loop
-    python scripts/autoresearch.py --dry-run    # show plan, don't train
-    python scripts/autoresearch.py --baseline   # only run baseline eval
-"""
-from __future__ import annotations
-import argparse
-import json
-import math
-import os
-import re
-import signal
-import subprocess
-import sys
-import time
-from pathlib import Path
-_PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-if _PROJECT_ROOT not in sys.path:
-    sys.path.insert(0, _PROJECT_ROOT)
-# ---------------------------------------------------------------------------
-# Mutation catalog (ordered by expected impact)
-# ---------------------------------------------------------------------------
-MUTATIONS = [
-    # Learning dynamics — env vars verified in hydra/config.py
-    {"name": "lr_matrix_0.012", "env": "HYDRA_MATRIX_LR=0.012"},   # default 0.12
-    {"name": "lr_matrix_0.06", "env": "HYDRA_MATRIX_LR=0.06"},     # half default
-    {"name": "lr_matrix_0.24", "env": "HYDRA_MATRIX_LR=0.24"},     # double default
-    {"name": "lr_floor_50pct", "env": "HYDRA_LR_MIN_MULT=0.5"},    # default 0.0
-    {"name": "lr_floor_20pct", "env": "HYDRA_LR_MIN_MULT=0.2"},    # default 0.0
-    {"name": "embed_lr_0.5", "env": "HYDRA_EMBED_LR=0.5"},         # default 1.0
-    {"name": "embed_lr_2.0", "env": "HYDRA_EMBED_LR=2.0"},         # default 1.0
-    {"name": "unembed_lr_0.01", "env": "HYDRA_UNEMBED_LR=0.01"},   # default 0.005
-    # Architecture — env vars verified in hydra/config.py
-    {"name": "d_model_384", "env": "HYDRA_D_MODEL=384"},            # default 256
-    {"name": "d_model_192", "env": "HYDRA_D_MODEL=192"},            # smaller
-    {"name": "d_state_128", "env": "HYDRA_D_STATE=128"},            # default 64
-    {"name": "d_state_32", "env": "HYDRA_D_STATE=32"},              # smaller
-    {"name": "n_layer_6", "env": "HYDRA_N_LAYER=6"},                # default 4
-    {"name": "n_layer_3", "env": "HYDRA_N_LAYER=3"},                # fewer
-    {"name": "headdim_16", "env": "HYDRA_HEADDIM=16"},              # default 32 -> more heads
-    {"name": "headdim_64", "env": "HYDRA_HEADDIM=64"},              # default 32 -> fewer heads
-    {"name": "expand_3", "env": "HYDRA_EXPAND=3"},                  # default 2
-    {"name": "engram_2048", "env": "HYDRA_ENGRAM_N_COLUMNS=2048"},  # default 1024
-    {"name": "engram_4096", "env": "HYDRA_ENGRAM_N_COLUMNS=4096"},  # default 1024
-    {"name": "engram_512", "env": "HYDRA_ENGRAM_N_COLUMNS=512"},    # smaller
-    # Batch size
-    {"name": "batch_32k", "env": "HYDRA_TOTAL_BATCH=32768"},        # default 32768 (verify)
-    {"name": "batch_16k", "env": "HYDRA_TOTAL_BATCH=16384"},        # smaller batch
-    {"name": "batch_65k", "env": "HYDRA_TOTAL_BATCH=65536"},        # larger batch
-    # Regularization — env vars verified in hydra/model.py + hydra/config.py
-    {"name": "dropout_0.05", "env": "HYDRA_DROPOUT=0.05"},          # default 0.2
-    {"name": "dropout_0.1", "env": "HYDRA_DROPOUT=0.1"},            # default 0.2
-    {"name": "dropout_0.3", "env": "HYDRA_DROPOUT=0.3"},            # higher
-]
-# ---------------------------------------------------------------------------
-# State management
-# ---------------------------------------------------------------------------
-STATE_DIR = os.path.join(_PROJECT_ROOT, ".omc")
-STATE_FILE = os.path.join(STATE_DIR, "autoresearch_config.json")
-DEFAULT_STATE = {
-    "baseline_quality": None,
-    "baseline_tps": None,
-    "current_gen": 0,
-    "mutations_tested": [],
-    "mutations_kept": [],
-    "tps_floor": 62000,
-    "time_budget": 600,
-    "history": [],
-}
-def load_state() -> dict:
-    """Load state from disk or return default."""
-    if os.path.exists(STATE_FILE):
-        with open(STATE_FILE, "r") as f:
-            state = json.load(f)
-        # Backfill missing keys from defaults
-        for k, v in DEFAULT_STATE.items():
-            if k not in state:
-                state[k] = v
-        return state
-    return dict(DEFAULT_STATE)
-def save_state(state: dict) -> None:
-    """Persist state to disk."""
-    os.makedirs(STATE_DIR, exist_ok=True)
-    with open(STATE_FILE, "w") as f:
-        json.dump(state, f, indent=2)
-# ---------------------------------------------------------------------------
-# Training subprocess
-# ---------------------------------------------------------------------------
-def build_env(extra_env: str | None = None) -> dict[str, str]:
-    """Build environment for training subprocess."""
-    env = os.environ.copy()
-    # Ensure CUDA paths
-    ld_paths = ["/usr/lib/wsl/lib", "/usr/local/cuda/lib64"]
-    existing = env.get("LD_LIBRARY_PATH", "")
-    for p in ld_paths:
-        if p not in existing:
-            existing = p + ":" + existing
-    env["LD_LIBRARY_PATH"] = existing
-    # Apply mutation env var
-    if extra_env:
-        key, val = extra_env.split("=", 1)
-        env[key] = val
-    return env
-def run_training(time_budget: int, extra_env: str | None = None) -> dict | None:
-    """Run train.py with given time budget and optional env override.
-    Returns dict with parsed metrics, or None on failure.
-    """
-    env = build_env(extra_env)
-    env["HYDRA_TIME_BUDGET"] = str(time_budget)
-    cmd = [os.path.join(_PROJECT_ROOT, ".venv", "bin", "python"), "-u", "train.py"]
-    try:
-        proc = subprocess.Popen(
-            cmd,
-            cwd=_PROJECT_ROOT,
-            env=env,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.STDOUT,
-            text=True,
-            bufsize=1,
-        )
-    except Exception as e:
-        print(f"  [ERROR] Failed to start training: {e}")
-        return None
-    output_lines: list[str] = []
-    last_step_line = ""
-    try:
-        for line in proc.stdout:
-            line = line.rstrip()
-            output_lines.append(line)
-            if line.startswith("step="):
-                last_step_line = line
-                # Print progress every 50 steps
-                m = re.search(r"step=(\d+)", line)
-                if m and int(m.group(1)) % 50 == 0:
-                    tps_m = re.search(r"tps=(\d+)", line)
-                    bpb_m = re.search(r"bpb=([\d.]+)", line)
-                    tps = tps_m.group(1) if tps_m else "?"
-                    bpb = bpb_m.group(1) if bpb_m else "?"
-                    print(f"    step={m.group(1)} tps={tps} bpb={bpb}", flush=True)
-            elif "val_bpb" in line or "factual_english_score" in line:
-                print(f"    {line}", flush=True)
-    except KeyboardInterrupt:
-        proc.terminate()
-        proc.wait()
-        raise
-    proc.wait()
-    if proc.returncode != 0:
-        print(f"  [ERROR] Training exited with code {proc.returncode}")
-        # Print last 10 lines for debugging
-        for line in output_lines[-10:]:
-            print(f"    {line}")
-        return None
-    return _parse_training_output(output_lines)
-def _parse_training_output(lines: list[str]) -> dict:
-    """Extract metrics from training output lines."""
-    metrics: dict[str, float] = {}
-    for line in lines:
-        # Key=value pairs from summary block
-        for key in ["val_bpb", "training_seconds", "peak_vram_mb", "mfu_percent",
-                     "total_tokens_M", "num_steps", "factual_english_score",
-                     "factual_english_hits"]:
-            m = re.match(rf"^{key}:\s+([\d.]+)", line.strip())
-            if m:
-                metrics[key] = float(m.group(1))
-        # TPS from last step line
-        if line.startswith("step="):
-            tps_m = re.search(r"tps=(\d+)", line)
-            if tps_m:
-                metrics["tps"] = float(tps_m.group(1))
-    return metrics
-# ---------------------------------------------------------------------------
-# Eval integration
-# ---------------------------------------------------------------------------
-def run_eval_after_training(extra_env: str | None = None) -> dict | None:
-    """Run eval_quality.py after training. Returns metrics dict or None."""
-    env = build_env(extra_env)
-    cmd = [
-        os.path.join(_PROJECT_ROOT, ".venv", "bin", "python"),
-        os.path.join(_PROJECT_ROOT, "scripts", "eval_quality.py"),
-    ]
-    try:
-        result = subprocess.run(
-            cmd,
-            cwd=_PROJECT_ROOT,
-            env=env,
-            capture_output=True,
-            text=True,
-            timeout=120,  # 2 min max for eval
-        )
-    except subprocess.TimeoutExpired:
-        print("  [ERROR] Eval timed out (120s)")
-        return None
-    except Exception as e:
-        print(f"  [ERROR] Eval failed: {e}")
-        return None
-    if result.returncode != 0:
-        print(f"  [ERROR] Eval exited with code {result.returncode}")
-        for line in result.stdout.split("\n")[-10:]:
-            print(f"    {line}")
-        for line in result.stderr.split("\n")[-5:]:
-            print(f"    {line}")
-        return None
-    # Parse key=value output
-    metrics = {}
-    for line in result.stdout.split("\n"):
-        line = line.strip()
-        m = re.match(r"^([\w]+)=([\d.eE+-]+)$", line)
-        if m:
-            try:
-                metrics[m.group(1)] = float(m.group(2))
-            except ValueError:
-                pass
-    return metrics if metrics else None
-# ---------------------------------------------------------------------------
-# Git operations
-# ---------------------------------------------------------------------------
-def git_commit(message: str) -> bool:
-    """Stage all changes and commit."""
-    try:
-        subprocess.run(["git", "add", "-A"], cwd=_PROJECT_ROOT, check=True,
-                       capture_output=True, timeout=30)
-        subprocess.run(
-            ["git", "commit", "-m", message],
-            cwd=_PROJECT_ROOT, check=True, capture_output=True, timeout=30,
-        )
-        return True
-    except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
-        print(f"  [WARN] Git commit failed: {e}")
-        return False
-# ---------------------------------------------------------------------------
-# Main loop
-# ---------------------------------------------------------------------------
-_SHUTDOWN = False
-def _handle_sigint(signum, frame):
-    global _SHUTDOWN
-    if _SHUTDOWN:
-        print("\n[AUTORESEARCH] Double Ctrl+C — force exit")
-        sys.exit(1)
-    _SHUTDOWN = True
-    print("\n[AUTORESEARCH] Ctrl+C received — finishing current gen then saving state...")
-def main():
-    global _SHUTDOWN
-    signal.signal(signal.SIGINT, _handle_sigint)
-    parser = argparse.ArgumentParser(description="HYDRA autoresearch mutation loop")
-    parser.add_argument("--dry-run", action="store_true", help="Show plan, don't train")
-    parser.add_argument("--baseline", action="store_true", help="Only run baseline")
-    parser.add_argument("--time-budget", type=int, default=600, help="Time budget per run (s)")
-    parser.add_argument("--tps-floor", type=int, default=62000, help="Minimum acceptable TPS")
-    args = parser.parse_args()
-    state = load_state()
-    state["time_budget"] = args.time_budget
-    state["tps_floor"] = args.tps_floor
-    tested = set(state["mutations_tested"])
-    remaining = [m for m in MUTATIONS if m["name"] not in tested]
-    print("=" * 70)
-    print("HYDRA AUTORESEARCH MUTATION LOOP")
-    print("=" * 70)
-    print(f"Time budget per run: {state['time_budget']}s")
-    print(f"TPS floor: {state['tps_floor']}")
-    print(f"Current gen: {state['current_gen']}")
-    print(f"Mutations tested: {len(tested)}/{len(MUTATIONS)}")
-    print(f"Mutations kept: {state['mutations_kept']}")
-    print(f"Remaining: {[m['name'] for m in remaining]}")
-    print()
-    if args.dry_run:
-        print("[DRY RUN] Would test these mutations in order:")
-        for i, m in enumerate(remaining):
-            print(f"  {i + 1}. {m['name']} ({m['env']})")
-        return
-    # -----------------------------------------------------------------------
-    # Baseline (Gen 0)
-    # -----------------------------------------------------------------------
-    if state["baseline_quality"] is None:
-        print("[GEN 0] Running baseline training + evaluation...")
-        train_metrics = run_training(state["time_budget"])
-        if train_metrics is None:
-            print("[FAIL] Baseline training failed")
-            save_state(state)
-            return
-        print("[GEN 0] Running quality evaluation...")
-        eval_metrics = run_eval_after_training()
-        if eval_metrics is None:
-            print("[FAIL] Baseline eval failed")
-            save_state(state)
-            return
-        baseline_tps = train_metrics.get("tps", 0)
-        baseline_quality = eval_metrics.get("quality_score", 0)
-        state["baseline_quality"] = baseline_quality
-        state["baseline_tps"] = baseline_tps
-        state["current_gen"] = 0
-        state["history"].append({
-            "gen": 0,
-            "mutation": "baseline",
-            "quality_score": baseline_quality,
-            "baseline_score": baseline_quality,
-            "delta": "0.0%",
-            "tps": baseline_tps,
-            "ppl": eval_metrics.get("ppl", 0),
-            "bleu4": eval_metrics.get("bleu4", 0),
-            "rouge_l": eval_metrics.get("rouge_l", 0),
-            "factual": eval_metrics.get("factual", 0),
-            "bpb": eval_metrics.get("bpb", 0),
-            "repetition_rate": eval_metrics.get("repetition_rate", 0),
-            "kept": True,
-        })
-        save_state(state)
-        print(f"[GEN 0] BASELINE: quality={baseline_quality:.4f} tps={baseline_tps:.0f}")
-        if args.baseline:
-            return
-    else:
-        print(f"[RESUME] Baseline quality={state['baseline_quality']:.4f} tps={state['baseline_tps']:.0f}")
-        if args.baseline:
-            return
-    # -----------------------------------------------------------------------
-    # Mutation loop
-    # -----------------------------------------------------------------------
-    current_quality = state["baseline_quality"]
-    # Track best quality so far (from last kept mutation, not just baseline)
-    if state["history"]:
-        kept_entries = [h for h in state["history"] if h.get("kept")]
-        if kept_entries:
-            current_quality = kept_entries[-1]["quality_score"]
-    for mutation in remaining:
-        if _SHUTDOWN:
-            print("[AUTORESEARCH] Shutdown requested — saving state")
-            save_state(state)
-            return
-        gen = state["current_gen"] + 1
-        name = mutation["name"]
-        env_str = mutation["env"]
-        print(f"\n[GEN {gen}] Testing {name} ({env_str})...")
-        print(f"  Current best quality: {current_quality:.4f}")
-        # Train with mutation
-        print(f"  Training ({state['time_budget']}s)...", flush=True)
-        train_metrics = run_training(state["time_budget"], extra_env=env_str)
-        if train_metrics is None:
-            print(f"  [SKIP] Training failed for {name}")
-            state["mutations_tested"].append(name)
-            state["current_gen"] = gen
-            state["history"].append({
-                "gen": gen, "mutation": name,
-                "quality_score": 0, "baseline_score": current_quality,
-                "delta": "FAIL", "tps": 0, "ppl": 0, "bleu4": 0,
-                "rouge_l": 0, "factual": 0, "bpb": 0, "repetition_rate": 0,
-                "kept": False,
-            })
-            save_state(state)
-            continue
-        tps = train_metrics.get("tps", 0)
-        # TPS floor check
-        if tps < state["tps_floor"]:
-            print(f"  [REJECT] TPS={tps:.0f} < floor={state['tps_floor']} — skipping eval")
-            state["mutations_tested"].append(name)
-            state["current_gen"] = gen
-            state["history"].append({
-                "gen": gen, "mutation": name,
-                "quality_score": 0, "baseline_score": current_quality,
-                "delta": f"TPS_FAIL({tps:.0f})", "tps": tps,
-                "ppl": 0, "bleu4": 0, "rouge_l": 0, "factual": 0,
-                "bpb": train_metrics.get("val_bpb", 0), "repetition_rate": 0,
-                "kept": False,
-            })
-            save_state(state)
-            continue
-        # Evaluate
-        print(f"  Evaluating...", flush=True)
-        eval_metrics = run_eval_after_training(extra_env=env_str)
-        if eval_metrics is None:
-            print(f"  [SKIP] Eval failed for {name}")
-            state["mutations_tested"].append(name)
-            state["current_gen"] = gen
-            state["history"].append({
-                "gen": gen, "mutation": name,
-                "quality_score": 0, "baseline_score": current_quality,
-                "delta": "EVAL_FAIL", "tps": tps, "ppl": 0, "bleu4": 0,
-                "rouge_l": 0, "factual": 0, "bpb": 0, "repetition_rate": 0,
-                "kept": False,
-            })
-            save_state(state)
-            continue
-        quality = eval_metrics.get("quality_score", 0)
-        delta_pct = ((quality - current_quality) / max(abs(current_quality), 1e-6)) * 100
-        delta_str = f"{delta_pct:+.1f}%"
-        kept = quality > current_quality and tps >= state["tps_floor"]
-        status = "KEEP" if kept else "DISCARD"
-        entry = {
-            "gen": gen,
-            "mutation": name,
-            "quality_score": quality,
-            "baseline_score": current_quality,
-            "delta": delta_str,
-            "tps": tps,
-            "ppl": eval_metrics.get("ppl", 0),
-            "bleu4": eval_metrics.get("bleu4", 0),
-            "rouge_l": eval_metrics.get("rouge_l", 0),
-            "factual": eval_metrics.get("factual", 0),
-            "bpb": eval_metrics.get("bpb", 0),
-            "repetition_rate": eval_metrics.get("repetition_rate", 0),
-            "kept": kept,
-        }
-        print(f"\n[GEN {gen}] {name}: quality={quality:.4f} ({delta_str}) tps={tps:.0f} -> {status}")
-        if kept:
-            current_quality = quality
-            state["mutations_kept"].append(name)
-            git_commit(f"autoresearch: gen {gen} — {name} quality {delta_str}")
-        state["mutations_tested"].append(name)
-        state["current_gen"] = gen
-        state["history"].append(entry)
-        save_state(state)
-    # -----------------------------------------------------------------------
-    # Summary
-    # -----------------------------------------------------------------------
-    print("\n" + "=" * 70)
-    print("AUTORESEARCH COMPLETE")
-    print("=" * 70)
-    print(f"Total generations: {state['current_gen']}")
-    print(f"Mutations kept: {state['mutations_kept']}")
-    print(f"Final quality: {current_quality:.4f}")
-    if state["baseline_quality"]:
-        total_delta = ((current_quality - state["baseline_quality"]) /
-                       max(abs(state["baseline_quality"]), 1e-6)) * 100
-        print(f"Total improvement: {total_delta:+.1f}%")
-    print()
-    # Print history table
-    print(f"{'Gen':>4} {'Mutation':>20} {'Quality':>8} {'Delta':>8} {'TPS':>7} {'PPL':>8} {'BPB':>7} {'Kept':>5}")
-    print("-" * 75)
-    for h in state["history"]:
-        print(f"{h['gen']:4d} {h['mutation']:>20s} {h['quality_score']:8.4f} "
-              f"{h['delta']:>8s} {h['tps']:7.0f} {h['ppl']:8.2f} "
-              f"{h.get('bpb', 0):7.4f} {'  YES' if h['kept'] else '   NO'}")
-if __name__ == "__main__":
-    main()

+#!/usr/bin/env python3
+"""HYDRA Autoresearch Mutation Loop.
+Runs baseline training -> evaluates -> picks ONE mutation at a time ->
+trains -> evaluates -> keeps if quality improves AND tps >= floor.
+Repeats until all mutations exhausted or Ctrl+C.
+State persisted in .omc/autoresearch_config.json for resume support.
+Usage:
+    python scripts/autoresearch.py              # run full loop
+    python scripts/autoresearch.py --dry-run    # show plan, don't train
+    python scripts/autoresearch.py --baseline   # only run baseline eval
+"""
+from __future__ import annotations
+import argparse
+import json
+import math
+import os
+import re
+import signal
+import subprocess
+import sys
+import time
+from pathlib import Path
+_PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+if _PROJECT_ROOT not in sys.path:
+    sys.path.insert(0, _PROJECT_ROOT)
+# ---------------------------------------------------------------------------
+# Mutation catalog (ordered by expected impact)
+# ---------------------------------------------------------------------------
+MUTATIONS = [
+    # Learning dynamics — env vars verified in hydra/config.py
+    {"name": "lr_matrix_0.012", "env": "HYDRA_MATRIX_LR=0.012"},   # default 0.12
+    {"name": "lr_matrix_0.06", "env": "HYDRA_MATRIX_LR=0.06"},     # half default
+    {"name": "lr_matrix_0.24", "env": "HYDRA_MATRIX_LR=0.24"},     # double default
+    {"name": "lr_floor_50pct", "env": "HYDRA_LR_MIN_MULT=0.5"},    # default 0.0
+    {"name": "lr_floor_20pct", "env": "HYDRA_LR_MIN_MULT=0.2"},    # default 0.0
+    {"name": "embed_lr_0.5", "env": "HYDRA_EMBED_LR=0.5"},         # default 1.0
+    {"name": "embed_lr_2.0", "env": "HYDRA_EMBED_LR=2.0"},         # default 1.0
+    {"name": "unembed_lr_0.01", "env": "HYDRA_UNEMBED_LR=0.01"},   # default 0.005
+    # Architecture — env vars verified in hydra/config.py
+    {"name": "d_model_384", "env": "HYDRA_D_MODEL=384"},            # default 256
+    {"name": "d_model_192", "env": "HYDRA_D_MODEL=192"},            # smaller
+    {"name": "d_state_128", "env": "HYDRA_D_STATE=128"},            # default 64
+    {"name": "d_state_32", "env": "HYDRA_D_STATE=32"},              # smaller
+    {"name": "n_layer_6", "env": "HYDRA_N_LAYER=6"},                # default 4
+    {"name": "n_layer_3", "env": "HYDRA_N_LAYER=3"},                # fewer
+    {"name": "headdim_16", "env": "HYDRA_HEADDIM=16"},              # default 32 -> more heads
+    {"name": "headdim_64", "env": "HYDRA_HEADDIM=64"},              # default 32 -> fewer heads
+    {"name": "expand_3", "env": "HYDRA_EXPAND=3"},                  # default 2
+    {"name": "engram_2048", "env": "HYDRA_ENGRAM_N_COLUMNS=2048"},  # default 1024
+    {"name": "engram_4096", "env": "HYDRA_ENGRAM_N_COLUMNS=4096"},  # default 1024
+    {"name": "engram_512", "env": "HYDRA_ENGRAM_N_COLUMNS=512"},    # smaller
+    # Batch size
+    {"name": "batch_32k", "env": "HYDRA_TOTAL_BATCH=32768"},        # default 32768 (verify)
+    {"name": "batch_16k", "env": "HYDRA_TOTAL_BATCH=16384"},        # smaller batch
+    {"name": "batch_65k", "env": "HYDRA_TOTAL_BATCH=65536"},        # larger batch
+    # Regularization — env vars verified in hydra/model.py + hydra/config.py
+    {"name": "dropout_0.05", "env": "HYDRA_DROPOUT=0.05"},          # default 0.2
+    {"name": "dropout_0.1", "env": "HYDRA_DROPOUT=0.1"},            # default 0.2
+    {"name": "dropout_0.3", "env": "HYDRA_DROPOUT=0.3"},            # higher
+]
+# ---------------------------------------------------------------------------
+# State management
+# ---------------------------------------------------------------------------
+STATE_DIR = os.path.join(_PROJECT_ROOT, ".omc")
+STATE_FILE = os.path.join(STATE_DIR, "autoresearch_config.json")
+DEFAULT_STATE = {
+    "baseline_quality": None,
+    "baseline_tps": None,
+    "current_gen": 0,
+    "mutations_tested": [],
+    "mutations_kept": [],
+    "tps_floor": 62000,
+    "time_budget": 600,
+    "history": [],
+}
+def load_state() -> dict:
+    """Load state from disk or return default."""
+    if os.path.exists(STATE_FILE):
+        with open(STATE_FILE, "r") as f:
+            state = json.load(f)
+        # Backfill missing keys from defaults
+        for k, v in DEFAULT_STATE.items():
+            if k not in state:
+                state[k] = v
+        return state
+    return dict(DEFAULT_STATE)
+def save_state(state: dict) -> None:
+    """Persist state to disk."""
+    os.makedirs(STATE_DIR, exist_ok=True)
+    with open(STATE_FILE, "w") as f:
+        json.dump(state, f, indent=2)
+# ---------------------------------------------------------------------------
+# Training subprocess
+# ---------------------------------------------------------------------------
+def build_env(extra_env: str | None = None) -> dict[str, str]:
+    """Build environment for training subprocess."""
+    env = os.environ.copy()
+    # Ensure CUDA paths
+    ld_paths = ["/usr/lib/wsl/lib", "/usr/local/cuda/lib64"]
+    existing = env.get("LD_LIBRARY_PATH", "")
+    for p in ld_paths:
+        if p not in existing:
+            existing = p + ":" + existing
+    env["LD_LIBRARY_PATH"] = existing
+    # Apply mutation env var
+    if extra_env:
+        key, val = extra_env.split("=", 1)
+        env[key] = val
+    return env
+def run_training(time_budget: int, extra_env: str | None = None) -> dict | None:
+    """Run train.py with given time budget and optional env override.
+    Returns dict with parsed metrics, or None on failure.
+    """
+    env = build_env(extra_env)
+    env["HYDRA_TIME_BUDGET"] = str(time_budget)
+    cmd = [os.path.join(_PROJECT_ROOT, ".venv", "bin", "python"), "-u", "train.py"]
+    try:
+        proc = subprocess.Popen(
+            cmd,
+            cwd=_PROJECT_ROOT,
+            env=env,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            bufsize=1,
+        )
+    except Exception as e:
+        print(f"  [ERROR] Failed to start training: {e}")
+        return None
+    output_lines: list[str] = []
+    last_step_line = ""
+    try:
+        for line in proc.stdout:
+            line = line.rstrip()
+            output_lines.append(line)
+            if line.startswith("step="):
+                last_step_line = line
+                # Print progress every 50 steps
+                m = re.search(r"step=(\d+)", line)
+                if m and int(m.group(1)) % 50 == 0:
+                    tps_m = re.search(r"tps=(\d+)", line)
+                    bpb_m = re.search(r"bpb=([\d.]+)", line)
+                    tps = tps_m.group(1) if tps_m else "?"
+                    bpb = bpb_m.group(1) if bpb_m else "?"
+                    print(f"    step={m.group(1)} tps={tps} bpb={bpb}", flush=True)
+            elif "val_bpb" in line or "factual_english_score" in line:
+                print(f"    {line}", flush=True)
+    except KeyboardInterrupt:
+        proc.terminate()
+        proc.wait()
+        raise
+    proc.wait()
+    if proc.returncode != 0:
+        print(f"  [ERROR] Training exited with code {proc.returncode}")
+        # Print last 10 lines for debugging
+        for line in output_lines[-10:]:
+            print(f"    {line}")
+        return None
+    return _parse_training_output(output_lines)
+def _parse_training_output(lines: list[str]) -> dict:
+    """Extract metrics from training output lines."""
+    metrics: dict[str, float] = {}
+    for line in lines:
+        # Key=value pairs from summary block
+        for key in ["val_bpb", "training_seconds", "peak_vram_mb", "mfu_percent",
+                     "total_tokens_M", "num_steps", "factual_english_score",
+                     "factual_english_hits"]:
+            m = re.match(rf"^{key}:\s+([\d.]+)", line.strip())
+            if m:
+                metrics[key] = float(m.group(1))
+        # TPS from last step line
+        if line.startswith("step="):
+            tps_m = re.search(r"tps=(\d+)", line)
+            if tps_m:
+                metrics["tps"] = float(tps_m.group(1))
+    return metrics
+# ---------------------------------------------------------------------------
+# Eval integration
+# ---------------------------------------------------------------------------
+def run_eval_after_training(extra_env: str | None = None) -> dict | None:
+    """Run eval_quality.py after training. Returns metrics dict or None."""
+    env = build_env(extra_env)
+    cmd = [
+        os.path.join(_PROJECT_ROOT, ".venv", "bin", "python"),
+        os.path.join(_PROJECT_ROOT, "scripts", "eval_quality.py"),
+    ]
+    try:
+        result = subprocess.run(
+            cmd,
+            cwd=_PROJECT_ROOT,
+            env=env,
+            capture_output=True,
+            text=True,
+            timeout=120,  # 2 min max for eval
+        )
+    except subprocess.TimeoutExpired:
+        print("  [ERROR] Eval timed out (120s)")
+        return None
+    except Exception as e:
+        print(f"  [ERROR] Eval failed: {e}")
+        return None
+    if result.returncode != 0:
+        print(f"  [ERROR] Eval exited with code {result.returncode}")
+        for line in result.stdout.split("\n")[-10:]:
+            print(f"    {line}")
+        for line in result.stderr.split("\n")[-5:]:
+            print(f"    {line}")
+        return None
+    # Parse key=value output
+    metrics = {}
+    for line in result.stdout.split("\n"):
+        line = line.strip()
+        m = re.match(r"^([\w]+)=([\d.eE+-]+)$", line)
+        if m:
+            try:
+                metrics[m.group(1)] = float(m.group(2))
+            except ValueError:
+                pass
+    return metrics if metrics else None
+# ---------------------------------------------------------------------------
+# Git operations
+# ---------------------------------------------------------------------------
+def git_commit(message: str) -> bool:
+    """Stage all changes and commit."""
+    try:
+        subprocess.run(["git", "add", "-A"], cwd=_PROJECT_ROOT, check=True,
+                       capture_output=True, timeout=30)
+        subprocess.run(
+            ["git", "commit", "-m", message],
+            cwd=_PROJECT_ROOT, check=True, capture_output=True, timeout=30,
+        )
+        return True
+    except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
+        print(f"  [WARN] Git commit failed: {e}")
+        return False
+# ---------------------------------------------------------------------------
+# Main loop
+# ---------------------------------------------------------------------------
+_SHUTDOWN = False
+def _handle_sigint(signum, frame):
+    global _SHUTDOWN
+    if _SHUTDOWN:
+        print("\n[AUTORESEARCH] Double Ctrl+C — force exit")
+        sys.exit(1)
+    _SHUTDOWN = True
+    print("\n[AUTORESEARCH] Ctrl+C received — finishing current gen then saving state...")
+def main():
+    global _SHUTDOWN
+    signal.signal(signal.SIGINT, _handle_sigint)
+    parser = argparse.ArgumentParser(description="HYDRA autoresearch mutation loop")
+    parser.add_argument("--dry-run", action="store_true", help="Show plan, don't train")
+    parser.add_argument("--baseline", action="store_true", help="Only run baseline")
+    parser.add_argument("--time-budget", type=int, default=600, help="Time budget per run (s)")
+    parser.add_argument("--tps-floor", type=int, default=62000, help="Minimum acceptable TPS")
+    args = parser.parse_args()
+    state = load_state()
+    state["time_budget"] = args.time_budget
+    state["tps_floor"] = args.tps_floor
+    tested = set(state["mutations_tested"])
+    remaining = [m for m in MUTATIONS if m["name"] not in tested]
+    print("=" * 70)
+    print("HYDRA AUTORESEARCH MUTATION LOOP")
+    print("=" * 70)
+    print(f"Time budget per run: {state['time_budget']}s")
+    print(f"TPS floor: {state['tps_floor']}")
+    print(f"Current gen: {state['current_gen']}")
+    print(f"Mutations tested: {len(tested)}/{len(MUTATIONS)}")
+    print(f"Mutations kept: {state['mutations_kept']}")
+    print(f"Remaining: {[m['name'] for m in remaining]}")
+    print()
+    if args.dry_run:
+        print("[DRY RUN] Would test these mutations in order:")
+        for i, m in enumerate(remaining):
+            print(f"  {i + 1}. {m['name']} ({m['env']})")
+        return
+    # -----------------------------------------------------------------------
+    # Baseline (Gen 0)
+    # -----------------------------------------------------------------------
+    if state["baseline_quality"] is None:
+        print("[GEN 0] Running baseline training + evaluation...")
+        train_metrics = run_training(state["time_budget"])
+        if train_metrics is None:
+            print("[FAIL] Baseline training failed")
+            save_state(state)
+            return
+        print("[GEN 0] Running quality evaluation...")
+        eval_metrics = run_eval_after_training()
+        if eval_metrics is None:
+            print("[FAIL] Baseline eval failed")
+            save_state(state)
+            return
+        baseline_tps = train_metrics.get("tps", 0)
+        baseline_quality = eval_metrics.get("quality_score", 0)
+        state["baseline_quality"] = baseline_quality
+        state["baseline_tps"] = baseline_tps
+        state["current_gen"] = 0
+        state["history"].append({
+            "gen": 0,
+            "mutation": "baseline",
+            "quality_score": baseline_quality,
+            "baseline_score": baseline_quality,
+            "delta": "0.0%",
+            "tps": baseline_tps,
+            "ppl": eval_metrics.get("ppl", 0),
+            "bleu4": eval_metrics.get("bleu4", 0),
+            "rouge_l": eval_metrics.get("rouge_l", 0),
+            "factual": eval_metrics.get("factual", 0),
+            "bpb": eval_metrics.get("bpb", 0),
+            "repetition_rate": eval_metrics.get("repetition_rate", 0),
+            "kept": True,
+        })
+        save_state(state)
+        print(f"[GEN 0] BASELINE: quality={baseline_quality:.4f} tps={baseline_tps:.0f}")
+        if args.baseline:
+            return
+    else:
+        print(f"[RESUME] Baseline quality={state['baseline_quality']:.4f} tps={state['baseline_tps']:.0f}")
+        if args.baseline:
+            return
+    # -----------------------------------------------------------------------
+    # Mutation loop
+    # -----------------------------------------------------------------------
+    current_quality = state["baseline_quality"]
+    # Track best quality so far (from last kept mutation, not just baseline)
+    if state["history"]:
+        kept_entries = [h for h in state["history"] if h.get("kept")]
+        if kept_entries:
+            current_quality = kept_entries[-1]["quality_score"]
+    for mutation in remaining:
+        if _SHUTDOWN:
+            print("[AUTORESEARCH] Shutdown requested — saving state")
+            save_state(state)
+            return
+        gen = state["current_gen"] + 1
+        name = mutation["name"]
+        env_str = mutation["env"]
+        print(f"\n[GEN {gen}] Testing {name} ({env_str})...")
+        print(f"  Current best quality: {current_quality:.4f}")
+        # Train with mutation
+        print(f"  Training ({state['time_budget']}s)...", flush=True)
+        train_metrics = run_training(state["time_budget"], extra_env=env_str)
+        if train_metrics is None:
+            print(f"  [SKIP] Training failed for {name}")
+            state["mutations_tested"].append(name)
+            state["current_gen"] = gen
+            state["history"].append({
+                "gen": gen, "mutation": name,
+                "quality_score": 0, "baseline_score": current_quality,
+                "delta": "FAIL", "tps": 0, "ppl": 0, "bleu4": 0,
+                "rouge_l": 0, "factual": 0, "bpb": 0, "repetition_rate": 0,
+                "kept": False,
+            })
+            save_state(state)
+            continue
+        tps = train_metrics.get("tps", 0)
+        # TPS floor check
+        if tps < state["tps_floor"]:
+            print(f"  [REJECT] TPS={tps:.0f} < floor={state['tps_floor']} — skipping eval")
+            state["mutations_tested"].append(name)
+            state["current_gen"] = gen
+            state["history"].append({
+                "gen": gen, "mutation": name,
+                "quality_score": 0, "baseline_score": current_quality,
+                "delta": f"TPS_FAIL({tps:.0f})", "tps": tps,
+                "ppl": 0, "bleu4": 0, "rouge_l": 0, "factual": 0,
+                "bpb": train_metrics.get("val_bpb", 0), "repetition_rate": 0,
+                "kept": False,
+            })
+            save_state(state)
+            continue
+        # Evaluate
+        print(f"  Evaluating...", flush=True)
+        eval_metrics = run_eval_after_training(extra_env=env_str)
+        if eval_metrics is None:
+            print(f"  [SKIP] Eval failed for {name}")
+            state["mutations_tested"].append(name)
+            state["current_gen"] = gen
+            state["history"].append({
+                "gen": gen, "mutation": name,
+                "quality_score": 0, "baseline_score": current_quality,
+                "delta": "EVAL_FAIL", "tps": tps, "ppl": 0, "bleu4": 0,
+                "rouge_l": 0, "factual": 0, "bpb": 0, "repetition_rate": 0,
+                "kept": False,
+            })
+            save_state(state)
+            continue
+        quality = eval_metrics.get("quality_score", 0)
+        delta_pct = ((quality - current_quality) / max(abs(current_quality), 1e-6)) * 100
+        delta_str = f"{delta_pct:+.1f}%"
+        kept = quality > current_quality and tps >= state["tps_floor"]
+        status = "KEEP" if kept else "DISCARD"
+        entry = {
+            "gen": gen,
+            "mutation": name,
+            "quality_score": quality,
+            "baseline_score": current_quality,
+            "delta": delta_str,
+            "tps": tps,
+            "ppl": eval_metrics.get("ppl", 0),
+            "bleu4": eval_metrics.get("bleu4", 0),
+            "rouge_l": eval_metrics.get("rouge_l", 0),
+            "factual": eval_metrics.get("factual", 0),
+            "bpb": eval_metrics.get("bpb", 0),
+            "repetition_rate": eval_metrics.get("repetition_rate", 0),
+            "kept": kept,
+        }
+        print(f"\n[GEN {gen}] {name}: quality={quality:.4f} ({delta_str}) tps={tps:.0f} -> {status}")
+        if kept:
+            current_quality = quality
+            state["mutations_kept"].append(name)
+            git_commit(f"autoresearch: gen {gen} — {name} quality {delta_str}")
+        state["mutations_tested"].append(name)
+        state["current_gen"] = gen
+        state["history"].append(entry)
+        save_state(state)
+    # -----------------------------------------------------------------------
+    # Summary
+    # -----------------------------------------------------------------------
+    print("\n" + "=" * 70)
+    print("AUTORESEARCH COMPLETE")
+    print("=" * 70)
+    print(f"Total generations: {state['current_gen']}")
+    print(f"Mutations kept: {state['mutations_kept']}")
+    print(f"Final quality: {current_quality:.4f}")
+    if state["baseline_quality"]:
+        total_delta = ((current_quality - state["baseline_quality"]) /
+                       max(abs(state["baseline_quality"]), 1e-6)) * 100
+        print(f"Total improvement: {total_delta:+.1f}%")
+    print()
+    # Print history table
+    print(f"{'Gen':>4} {'Mutation':>20} {'Quality':>8} {'Delta':>8} {'TPS':>7} {'PPL':>8} {'BPB':>7} {'Kept':>5}")
+    print("-" * 75)
+    for h in state["history"]:
+        print(f"{h['gen']:4d} {h['mutation']:>20s} {h['quality_score']:8.4f} "
+              f"{h['delta']:>8s} {h['tps']:7.0f} {h['ppl']:8.2f} "
+              f"{h.get('bpb', 0):7.4f} {'  YES' if h['kept'] else '   NO'}")
+if __name__ == "__main__":
+    main()

overlay/scripts/chat.py CHANGED Viewed

@@ -1,458 +1,458 @@
-"""Interactive chat REPL for HYDRA.
-Usage:
-    python scripts/chat.py                  # auto-select best checkpoint
-    python scripts/chat.py --ckpt PATH      # explicit checkpoint
-    python scripts/chat.py --sft            # prefer sft_final.pt
-    python scripts/chat.py --random         # skip ckpt, use random weights
-HONESTY: model is ~7.5M params at d_model=256/n_layer=4. Expect incoherent
-output. This REPL validates the *interface* — tokenizer roundtrip, generation
-loop, stop-token handling, conversation history truncation. Coherent dialogue
-is not a goal at this scale.
-Slash commands:
-    /reset   clear conversation history
-    /quit    exit
-    /temp X  set temperature (default 0.8)
-    /topk K  set top-k (default 40)
-    /topp P  set top-p (default 0.9)
-    /max N   set max new tokens per turn (default 200)
-    /rep R   set repetition penalty (default 1.1)
-    /sys S   set a system prefix prepended to every turn
-    /info    print current settings + checkpoint path
-"""
-from __future__ import annotations
-import argparse
-import os
-import sys
-import time
-from dataclasses import asdict
-from pathlib import Path
-# Make repo root importable when invoked as `python scripts/chat.py`.
-_REPO_ROOT = Path(__file__).resolve().parent.parent
-if str(_REPO_ROOT) not in sys.path:
-    sys.path.insert(0, str(_REPO_ROOT))
-import torch  # noqa: E402
-# Chat template — plain-text fallback (see .omc/chat_plan.md).
-# If the SFT agent later reserves special tokens, redefine USER_TAG /
-# ASSISTANT_TAG / END_TAG and the stop-string accordingly.
-USER_TAG = "User:"
-ASSISTANT_TAG = "Assistant:"
-END_TAG = "\nUser:"  # stop-string matched on decoded output
-CKPT_DIR = Path(os.path.expanduser("~/.cache/autoresearch/ckpts"))
-CKPT_CANDIDATES_PRETRAIN = ["pretrain_final.pt", "latest.pt"]
-CKPT_CANDIDATES_SFT = ["sft_final.pt"]
-# ---------------------------------------------------------------------------
-# Checkpoint resolution
-# ---------------------------------------------------------------------------
-def resolve_checkpoint(explicit: str | None, prefer_sft: bool) -> Path | None:
-    """Return Path to checkpoint file, or None if nothing found.
-    Order:
-      1. `explicit` if provided and exists.
-      2. If prefer_sft: sft_final.pt -> pretrain_final.pt -> latest.pt.
-      3. Else: sft_final.pt (if exists) -> pretrain_final.pt -> latest.pt.
-    """
-    if explicit:
-        p = Path(os.path.expanduser(explicit))
-        if p.exists():
-            return p
-        print(f"[WARN] --ckpt {p} does not exist; falling through to auto-select.", file=sys.stderr)
-    # Task spec: prefer sft_final.pt if it exists; otherwise pretrain_final.pt
-    # then latest.pt. --sft just makes the preference explicit; it's already
-    # the default behavior. We list SFT first in both orderings to honor the
-    # spec, since the task description said "prefer sft if exists" by default.
-    _ = prefer_sft  # reserved for future "pretrain-only" vs "sft-only" modes
-    order = CKPT_CANDIDATES_SFT + CKPT_CANDIDATES_PRETRAIN
-    for name in order:
-        cand = CKPT_DIR / name
-        if cand.exists():
-            return cand
-    return None
-# ---------------------------------------------------------------------------
-# Model + tokenizer loading
-# ---------------------------------------------------------------------------
-def load_model_and_tokenizer(ckpt_path: Path | None, device: torch.device):
-    """Build model + tokenizer. If ckpt_path is None, random weights are used.
-    Returns (model, tokenizer, meta) where meta is a dict with 'ckpt',
-    'step', 'val_bpb' etc. for /info display.
-    """
-    from hydra.config import PostSemClawConfig
-    from hydra.model import PostSemClawModel
-    from prepare import Tokenizer
-    tokenizer = Tokenizer.from_directory()
-    vocab_size = tokenizer.get_vocab_size()
-    print(f"[chat] Tokenizer loaded (vocab={vocab_size:,})")
-    meta: dict = {"ckpt": str(ckpt_path) if ckpt_path else "<random>", "step": None, "val_bpb": None}
-    # Build config. If checkpoint provides one, use it; else use env-var defaults.
-    ckpt_state = None
-    config_kwargs: dict = {}
-    if ckpt_path is not None:
-        print(f"[chat] Loading checkpoint: {ckpt_path}")
-        ckpt_state = torch.load(ckpt_path, map_location=device, weights_only=False)
-        cfg_dict = ckpt_state.get("config")
-        if isinstance(cfg_dict, dict):
-            # Filter to kwargs PostSemClawConfig actually accepts.
-            allowed = set(PostSemClawConfig.__dataclass_fields__.keys())
-            config_kwargs = {k: v for k, v in cfg_dict.items() if k in allowed}
-        meta["step"] = ckpt_state.get("step")
-        meta["val_bpb"] = ckpt_state.get("val_bpb") or ckpt_state.get("bpb")
-    # Env-var defaults are applied by PostSemClawConfig field defaults; but the
-    # training run builds the config explicitly from hydra.config module-level
-    # constants. We mirror that here so the random-weights path aligns with
-    # what train.py would instantiate for the same env.
-    if not config_kwargs:
-        from hydra.config import (  # noqa: E402
-            D_MODEL, D_STATE, ENGRAM_KEY_DIM, ENGRAM_LAYER_IDX,
-            ENGRAM_N_COLUMNS, EXPAND, HEADDIM, N_HEADS, N_LAYER,
-        )
-        from prepare import MAX_SEQ_LEN  # noqa: E402
-        config_kwargs = dict(
-            sequence_len=MAX_SEQ_LEN,
-            vocab_size=vocab_size,
-            n_layer=N_LAYER,
-            d_model=D_MODEL,
-            d_state=D_STATE,
-            headdim=HEADDIM,
-            n_heads=N_HEADS,
-            expand=EXPAND,
-            engram_n_columns=ENGRAM_N_COLUMNS,
-            engram_key_dim=ENGRAM_KEY_DIM,
-            engram_layer_idx=ENGRAM_LAYER_IDX,
-        )
-    # Build model on meta device then materialize — matches training.py path.
-    with torch.device("meta"):
-        model = PostSemClawModel(PostSemClawConfig(**config_kwargs))
-    model.to_empty(device=device)
-    model.init_weights()
-    if ckpt_state is not None and "model_state_dict" in ckpt_state:
-        # strict=False: the model has non-parameter buffers (SDR retina loaded
-        # from npz, HTM Rust-side state, engram EMA stats) that may not be in
-        # the state_dict. missing/unexpected-key warnings are expected and OK.
-        missing, unexpected = model.load_state_dict(
-            ckpt_state["model_state_dict"], strict=False
-        )
-        if missing:
-            print(f"[chat] Note: {len(missing)} missing key(s) in state_dict (expected for HTM/SDR buffers).")
-        if unexpected:
-            print(f"[chat] Note: {len(unexpected)} unexpected key(s) in state_dict.")
-    elif ckpt_path is None:
-        print("[chat] [WARN] NO CHECKPOINT — using random weights. Output will be gibberish.", file=sys.stderr)
-    model.eval()
-    return model, tokenizer, meta
-# ---------------------------------------------------------------------------
-# Generation
-# ---------------------------------------------------------------------------
-def generate_stream(
-    model,
-    tokenizer,
-    prompt_ids: list[int],
-    *,
-    max_new_tokens: int,
-    temperature: float,
-    top_k: int,
-    top_p: float,
-    repetition_penalty: float,
-    stop_strings: tuple[str, ...],
-    max_seq_len: int,
-    device: torch.device,
-    rep_window: int = 64,
-):
-    """Yield decoded-text chunks as tokens are generated.
-    Truncates `prompt_ids` to the last `max_seq_len` tokens if needed. Stops
-    early when any `stop_strings` substring appears in the newly-decoded
-    continuation.
-    """
-    from scripts.sample_utils import sample_token
-    # Truncate prompt to window.
-    if len(prompt_ids) > max_seq_len:
-        prompt_ids = prompt_ids[-max_seq_len:]
-    ctx = torch.tensor([prompt_ids], device=device, dtype=torch.long)
-    generated: list[int] = []
-    # Track already-streamed byte length so we can detect when the decoded
-    # string has grown (BPE tokens may decode to multi-char strings mid-merge).
-    streamed_chars = 0
-    accumulated_text = ""
-    autocast_ctx = torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16)
-    for _ in range(max_new_tokens):
-        with torch.no_grad(), autocast_ctx:
-            out = model(ctx, targets=None)
-        # out shape: (1, T, vocab) or (1, vocab) depending on path.
-        if out.dim() == 3:
-            last_logits = out[0, -1, :]
-        else:
-            last_logits = out[0]
-        recent = generated[-rep_window:] if generated else None
-        next_id = sample_token(
-            last_logits,
-            temperature=temperature,
-            top_k=top_k,
-            top_p=top_p,
-            repetition_penalty=repetition_penalty,
-            recent_tokens=recent,
-        )
-        generated.append(next_id)
-        # Decode everything so-far then diff — BPE decoding is not token-local,
-        # so a per-token decode can drop bytes.
-        new_text = tokenizer.decode(generated)
-        delta = new_text[streamed_chars:]
-        if delta:
-            streamed_chars = len(new_text)
-            accumulated_text = new_text
-            yield delta
-        # Stop-string check.
-        hit_stop = any(s and s in accumulated_text for s in stop_strings)
-        if hit_stop:
-            break
-        # Advance context. If we've filled the window, drop oldest token.
-        ctx = torch.cat([ctx, torch.tensor([[next_id]], device=device, dtype=torch.long)], dim=1)
-        if ctx.size(1) > max_seq_len:
-            ctx = ctx[:, -max_seq_len:]
-    # Final accumulated text is also returned for history tracking.
-    return accumulated_text  # noqa: B901  (generator return for history)
-def _consume_stream_with_print(stream_gen):
-    """Iterate a generator, print each chunk, return the full text.
-    Replacement for a naïve list(stream) since `generate_stream` is a generator
-    that yields then returns the final text.
-    """
-    collected = []
-    try:
-        while True:
-            chunk = next(stream_gen)
-            collected.append(chunk)
-            sys.stdout.write(chunk)
-            sys.stdout.flush()
-    except StopIteration as stop:
-        # stop.value holds the return value of the generator.
-        final = stop.value
-        if final is not None:
-            return final
-        return "".join(collected)
-# ---------------------------------------------------------------------------
-# REPL
-# ---------------------------------------------------------------------------
-def build_prompt(system: str, history: list[tuple[str, str]], user_msg: str) -> str:
-    """Assemble the text prompt fed to the tokenizer."""
-    parts: list[str] = []
-    if system:
-        parts.append(system.rstrip() + "\n")
-    for u, a in history:
-        parts.append(f"{USER_TAG} {u}\n{ASSISTANT_TAG} {a}\n")
-    parts.append(f"{USER_TAG} {user_msg}\n{ASSISTANT_TAG}")
-    return "".join(parts)
-def run_repl(
-    model,
-    tokenizer,
-    meta: dict,
-    *,
-    device: torch.device,
-    max_seq_len: int,
-) -> None:
-    settings = {
-        "temperature": float(os.environ.get("HYDRA_CHAT_TEMP", "0.8")),
-        "top_k": int(os.environ.get("HYDRA_CHAT_TOPK", "40")),
-        "top_p": float(os.environ.get("HYDRA_CHAT_TOPP", "0.9")),
-        "max_new_tokens": int(os.environ.get("HYDRA_CHAT_MAX", "200")),
-        "repetition_penalty": float(os.environ.get("HYDRA_CHAT_REP", "1.1")),
-        "system": os.environ.get("HYDRA_CHAT_SYSTEM", ""),
-    }
-    history: list[tuple[str, str]] = []
-    print()
-    print("=" * 60)
-    print("HYDRA chat REPL")
-    print(f"  checkpoint: {meta['ckpt']}")
-    if meta.get("step") is not None:
-        print(f"  step: {meta['step']}")
-    if meta.get("val_bpb") is not None:
-        print(f"  val_bpb: {meta['val_bpb']}")
-    print("  type /info for settings, /quit to exit")
-    print("=" * 60)
-    print()
-    while True:
-        try:
-            line = input(f"{USER_TAG} ")
-        except (EOFError, KeyboardInterrupt):
-            print()
-            return
-        line = line.rstrip()
-        if not line:
-            continue
-        if line.startswith("/"):
-            cmd, *rest = line.split(maxsplit=1)
-            arg = rest[0] if rest else ""
-            if cmd == "/quit" or cmd == "/exit":
-                return
-            elif cmd == "/reset":
-                history = []
-                print("[reset]")
-                continue
-            elif cmd == "/info":
-                print(f"[info] ckpt={meta['ckpt']} settings={settings} history_turns={len(history)}")
-                continue
-            elif cmd == "/temp":
-                try:
-                    settings["temperature"] = float(arg)
-                    print(f"[temp={settings['temperature']}]")
-                except ValueError:
-                    print(f"[err] /temp needs a float, got {arg!r}")
-                continue
-            elif cmd == "/topk":
-                try:
-                    settings["top_k"] = int(arg)
-                    print(f"[topk={settings['top_k']}]")
-                except ValueError:
-                    print(f"[err] /topk needs an int, got {arg!r}")
-                continue
-            elif cmd == "/topp":
-                try:
-                    settings["top_p"] = float(arg)
-                    print(f"[topp={settings['top_p']}]")
-                except ValueError:
-                    print(f"[err] /topp needs a float, got {arg!r}")
-                continue
-            elif cmd == "/max":
-                try:
-                    settings["max_new_tokens"] = int(arg)
-                    print(f"[max={settings['max_new_tokens']}]")
-                except ValueError:
-                    print(f"[err] /max needs an int, got {arg!r}")
-                continue
-            elif cmd == "/rep":
-                try:
-                    settings["repetition_penalty"] = float(arg)
-                    print(f"[rep={settings['repetition_penalty']}]")
-                except ValueError:
-                    print(f"[err] /rep needs a float, got {arg!r}")
-                continue
-            elif cmd == "/sys":
-                settings["system"] = arg
-                print(f"[sys set, {len(arg)} chars]")
-                continue
-            else:
-                print(f"[err] unknown command {cmd!r}. Try /info /reset /quit.")
-                continue
-        # Normal chat turn.
-        prompt_text = build_prompt(settings["system"], history, line)
-        prompt_ids = tokenizer.encode(prompt_text)
-        sys.stdout.write(f"{ASSISTANT_TAG} ")
-        sys.stdout.flush()
-        stream = generate_stream(
-            model, tokenizer, prompt_ids,
-            max_new_tokens=settings["max_new_tokens"],
-            temperature=settings["temperature"],
-            top_k=settings["top_k"],
-            top_p=settings["top_p"],
-            repetition_penalty=settings["repetition_penalty"],
-            stop_strings=(END_TAG,),
-            max_seq_len=max_seq_len,
-            device=device,
-        )
-        response_text = _consume_stream_with_print(stream)
-        if not response_text.endswith("\n"):
-            sys.stdout.write("\n")
-            sys.stdout.flush()
-        # Strip trailing stop marker from the remembered history.
-        clean = response_text
-        if END_TAG in clean:
-            clean = clean.split(END_TAG, 1)[0]
-        clean = clean.strip()
-        history.append((line, clean))
-# ---------------------------------------------------------------------------
-# CLI
-# ---------------------------------------------------------------------------
-def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
-    p = argparse.ArgumentParser(description="HYDRA chat REPL")
-    p.add_argument("--ckpt", type=str, default=None,
-                   help="Path to checkpoint (.pt). If omitted, auto-select.")
-    p.add_argument("--sft", action="store_true",
-                   help="Prefer an SFT checkpoint if available.")
-    p.add_argument("--random", action="store_true",
-                   help="Skip checkpoint load; use random weights.")
-    p.add_argument("--device", type=str, default=None,
-                   help="Torch device (default: cuda if available else cpu).")
-    return p.parse_args(argv)
-def main(argv: list[str] | None = None) -> int:
-    args = _parse_args(argv)
-    if args.device:
-        device = torch.device(args.device)
-    elif torch.cuda.is_available():
-        device = torch.device("cuda")
-    else:
-        device = torch.device("cpu")
-        print("[chat] [WARN] CUDA not available; HYDRA's HTM/Mamba kernels may fail on CPU.", file=sys.stderr)
-    ckpt_path: Path | None
-    if args.random:
-        ckpt_path = None
-    else:
-        ckpt_path = resolve_checkpoint(args.ckpt, args.sft)
-    t0 = time.time()
-    model, tokenizer, meta = load_model_and_tokenizer(ckpt_path, device)
-    dt = time.time() - t0
-    print(f"[chat] Model ready in {dt:.1f}s on {device}")
-    from prepare import MAX_SEQ_LEN
-    run_repl(model, tokenizer, meta, device=device, max_seq_len=MAX_SEQ_LEN)
-    return 0
-if __name__ == "__main__":
-    sys.exit(main())

+"""Interactive chat REPL for HYDRA.
+Usage:
+    python scripts/chat.py                  # auto-select best checkpoint
+    python scripts/chat.py --ckpt PATH      # explicit checkpoint
+    python scripts/chat.py --sft            # prefer sft_final.pt
+    python scripts/chat.py --random         # skip ckpt, use random weights
+HONESTY: model is ~7.5M params at d_model=256/n_layer=4. Expect incoherent
+output. This REPL validates the *interface* — tokenizer roundtrip, generation
+loop, stop-token handling, conversation history truncation. Coherent dialogue
+is not a goal at this scale.
+Slash commands:
+    /reset   clear conversation history
+    /quit    exit
+    /temp X  set temperature (default 0.8)
+    /topk K  set top-k (default 40)
+    /topp P  set top-p (default 0.9)
+    /max N   set max new tokens per turn (default 200)
+    /rep R   set repetition penalty (default 1.1)
+    /sys S   set a system prefix prepended to every turn
+    /info    print current settings + checkpoint path
+"""
+from __future__ import annotations
+import argparse
+import os
+import sys
+import time
+from dataclasses import asdict
+from pathlib import Path
+# Make repo root importable when invoked as `python scripts/chat.py`.
+_REPO_ROOT = Path(__file__).resolve().parent.parent
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+import torch  # noqa: E402
+# Chat template — plain-text fallback (see .omc/chat_plan.md).
+# If the SFT agent later reserves special tokens, redefine USER_TAG /
+# ASSISTANT_TAG / END_TAG and the stop-string accordingly.
+USER_TAG = "User:"
+ASSISTANT_TAG = "Assistant:"
+END_TAG = "\nUser:"  # stop-string matched on decoded output
+CKPT_DIR = Path(os.path.expanduser("~/.cache/autoresearch/ckpts"))
+CKPT_CANDIDATES_PRETRAIN = ["pretrain_final.pt", "latest.pt"]
+CKPT_CANDIDATES_SFT = ["sft_final.pt"]
+# ---------------------------------------------------------------------------
+# Checkpoint resolution
+# ---------------------------------------------------------------------------
+def resolve_checkpoint(explicit: str | None, prefer_sft: bool) -> Path | None:
+    """Return Path to checkpoint file, or None if nothing found.
+    Order:
+      1. `explicit` if provided and exists.
+      2. If prefer_sft: sft_final.pt -> pretrain_final.pt -> latest.pt.
+      3. Else: sft_final.pt (if exists) -> pretrain_final.pt -> latest.pt.
+    """
+    if explicit:
+        p = Path(os.path.expanduser(explicit))
+        if p.exists():
+            return p
+        print(f"[WARN] --ckpt {p} does not exist; falling through to auto-select.", file=sys.stderr)
+    # Task spec: prefer sft_final.pt if it exists; otherwise pretrain_final.pt
+    # then latest.pt. --sft just makes the preference explicit; it's already
+    # the default behavior. We list SFT first in both orderings to honor the
+    # spec, since the task description said "prefer sft if exists" by default.
+    _ = prefer_sft  # reserved for future "pretrain-only" vs "sft-only" modes
+    order = CKPT_CANDIDATES_SFT + CKPT_CANDIDATES_PRETRAIN
+    for name in order:
+        cand = CKPT_DIR / name
+        if cand.exists():
+            return cand
+    return None
+# ---------------------------------------------------------------------------
+# Model + tokenizer loading
+# ---------------------------------------------------------------------------
+def load_model_and_tokenizer(ckpt_path: Path | None, device: torch.device):
+    """Build model + tokenizer. If ckpt_path is None, random weights are used.
+    Returns (model, tokenizer, meta) where meta is a dict with 'ckpt',
+    'step', 'val_bpb' etc. for /info display.
+    """
+    from hydra.config import PostSemClawConfig
+    from hydra.model import PostSemClawModel
+    from prepare import Tokenizer
+    tokenizer = Tokenizer.from_directory()
+    vocab_size = tokenizer.get_vocab_size()
+    print(f"[chat] Tokenizer loaded (vocab={vocab_size:,})")
+    meta: dict = {"ckpt": str(ckpt_path) if ckpt_path else "<random>", "step": None, "val_bpb": None}
+    # Build config. If checkpoint provides one, use it; else use env-var defaults.
+    ckpt_state = None
+    config_kwargs: dict = {}
+    if ckpt_path is not None:
+        print(f"[chat] Loading checkpoint: {ckpt_path}")
+        ckpt_state = torch.load(ckpt_path, map_location=device, weights_only=False)
+        cfg_dict = ckpt_state.get("config")
+        if isinstance(cfg_dict, dict):
+            # Filter to kwargs PostSemClawConfig actually accepts.
+            allowed = set(PostSemClawConfig.__dataclass_fields__.keys())
+            config_kwargs = {k: v for k, v in cfg_dict.items() if k in allowed}
+        meta["step"] = ckpt_state.get("step")
+        meta["val_bpb"] = ckpt_state.get("val_bpb") or ckpt_state.get("bpb")
+    # Env-var defaults are applied by PostSemClawConfig field defaults; but the
+    # training run builds the config explicitly from hydra.config module-level
+    # constants. We mirror that here so the random-weights path aligns with
+    # what train.py would instantiate for the same env.
+    if not config_kwargs:
+        from hydra.config import (  # noqa: E402
+            D_MODEL, D_STATE, ENGRAM_KEY_DIM, ENGRAM_LAYER_IDX,
+            ENGRAM_N_COLUMNS, EXPAND, HEADDIM, N_HEADS, N_LAYER,
+        )
+        from prepare import MAX_SEQ_LEN  # noqa: E402
+        config_kwargs = dict(
+            sequence_len=MAX_SEQ_LEN,
+            vocab_size=vocab_size,
+            n_layer=N_LAYER,
+            d_model=D_MODEL,
+            d_state=D_STATE,
+            headdim=HEADDIM,
+            n_heads=N_HEADS,
+            expand=EXPAND,
+            engram_n_columns=ENGRAM_N_COLUMNS,
+            engram_key_dim=ENGRAM_KEY_DIM,
+            engram_layer_idx=ENGRAM_LAYER_IDX,
+        )
+    # Build model on meta device then materialize — matches training.py path.
+    with torch.device("meta"):
+        model = PostSemClawModel(PostSemClawConfig(**config_kwargs))
+    model.to_empty(device=device)
+    model.init_weights()
+    if ckpt_state is not None and "model_state_dict" in ckpt_state:
+        # strict=False: the model has non-parameter buffers (SDR retina loaded
+        # from npz, HTM Rust-side state, engram EMA stats) that may not be in
+        # the state_dict. missing/unexpected-key warnings are expected and OK.
+        missing, unexpected = model.load_state_dict(
+            ckpt_state["model_state_dict"], strict=False
+        )
+        if missing:
+            print(f"[chat] Note: {len(missing)} missing key(s) in state_dict (expected for HTM/SDR buffers).")
+        if unexpected:
+            print(f"[chat] Note: {len(unexpected)} unexpected key(s) in state_dict.")
+    elif ckpt_path is None:
+        print("[chat] [WARN] NO CHECKPOINT — using random weights. Output will be gibberish.", file=sys.stderr)
+    model.eval()
+    return model, tokenizer, meta
+# ---------------------------------------------------------------------------
+# Generation
+# ---------------------------------------------------------------------------
+def generate_stream(
+    model,
+    tokenizer,
+    prompt_ids: list[int],
+    *,
+    max_new_tokens: int,
+    temperature: float,
+    top_k: int,
+    top_p: float,
+    repetition_penalty: float,
+    stop_strings: tuple[str, ...],
+    max_seq_len: int,
+    device: torch.device,
+    rep_window: int = 64,
+):
+    """Yield decoded-text chunks as tokens are generated.
+    Truncates `prompt_ids` to the last `max_seq_len` tokens if needed. Stops
+    early when any `stop_strings` substring appears in the newly-decoded
+    continuation.
+    """
+    from scripts.sample_utils import sample_token
+    # Truncate prompt to window.
+    if len(prompt_ids) > max_seq_len:
+        prompt_ids = prompt_ids[-max_seq_len:]
+    ctx = torch.tensor([prompt_ids], device=device, dtype=torch.long)
+    generated: list[int] = []
+    # Track already-streamed byte length so we can detect when the decoded
+    # string has grown (BPE tokens may decode to multi-char strings mid-merge).
+    streamed_chars = 0
+    accumulated_text = ""
+    autocast_ctx = torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16)
+    for _ in range(max_new_tokens):
+        with torch.no_grad(), autocast_ctx:
+            out = model(ctx, targets=None)
+        # out shape: (1, T, vocab) or (1, vocab) depending on path.
+        if out.dim() == 3:
+            last_logits = out[0, -1, :]
+        else:
+            last_logits = out[0]
+        recent = generated[-rep_window:] if generated else None
+        next_id = sample_token(
+            last_logits,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            recent_tokens=recent,
+        )
+        generated.append(next_id)
+        # Decode everything so-far then diff — BPE decoding is not token-local,
+        # so a per-token decode can drop bytes.
+        new_text = tokenizer.decode(generated)
+        delta = new_text[streamed_chars:]
+        if delta:
+            streamed_chars = len(new_text)
+            accumulated_text = new_text
+            yield delta
+        # Stop-string check.
+        hit_stop = any(s and s in accumulated_text for s in stop_strings)
+        if hit_stop:
+            break
+        # Advance context. If we've filled the window, drop oldest token.
+        ctx = torch.cat([ctx, torch.tensor([[next_id]], device=device, dtype=torch.long)], dim=1)
+        if ctx.size(1) > max_seq_len:
+            ctx = ctx[:, -max_seq_len:]
+    # Final accumulated text is also returned for history tracking.
+    return accumulated_text  # noqa: B901  (generator return for history)
+def _consume_stream_with_print(stream_gen):
+    """Iterate a generator, print each chunk, return the full text.
+    Replacement for a naïve list(stream) since `generate_stream` is a generator
+    that yields then returns the final text.
+    """
+    collected = []
+    try:
+        while True:
+            chunk = next(stream_gen)
+            collected.append(chunk)
+            sys.stdout.write(chunk)
+            sys.stdout.flush()
+    except StopIteration as stop:
+        # stop.value holds the return value of the generator.
+        final = stop.value
+        if final is not None:
+            return final
+        return "".join(collected)
+# ---------------------------------------------------------------------------
+# REPL
+# ---------------------------------------------------------------------------
+def build_prompt(system: str, history: list[tuple[str, str]], user_msg: str) -> str:
+    """Assemble the text prompt fed to the tokenizer."""
+    parts: list[str] = []
+    if system:
+        parts.append(system.rstrip() + "\n")
+    for u, a in history:
+        parts.append(f"{USER_TAG} {u}\n{ASSISTANT_TAG} {a}\n")
+    parts.append(f"{USER_TAG} {user_msg}\n{ASSISTANT_TAG}")
+    return "".join(parts)
+def run_repl(
+    model,
+    tokenizer,
+    meta: dict,
+    *,
+    device: torch.device,
+    max_seq_len: int,
+) -> None:
+    settings = {
+        "temperature": float(os.environ.get("HYDRA_CHAT_TEMP", "0.8")),
+        "top_k": int(os.environ.get("HYDRA_CHAT_TOPK", "40")),
+        "top_p": float(os.environ.get("HYDRA_CHAT_TOPP", "0.9")),
+        "max_new_tokens": int(os.environ.get("HYDRA_CHAT_MAX", "200")),
+        "repetition_penalty": float(os.environ.get("HYDRA_CHAT_REP", "1.1")),
+        "system": os.environ.get("HYDRA_CHAT_SYSTEM", ""),
+    }
+    history: list[tuple[str, str]] = []
+    print()
+    print("=" * 60)
+    print("HYDRA chat REPL")
+    print(f"  checkpoint: {meta['ckpt']}")
+    if meta.get("step") is not None:
+        print(f"  step: {meta['step']}")
+    if meta.get("val_bpb") is not None:
+        print(f"  val_bpb: {meta['val_bpb']}")
+    print("  type /info for settings, /quit to exit")
+    print("=" * 60)
+    print()
+    while True:
+        try:
+            line = input(f"{USER_TAG} ")
+        except (EOFError, KeyboardInterrupt):
+            print()
+            return
+        line = line.rstrip()
+        if not line:
+            continue
+        if line.startswith("/"):
+            cmd, *rest = line.split(maxsplit=1)
+            arg = rest[0] if rest else ""
+            if cmd == "/quit" or cmd == "/exit":
+                return
+            elif cmd == "/reset":
+                history = []
+                print("[reset]")
+                continue
+            elif cmd == "/info":
+                print(f"[info] ckpt={meta['ckpt']} settings={settings} history_turns={len(history)}")
+                continue
+            elif cmd == "/temp":
+                try:
+                    settings["temperature"] = float(arg)
+                    print(f"[temp={settings['temperature']}]")
+                except ValueError:
+                    print(f"[err] /temp needs a float, got {arg!r}")
+                continue
+            elif cmd == "/topk":
+                try:
+                    settings["top_k"] = int(arg)
+                    print(f"[topk={settings['top_k']}]")
+                except ValueError:
+                    print(f"[err] /topk needs an int, got {arg!r}")
+                continue
+            elif cmd == "/topp":
+                try:
+                    settings["top_p"] = float(arg)
+                    print(f"[topp={settings['top_p']}]")
+                except ValueError:
+                    print(f"[err] /topp needs a float, got {arg!r}")
+                continue
+            elif cmd == "/max":
+                try:
+                    settings["max_new_tokens"] = int(arg)
+                    print(f"[max={settings['max_new_tokens']}]")
+                except ValueError:
+                    print(f"[err] /max needs an int, got {arg!r}")
+                continue
+            elif cmd == "/rep":
+                try:
+                    settings["repetition_penalty"] = float(arg)
+                    print(f"[rep={settings['repetition_penalty']}]")
+                except ValueError:
+                    print(f"[err] /rep needs a float, got {arg!r}")
+                continue
+            elif cmd == "/sys":
+                settings["system"] = arg
+                print(f"[sys set, {len(arg)} chars]")
+                continue
+            else:
+                print(f"[err] unknown command {cmd!r}. Try /info /reset /quit.")
+                continue
+        # Normal chat turn.
+        prompt_text = build_prompt(settings["system"], history, line)
+        prompt_ids = tokenizer.encode(prompt_text)
+        sys.stdout.write(f"{ASSISTANT_TAG} ")
+        sys.stdout.flush()
+        stream = generate_stream(
+            model, tokenizer, prompt_ids,
+            max_new_tokens=settings["max_new_tokens"],
+            temperature=settings["temperature"],
+            top_k=settings["top_k"],
+            top_p=settings["top_p"],
+            repetition_penalty=settings["repetition_penalty"],
+            stop_strings=(END_TAG,),
+            max_seq_len=max_seq_len,
+            device=device,
+        )
+        response_text = _consume_stream_with_print(stream)
+        if not response_text.endswith("\n"):
+            sys.stdout.write("\n")
+            sys.stdout.flush()
+        # Strip trailing stop marker from the remembered history.
+        clean = response_text
+        if END_TAG in clean:
+            clean = clean.split(END_TAG, 1)[0]
+        clean = clean.strip()
+        history.append((line, clean))
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="HYDRA chat REPL")
+    p.add_argument("--ckpt", type=str, default=None,
+                   help="Path to checkpoint (.pt). If omitted, auto-select.")
+    p.add_argument("--sft", action="store_true",
+                   help="Prefer an SFT checkpoint if available.")
+    p.add_argument("--random", action="store_true",
+                   help="Skip checkpoint load; use random weights.")
+    p.add_argument("--device", type=str, default=None,
+                   help="Torch device (default: cuda if available else cpu).")
+    return p.parse_args(argv)
+def main(argv: list[str] | None = None) -> int:
+    args = _parse_args(argv)
+    if args.device:
+        device = torch.device(args.device)
+    elif torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+        print("[chat] [WARN] CUDA not available; HYDRA's HTM/Mamba kernels may fail on CPU.", file=sys.stderr)
+    ckpt_path: Path | None
+    if args.random:
+        ckpt_path = None
+    else:
+        ckpt_path = resolve_checkpoint(args.ckpt, args.sft)
+    t0 = time.time()
+    model, tokenizer, meta = load_model_and_tokenizer(ckpt_path, device)
+    dt = time.time() - t0
+    print(f"[chat] Model ready in {dt:.1f}s on {device}")
+    from prepare import MAX_SEQ_LEN
+    run_repl(model, tokenizer, meta, device=device, max_seq_len=MAX_SEQ_LEN)
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

overlay/scripts/chat_eval.py CHANGED Viewed

@@ -1,300 +1,300 @@
-"""Non-interactive chat eval for HYDRA.
-Runs a fixed set of prompts through the same chat template that `chat.py`
-uses, prints a markdown table with the response and coherence heuristics.
-Usage:
-    python scripts/chat_eval.py                 # auto-select checkpoint
-    python scripts/chat_eval.py --ckpt PATH
-    python scripts/chat_eval.py --random
-    python scripts/chat_eval.py --json out.json # also dump raw results
-    python scripts/chat_eval.py --max 80        # cap new tokens per prompt
-"""
-from __future__ import annotations
-import argparse
-import json
-import os
-import re
-import sys
-import time
-from pathlib import Path
-_REPO_ROOT = Path(__file__).resolve().parent.parent
-if str(_REPO_ROOT) not in sys.path:
-    sys.path.insert(0, str(_REPO_ROOT))
-import torch  # noqa: E402
-from scripts.chat import (  # noqa: E402
-    ASSISTANT_TAG, END_TAG, USER_TAG, build_prompt,
-    generate_stream, load_model_and_tokenizer, resolve_checkpoint,
-)
-PROMPTS: list[str] = [
-    # Factual
-    "What is the capital of France?",
-    "Who wrote Romeo and Juliet?",
-    "What is 2 plus 2?",
-    "What color is the sky on a clear day?",
-    # Completion
-    "Once upon a time",
-    "The cat sat on the",
-    "In a hole in the ground there lived",
-    # Instruction
-    "Write one short sentence about rain.",
-    "List three animals.",
-    "Define the word 'library'.",
-    # Conversational
-    "Hello, how are you?",
-    "Tell me a joke.",
-    # Creative
-    "Describe a sunset in one line.",
-    "Give me a name for a pet robot.",
-    "What is the meaning of friendship?",
-]
-# Heuristic thresholds (printed, not enforced as pass/fail).
-THRESH_DISTINCT_2 = 0.30
-THRESH_SENT_MIN = 5
-THRESH_SENT_MAX = 30
-THRESH_EN_RATIO = 0.95
-# ---------------------------------------------------------------------------
-# Coherence heuristics
-# ---------------------------------------------------------------------------
-def _tokens(text: str) -> list[str]:
-    return re.findall(r"[A-Za-z0-9']+", text)
-def distinct_2(text: str) -> float:
-    toks = _tokens(text)
-    if len(toks) < 2:
-        return 0.0
-    bigrams = [(toks[i], toks[i + 1]) for i in range(len(toks) - 1)]
-    return len(set(bigrams)) / max(1, len(bigrams))
-def avg_sentence_len(text: str) -> float:
-    sents = re.split(r"[.!?]+", text)
-    lens = [len(_tokens(s)) for s in sents if _tokens(s)]
-    if not lens:
-        return 0.0
-    return sum(lens) / len(lens)
-def english_char_ratio(text: str) -> float:
-    if not text:
-        return 0.0
-    allowed = 0
-    for c in text:
-        if c.isalnum() or c.isspace() or c in ".,!?;:'\"-()[]{}/\\*#@&%+=_<>|$":
-            allowed += 1
-    return allowed / len(text)
-# ---------------------------------------------------------------------------
-# Runner
-# ---------------------------------------------------------------------------
-def _run_one(model, tokenizer, prompt: str, *, max_new_tokens: int, device: torch.device,
-             max_seq_len: int, temperature: float, top_k: int, top_p: float,
-             repetition_penalty: float) -> str:
-    prompt_text = build_prompt(system="", history=[], user_msg=prompt)
-    prompt_ids = tokenizer.encode(prompt_text)
-    stream = generate_stream(
-        model, tokenizer, prompt_ids,
-        max_new_tokens=max_new_tokens,
-        temperature=temperature,
-        top_k=top_k,
-        top_p=top_p,
-        repetition_penalty=repetition_penalty,
-        stop_strings=(END_TAG,),
-        max_seq_len=max_seq_len,
-        device=device,
-    )
-    collected: list[str] = []
-    try:
-        while True:
-            collected.append(next(stream))
-    except StopIteration as stop:
-        if stop.value is not None:
-            text = stop.value
-        else:
-            text = "".join(collected)
-    if END_TAG in text:
-        text = text.split(END_TAG, 1)[0]
-    return text.strip()
-def _render_markdown(rows: list[dict]) -> str:
-    lines = [
-        "| # | Prompt | Response | dist-2 | sent_len | en_ratio | flags |",
-        "|---|--------|----------|--------|----------|----------|-------|",
-    ]
-    def _cell(s: str, n: int = 60) -> str:
-        s = s.replace("|", "\\|").replace("\n", " ")
-        if len(s) > n:
-            s = s[: n - 1] + "…"
-        return s
-    for i, r in enumerate(rows, 1):
-        flags = []
-        if r["distinct_2"] < THRESH_DISTINCT_2:
-            flags.append("repetitive")
-        if not (THRESH_SENT_MIN <= r["avg_sentence_len"] <= THRESH_SENT_MAX):
-            flags.append("sent_len")
-        if r["en_ratio"] < THRESH_EN_RATIO:
-            flags.append("non_en")
-        flag_str = ",".join(flags) or "ok"
-        lines.append(
-            f"| {i} | {_cell(r['prompt'], 40)} | {_cell(r['response'], 60)} | "
-            f"{r['distinct_2']:.2f} | {r['avg_sentence_len']:.1f} | "
-            f"{r['en_ratio']:.2f} | {flag_str} |"
-        )
-    return "\n".join(lines)
-# ---------------------------------------------------------------------------
-# CLI
-# ---------------------------------------------------------------------------
-def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
-    p = argparse.ArgumentParser(description="HYDRA chat eval")
-    p.add_argument("--ckpt", type=str, default=None, help="Checkpoint path.")
-    p.add_argument("--sft", action="store_true", help="Prefer SFT checkpoint.")
-    p.add_argument("--random", action="store_true", help="Use random weights.")
-    p.add_argument("--max", dest="max_new_tokens", type=int, default=80)
-    p.add_argument("--temp", dest="temperature", type=float, default=0.8)
-    p.add_argument("--topk", dest="top_k", type=int, default=40)
-    p.add_argument("--topp", dest="top_p", type=float, default=0.9)
-    p.add_argument("--rep", dest="repetition_penalty", type=float, default=1.1)
-    p.add_argument("--json", dest="json_out", type=str, default=None,
-                   help="Optional: dump raw results to this JSON path.")
-    p.add_argument("--device", type=str, default=None)
-    return p.parse_args(argv)
-def main(argv: list[str] | None = None) -> int:
-    args = _parse_args(argv)
-    if args.device:
-        device = torch.device(args.device)
-    elif torch.cuda.is_available():
-        device = torch.device("cuda")
-    else:
-        device = torch.device("cpu")
-    ckpt_path = None if args.random else resolve_checkpoint(args.ckpt, args.sft)
-    t0 = time.time()
-    model, tokenizer, meta = load_model_and_tokenizer(ckpt_path, device)
-    dt_load = time.time() - t0
-    print(f"[chat_eval] Loaded in {dt_load:.1f}s  ckpt={meta['ckpt']}")
-    from prepare import MAX_SEQ_LEN
-    rows: list[dict] = []
-    t_gen = time.time()
-    for i, prompt in enumerate(PROMPTS, 1):
-        t_start = time.time()
-        try:
-            resp = _run_one(
-                model, tokenizer, prompt,
-                max_new_tokens=args.max_new_tokens,
-                device=device,
-                max_seq_len=MAX_SEQ_LEN,
-                temperature=args.temperature,
-                top_k=args.top_k,
-                top_p=args.top_p,
-                repetition_penalty=args.repetition_penalty,
-            )
-            err = None
-        except Exception as e:  # noqa: BLE001 — eval must not abort mid-prompt.
-            resp = ""
-            err = repr(e)
-            print(f"[chat_eval] prompt {i} failed: {err}", file=sys.stderr)
-        rows.append({
-            "prompt": prompt,
-            "response": resp,
-            "distinct_2": distinct_2(resp),
-            "avg_sentence_len": avg_sentence_len(resp),
-            "en_ratio": english_char_ratio(resp),
-            "latency_s": round(time.time() - t_start, 2),
-            "error": err,
-        })
-        print(f"[chat_eval] {i:2d}/{len(PROMPTS)}  {rows[-1]['latency_s']:.1f}s  {resp!r}")
-    dt_gen = time.time() - t_gen
-    print()
-    print("## HYDRA chat_eval results")
-    print(f"- checkpoint: `{meta['ckpt']}`")
-    if meta.get("step") is not None:
-        print(f"- step: {meta['step']}")
-    if meta.get("val_bpb") is not None:
-        print(f"- val_bpb: {meta['val_bpb']}")
-    print(f"- prompts: {len(PROMPTS)}")
-    print(f"- load: {dt_load:.1f}s  generation: {dt_gen:.1f}s")
-    print()
-    print(_render_markdown(rows))
-    print()
-    # Summary heuristics
-    any_empty = sum(1 for r in rows if not r["response"])
-    any_error = sum(1 for r in rows if r["error"])
-    mean_d2 = sum(r["distinct_2"] for r in rows) / max(1, len(rows))
-    mean_en = sum(r["en_ratio"] for r in rows) / max(1, len(rows))
-    print("### Aggregates")
-    print(f"- empty responses: {any_empty}/{len(rows)}")
-    print(f"- generation errors: {any_error}/{len(rows)}")
-    print(f"- mean distinct-2: {mean_d2:.3f}  (target > {THRESH_DISTINCT_2})")
-    print(f"- mean en_ratio:  {mean_en:.3f}  (target > {THRESH_EN_RATIO})")
-    print()
-    print("_Quality at this model scale (~7.5M params) is NOT expected to meet thresholds; "
-          "this eval verifies the chat interface, not dialogue coherence._")
-    if args.json_out:
-        out = {
-            "meta": meta,
-            "settings": {
-                "max_new_tokens": args.max_new_tokens,
-                "temperature": args.temperature,
-                "top_k": args.top_k,
-                "top_p": args.top_p,
-                "repetition_penalty": args.repetition_penalty,
-            },
-            "rows": rows,
-            "aggregates": {
-                "empty": any_empty,
-                "errors": any_error,
-                "mean_distinct_2": mean_d2,
-                "mean_en_ratio": mean_en,
-                "load_s": dt_load,
-                "gen_s": dt_gen,
-            },
-        }
-        Path(args.json_out).write_text(json.dumps(out, indent=2))
-        print(f"[chat_eval] JSON written to {args.json_out}")
-    # Exit 0 if we loaded and generated *something* for each prompt (even if
-    # quality was poor). Exit 1 only on load failure (caught by main's exception
-    # propagation) or if ALL prompts returned empty strings — that signals a
-    # broken generation loop, not poor quality.
-    if any_empty == len(rows):
-        print("[chat_eval] ALL prompts returned empty — generation loop is broken.", file=sys.stderr)
-        return 1
-    return 0
-if __name__ == "__main__":
-    sys.exit(main())

+"""Non-interactive chat eval for HYDRA.
+Runs a fixed set of prompts through the same chat template that `chat.py`
+uses, prints a markdown table with the response and coherence heuristics.
+Usage:
+    python scripts/chat_eval.py                 # auto-select checkpoint
+    python scripts/chat_eval.py --ckpt PATH
+    python scripts/chat_eval.py --random
+    python scripts/chat_eval.py --json out.json # also dump raw results
+    python scripts/chat_eval.py --max 80        # cap new tokens per prompt
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import re
+import sys
+import time
+from pathlib import Path
+_REPO_ROOT = Path(__file__).resolve().parent.parent
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+import torch  # noqa: E402
+from scripts.chat import (  # noqa: E402
+    ASSISTANT_TAG, END_TAG, USER_TAG, build_prompt,
+    generate_stream, load_model_and_tokenizer, resolve_checkpoint,
+)
+PROMPTS: list[str] = [
+    # Factual
+    "What is the capital of France?",
+    "Who wrote Romeo and Juliet?",
+    "What is 2 plus 2?",
+    "What color is the sky on a clear day?",
+    # Completion
+    "Once upon a time",
+    "The cat sat on the",
+    "In a hole in the ground there lived",
+    # Instruction
+    "Write one short sentence about rain.",
+    "List three animals.",
+    "Define the word 'library'.",
+    # Conversational
+    "Hello, how are you?",
+    "Tell me a joke.",
+    # Creative
+    "Describe a sunset in one line.",
+    "Give me a name for a pet robot.",
+    "What is the meaning of friendship?",
+]
+# Heuristic thresholds (printed, not enforced as pass/fail).
+THRESH_DISTINCT_2 = 0.30
+THRESH_SENT_MIN = 5
+THRESH_SENT_MAX = 30
+THRESH_EN_RATIO = 0.95
+# ---------------------------------------------------------------------------
+# Coherence heuristics
+# ---------------------------------------------------------------------------
+def _tokens(text: str) -> list[str]:
+    return re.findall(r"[A-Za-z0-9']+", text)
+def distinct_2(text: str) -> float:
+    toks = _tokens(text)
+    if len(toks) < 2:
+        return 0.0
+    bigrams = [(toks[i], toks[i + 1]) for i in range(len(toks) - 1)]
+    return len(set(bigrams)) / max(1, len(bigrams))
+def avg_sentence_len(text: str) -> float:
+    sents = re.split(r"[.!?]+", text)
+    lens = [len(_tokens(s)) for s in sents if _tokens(s)]
+    if not lens:
+        return 0.0
+    return sum(lens) / len(lens)
+def english_char_ratio(text: str) -> float:
+    if not text:
+        return 0.0
+    allowed = 0
+    for c in text:
+        if c.isalnum() or c.isspace() or c in ".,!?;:'\"-()[]{}/\\*#@&%+=_<>|$":
+            allowed += 1
+    return allowed / len(text)
+# ---------------------------------------------------------------------------
+# Runner
+# ---------------------------------------------------------------------------
+def _run_one(model, tokenizer, prompt: str, *, max_new_tokens: int, device: torch.device,
+             max_seq_len: int, temperature: float, top_k: int, top_p: float,
+             repetition_penalty: float) -> str:
+    prompt_text = build_prompt(system="", history=[], user_msg=prompt)
+    prompt_ids = tokenizer.encode(prompt_text)
+    stream = generate_stream(
+        model, tokenizer, prompt_ids,
+        max_new_tokens=max_new_tokens,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        repetition_penalty=repetition_penalty,
+        stop_strings=(END_TAG,),
+        max_seq_len=max_seq_len,
+        device=device,
+    )
+    collected: list[str] = []
+    try:
+        while True:
+            collected.append(next(stream))
+    except StopIteration as stop:
+        if stop.value is not None:
+            text = stop.value
+        else:
+            text = "".join(collected)
+    if END_TAG in text:
+        text = text.split(END_TAG, 1)[0]
+    return text.strip()
+def _render_markdown(rows: list[dict]) -> str:
+    lines = [
+        "| # | Prompt | Response | dist-2 | sent_len | en_ratio | flags |",
+        "|---|--------|----------|--------|----------|----------|-------|",
+    ]
+    def _cell(s: str, n: int = 60) -> str:
+        s = s.replace("|", "\\|").replace("\n", " ")
+        if len(s) > n:
+            s = s[: n - 1] + "…"
+        return s
+    for i, r in enumerate(rows, 1):
+        flags = []
+        if r["distinct_2"] < THRESH_DISTINCT_2:
+            flags.append("repetitive")
+        if not (THRESH_SENT_MIN <= r["avg_sentence_len"] <= THRESH_SENT_MAX):
+            flags.append("sent_len")
+        if r["en_ratio"] < THRESH_EN_RATIO:
+            flags.append("non_en")
+        flag_str = ",".join(flags) or "ok"
+        lines.append(
+            f"| {i} | {_cell(r['prompt'], 40)} | {_cell(r['response'], 60)} | "
+            f"{r['distinct_2']:.2f} | {r['avg_sentence_len']:.1f} | "
+            f"{r['en_ratio']:.2f} | {flag_str} |"
+        )
+    return "\n".join(lines)
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="HYDRA chat eval")
+    p.add_argument("--ckpt", type=str, default=None, help="Checkpoint path.")
+    p.add_argument("--sft", action="store_true", help="Prefer SFT checkpoint.")
+    p.add_argument("--random", action="store_true", help="Use random weights.")
+    p.add_argument("--max", dest="max_new_tokens", type=int, default=80)
+    p.add_argument("--temp", dest="temperature", type=float, default=0.8)
+    p.add_argument("--topk", dest="top_k", type=int, default=40)
+    p.add_argument("--topp", dest="top_p", type=float, default=0.9)
+    p.add_argument("--rep", dest="repetition_penalty", type=float, default=1.1)
+    p.add_argument("--json", dest="json_out", type=str, default=None,
+                   help="Optional: dump raw results to this JSON path.")
+    p.add_argument("--device", type=str, default=None)
+    return p.parse_args(argv)
+def main(argv: list[str] | None = None) -> int:
+    args = _parse_args(argv)
+    if args.device:
+        device = torch.device(args.device)
+    elif torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+    ckpt_path = None if args.random else resolve_checkpoint(args.ckpt, args.sft)
+    t0 = time.time()
+    model, tokenizer, meta = load_model_and_tokenizer(ckpt_path, device)
+    dt_load = time.time() - t0
+    print(f"[chat_eval] Loaded in {dt_load:.1f}s  ckpt={meta['ckpt']}")
+    from prepare import MAX_SEQ_LEN
+    rows: list[dict] = []
+    t_gen = time.time()
+    for i, prompt in enumerate(PROMPTS, 1):
+        t_start = time.time()
+        try:
+            resp = _run_one(
+                model, tokenizer, prompt,
+                max_new_tokens=args.max_new_tokens,
+                device=device,
+                max_seq_len=MAX_SEQ_LEN,
+                temperature=args.temperature,
+                top_k=args.top_k,
+                top_p=args.top_p,
+                repetition_penalty=args.repetition_penalty,
+            )
+            err = None
+        except Exception as e:  # noqa: BLE001 — eval must not abort mid-prompt.
+            resp = ""
+            err = repr(e)
+            print(f"[chat_eval] prompt {i} failed: {err}", file=sys.stderr)
+        rows.append({
+            "prompt": prompt,
+            "response": resp,
+            "distinct_2": distinct_2(resp),
+            "avg_sentence_len": avg_sentence_len(resp),
+            "en_ratio": english_char_ratio(resp),
+            "latency_s": round(time.time() - t_start, 2),
+            "error": err,
+        })
+        print(f"[chat_eval] {i:2d}/{len(PROMPTS)}  {rows[-1]['latency_s']:.1f}s  {resp!r}")
+    dt_gen = time.time() - t_gen
+    print()
+    print("## HYDRA chat_eval results")
+    print(f"- checkpoint: `{meta['ckpt']}`")
+    if meta.get("step") is not None:
+        print(f"- step: {meta['step']}")
+    if meta.get("val_bpb") is not None:
+        print(f"- val_bpb: {meta['val_bpb']}")
+    print(f"- prompts: {len(PROMPTS)}")
+    print(f"- load: {dt_load:.1f}s  generation: {dt_gen:.1f}s")
+    print()
+    print(_render_markdown(rows))
+    print()
+    # Summary heuristics
+    any_empty = sum(1 for r in rows if not r["response"])
+    any_error = sum(1 for r in rows if r["error"])
+    mean_d2 = sum(r["distinct_2"] for r in rows) / max(1, len(rows))
+    mean_en = sum(r["en_ratio"] for r in rows) / max(1, len(rows))
+    print("### Aggregates")
+    print(f"- empty responses: {any_empty}/{len(rows)}")
+    print(f"- generation errors: {any_error}/{len(rows)}")
+    print(f"- mean distinct-2: {mean_d2:.3f}  (target > {THRESH_DISTINCT_2})")
+    print(f"- mean en_ratio:  {mean_en:.3f}  (target > {THRESH_EN_RATIO})")
+    print()
+    print("_Quality at this model scale (~7.5M params) is NOT expected to meet thresholds; "
+          "this eval verifies the chat interface, not dialogue coherence._")
+    if args.json_out:
+        out = {
+            "meta": meta,
+            "settings": {
+                "max_new_tokens": args.max_new_tokens,
+                "temperature": args.temperature,
+                "top_k": args.top_k,
+                "top_p": args.top_p,
+                "repetition_penalty": args.repetition_penalty,
+            },
+            "rows": rows,
+            "aggregates": {
+                "empty": any_empty,
+                "errors": any_error,
+                "mean_distinct_2": mean_d2,
+                "mean_en_ratio": mean_en,
+                "load_s": dt_load,
+                "gen_s": dt_gen,
+            },
+        }
+        Path(args.json_out).write_text(json.dumps(out, indent=2))
+        print(f"[chat_eval] JSON written to {args.json_out}")
+    # Exit 0 if we loaded and generated *something* for each prompt (even if
+    # quality was poor). Exit 1 only on load failure (caught by main's exception
+    # propagation) or if ALL prompts returned empty strings — that signals a
+    # broken generation loop, not poor quality.
+    if any_empty == len(rows):
+        print("[chat_eval] ALL prompts returned empty — generation loop is broken.", file=sys.stderr)
+        return 1
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

overlay/scripts/compile_debug.py CHANGED Viewed

@@ -1,213 +1,213 @@
-"""Diagnostic script for torch.compile deadlock after ~500 steps.
-F17 investigation: validates that the _compiled_core / forward split
-fixes the deadlock by running forward+backward loops with compile on.
-Usage:
-    LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
-      HYDRA_TIME_BUDGET=30 HYDRA_BATCH_SIZE=8 HYDRA_TOTAL_BATCH=16384 \
-      HYDRA_HTM_LEARN_EVERY=4 HYDRA_HESTIA_INTERVAL=9999 \
-      .venv/bin/python -u scripts/compile_debug.py [mode]
-Modes:
-    eager       - no compile (baseline)
-    model_only  - compile model _compiled_core only
-    muon_only   - compile muon step only
-    both        - compile both (default)
-"""
-from __future__ import annotations
-import gc
-import os
-import signal
-import sys
-import threading
-import time
-# Set CUDA env before torch import
-os.environ.setdefault("CUDA_HOME", "/usr/local/cuda")
-os.environ.setdefault("PYTORCH_ALLOC_CONF", "expandable_segments:True")
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-# -------------------------------------------------------------------------
-# Config
-# -------------------------------------------------------------------------
-MAX_STEPS = 800
-WATCHDOG_TIMEOUT_S = 20  # kill if no progress for this many seconds
-BATCH_SIZE = int(os.environ.get("HYDRA_BATCH_SIZE", "8"))
-SEQ_LEN = 2048
-VOCAB_SIZE = 8192
-# -------------------------------------------------------------------------
-# Watchdog thread: kills process if no progress
-# -------------------------------------------------------------------------
-_last_progress = time.time()
-_watchdog_armed = True
-def _watchdog_fn():
-    global _last_progress, _watchdog_armed
-    while _watchdog_armed:
-        time.sleep(1.0)
-        elapsed = time.time() - _last_progress
-        if elapsed > WATCHDOG_TIMEOUT_S:
-            print(f"\n*** WATCHDOG: no progress for {elapsed:.1f}s — DEADLOCK DETECTED ***",
-                  flush=True)
-            _dump_diagnostics()
-            os.kill(os.getpid(), signal.SIGTERM)
-            return
-def _dump_diagnostics():
-    """Dump CUDA/dynamo state at deadlock time."""
-    try:
-        stats = torch.cuda.memory_stats()
-        print(f"  alloc_retries:     {stats.get('num_alloc_retries', 'N/A')}")
-        print(f"  allocated_bytes:   {stats.get('allocated_bytes.all.current', 0) / 1e6:.1f} MB")
-        print(f"  reserved_bytes:    {stats.get('reserved_bytes.all.current', 0) / 1e6:.1f} MB")
-        print(f"  num_ooms:          {stats.get('num_ooms', 0)}")
-    except Exception as e:
-        print(f"  (memory_stats failed: {e})")
-    try:
-        import torch._dynamo.utils as du
-        print(f"  dynamo counters:   {dict(du.counters)}")
-    except Exception as e:
-        print(f"  (dynamo counters failed: {e})")
-def tick():
-    global _last_progress
-    _last_progress = time.time()
-# -------------------------------------------------------------------------
-# Test
-# -------------------------------------------------------------------------
-def run_test(mode: str) -> dict:
-    """Run forward+backward loop with specified compile config."""
-    print(f"\n{'='*70}")
-    print(f"TEST MODE: {mode}")
-    print(f"{'='*70}", flush=True)
-    compile_model = mode in ("model_only", "both")
-    compile_muon = mode in ("muon_only", "both")
-    os.environ["HYDRA_MODEL_COMPILE"] = "1" if compile_model else "0"
-    os.environ["HYDRA_MUON_COMPILE"] = "1" if compile_muon else "0"
-    os.environ["HYDRA_ASYNC_POSTPROCESS"] = "0"
-    os.environ["HYDRA_HESTIA_INTERVAL"] = "9999"
-    os.environ["HYDRA_HTM_LEARN_EVERY"] = "4"
-    # Clear cached modules for fresh env var reads
-    for mod_name in list(sys.modules.keys()):
-        if mod_name.startswith("hydra."):
-            del sys.modules[mod_name]
-    torch._dynamo.reset()
-    torch.cuda.empty_cache()
-    torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-    from hydra.model import PostSemClawModel
-    from hydra.config import PostSemClawConfig
-    device = torch.device("cuda")
-    config = PostSemClawConfig(
-        d_model=256, n_layer=4, d_state=64, headdim=32, expand=2,
-        vocab_size=VOCAB_SIZE, sequence_len=SEQ_LEN,
-    )
-    with torch.device("meta"):
-        model = PostSemClawModel(config)
-    model.to_empty(device=device)
-    model.init_weights()
-    optimizer = model.setup_optimizer()
-    autocast_ctx = torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16)
-    result = {"mode": mode, "max_step": 0, "tps_samples": []}
-    alloc_retries_prev = 0
-    tick()
-    for step in range(MAX_STEPS):
-        t0 = time.time()
-        x = torch.randint(0, VOCAB_SIZE, (BATCH_SIZE, SEQ_LEN), device=device)
-        y = torch.randint(0, VOCAB_SIZE, (BATCH_SIZE, SEQ_LEN), device=device)
-        with autocast_ctx:
-            loss = model(x, y)
-        loss.backward()
-        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
-        optimizer.step()
-        model.zero_grad(set_to_none=True)
-        torch.cuda.synchronize()
-        dt = time.time() - t0
-        tps = int(BATCH_SIZE * SEQ_LEN / dt)
-        tick()
-        stats = torch.cuda.memory_stats()
-        retries = stats.get("num_alloc_retries", 0)
-        retry_delta = retries - alloc_retries_prev
-        alloc_retries_prev = retries
-        result["max_step"] = step
-        if step % 50 == 0 or retry_delta > 0 or step < 3:
-            alloc_mb = stats.get("allocated_bytes.all.current", 0) / 1e6
-            print(
-                f"  step={step:04d} tps={tps:6d} dt={dt*1000:.0f}ms "
-                f"alloc={alloc_mb:.0f}MB retries={retries}",
-                flush=True,
-            )
-            result["tps_samples"].append((step, tps))
-    result["completed"] = True
-    print(f"\n  COMPLETED: {MAX_STEPS} steps, mode={mode}", flush=True)
-    return result
-def main():
-    print(f"torch: {torch.__version__}  CUDA: {torch.version.cuda}")
-    print(f"GPU:   {torch.cuda.get_device_name()}")
-    print(f"VRAM:  {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")
-    print(f"Steps: {MAX_STEPS}  Watchdog: {WATCHDOG_TIMEOUT_S}s")
-    wd = threading.Thread(target=_watchdog_fn, daemon=True)
-    wd.start()
-    modes = sys.argv[1:] if len(sys.argv) > 1 else ["both"]
-    results = []
-    for mode in modes:
-        try:
-            r = run_test(mode)
-        except SystemExit:
-            print(f"\n  DEADLOCK/KILLED mode={mode}", flush=True)
-            r = {"mode": mode, "completed": False, "max_step": "?"}
-        except Exception as e:
-            print(f"\n  ERROR mode={mode}: {e}", flush=True)
-            r = {"mode": mode, "completed": False, "error": str(e)}
-        results.append(r)
-    print(f"\n{'='*70}")
-    print("SUMMARY")
-    print(f"{'='*70}")
-    for r in results:
-        status = "PASS" if r.get("completed") else "FAIL"
-        print(f"  {r['mode']:20s}: {status} (step {r.get('max_step', '?')})")
-    global _watchdog_armed
-    _watchdog_armed = False
-if __name__ == "__main__":
-    main()

+"""Diagnostic script for torch.compile deadlock after ~500 steps.
+F17 investigation: validates that the _compiled_core / forward split
+fixes the deadlock by running forward+backward loops with compile on.
+Usage:
+    LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
+      HYDRA_TIME_BUDGET=30 HYDRA_BATCH_SIZE=8 HYDRA_TOTAL_BATCH=16384 \
+      HYDRA_HTM_LEARN_EVERY=4 HYDRA_HESTIA_INTERVAL=9999 \
+      .venv/bin/python -u scripts/compile_debug.py [mode]
+Modes:
+    eager       - no compile (baseline)
+    model_only  - compile model _compiled_core only
+    muon_only   - compile muon step only
+    both        - compile both (default)
+"""
+from __future__ import annotations
+import gc
+import os
+import signal
+import sys
+import threading
+import time
+# Set CUDA env before torch import
+os.environ.setdefault("CUDA_HOME", "/usr/local/cuda")
+os.environ.setdefault("PYTORCH_ALLOC_CONF", "expandable_segments:True")
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# -------------------------------------------------------------------------
+# Config
+# -------------------------------------------------------------------------
+MAX_STEPS = 800
+WATCHDOG_TIMEOUT_S = 20  # kill if no progress for this many seconds
+BATCH_SIZE = int(os.environ.get("HYDRA_BATCH_SIZE", "8"))
+SEQ_LEN = 2048
+VOCAB_SIZE = 8192
+# -------------------------------------------------------------------------
+# Watchdog thread: kills process if no progress
+# -------------------------------------------------------------------------
+_last_progress = time.time()
+_watchdog_armed = True
+def _watchdog_fn():
+    global _last_progress, _watchdog_armed
+    while _watchdog_armed:
+        time.sleep(1.0)
+        elapsed = time.time() - _last_progress
+        if elapsed > WATCHDOG_TIMEOUT_S:
+            print(f"\n*** WATCHDOG: no progress for {elapsed:.1f}s — DEADLOCK DETECTED ***",
+                  flush=True)
+            _dump_diagnostics()
+            os.kill(os.getpid(), signal.SIGTERM)
+            return
+def _dump_diagnostics():
+    """Dump CUDA/dynamo state at deadlock time."""
+    try:
+        stats = torch.cuda.memory_stats()
+        print(f"  alloc_retries:     {stats.get('num_alloc_retries', 'N/A')}")
+        print(f"  allocated_bytes:   {stats.get('allocated_bytes.all.current', 0) / 1e6:.1f} MB")
+        print(f"  reserved_bytes:    {stats.get('reserved_bytes.all.current', 0) / 1e6:.1f} MB")
+        print(f"  num_ooms:          {stats.get('num_ooms', 0)}")
+    except Exception as e:
+        print(f"  (memory_stats failed: {e})")
+    try:
+        import torch._dynamo.utils as du
+        print(f"  dynamo counters:   {dict(du.counters)}")
+    except Exception as e:
+        print(f"  (dynamo counters failed: {e})")
+def tick():
+    global _last_progress
+    _last_progress = time.time()
+# -------------------------------------------------------------------------
+# Test
+# -------------------------------------------------------------------------
+def run_test(mode: str) -> dict:
+    """Run forward+backward loop with specified compile config."""
+    print(f"\n{'='*70}")
+    print(f"TEST MODE: {mode}")
+    print(f"{'='*70}", flush=True)
+    compile_model = mode in ("model_only", "both")
+    compile_muon = mode in ("muon_only", "both")
+    os.environ["HYDRA_MODEL_COMPILE"] = "1" if compile_model else "0"
+    os.environ["HYDRA_MUON_COMPILE"] = "1" if compile_muon else "0"
+    os.environ["HYDRA_ASYNC_POSTPROCESS"] = "0"
+    os.environ["HYDRA_HESTIA_INTERVAL"] = "9999"
+    os.environ["HYDRA_HTM_LEARN_EVERY"] = "4"
+    # Clear cached modules for fresh env var reads
+    for mod_name in list(sys.modules.keys()):
+        if mod_name.startswith("hydra."):
+            del sys.modules[mod_name]
+    torch._dynamo.reset()
+    torch.cuda.empty_cache()
+    torch.cuda.reset_peak_memory_stats()
+    gc.collect()
+    from hydra.model import PostSemClawModel
+    from hydra.config import PostSemClawConfig
+    device = torch.device("cuda")
+    config = PostSemClawConfig(
+        d_model=256, n_layer=4, d_state=64, headdim=32, expand=2,
+        vocab_size=VOCAB_SIZE, sequence_len=SEQ_LEN,
+    )
+    with torch.device("meta"):
+        model = PostSemClawModel(config)
+    model.to_empty(device=device)
+    model.init_weights()
+    optimizer = model.setup_optimizer()
+    autocast_ctx = torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16)
+    result = {"mode": mode, "max_step": 0, "tps_samples": []}
+    alloc_retries_prev = 0
+    tick()
+    for step in range(MAX_STEPS):
+        t0 = time.time()
+        x = torch.randint(0, VOCAB_SIZE, (BATCH_SIZE, SEQ_LEN), device=device)
+        y = torch.randint(0, VOCAB_SIZE, (BATCH_SIZE, SEQ_LEN), device=device)
+        with autocast_ctx:
+            loss = model(x, y)
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+        optimizer.step()
+        model.zero_grad(set_to_none=True)
+        torch.cuda.synchronize()
+        dt = time.time() - t0
+        tps = int(BATCH_SIZE * SEQ_LEN / dt)
+        tick()
+        stats = torch.cuda.memory_stats()
+        retries = stats.get("num_alloc_retries", 0)
+        retry_delta = retries - alloc_retries_prev
+        alloc_retries_prev = retries
+        result["max_step"] = step
+        if step % 50 == 0 or retry_delta > 0 or step < 3:
+            alloc_mb = stats.get("allocated_bytes.all.current", 0) / 1e6
+            print(
+                f"  step={step:04d} tps={tps:6d} dt={dt*1000:.0f}ms "
+                f"alloc={alloc_mb:.0f}MB retries={retries}",
+                flush=True,
+            )
+            result["tps_samples"].append((step, tps))
+    result["completed"] = True
+    print(f"\n  COMPLETED: {MAX_STEPS} steps, mode={mode}", flush=True)
+    return result
+def main():
+    print(f"torch: {torch.__version__}  CUDA: {torch.version.cuda}")
+    print(f"GPU:   {torch.cuda.get_device_name()}")
+    print(f"VRAM:  {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")
+    print(f"Steps: {MAX_STEPS}  Watchdog: {WATCHDOG_TIMEOUT_S}s")
+    wd = threading.Thread(target=_watchdog_fn, daemon=True)
+    wd.start()
+    modes = sys.argv[1:] if len(sys.argv) > 1 else ["both"]
+    results = []
+    for mode in modes:
+        try:
+            r = run_test(mode)
+        except SystemExit:
+            print(f"\n  DEADLOCK/KILLED mode={mode}", flush=True)
+            r = {"mode": mode, "completed": False, "max_step": "?"}
+        except Exception as e:
+            print(f"\n  ERROR mode={mode}: {e}", flush=True)
+            r = {"mode": mode, "completed": False, "error": str(e)}
+        results.append(r)
+    print(f"\n{'='*70}")
+    print("SUMMARY")
+    print(f"{'='*70}")
+    for r in results:
+        status = "PASS" if r.get("completed") else "FAIL"
+        print(f"  {r['mode']:20s}: {status} (step {r.get('max_step', '?')})")
+    global _watchdog_armed
+    _watchdog_armed = False
+if __name__ == "__main__":
+    main()

overlay/scripts/dataset_audit.py CHANGED Viewed

@@ -1,241 +1,241 @@
-"""
-Dataset audit — diagnostic tool for HYDRA's pretraining corpus.
-Usage:
-    python scripts/dataset_audit.py              # Quick audit
-    python scripts/dataset_audit.py --sample 10  # Sample 10 shards for token counts
-    python scripts/dataset_audit.py --full       # Full tokenize of every shard (slow)
-Reports:
-- Shard count, total disk usage
-- Estimated total tokens (character-based + tokenized sample)
-- Training budget sufficiency vs 12h @ 65k tok/s = 2.8B token target
-- Document diversity sample
-- Warnings about shard ordering, shuffle, and streaming behavior
-"""
-from __future__ import annotations
-import argparse
-import os
-import sys
-import time
-from pathlib import Path
-import pyarrow.parquet as pq
-# Resolve repo root so the script works regardless of CWD.
-REPO_ROOT = Path(__file__).resolve().parent.parent
-sys.path.insert(0, str(REPO_ROOT))
-from prepare import (  # noqa: E402
-    DATA_DIR,
-    MAX_SHARD,
-    TOKENIZER_DIR,
-    VAL_FILENAME,
-    VAL_SHARD,
-)
-TARGET_TOKENS_12H = 2_800_000_000  # 65k tok/s * 12h * 3600s
-CHARS_PER_TOKEN_HEURISTIC = 4.0
-def human_bytes(n: int) -> str:
-    for unit in ("B", "KB", "MB", "GB", "TB"):
-        if n < 1024:
-            return f"{n:.1f}{unit}"
-        n /= 1024
-    return f"{n:.1f}PB"
-def human_tokens(n: int | float) -> str:
-    if n >= 1e9:
-        return f"{n / 1e9:.2f}B"
-    if n >= 1e6:
-        return f"{n / 1e6:.1f}M"
-    if n >= 1e3:
-        return f"{n / 1e3:.1f}K"
-    return f"{n:.0f}"
-def list_shards() -> tuple[list[Path], Path | None]:
-    """Return (train_shards_sorted, val_shard_or_none)."""
-    if not os.path.isdir(DATA_DIR):
-        return [], None
-    all_paths = sorted(Path(DATA_DIR).glob("shard_*.parquet"))
-    val_path = Path(DATA_DIR) / VAL_FILENAME
-    train = [p for p in all_paths if p.name != VAL_FILENAME]
-    val = val_path if val_path.exists() else None
-    return train, val
-def tokenized_sample(shard_path: Path, enc, row_groups: int = 5) -> tuple[int, int]:
-    """Tokenize first N row groups of a shard. Returns (tokens, docs)."""
-    pf = pq.ParquetFile(shard_path)
-    tokens = 0
-    docs = 0
-    n = min(row_groups, pf.num_row_groups)
-    for i in range(n):
-        rg = pf.read_row_group(i)
-        texts = rg.column("text").to_pylist()
-        ids = enc.encode_ordinary_batch(texts, num_threads=8)
-        tokens += sum(len(x) for x in ids)
-        docs += len(texts)
-    return tokens, docs, pf.num_row_groups
-def main() -> int:
-    parser = argparse.ArgumentParser(description="Audit the HYDRA training corpus")
-    parser.add_argument(
-        "--sample",
-        type=int,
-        default=3,
-        help="Number of shards to tokenize for token-count estimate",
-    )
-    parser.add_argument(
-        "--full",
-        action="store_true",
-        help="Tokenize every shard (slow; gives exact total)",
-    )
-    args = parser.parse_args()
-    print("=" * 72)
-    print("HYDRA corpus audit")
-    print("=" * 72)
-    print(f"DATA_DIR:        {DATA_DIR}")
-    print(f"TOKENIZER_DIR:   {TOKENIZER_DIR}")
-    print(f"Source dataset:  karpathy/climbmix-400b-shuffle")
-    print(f"Max remote shard: {MAX_SHARD} (pinned val = shard_{VAL_SHARD:05d})")
-    print()
-    train_shards, val_shard = list_shards()
-    if not train_shards:
-        print("ERROR: no parquet shards found. Run `python prepare.py` first.")
-        return 1
-    total_disk = sum(p.stat().st_size for p in train_shards)
-    val_disk = val_shard.stat().st_size if val_shard else 0
-    print(f"Train shards:    {len(train_shards)}  ({train_shards[0].name} ... {train_shards[-1].name})")
-    print(f"Val shard:       {'present' if val_shard else 'MISSING'}  ({VAL_FILENAME})")
-    print(f"Disk (train):    {human_bytes(total_disk)}")
-    print(f"Disk (val):      {human_bytes(val_disk)}")
-    print()
-    # Character-based pass (fast): count total chars in all shards.
-    t0 = time.time()
-    total_chars = 0
-    total_docs = 0
-    total_row_groups = 0
-    for p in train_shards:
-        pf = pq.ParquetFile(p)
-        total_row_groups += pf.num_row_groups
-        total_docs += pf.metadata.num_rows
-    dt_meta = time.time() - t0
-    print(f"Metadata scan:   {len(train_shards)} shards in {dt_meta:.1f}s")
-    print(f"Train documents: {total_docs:,}")
-    print(f"Row groups:      {total_row_groups:,}")
-    print()
-    # Tokenizer-based sampling.
-    try:
-        import pickle
-        with open(os.path.join(TOKENIZER_DIR, "tokenizer.pkl"), "rb") as f:
-            enc = pickle.load(f)
-        print(f"Tokenizer vocab: {enc.n_vocab}")
-    except FileNotFoundError:
-        print("WARNING: tokenizer.pkl not found — skipping tokenized sample.")
-        enc = None
-    est_total_tokens = 0
-    if enc is not None:
-        if args.full:
-            sample_shards = train_shards
-        else:
-            # Pick shards evenly across the range for a representative sample.
-            n_sample = min(args.sample, len(train_shards))
-            if n_sample == 1:
-                sample_shards = [train_shards[0]]
-            else:
-                stride = max(1, len(train_shards) // n_sample)
-                sample_shards = train_shards[::stride][:n_sample]
-        t0 = time.time()
-        sample_tokens = 0
-        sample_docs = 0
-        sample_row_groups = 0
-        sample_shard_row_groups = 0
-        print(f"Tokenizing sample: {len(sample_shards)} shards ...")
-        for p in sample_shards:
-            tok, docs, n_rg = tokenized_sample(p, enc, row_groups=5)
-            sample_tokens += tok
-            sample_docs += docs
-            sample_row_groups += min(5, n_rg)
-            sample_shard_row_groups += n_rg
-        dt_tok = time.time() - t0
-        tokens_per_rg = sample_tokens / max(sample_row_groups, 1)
-        per_shard = tokens_per_rg * (sample_shard_row_groups / len(sample_shards))
-        est_total_tokens = per_shard * len(train_shards)
-        print(
-            f"Sampled {sample_row_groups} row groups ({sample_docs:,} docs, "
-            f"{sample_tokens:,} tokens) in {dt_tok:.1f}s"
-        )
-        print(f"  tokens/row_group: {tokens_per_rg:,.0f}")
-        print(f"  tokens/shard:     {per_shard:,.0f}")
-        print(f"  tokens/shard:     {human_tokens(per_shard)}")
-    else:
-        # Fall back to character heuristic.
-        per_shard_chars = total_disk / max(len(train_shards), 1)
-        # Parquet compression ratio ~3x for text; decompressed ~3 * file size.
-        # Chars per token heuristic ≈ 4.
-        est_total_tokens = (total_disk * 3.0) / CHARS_PER_TOKEN_HEURISTIC
-    print()
-    print("-" * 72)
-    print("Token budget analysis")
-    print("-" * 72)
-    print(f"Estimated total train tokens: {human_tokens(est_total_tokens)} "
-          f"({est_total_tokens:,.0f})")
-    print(f"12h @ 65k tok/s target:       {human_tokens(TARGET_TOKENS_12H)}")
-    ratio = est_total_tokens / TARGET_TOKENS_12H if TARGET_TOKENS_12H else 0
-    if ratio >= 1.0:
-        print(f"  Ratio: {ratio:.1f}x  ({'SUFFICIENT' if ratio >= 1.2 else 'TIGHT'})")
-    else:
-        print(f"  Ratio: {ratio:.2f}x  INSUFFICIENT — need {1 - ratio:.0%} more")
-    print()
-    # Warnings about the dataloader behavior.
-    print("-" * 72)
-    print("Dataloader behavior (prepare.py::_document_batches)")
-    print("-" * 72)
-    print("+ Infinite streaming: while True around shard list (no StopIteration)")
-    print("+ Streams per shard, never loads full corpus into RAM")
-    print("+ BOS-aligned best-fit packing gives document-level buffer shuffling")
-    print("- Cross-shard order is LEXICOGRAPHIC and FIXED on every epoch")
-    print("- Row groups / rows WITHIN a shard are read in fixed order")
-    print("  (climbmix-400b-shuffle is pre-shuffled at source, mitigating this)")
-    print()
-    # Quick content diversity peek.
-    if train_shards:
-        print("-" * 72)
-        print("Content sample (shard 0, first 3 docs)")
-        print("-" * 72)
-        pf = pq.ParquetFile(train_shards[0])
-        rg = pf.read_row_group(0)
-        texts = rg.column("text").to_pylist()
-        for i, idx in enumerate([0, len(texts) // 2, len(texts) - 1]):
-            if idx < len(texts):
-                snippet = texts[idx][:160].replace("\n", " ")
-                print(f"  [{i}] len={len(texts[idx])}: {snippet!r}")
-        print()
-    print("=" * 72)
-    print("Done.")
-    return 0
-if __name__ == "__main__":
-    raise SystemExit(main())

+"""
+Dataset audit — diagnostic tool for HYDRA's pretraining corpus.
+Usage:
+    python scripts/dataset_audit.py              # Quick audit
+    python scripts/dataset_audit.py --sample 10  # Sample 10 shards for token counts
+    python scripts/dataset_audit.py --full       # Full tokenize of every shard (slow)
+Reports:
+- Shard count, total disk usage
+- Estimated total tokens (character-based + tokenized sample)
+- Training budget sufficiency vs 12h @ 65k tok/s = 2.8B token target
+- Document diversity sample
+- Warnings about shard ordering, shuffle, and streaming behavior
+"""
+from __future__ import annotations
+import argparse
+import os
+import sys
+import time
+from pathlib import Path
+import pyarrow.parquet as pq
+# Resolve repo root so the script works regardless of CWD.
+REPO_ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(REPO_ROOT))
+from prepare import (  # noqa: E402
+    DATA_DIR,
+    MAX_SHARD,
+    TOKENIZER_DIR,
+    VAL_FILENAME,
+    VAL_SHARD,
+)
+TARGET_TOKENS_12H = 2_800_000_000  # 65k tok/s * 12h * 3600s
+CHARS_PER_TOKEN_HEURISTIC = 4.0
+def human_bytes(n: int) -> str:
+    for unit in ("B", "KB", "MB", "GB", "TB"):
+        if n < 1024:
+            return f"{n:.1f}{unit}"
+        n /= 1024
+    return f"{n:.1f}PB"
+def human_tokens(n: int | float) -> str:
+    if n >= 1e9:
+        return f"{n / 1e9:.2f}B"
+    if n >= 1e6:
+        return f"{n / 1e6:.1f}M"
+    if n >= 1e3:
+        return f"{n / 1e3:.1f}K"
+    return f"{n:.0f}"
+def list_shards() -> tuple[list[Path], Path | None]:
+    """Return (train_shards_sorted, val_shard_or_none)."""
+    if not os.path.isdir(DATA_DIR):
+        return [], None
+    all_paths = sorted(Path(DATA_DIR).glob("shard_*.parquet"))
+    val_path = Path(DATA_DIR) / VAL_FILENAME
+    train = [p for p in all_paths if p.name != VAL_FILENAME]
+    val = val_path if val_path.exists() else None
+    return train, val
+def tokenized_sample(shard_path: Path, enc, row_groups: int = 5) -> tuple[int, int]:
+    """Tokenize first N row groups of a shard. Returns (tokens, docs)."""
+    pf = pq.ParquetFile(shard_path)
+    tokens = 0
+    docs = 0
+    n = min(row_groups, pf.num_row_groups)
+    for i in range(n):
+        rg = pf.read_row_group(i)
+        texts = rg.column("text").to_pylist()
+        ids = enc.encode_ordinary_batch(texts, num_threads=8)
+        tokens += sum(len(x) for x in ids)
+        docs += len(texts)
+    return tokens, docs, pf.num_row_groups
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Audit the HYDRA training corpus")
+    parser.add_argument(
+        "--sample",
+        type=int,
+        default=3,
+        help="Number of shards to tokenize for token-count estimate",
+    )
+    parser.add_argument(
+        "--full",
+        action="store_true",
+        help="Tokenize every shard (slow; gives exact total)",
+    )
+    args = parser.parse_args()
+    print("=" * 72)
+    print("HYDRA corpus audit")
+    print("=" * 72)
+    print(f"DATA_DIR:        {DATA_DIR}")
+    print(f"TOKENIZER_DIR:   {TOKENIZER_DIR}")
+    print(f"Source dataset:  karpathy/climbmix-400b-shuffle")
+    print(f"Max remote shard: {MAX_SHARD} (pinned val = shard_{VAL_SHARD:05d})")
+    print()
+    train_shards, val_shard = list_shards()
+    if not train_shards:
+        print("ERROR: no parquet shards found. Run `python prepare.py` first.")
+        return 1
+    total_disk = sum(p.stat().st_size for p in train_shards)
+    val_disk = val_shard.stat().st_size if val_shard else 0
+    print(f"Train shards:    {len(train_shards)}  ({train_shards[0].name} ... {train_shards[-1].name})")
+    print(f"Val shard:       {'present' if val_shard else 'MISSING'}  ({VAL_FILENAME})")
+    print(f"Disk (train):    {human_bytes(total_disk)}")
+    print(f"Disk (val):      {human_bytes(val_disk)}")
+    print()
+    # Character-based pass (fast): count total chars in all shards.
+    t0 = time.time()
+    total_chars = 0
+    total_docs = 0
+    total_row_groups = 0
+    for p in train_shards:
+        pf = pq.ParquetFile(p)
+        total_row_groups += pf.num_row_groups
+        total_docs += pf.metadata.num_rows
+    dt_meta = time.time() - t0
+    print(f"Metadata scan:   {len(train_shards)} shards in {dt_meta:.1f}s")
+    print(f"Train documents: {total_docs:,}")
+    print(f"Row groups:      {total_row_groups:,}")
+    print()
+    # Tokenizer-based sampling.
+    try:
+        import pickle
+        with open(os.path.join(TOKENIZER_DIR, "tokenizer.pkl"), "rb") as f:
+            enc = pickle.load(f)
+        print(f"Tokenizer vocab: {enc.n_vocab}")
+    except FileNotFoundError:
+        print("WARNING: tokenizer.pkl not found — skipping tokenized sample.")
+        enc = None
+    est_total_tokens = 0
+    if enc is not None:
+        if args.full:
+            sample_shards = train_shards
+        else:
+            # Pick shards evenly across the range for a representative sample.
+            n_sample = min(args.sample, len(train_shards))
+            if n_sample == 1:
+                sample_shards = [train_shards[0]]
+            else:
+                stride = max(1, len(train_shards) // n_sample)
+                sample_shards = train_shards[::stride][:n_sample]
+        t0 = time.time()
+        sample_tokens = 0
+        sample_docs = 0
+        sample_row_groups = 0
+        sample_shard_row_groups = 0
+        print(f"Tokenizing sample: {len(sample_shards)} shards ...")
+        for p in sample_shards:
+            tok, docs, n_rg = tokenized_sample(p, enc, row_groups=5)
+            sample_tokens += tok
+            sample_docs += docs
+            sample_row_groups += min(5, n_rg)
+            sample_shard_row_groups += n_rg
+        dt_tok = time.time() - t0
+        tokens_per_rg = sample_tokens / max(sample_row_groups, 1)
+        per_shard = tokens_per_rg * (sample_shard_row_groups / len(sample_shards))
+        est_total_tokens = per_shard * len(train_shards)
+        print(
+            f"Sampled {sample_row_groups} row groups ({sample_docs:,} docs, "
+            f"{sample_tokens:,} tokens) in {dt_tok:.1f}s"
+        )
+        print(f"  tokens/row_group: {tokens_per_rg:,.0f}")
+        print(f"  tokens/shard:     {per_shard:,.0f}")
+        print(f"  tokens/shard:     {human_tokens(per_shard)}")
+    else:
+        # Fall back to character heuristic.
+        per_shard_chars = total_disk / max(len(train_shards), 1)
+        # Parquet compression ratio ~3x for text; decompressed ~3 * file size.
+        # Chars per token heuristic ≈ 4.
+        est_total_tokens = (total_disk * 3.0) / CHARS_PER_TOKEN_HEURISTIC
+    print()
+    print("-" * 72)
+    print("Token budget analysis")
+    print("-" * 72)
+    print(f"Estimated total train tokens: {human_tokens(est_total_tokens)} "
+          f"({est_total_tokens:,.0f})")
+    print(f"12h @ 65k tok/s target:       {human_tokens(TARGET_TOKENS_12H)}")
+    ratio = est_total_tokens / TARGET_TOKENS_12H if TARGET_TOKENS_12H else 0
+    if ratio >= 1.0:
+        print(f"  Ratio: {ratio:.1f}x  ({'SUFFICIENT' if ratio >= 1.2 else 'TIGHT'})")
+    else:
+        print(f"  Ratio: {ratio:.2f}x  INSUFFICIENT — need {1 - ratio:.0%} more")
+    print()
+    # Warnings about the dataloader behavior.
+    print("-" * 72)
+    print("Dataloader behavior (prepare.py::_document_batches)")
+    print("-" * 72)
+    print("+ Infinite streaming: while True around shard list (no StopIteration)")
+    print("+ Streams per shard, never loads full corpus into RAM")
+    print("+ BOS-aligned best-fit packing gives document-level buffer shuffling")
+    print("- Cross-shard order is LEXICOGRAPHIC and FIXED on every epoch")
+    print("- Row groups / rows WITHIN a shard are read in fixed order")
+    print("  (climbmix-400b-shuffle is pre-shuffled at source, mitigating this)")
+    print()
+    # Quick content diversity peek.
+    if train_shards:
+        print("-" * 72)
+        print("Content sample (shard 0, first 3 docs)")
+        print("-" * 72)
+        pf = pq.ParquetFile(train_shards[0])
+        rg = pf.read_row_group(0)
+        texts = rg.column("text").to_pylist()
+        for i, idx in enumerate([0, len(texts) // 2, len(texts) - 1]):
+            if idx < len(texts):
+                snippet = texts[idx][:160].replace("\n", " ")
+                print(f"  [{i}] len={len(texts[idx])}: {snippet!r}")
+        print()
+    print("=" * 72)
+    print("Done.")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

overlay/scripts/download_sft_data.py CHANGED Viewed

@@ -1,457 +1,457 @@
-"""Download + tokenize instruction data for HYDRA SFT.
-Writes int16 token shards to `data/sft/shard_XXX.bin` plus a
-`data/sft/meta.json` with counts + special-token mapping.
-Chat format (vocab's 4 reserved special tokens are repurposed):
-    <BOS=8188> <|user|=8189>\n{instruction}\n{input?}\n <|assistant|=8190>\n
-    {output}<|end|=8191>\n
-Special-token IDs are constants derived from the tokenizer (they are the
-last 4 IDs in an 8192-vocab). They are stored in meta.json for the SFT
-script to read.
-Sources (tried in order):
-    1. yahma/alpaca-cleaned (~52K pairs via HF parquet auto-convert)
-    2. databricks/databricks-dolly-15k (~15K pairs)
-    3. Hard-coded 200 simple Q&A pairs (offline backup)
-Usage:
-    python scripts/download_sft_data.py              # full download
-    python scripts/download_sft_data.py --test       # small smoke run
-    python scripts/download_sft_data.py --offline    # skip network; use backup
-"""
-from __future__ import annotations
-import argparse
-import json
-import os
-import pickle
-import sys
-import time
-from pathlib import Path
-import numpy as np
-import requests
-# Make `prepare` and `hydra.*` importable when run as a script
-_REPO_ROOT = Path(__file__).resolve().parent.parent
-if str(_REPO_ROOT) not in sys.path:
-    sys.path.insert(0, str(_REPO_ROOT))
-# ---------------------------------------------------------------------------
-# Constants
-# ---------------------------------------------------------------------------
-CACHE_DIR = Path.home() / ".cache" / "autoresearch"
-TOKENIZER_PKL = CACHE_DIR / "tokenizer" / "tokenizer.pkl"
-SFT_DIR = _REPO_ROOT / "data" / "sft"
-SFT_DIR.mkdir(parents=True, exist_ok=True)
-# Reserved token repurposing — must match prepare.py SPECIAL_TOKENS list
-# (indices 8188-8191 in the 8192-vocab BPE).
-BOS_ID = 8188          # <|reserved_0|>
-USER_ID = 8189         # <|reserved_1|>
-ASSISTANT_ID = 8190    # <|reserved_2|>
-END_ID = 8191          # <|reserved_3|>
-# Shards are int16 arrays of packed token IDs.
-TOKENS_PER_SHARD = 1_048_576  # ~2 MB per shard
-DTYPE = np.int16               # vocab_size=8192 fits in int16
-TARGET_TOKENS_DEFAULT = 15_000_000   # ~15M instruction tokens
-TARGET_TOKENS_TEST = 1_500_000       # smoke run
-# HuggingFace auto-parquet endpoint — one file for alpaca-cleaned
-ALPACA_URL = (
-    "https://huggingface.co/api/datasets/yahma/alpaca-cleaned/parquet/"
-    "default/train/0.parquet"
-)
-DOLLY_URL = (
-    "https://huggingface.co/api/datasets/databricks/databricks-dolly-15k/"
-    "parquet/default/train/0.parquet"
-)
-# ---------------------------------------------------------------------------
-# Offline backup Q&A pairs (used only if network unavailable)
-# ---------------------------------------------------------------------------
-_BACKUP_QA = [
-    ("What is the capital of France?", "The capital of France is Paris."),
-    ("What is the capital of Germany?", "The capital of Germany is Berlin."),
-    ("What is the capital of Japan?", "The capital of Japan is Tokyo."),
-    ("What is the capital of Italy?", "The capital of Italy is Rome."),
-    ("What is the capital of Spain?", "The capital of Spain is Madrid."),
-    ("What is the capital of England?", "The capital of England is London."),
-    ("What is the capital of Canada?", "The capital of Canada is Ottawa."),
-    ("What is the capital of Australia?", "The capital of Australia is Canberra."),
-    ("What is 2 plus 2?", "Two plus two equals four."),
-    ("What is 5 plus 5?", "Five plus five equals ten."),
-    ("What is 3 times 4?", "Three times four equals twelve."),
-    ("What is 10 minus 3?", "Ten minus three equals seven."),
-    ("Name a color.", "Red is a color. Other colors include blue, green, and yellow."),
-    ("Name a fruit.", "Apple is a fruit. Other fruits include banana, orange, and grape."),
-    ("Name an animal.", "A dog is an animal. Other animals include cat, horse, and cow."),
-    ("What do birds do?", "Birds fly through the air using their wings."),
-    ("What do fish do?", "Fish swim in water using their fins and tails."),
-    ("What is water made of?", "Water is made of hydrogen and oxygen."),
-    ("What is the sun?", "The sun is a star at the center of our solar system."),
-    ("What is the moon?", "The moon is Earth's natural satellite."),
-    ("What is a cloud?", "A cloud is a collection of tiny water droplets in the sky."),
-    ("What is rain?", "Rain is water that falls from clouds to the ground."),
-    ("What is snow?", "Snow is frozen water that falls from clouds in cold weather."),
-    ("What is wind?", "Wind is moving air."),
-    ("What is fire?", "Fire is a chemical reaction that produces heat and light."),
-    ("What is ice?", "Ice is frozen water."),
-    ("Say hello.", "Hello! How can I help you today?"),
-    ("Say goodbye.", "Goodbye! Have a great day."),
-    ("Tell me a short story.", "Once upon a time, a brave rabbit lived in the forest. The end."),
-    ("Tell me a joke.", "Why did the chicken cross the road? To get to the other side."),
-    ("Who wrote Hamlet?", "William Shakespeare wrote the play Hamlet."),
-    ("Who wrote Romeo and Juliet?", "William Shakespeare wrote Romeo and Juliet."),
-    ("Who painted the Mona Lisa?", "Leonardo da Vinci painted the Mona Lisa."),
-    ("When did World War 2 end?", "World War 2 ended in 1945."),
-    ("What is gravity?", "Gravity is the force that pulls objects toward the Earth."),
-    ("What is the speed of light?", "The speed of light is approximately 300,000 kilometers per second."),
-    ("What is the largest planet?", "Jupiter is the largest planet in our solar system."),
-    ("What is the smallest planet?", "Mercury is the smallest planet in our solar system."),
-    ("At what temperature does water boil?", "Water boils at 100 degrees Celsius or 212 degrees Fahrenheit."),
-    ("At what temperature does water freeze?", "Water freezes at 0 degrees Celsius or 32 degrees Fahrenheit."),
-    ("How many legs does a spider have?", "A spider has eight legs."),
-    ("How many legs does an insect have?", "An insect has six legs."),
-    ("What do plants need to grow?", "Plants need sunlight, water, soil, and air to grow."),
-    ("What do humans eat?", "Humans eat a variety of foods including fruits, vegetables, meat, and grains."),
-    ("What is a book?", "A book is a collection of written or printed pages bound together."),
-    ("What is a computer?", "A computer is an electronic device that processes information."),
-    ("What is a phone?", "A phone is a device used to communicate with people at a distance."),
-    ("What is music?", "Music is an arrangement of sounds that is pleasing to hear."),
-    ("What is art?", "Art is the expression of human creativity and imagination."),
-    ("What is a language?", "A language is a system of communication used by a group of people."),
-]
-# Duplicate to reach ~200 samples (each pair appears ~4x)
-BACKUP_QA = (_BACKUP_QA * 4)[:200]
-# ---------------------------------------------------------------------------
-# Tokenizer loader
-# ---------------------------------------------------------------------------
-class _TokenizerWrapper:
-    """Minimal wrapper around the pickled tiktoken.Encoding. We avoid
-    importing `prepare.Tokenizer` to sidestep its side effects (which
-    touch the running pretrain's cache files)."""
-    def __init__(self, enc):
-        self.enc = enc
-    def encode(self, text: str) -> list[int]:
-        return self.enc.encode_ordinary(text)
-    @property
-    def vocab_size(self) -> int:
-        return self.enc.n_vocab
-def load_tokenizer() -> _TokenizerWrapper:
-    if not TOKENIZER_PKL.exists():
-        raise FileNotFoundError(
-            f"Tokenizer not found at {TOKENIZER_PKL}. Run `python prepare.py` "
-            f"first."
-        )
-    with open(TOKENIZER_PKL, "rb") as f:
-        enc = pickle.load(f)
-    tok = _TokenizerWrapper(enc)
-    assert tok.vocab_size == 8192, f"Expected vocab=8192, got {tok.vocab_size}"
-    return tok
-# ---------------------------------------------------------------------------
-# Source downloaders
-# ---------------------------------------------------------------------------
-def _download_parquet(url: str, local_path: Path, timeout: int = 60) -> bool:
-    """Stream-download a parquet file with retry. Returns True on success."""
-    local_path.parent.mkdir(parents=True, exist_ok=True)
-    tmp = local_path.with_suffix(local_path.suffix + ".tmp")
-    for attempt in range(1, 4):
-        try:
-            with requests.get(url, stream=True, timeout=timeout,
-                              allow_redirects=True) as r:
-                r.raise_for_status()
-                with open(tmp, "wb") as f:
-                    for chunk in r.iter_content(chunk_size=1 << 20):
-                        if chunk:
-                            f.write(chunk)
-            tmp.replace(local_path)
-            return True
-        except Exception as e:
-            print(f"  [net] attempt {attempt} failed: {e}", flush=True)
-            for p in (tmp, local_path):
-                try:
-                    p.unlink()
-                except FileNotFoundError:
-                    pass
-            time.sleep(2 ** attempt)
-    return False
-def _iter_alpaca(local_path: Path):
-    """Yield (instruction, input, output) from alpaca-cleaned parquet."""
-    import pyarrow.parquet as pq
-    pf = pq.ParquetFile(str(local_path))
-    for rg_idx in range(pf.num_row_groups):
-        rg = pf.read_row_group(rg_idx)
-        instr_col = rg.column("instruction").to_pylist()
-        input_col = rg.column("input").to_pylist()
-        output_col = rg.column("output").to_pylist()
-        for instruction, input_text, output in zip(instr_col, input_col, output_col):
-            if instruction and output:
-                yield instruction, (input_text or ""), output
-def _iter_dolly(local_path: Path):
-    """Yield (instruction, input, output) from dolly-15k parquet."""
-    import pyarrow.parquet as pq
-    pf = pq.ParquetFile(str(local_path))
-    # Schema: instruction, context, response, category
-    for rg_idx in range(pf.num_row_groups):
-        rg = pf.read_row_group(rg_idx)
-        cols = {n: rg.column(n).to_pylist() for n in rg.schema.names}
-        instr_col = cols.get("instruction") or cols.get("Instruction")
-        ctx_col = cols.get("context") or cols.get("Context") or [""] * len(instr_col)
-        resp_col = cols.get("response") or cols.get("Response")
-        for instruction, context, response in zip(instr_col, ctx_col, resp_col):
-            if instruction and response:
-                yield instruction, (context or ""), response
-def _iter_backup():
-    for q, a in BACKUP_QA:
-        yield q, "", a
-# ---------------------------------------------------------------------------
-# Encoding
-# ---------------------------------------------------------------------------
-def encode_example(tok: _TokenizerWrapper, instruction: str,
-                   input_text: str, output: str) -> list[int]:
-    """Serialize one instruction/response pair into a flat token list.
-    Format:
-        <BOS> <|user|> \\n {instr}\\n[{input}\\n] <|assistant|> \\n {output} <|end|> \\n
-    """
-    ids: list[int] = [BOS_ID, USER_ID]
-    ids += tok.encode("\n" + instruction.strip())
-    if input_text and input_text.strip():
-        ids += tok.encode("\n" + input_text.strip())
-    ids += tok.encode("\n")
-    ids.append(ASSISTANT_ID)
-    ids += tok.encode("\n" + output.strip())
-    ids.append(END_ID)
-    ids += tok.encode("\n")
-    return ids
-def encode_example_with_mask(tok: _TokenizerWrapper, instruction: str,
-                             input_text: str, output: str
-                             ) -> tuple[list[int], list[int]]:
-    """Return (tokens, mask) where mask[i]=1 means 'compute loss on token i'
-    and mask[i]=0 means 'prompt, ignore'. The boundary is the <|assistant|>
-    token: the assistant response (and <|end|>) contribute to loss; the
-    user prompt does not."""
-    prompt_ids = [BOS_ID, USER_ID] + tok.encode("\n" + instruction.strip())
-    if input_text and input_text.strip():
-        prompt_ids += tok.encode("\n" + input_text.strip())
-    prompt_ids += tok.encode("\n")
-    prompt_ids.append(ASSISTANT_ID)
-    response_ids = tok.encode("\n" + output.strip())
-    response_ids.append(END_ID)
-    response_ids += tok.encode("\n")
-    ids = prompt_ids + response_ids
-    mask = [0] * len(prompt_ids) + [1] * len(response_ids)
-    return ids, mask
-# ---------------------------------------------------------------------------
-# Shard writer
-# ---------------------------------------------------------------------------
-class ShardWriter:
-    """Writes two parallel int16 files per shard:
-        data/sft/shard_XXX.bin       — token IDs
-        data/sft/mask_XXX.bin        — 0/1 loss mask
-    Packs one example after another with no padding. At runtime, SFT builds
-    sequences of length MAX_SEQ_LEN by slicing across these flat arrays.
-    """
-    def __init__(self, out_dir: Path, tokens_per_shard: int = TOKENS_PER_SHARD):
-        self.out_dir = out_dir
-        self.tokens_per_shard = tokens_per_shard
-        self.shard_idx = 0
-        self._buf_tok: list[int] = []
-        self._buf_mask: list[int] = []
-        self.total_tokens = 0
-    def add(self, tokens: list[int], mask: list[int]):
-        assert len(tokens) == len(mask)
-        self._buf_tok.extend(tokens)
-        self._buf_mask.extend(mask)
-        self.total_tokens += len(tokens)
-        while len(self._buf_tok) >= self.tokens_per_shard:
-            self._flush_one(self.tokens_per_shard)
-    def _flush_one(self, n: int):
-        tok_path = self.out_dir / f"shard_{self.shard_idx:04d}.bin"
-        mask_path = self.out_dir / f"mask_{self.shard_idx:04d}.bin"
-        arr_tok = np.array(self._buf_tok[:n], dtype=DTYPE)
-        arr_mask = np.array(self._buf_mask[:n], dtype=np.uint8)
-        arr_tok.tofile(tok_path)
-        arr_mask.tofile(mask_path)
-        self._buf_tok = self._buf_tok[n:]
-        self._buf_mask = self._buf_mask[n:]
-        print(f"  wrote {tok_path.name} ({n:,} tokens)", flush=True)
-        self.shard_idx += 1
-    def finalize(self):
-        if self._buf_tok:
-            self._flush_one(len(self._buf_tok))
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
-def main():
-    ap = argparse.ArgumentParser()
-    ap.add_argument("--test", action="store_true",
-                    help="Small smoke run: write ~1.5M tokens and exit.")
-    ap.add_argument("--offline", action="store_true",
-                    help="Skip network, use hard-coded backup only.")
-    ap.add_argument("--target-tokens", type=int, default=None,
-                    help="Override target token count.")
-    args = ap.parse_args()
-    target = args.target_tokens or (
-        TARGET_TOKENS_TEST if args.test else TARGET_TOKENS_DEFAULT
-    )
-    print(f"SFT_DIR: {SFT_DIR}")
-    print(f"Target tokens: {target:,}")
-    print(f"Offline mode: {args.offline}")
-    # Clear any prior shards
-    for p in SFT_DIR.glob("shard_*.bin"):
-        p.unlink()
-    for p in SFT_DIR.glob("mask_*.bin"):
-        p.unlink()
-    tok = load_tokenizer()
-    print(f"Tokenizer vocab: {tok.vocab_size}")
-    print(f"Special tokens: BOS={BOS_ID} USER={USER_ID} "
-          f"ASSISTANT={ASSISTANT_ID} END={END_ID}")
-    sources = []  # list of (name, iterator_fn)
-    if not args.offline:
-        alpaca_path = SFT_DIR / "alpaca_raw.parquet"
-        print(f"\n[src] downloading alpaca-cleaned -> {alpaca_path.name} ...")
-        if _download_parquet(ALPACA_URL, alpaca_path):
-            print(f"  ok ({alpaca_path.stat().st_size // (1 << 20)} MiB)")
-            sources.append(("alpaca-cleaned", lambda: _iter_alpaca(alpaca_path)))
-        else:
-            print("  alpaca download FAILED, trying dolly...")
-            dolly_path = SFT_DIR / "dolly_raw.parquet"
-            if _download_parquet(DOLLY_URL, dolly_path):
-                print(f"  ok ({dolly_path.stat().st_size // (1 << 20)} MiB)")
-                sources.append(("dolly-15k", lambda: _iter_dolly(dolly_path)))
-    # Always include backup — cheap, catches tail
-    sources.append(("backup-200", _iter_backup))
-    if not sources:
-        print("FATAL: no data sources available.", file=sys.stderr)
-        sys.exit(1)
-    # Stream-encode
-    writer = ShardWriter(SFT_DIR)
-    n_examples = 0
-    n_assistant_tokens = 0
-    source_counts = {}
-    for src_name, src_fn in sources:
-        print(f"\n[src] encoding {src_name} ...")
-        src_examples = 0
-        src_tokens = 0
-        for (instruction, input_text, output) in src_fn():
-            # Skip overly long outputs — 7.5M model can't use them
-            if len(output) > 2000:
-                output = output[:2000]
-            ids, mask = encode_example_with_mask(tok, instruction,
-                                                 input_text, output)
-            if len(ids) < 4 or len(ids) > 512:
-                # Skip degenerate / too-long examples
-                continue
-            writer.add(ids, mask)
-            n_examples += 1
-            src_examples += 1
-            src_tokens += len(ids)
-            n_assistant_tokens += sum(mask)
-            if writer.total_tokens >= target:
-                break
-        source_counts[src_name] = {
-            "examples": src_examples,
-            "tokens": src_tokens,
-        }
-        print(f"  {src_name}: {src_examples:,} examples, {src_tokens:,} tokens")
-        if writer.total_tokens >= target:
-            break
-    writer.finalize()
-    meta = {
-        "total_tokens": writer.total_tokens,
-        "total_examples": n_examples,
-        "assistant_tokens_in_loss": n_assistant_tokens,
-        "num_shards": writer.shard_idx,
-        "tokens_per_shard": TOKENS_PER_SHARD,
-        "dtype": "int16",
-        "vocab_size": tok.vocab_size,
-        "special_tokens": {
-            "bos": BOS_ID,
-            "user": USER_ID,
-            "assistant": ASSISTANT_ID,
-            "end": END_ID,
-        },
-        "sources": source_counts,
-        "format_hint": (
-            "<BOS><|user|>\\n{instr}\\n[{input}\\n]<|assistant|>\\n"
-            "{output}<|end|>\\n"
-        ),
-    }
-    meta_path = SFT_DIR / "meta.json"
-    with open(meta_path, "w") as f:
-        json.dump(meta, f, indent=2)
-    print(f"\n===== SFT data ready =====")
-    print(f"  examples:      {n_examples:,}")
-    print(f"  total tokens:  {writer.total_tokens:,}")
-    print(f"  loss tokens:   {n_assistant_tokens:,}")
-    print(f"  shards:        {writer.shard_idx}")
-    print(f"  meta:          {meta_path}")
-    if args.test and writer.total_tokens < 1_000_000:
-        print(f"\nWARN: test mode produced only {writer.total_tokens:,} "
-              f"tokens — below 1M threshold.")
-        sys.exit(2)
-if __name__ == "__main__":
-    main()

+"""Download + tokenize instruction data for HYDRA SFT.
+Writes int16 token shards to `data/sft/shard_XXX.bin` plus a
+`data/sft/meta.json` with counts + special-token mapping.
+Chat format (vocab's 4 reserved special tokens are repurposed):
+    <BOS=8188> <|user|=8189>\n{instruction}\n{input?}\n <|assistant|=8190>\n
+    {output}<|end|=8191>\n
+Special-token IDs are constants derived from the tokenizer (they are the
+last 4 IDs in an 8192-vocab). They are stored in meta.json for the SFT
+script to read.
+Sources (tried in order):
+    1. yahma/alpaca-cleaned (~52K pairs via HF parquet auto-convert)
+    2. databricks/databricks-dolly-15k (~15K pairs)
+    3. Hard-coded 200 simple Q&A pairs (offline backup)
+Usage:
+    python scripts/download_sft_data.py              # full download
+    python scripts/download_sft_data.py --test       # small smoke run
+    python scripts/download_sft_data.py --offline    # skip network; use backup
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import pickle
+import sys
+import time
+from pathlib import Path
+import numpy as np
+import requests
+# Make `prepare` and `hydra.*` importable when run as a script
+_REPO_ROOT = Path(__file__).resolve().parent.parent
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+CACHE_DIR = Path.home() / ".cache" / "autoresearch"
+TOKENIZER_PKL = CACHE_DIR / "tokenizer" / "tokenizer.pkl"
+SFT_DIR = _REPO_ROOT / "data" / "sft"
+SFT_DIR.mkdir(parents=True, exist_ok=True)
+# Reserved token repurposing — must match prepare.py SPECIAL_TOKENS list
+# (indices 8188-8191 in the 8192-vocab BPE).
+BOS_ID = 8188          # <|reserved_0|>
+USER_ID = 8189         # <|reserved_1|>
+ASSISTANT_ID = 8190    # <|reserved_2|>
+END_ID = 8191          # <|reserved_3|>
+# Shards are int16 arrays of packed token IDs.
+TOKENS_PER_SHARD = 1_048_576  # ~2 MB per shard
+DTYPE = np.int16               # vocab_size=8192 fits in int16
+TARGET_TOKENS_DEFAULT = 15_000_000   # ~15M instruction tokens
+TARGET_TOKENS_TEST = 1_500_000       # smoke run
+# HuggingFace auto-parquet endpoint — one file for alpaca-cleaned
+ALPACA_URL = (
+    "https://huggingface.co/api/datasets/yahma/alpaca-cleaned/parquet/"
+    "default/train/0.parquet"
+)
+DOLLY_URL = (
+    "https://huggingface.co/api/datasets/databricks/databricks-dolly-15k/"
+    "parquet/default/train/0.parquet"
+)
+# ---------------------------------------------------------------------------
+# Offline backup Q&A pairs (used only if network unavailable)
+# ---------------------------------------------------------------------------
+_BACKUP_QA = [
+    ("What is the capital of France?", "The capital of France is Paris."),
+    ("What is the capital of Germany?", "The capital of Germany is Berlin."),
+    ("What is the capital of Japan?", "The capital of Japan is Tokyo."),
+    ("What is the capital of Italy?", "The capital of Italy is Rome."),
+    ("What is the capital of Spain?", "The capital of Spain is Madrid."),
+    ("What is the capital of England?", "The capital of England is London."),
+    ("What is the capital of Canada?", "The capital of Canada is Ottawa."),
+    ("What is the capital of Australia?", "The capital of Australia is Canberra."),
+    ("What is 2 plus 2?", "Two plus two equals four."),
+    ("What is 5 plus 5?", "Five plus five equals ten."),
+    ("What is 3 times 4?", "Three times four equals twelve."),
+    ("What is 10 minus 3?", "Ten minus three equals seven."),
+    ("Name a color.", "Red is a color. Other colors include blue, green, and yellow."),
+    ("Name a fruit.", "Apple is a fruit. Other fruits include banana, orange, and grape."),
+    ("Name an animal.", "A dog is an animal. Other animals include cat, horse, and cow."),
+    ("What do birds do?", "Birds fly through the air using their wings."),
+    ("What do fish do?", "Fish swim in water using their fins and tails."),
+    ("What is water made of?", "Water is made of hydrogen and oxygen."),
+    ("What is the sun?", "The sun is a star at the center of our solar system."),
+    ("What is the moon?", "The moon is Earth's natural satellite."),
+    ("What is a cloud?", "A cloud is a collection of tiny water droplets in the sky."),
+    ("What is rain?", "Rain is water that falls from clouds to the ground."),
+    ("What is snow?", "Snow is frozen water that falls from clouds in cold weather."),
+    ("What is wind?", "Wind is moving air."),
+    ("What is fire?", "Fire is a chemical reaction that produces heat and light."),
+    ("What is ice?", "Ice is frozen water."),
+    ("Say hello.", "Hello! How can I help you today?"),
+    ("Say goodbye.", "Goodbye! Have a great day."),
+    ("Tell me a short story.", "Once upon a time, a brave rabbit lived in the forest. The end."),
+    ("Tell me a joke.", "Why did the chicken cross the road? To get to the other side."),
+    ("Who wrote Hamlet?", "William Shakespeare wrote the play Hamlet."),
+    ("Who wrote Romeo and Juliet?", "William Shakespeare wrote Romeo and Juliet."),
+    ("Who painted the Mona Lisa?", "Leonardo da Vinci painted the Mona Lisa."),
+    ("When did World War 2 end?", "World War 2 ended in 1945."),
+    ("What is gravity?", "Gravity is the force that pulls objects toward the Earth."),
+    ("What is the speed of light?", "The speed of light is approximately 300,000 kilometers per second."),
+    ("What is the largest planet?", "Jupiter is the largest planet in our solar system."),
+    ("What is the smallest planet?", "Mercury is the smallest planet in our solar system."),
+    ("At what temperature does water boil?", "Water boils at 100 degrees Celsius or 212 degrees Fahrenheit."),
+    ("At what temperature does water freeze?", "Water freezes at 0 degrees Celsius or 32 degrees Fahrenheit."),
+    ("How many legs does a spider have?", "A spider has eight legs."),
+    ("How many legs does an insect have?", "An insect has six legs."),
+    ("What do plants need to grow?", "Plants need sunlight, water, soil, and air to grow."),
+    ("What do humans eat?", "Humans eat a variety of foods including fruits, vegetables, meat, and grains."),
+    ("What is a book?", "A book is a collection of written or printed pages bound together."),
+    ("What is a computer?", "A computer is an electronic device that processes information."),
+    ("What is a phone?", "A phone is a device used to communicate with people at a distance."),
+    ("What is music?", "Music is an arrangement of sounds that is pleasing to hear."),
+    ("What is art?", "Art is the expression of human creativity and imagination."),
+    ("What is a language?", "A language is a system of communication used by a group of people."),
+]
+# Duplicate to reach ~200 samples (each pair appears ~4x)
+BACKUP_QA = (_BACKUP_QA * 4)[:200]
+# ---------------------------------------------------------------------------
+# Tokenizer loader
+# ---------------------------------------------------------------------------
+class _TokenizerWrapper:
+    """Minimal wrapper around the pickled tiktoken.Encoding. We avoid
+    importing `prepare.Tokenizer` to sidestep its side effects (which
+    touch the running pretrain's cache files)."""
+    def __init__(self, enc):
+        self.enc = enc
+    def encode(self, text: str) -> list[int]:
+        return self.enc.encode_ordinary(text)
+    @property
+    def vocab_size(self) -> int:
+        return self.enc.n_vocab
+def load_tokenizer() -> _TokenizerWrapper:
+    if not TOKENIZER_PKL.exists():
+        raise FileNotFoundError(
+            f"Tokenizer not found at {TOKENIZER_PKL}. Run `python prepare.py` "
+            f"first."
+        )
+    with open(TOKENIZER_PKL, "rb") as f:
+        enc = pickle.load(f)
+    tok = _TokenizerWrapper(enc)
+    assert tok.vocab_size == 8192, f"Expected vocab=8192, got {tok.vocab_size}"
+    return tok
+# ---------------------------------------------------------------------------
+# Source downloaders
+# ---------------------------------------------------------------------------
+def _download_parquet(url: str, local_path: Path, timeout: int = 60) -> bool:
+    """Stream-download a parquet file with retry. Returns True on success."""
+    local_path.parent.mkdir(parents=True, exist_ok=True)
+    tmp = local_path.with_suffix(local_path.suffix + ".tmp")
+    for attempt in range(1, 4):
+        try:
+            with requests.get(url, stream=True, timeout=timeout,
+                              allow_redirects=True) as r:
+                r.raise_for_status()
+                with open(tmp, "wb") as f:
+                    for chunk in r.iter_content(chunk_size=1 << 20):
+                        if chunk:
+                            f.write(chunk)
+            tmp.replace(local_path)
+            return True
+        except Exception as e:
+            print(f"  [net] attempt {attempt} failed: {e}", flush=True)
+            for p in (tmp, local_path):
+                try:
+                    p.unlink()
+                except FileNotFoundError:
+                    pass
+            time.sleep(2 ** attempt)
+    return False
+def _iter_alpaca(local_path: Path):
+    """Yield (instruction, input, output) from alpaca-cleaned parquet."""
+    import pyarrow.parquet as pq
+    pf = pq.ParquetFile(str(local_path))
+    for rg_idx in range(pf.num_row_groups):
+        rg = pf.read_row_group(rg_idx)
+        instr_col = rg.column("instruction").to_pylist()
+        input_col = rg.column("input").to_pylist()
+        output_col = rg.column("output").to_pylist()
+        for instruction, input_text, output in zip(instr_col, input_col, output_col):
+            if instruction and output:
+                yield instruction, (input_text or ""), output
+def _iter_dolly(local_path: Path):
+    """Yield (instruction, input, output) from dolly-15k parquet."""
+    import pyarrow.parquet as pq
+    pf = pq.ParquetFile(str(local_path))
+    # Schema: instruction, context, response, category
+    for rg_idx in range(pf.num_row_groups):
+        rg = pf.read_row_group(rg_idx)
+        cols = {n: rg.column(n).to_pylist() for n in rg.schema.names}
+        instr_col = cols.get("instruction") or cols.get("Instruction")
+        ctx_col = cols.get("context") or cols.get("Context") or [""] * len(instr_col)
+        resp_col = cols.get("response") or cols.get("Response")
+        for instruction, context, response in zip(instr_col, ctx_col, resp_col):
+            if instruction and response:
+                yield instruction, (context or ""), response
+def _iter_backup():
+    for q, a in BACKUP_QA:
+        yield q, "", a
+# ---------------------------------------------------------------------------
+# Encoding
+# ---------------------------------------------------------------------------
+def encode_example(tok: _TokenizerWrapper, instruction: str,
+                   input_text: str, output: str) -> list[int]:
+    """Serialize one instruction/response pair into a flat token list.
+    Format:
+        <BOS> <|user|> \\n {instr}\\n[{input}\\n] <|assistant|> \\n {output} <|end|> \\n
+    """
+    ids: list[int] = [BOS_ID, USER_ID]
+    ids += tok.encode("\n" + instruction.strip())
+    if input_text and input_text.strip():
+        ids += tok.encode("\n" + input_text.strip())
+    ids += tok.encode("\n")
+    ids.append(ASSISTANT_ID)
+    ids += tok.encode("\n" + output.strip())
+    ids.append(END_ID)
+    ids += tok.encode("\n")
+    return ids
+def encode_example_with_mask(tok: _TokenizerWrapper, instruction: str,
+                             input_text: str, output: str
+                             ) -> tuple[list[int], list[int]]:
+    """Return (tokens, mask) where mask[i]=1 means 'compute loss on token i'
+    and mask[i]=0 means 'prompt, ignore'. The boundary is the <|assistant|>
+    token: the assistant response (and <|end|>) contribute to loss; the
+    user prompt does not."""
+    prompt_ids = [BOS_ID, USER_ID] + tok.encode("\n" + instruction.strip())
+    if input_text and input_text.strip():
+        prompt_ids += tok.encode("\n" + input_text.strip())
+    prompt_ids += tok.encode("\n")
+    prompt_ids.append(ASSISTANT_ID)
+    response_ids = tok.encode("\n" + output.strip())
+    response_ids.append(END_ID)
+    response_ids += tok.encode("\n")
+    ids = prompt_ids + response_ids
+    mask = [0] * len(prompt_ids) + [1] * len(response_ids)
+    return ids, mask
+# ---------------------------------------------------------------------------
+# Shard writer
+# ---------------------------------------------------------------------------
+class ShardWriter:
+    """Writes two parallel int16 files per shard:
+        data/sft/shard_XXX.bin       — token IDs
+        data/sft/mask_XXX.bin        — 0/1 loss mask
+    Packs one example after another with no padding. At runtime, SFT builds
+    sequences of length MAX_SEQ_LEN by slicing across these flat arrays.
+    """
+    def __init__(self, out_dir: Path, tokens_per_shard: int = TOKENS_PER_SHARD):
+        self.out_dir = out_dir
+        self.tokens_per_shard = tokens_per_shard
+        self.shard_idx = 0
+        self._buf_tok: list[int] = []
+        self._buf_mask: list[int] = []
+        self.total_tokens = 0
+    def add(self, tokens: list[int], mask: list[int]):
+        assert len(tokens) == len(mask)
+        self._buf_tok.extend(tokens)
+        self._buf_mask.extend(mask)
+        self.total_tokens += len(tokens)
+        while len(self._buf_tok) >= self.tokens_per_shard:
+            self._flush_one(self.tokens_per_shard)
+    def _flush_one(self, n: int):
+        tok_path = self.out_dir / f"shard_{self.shard_idx:04d}.bin"
+        mask_path = self.out_dir / f"mask_{self.shard_idx:04d}.bin"
+        arr_tok = np.array(self._buf_tok[:n], dtype=DTYPE)
+        arr_mask = np.array(self._buf_mask[:n], dtype=np.uint8)
+        arr_tok.tofile(tok_path)
+        arr_mask.tofile(mask_path)
+        self._buf_tok = self._buf_tok[n:]
+        self._buf_mask = self._buf_mask[n:]
+        print(f"  wrote {tok_path.name} ({n:,} tokens)", flush=True)
+        self.shard_idx += 1
+    def finalize(self):
+        if self._buf_tok:
+            self._flush_one(len(self._buf_tok))
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--test", action="store_true",
+                    help="Small smoke run: write ~1.5M tokens and exit.")
+    ap.add_argument("--offline", action="store_true",
+                    help="Skip network, use hard-coded backup only.")
+    ap.add_argument("--target-tokens", type=int, default=None,
+                    help="Override target token count.")
+    args = ap.parse_args()
+    target = args.target_tokens or (
+        TARGET_TOKENS_TEST if args.test else TARGET_TOKENS_DEFAULT
+    )
+    print(f"SFT_DIR: {SFT_DIR}")
+    print(f"Target tokens: {target:,}")
+    print(f"Offline mode: {args.offline}")
+    # Clear any prior shards
+    for p in SFT_DIR.glob("shard_*.bin"):
+        p.unlink()
+    for p in SFT_DIR.glob("mask_*.bin"):
+        p.unlink()
+    tok = load_tokenizer()
+    print(f"Tokenizer vocab: {tok.vocab_size}")
+    print(f"Special tokens: BOS={BOS_ID} USER={USER_ID} "
+          f"ASSISTANT={ASSISTANT_ID} END={END_ID}")
+    sources = []  # list of (name, iterator_fn)
+    if not args.offline:
+        alpaca_path = SFT_DIR / "alpaca_raw.parquet"
+        print(f"\n[src] downloading alpaca-cleaned -> {alpaca_path.name} ...")
+        if _download_parquet(ALPACA_URL, alpaca_path):
+            print(f"  ok ({alpaca_path.stat().st_size // (1 << 20)} MiB)")
+            sources.append(("alpaca-cleaned", lambda: _iter_alpaca(alpaca_path)))
+        else:
+            print("  alpaca download FAILED, trying dolly...")
+            dolly_path = SFT_DIR / "dolly_raw.parquet"
+            if _download_parquet(DOLLY_URL, dolly_path):
+                print(f"  ok ({dolly_path.stat().st_size // (1 << 20)} MiB)")
+                sources.append(("dolly-15k", lambda: _iter_dolly(dolly_path)))
+    # Always include backup — cheap, catches tail
+    sources.append(("backup-200", _iter_backup))
+    if not sources:
+        print("FATAL: no data sources available.", file=sys.stderr)
+        sys.exit(1)
+    # Stream-encode
+    writer = ShardWriter(SFT_DIR)
+    n_examples = 0
+    n_assistant_tokens = 0
+    source_counts = {}
+    for src_name, src_fn in sources:
+        print(f"\n[src] encoding {src_name} ...")
+        src_examples = 0
+        src_tokens = 0
+        for (instruction, input_text, output) in src_fn():
+            # Skip overly long outputs — 7.5M model can't use them
+            if len(output) > 2000:
+                output = output[:2000]
+            ids, mask = encode_example_with_mask(tok, instruction,
+                                                 input_text, output)
+            if len(ids) < 4 or len(ids) > 512:
+                # Skip degenerate / too-long examples
+                continue
+            writer.add(ids, mask)
+            n_examples += 1
+            src_examples += 1
+            src_tokens += len(ids)
+            n_assistant_tokens += sum(mask)
+            if writer.total_tokens >= target:
+                break
+        source_counts[src_name] = {
+            "examples": src_examples,
+            "tokens": src_tokens,
+        }
+        print(f"  {src_name}: {src_examples:,} examples, {src_tokens:,} tokens")
+        if writer.total_tokens >= target:
+            break
+    writer.finalize()
+    meta = {
+        "total_tokens": writer.total_tokens,
+        "total_examples": n_examples,
+        "assistant_tokens_in_loss": n_assistant_tokens,
+        "num_shards": writer.shard_idx,
+        "tokens_per_shard": TOKENS_PER_SHARD,
+        "dtype": "int16",
+        "vocab_size": tok.vocab_size,
+        "special_tokens": {
+            "bos": BOS_ID,
+            "user": USER_ID,
+            "assistant": ASSISTANT_ID,
+            "end": END_ID,
+        },
+        "sources": source_counts,
+        "format_hint": (
+            "<BOS><|user|>\\n{instr}\\n[{input}\\n]<|assistant|>\\n"
+            "{output}<|end|>\\n"
+        ),
+    }
+    meta_path = SFT_DIR / "meta.json"
+    with open(meta_path, "w") as f:
+        json.dump(meta, f, indent=2)
+    print(f"\n===== SFT data ready =====")
+    print(f"  examples:      {n_examples:,}")
+    print(f"  total tokens:  {writer.total_tokens:,}")
+    print(f"  loss tokens:   {n_assistant_tokens:,}")
+    print(f"  shards:        {writer.shard_idx}")
+    print(f"  meta:          {meta_path}")
+    if args.test and writer.total_tokens < 1_000_000:
+        print(f"\nWARN: test mode produced only {writer.total_tokens:,} "
+              f"tokens — below 1M threshold.")
+        sys.exit(2)
+if __name__ == "__main__":
+    main()

overlay/scripts/eval_quality.py CHANGED Viewed

@@ -1,525 +1,525 @@
-#!/usr/bin/env python3
-"""Comprehensive quality evaluation harness for HYDRA.
-Computes: PPL, BLEU-1, BLEU-4, ROUGE-1, ROUGE-L, factual accuracy,
-coherence metrics (distinct-2, repetition-rate, self-BLEU), and a
-composite quality_score.
-Usage:
-    python scripts/eval_quality.py                     # eval latest model
-    python scripts/eval_quality.py --checkpoint ckpt.pt  # eval from checkpoint
-All metrics printed as key=value (grep-friendly). Runs in <30s on RTX 3060.
-"""
-from __future__ import annotations
-import math
-import os
-import sys
-import time
-from collections import Counter
-from typing import Optional
-# Ensure project root is on path
-_PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-if _PROJECT_ROOT not in sys.path:
-    sys.path.insert(0, _PROJECT_ROOT)
-import torch
-import torch.nn.functional as F
-from hydra.config import (
-    D_MODEL, D_STATE, DEVICE_BATCH_SIZE, ENGRAM_KEY_DIM,
-    ENGRAM_LAYER_IDX, ENGRAM_N_COLUMNS, EXPAND, HEADDIM,
-    N_HEADS, N_LAYER, PostSemClawConfig,
-)
-from hydra.eval import FACTUAL_EVAL
-from prepare import MAX_SEQ_LEN, Tokenizer, evaluate_bpb
-# ---------------------------------------------------------------------------
-# Eval prompts (hardcoded for reproducibility)
-# ---------------------------------------------------------------------------
-EVAL_PROMPTS = [
-    "The capital of France is",
-    "In 1969, humans first",
-    "Water boils at a temperature of",
-    "The theory of relativity was developed by",
-    "The largest planet in our solar system is",
-    "Photosynthesis is the process by which",
-    "The stock market crashed in",
-    "DNA stands for",
-    "The speed of light is approximately",
-    "Shakespeare wrote the play",
-    "The mitochondria is often called the",
-    "In computer science, an algorithm is",
-    "The chemical symbol for gold is",
-    "The Great Wall of China was built to",
-    "Gravity is a force that",
-    "The human heart pumps blood through",
-    "The Amazon rainforest is located in",
-    "Pi is approximately equal to",
-    "The first President of the United States was",
-    "Oxygen makes up approximately",
-]
-# Reference continuations (approximate, for BLEU/ROUGE)
-EVAL_REFERENCES = [
-    "Paris, which is also the largest city in France.",
-    "landed on the Moon during the Apollo 11 mission.",
-    "100 degrees Celsius or 212 degrees Fahrenheit at standard atmospheric pressure.",
-    "Albert Einstein in the early twentieth century.",
-    "Jupiter, which is a gas giant.",
-    "plants convert sunlight into chemical energy and produce oxygen.",
-    "1929, leading to the Great Depression.",
-    "deoxyribonucleic acid, which carries genetic information.",
-    "299,792 kilometers per second in a vacuum.",
-    "Romeo and Juliet, one of the most famous tragedies.",
-    "powerhouse of the cell because it produces energy.",
-    "a step by step procedure for solving a problem.",
-    "Au, from the Latin word aurum.",
-    "protect against invasions from the north.",
-    "attracts objects with mass toward each other.",
-    "the circulatory system to deliver oxygen and nutrients.",
-    "South America, primarily within Brazil.",
-    "3.14159, and it represents the ratio of circumference to diameter.",
-    "George Washington, who served from 1789 to 1797.",
-    "21 percent of the Earth's atmosphere.",
-]
-COHERENCE_PROMPTS = [
-    "The history of science shows that",
-    "In modern society, technology has",
-    "The relationship between education and",
-    "Climate change is affecting the world because",
-    "The development of artificial intelligence has led to",
-    "Throughout human history, art has been",
-    "The economy of a nation depends on",
-    "Medical research has shown that",
-    "The role of government in society is",
-    "The ocean covers more than",
-]
-# ---------------------------------------------------------------------------
-# Manual BLEU implementation (no nltk dependency)
-# ---------------------------------------------------------------------------
-def _get_ngrams(tokens: list[str], n: int) -> Counter:
-    """Extract n-gram counts from token list."""
-    return Counter(tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1))
-def _modified_precision(reference_tokens: list[str], hypothesis_tokens: list[str], n: int) -> tuple[int, int]:
-    """Compute modified precision for n-grams."""
-    ref_ngrams = _get_ngrams(reference_tokens, n)
-    hyp_ngrams = _get_ngrams(hypothesis_tokens, n)
-    clipped_count = 0
-    total_count = 0
-    for ngram, count in hyp_ngrams.items():
-        clipped_count += min(count, ref_ngrams.get(ngram, 0))
-        total_count += count
-    return clipped_count, max(total_count, 1)
-def compute_bleu(references: list[list[str]], hypotheses: list[list[str]], max_n: int = 4) -> dict[str, float]:
-    """Corpus-level BLEU-1 through BLEU-max_n.
-    Uses brevity penalty and geometric mean of modified precisions.
-    """
-    precisions = []
-    for n in range(1, max_n + 1):
-        total_clip = 0
-        total_count = 0
-        for ref, hyp in zip(references, hypotheses):
-            clip, count = _modified_precision(ref, hyp, n)
-            total_clip += clip
-            total_count += count
-        precisions.append(total_clip / max(total_count, 1))
-    # Brevity penalty
-    ref_len = sum(len(r) for r in references)
-    hyp_len = sum(len(h) for h in hypotheses)
-    if hyp_len == 0:
-        return {f"bleu{n}": 0.0 for n in range(1, max_n + 1)}
-    bp = math.exp(min(0, 1 - ref_len / hyp_len))
-    result = {}
-    for n in range(1, max_n + 1):
-        # Geometric mean of precisions 1..n
-        log_avg = sum(math.log(max(p, 1e-10)) for p in precisions[:n]) / n
-        result[f"bleu{n}"] = bp * math.exp(log_avg)
-    return result
-# ---------------------------------------------------------------------------
-# Manual ROUGE implementation (no rouge_score dependency)
-# ---------------------------------------------------------------------------
-def _lcs_length(x: list[str], y: list[str]) -> int:
-    """Longest common subsequence length via DP."""
-    m, n = len(x), len(y)
-    if m == 0 or n == 0:
-        return 0
-    # Space-optimized: only keep current and previous row
-    prev = [0] * (n + 1)
-    curr = [0] * (n + 1)
-    for i in range(1, m + 1):
-        for j in range(1, n + 1):
-            if x[i - 1] == y[j - 1]:
-                curr[j] = prev[j - 1] + 1
-            else:
-                curr[j] = max(prev[j], curr[j - 1])
-        prev, curr = curr, [0] * (n + 1)
-    return prev[n]
-def compute_rouge(references: list[list[str]], hypotheses: list[list[str]]) -> dict[str, float]:
-    """Compute ROUGE-1 (unigram F1) and ROUGE-L (LCS-based F1)."""
-    rouge1_scores = []
-    rougel_scores = []
-    for ref, hyp in zip(references, hypotheses):
-        if not ref or not hyp:
-            rouge1_scores.append(0.0)
-            rougel_scores.append(0.0)
-            continue
-        # ROUGE-1: unigram overlap
-        ref_unigrams = Counter(ref)
-        hyp_unigrams = Counter(hyp)
-        overlap = sum((ref_unigrams & hyp_unigrams).values())
-        r1_precision = overlap / max(len(hyp), 1)
-        r1_recall = overlap / max(len(ref), 1)
-        r1_f1 = 2 * r1_precision * r1_recall / max(r1_precision + r1_recall, 1e-10)
-        rouge1_scores.append(r1_f1)
-        # ROUGE-L: LCS-based
-        lcs = _lcs_length(ref, hyp)
-        rl_precision = lcs / max(len(hyp), 1)
-        rl_recall = lcs / max(len(ref), 1)
-        rl_f1 = 2 * rl_precision * rl_recall / max(rl_precision + rl_recall, 1e-10)
-        rougel_scores.append(rl_f1)
-    return {
-        "rouge1": sum(rouge1_scores) / max(len(rouge1_scores), 1),
-        "rouge_l": sum(rougel_scores) / max(len(rougel_scores), 1),
-    }
-# ---------------------------------------------------------------------------
-# Greedy generation
-# ---------------------------------------------------------------------------
-@torch.no_grad()
-def greedy_generate(model, tokenizer, prompt: str, max_new_tokens: int = 32, device: str = "cuda") -> str:
-    """Greedy (argmax) autoregressive generation. Deterministic."""
-    ids = tokenizer.encode(prompt)
-    x = torch.tensor([ids], device=device, dtype=torch.long)
-    for _ in range(max_new_tokens):
-        logits = model(x, targets=None)
-        if logits.dim() == 3:
-            next_logits = logits[0, -1, :]
-        else:
-            next_logits = logits[0]
-        next_id = next_logits.argmax().unsqueeze(0).unsqueeze(0)
-        x = torch.cat([x, next_id], dim=1)
-        if x.size(1) >= MAX_SEQ_LEN:
-            break
-    all_ids = x[0].tolist()
-    return tokenizer.decode(all_ids[len(ids):])
-# ---------------------------------------------------------------------------
-# Coherence metrics
-# ---------------------------------------------------------------------------
-def compute_coherence(generations: list[str]) -> dict[str, float]:
-    """Compute distinct-2, repetition rate, and self-BLEU across generations."""
-    all_bigrams = []
-    all_fourgrams = []
-    tokenized_gens = []
-    for gen in generations:
-        tokens = gen.lower().split()
-        tokenized_gens.append(tokens)
-        bigrams = [tuple(tokens[i:i + 2]) for i in range(len(tokens) - 1)]
-        fourgrams = [tuple(tokens[i:i + 4]) for i in range(len(tokens) - 3)]
-        all_bigrams.extend(bigrams)
-        all_fourgrams.extend(fourgrams)
-    # Distinct-2: fraction of unique bigrams
-    distinct2 = len(set(all_bigrams)) / max(len(all_bigrams), 1)
-    # Repetition rate: fraction of 4-grams that appear more than once
-    fourgram_counts = Counter(all_fourgrams)
-    repeated = sum(1 for c in fourgram_counts.values() if c > 1)
-    repetition_rate = repeated / max(len(fourgram_counts), 1)
-    # Self-BLEU: average BLEU of each generation against all others
-    # Lower = more diverse
-    self_bleu_scores = []
-    for i, hyp in enumerate(tokenized_gens):
-        if not hyp:
-            continue
-        others = [g for j, g in enumerate(tokenized_gens) if j != i and g]
-        if not others:
-            continue
-        # Average BLEU against each other generation
-        pair_scores = []
-        for ref in others:
-            result = compute_bleu([ref], [hyp], max_n=4)
-            pair_scores.append(result.get("bleu4", 0.0))
-        self_bleu_scores.append(sum(pair_scores) / len(pair_scores))
-    self_bleu = sum(self_bleu_scores) / max(len(self_bleu_scores), 1)
-    return {
-        "distinct2": distinct2,
-        "repetition_rate": repetition_rate,
-        "self_bleu": self_bleu,
-    }
-# ---------------------------------------------------------------------------
-# Factual accuracy (reuse existing probes)
-# ---------------------------------------------------------------------------
-def compute_factual(model, tokenizer, device: str = "cuda") -> float:
-    """Run factual eval probes, return accuracy [0,1]."""
-    model.eval()
-    hits = 0
-    with torch.no_grad(), torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
-        for prompt, answers in FACTUAL_EVAL:
-            ids = tokenizer.encode(prompt)
-            x = torch.tensor([ids], device=device, dtype=torch.long)
-            logits = model(x, targets=None)
-            if logits.dim() == 3:
-                last_logits = logits[0, -1, :]
-            else:
-                last_logits = logits[0]
-            probs = torch.softmax(last_logits.float(), dim=-1)
-            top_k = min(20, probs.shape[-1])
-            top_ids = torch.topk(probs, top_k).indices.tolist()
-            top_tokens = [tokenizer.decode([tid]).strip().lower() for tid in top_ids]
-            answers_lower = [a.lower() for a in answers]
-            if any(any(a in tok for a in answers_lower) for tok in top_tokens):
-                hits += 1
-    return hits / max(len(FACTUAL_EVAL), 1)
-# ---------------------------------------------------------------------------
-# PPL (perplexity) via existing evaluate_bpb
-# ---------------------------------------------------------------------------
-def compute_ppl(model, tokenizer, batch_size: int = 8) -> tuple[float, float]:
-    """Compute BPB and PPL. Returns (bpb, ppl)."""
-    import prepare as _prepare_mod
-    # Use smaller eval set for speed (<30s budget)
-    orig_eval = _prepare_mod.EVAL_TOKENS
-    _prepare_mod.EVAL_TOKENS = 2 * 524288  # ~1M tokens, fast
-    try:
-        with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
-            bpb = evaluate_bpb(model, tokenizer, batch_size)
-    finally:
-        _prepare_mod.EVAL_TOKENS = orig_eval
-    ppl = 2 ** bpb
-    return bpb, ppl
-# ---------------------------------------------------------------------------
-# Composite quality score
-# ---------------------------------------------------------------------------
-def compute_quality_score(ppl: float, bleu4: float, rouge_l: float,
-                          factual: float, repetition_rate: float) -> float:
-    """Single composite metric for autoresearch optimization.
-    Formula rationale:
-    - PPL (30%): Primary language modeling metric, capped at 100
-    - BLEU-4 (20%): Generation quality vs references
-    - ROUGE-L (20%): Recall of reference content
-    - Factual (15%): Knowledge memorization
-    - 1-repetition (15%): Diversity/coherence
-    """
-    return (
-        0.3 * (1 - min(ppl, 100) / 100) +
-        0.2 * bleu4 +
-        0.2 * rouge_l +
-        0.15 * factual +
-        0.15 * (1 - repetition_rate)
-    )
-# ---------------------------------------------------------------------------
-# Main evaluation entry point
-# ---------------------------------------------------------------------------
-def run_quality_eval(
-    model: torch.nn.Module,
-    tokenizer,
-    device: str = "cuda",
-    batch_size: int = 8,
-    verbose: bool = True,
-) -> dict[str, float]:
-    """Run full quality evaluation suite. Returns dict of all metrics."""
-    model.eval()
-    results: dict[str, float] = {}
-    t0 = time.time()
-    # 1. PPL / BPB
-    if verbose:
-        print("[eval] Computing PPL/BPB...", flush=True)
-    bpb, ppl = compute_ppl(model, tokenizer, batch_size)
-    results["bpb"] = bpb
-    results["ppl"] = ppl
-    # 2. Generate continuations for BLEU/ROUGE
-    if verbose:
-        print("[eval] Generating continuations (20 prompts, greedy)...", flush=True)
-    hypotheses_text = []
-    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
-        for prompt in EVAL_PROMPTS:
-            gen = greedy_generate(model, tokenizer, prompt, max_new_tokens=32, device=device)
-            hypotheses_text.append(gen)
-    # Tokenize for BLEU/ROUGE (simple whitespace split)
-    ref_tokens = [ref.lower().split() for ref in EVAL_REFERENCES]
-    hyp_tokens = [hyp.lower().split() for hyp in hypotheses_text]
-    # 3. BLEU
-    if verbose:
-        print("[eval] Computing BLEU...", flush=True)
-    bleu = compute_bleu(ref_tokens, hyp_tokens, max_n=4)
-    results["bleu1"] = bleu["bleu1"]
-    results["bleu4"] = bleu["bleu4"]
-    # 4. ROUGE
-    if verbose:
-        print("[eval] Computing ROUGE...", flush=True)
-    rouge = compute_rouge(ref_tokens, hyp_tokens)
-    results["rouge1"] = rouge["rouge1"]
-    results["rouge_l"] = rouge["rouge_l"]
-    # 5. Factual accuracy
-    if verbose:
-        print("[eval] Computing factual accuracy...", flush=True)
-    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
-        factual = compute_factual(model, tokenizer, device)
-    results["factual"] = factual
-    # 6. Coherence
-    if verbose:
-        print("[eval] Generating coherence passages (10 prompts, 64 tokens)...", flush=True)
-    coherence_gens = []
-    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
-        for prompt in COHERENCE_PROMPTS:
-            gen = greedy_generate(model, tokenizer, prompt, max_new_tokens=64, device=device)
-            coherence_gens.append(gen)
-    coherence = compute_coherence(coherence_gens)
-    results["distinct2"] = coherence["distinct2"]
-    results["repetition_rate"] = coherence["repetition_rate"]
-    results["self_bleu"] = coherence["self_bleu"]
-    # 7. Composite score
-    results["quality_score"] = compute_quality_score(
-        ppl=results["ppl"],
-        bleu4=results["bleu4"],
-        rouge_l=results["rouge_l"],
-        factual=results["factual"],
-        repetition_rate=results["repetition_rate"],
-    )
-    elapsed = time.time() - t0
-    results["eval_time_s"] = elapsed
-    # Print all metrics
-    if verbose:
-        print("\n--- Quality Evaluation Results ---")
-        for k, v in sorted(results.items()):
-            print(f"{k}={v:.6f}")
-        print("--- End Quality Evaluation ---\n")
-        # Print sample generations
-        print("--- Sample Generations ---")
-        for i, (prompt, gen) in enumerate(zip(EVAL_PROMPTS[:5], hypotheses_text[:5])):
-            print(f'  [{i}] "{prompt}" -> "{gen.strip()[:80]}"')
-        print("--- End Sample Generations ---\n")
-        print("--- Coherence Samples ---")
-        for i, (prompt, gen) in enumerate(zip(COHERENCE_PROMPTS[:3], coherence_gens[:3])):
-            print(f'  [{i}] "{prompt}" -> "{gen.strip()[:100]}"')
-        print("--- End Coherence Samples ---\n")
-    return results
-# ---------------------------------------------------------------------------
-# Standalone CLI
-# ---------------------------------------------------------------------------
-def _build_model_and_tokenizer(checkpoint: Optional[str] = None):
-    """Build model + tokenizer, optionally loading from checkpoint."""
-    from hydra.model import PostSemClawModel
-    device = torch.device("cuda")
-    tokenizer = Tokenizer.from_directory()
-    vocab_size = tokenizer.get_vocab_size()
-    config = PostSemClawConfig(
-        sequence_len=MAX_SEQ_LEN,
-        vocab_size=vocab_size,
-        n_layer=N_LAYER,
-        d_model=D_MODEL,
-        d_state=D_STATE,
-        headdim=HEADDIM,
-        n_heads=N_HEADS,
-        expand=EXPAND,
-        engram_n_columns=ENGRAM_N_COLUMNS,
-        engram_key_dim=ENGRAM_KEY_DIM,
-        engram_layer_idx=ENGRAM_LAYER_IDX,
-    )
-    with torch.device("meta"):
-        model = PostSemClawModel(config)
-    model.to_empty(device=device)
-    if checkpoint and os.path.exists(checkpoint):
-        print(f"[eval] Loading checkpoint: {checkpoint}")
-        state = torch.load(checkpoint, map_location=device, weights_only=True)
-        model.load_state_dict(state, strict=False)
-    else:
-        print("[eval] No checkpoint — using freshly initialized weights")
-        model.init_weights()
-    model.eval()
-    return model, tokenizer, device
-def main():
-    import argparse
-    parser = argparse.ArgumentParser(description="HYDRA quality evaluation")
-    parser.add_argument("--checkpoint", type=str, default=None, help="Path to model checkpoint")
-    parser.add_argument("--batch-size", type=int, default=DEVICE_BATCH_SIZE, help="Batch size for PPL eval")
-    args = parser.parse_args()
-    model, tokenizer, device = _build_model_and_tokenizer(args.checkpoint)
-    results = run_quality_eval(model, tokenizer, str(device), args.batch_size, verbose=True)
-    # Final summary line (grep-friendly)
-    print(f"QUALITY_SCORE={results['quality_score']:.6f} PPL={results['ppl']:.3f} "
-          f"BPB={results['bpb']:.4f} BLEU4={results['bleu4']:.4f} "
-          f"ROUGE_L={results['rouge_l']:.4f} FACTUAL={results['factual']:.4f} "
-          f"REP_RATE={results['repetition_rate']:.4f}")
-if __name__ == "__main__":
-    main()

+#!/usr/bin/env python3
+"""Comprehensive quality evaluation harness for HYDRA.
+Computes: PPL, BLEU-1, BLEU-4, ROUGE-1, ROUGE-L, factual accuracy,
+coherence metrics (distinct-2, repetition-rate, self-BLEU), and a
+composite quality_score.
+Usage:
+    python scripts/eval_quality.py                     # eval latest model
+    python scripts/eval_quality.py --checkpoint ckpt.pt  # eval from checkpoint
+All metrics printed as key=value (grep-friendly). Runs in <30s on RTX 3060.
+"""
+from __future__ import annotations
+import math
+import os
+import sys
+import time
+from collections import Counter
+from typing import Optional
+# Ensure project root is on path
+_PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+if _PROJECT_ROOT not in sys.path:
+    sys.path.insert(0, _PROJECT_ROOT)
+import torch
+import torch.nn.functional as F
+from hydra.config import (
+    D_MODEL, D_STATE, DEVICE_BATCH_SIZE, ENGRAM_KEY_DIM,
+    ENGRAM_LAYER_IDX, ENGRAM_N_COLUMNS, EXPAND, HEADDIM,
+    N_HEADS, N_LAYER, PostSemClawConfig,
+)
+from hydra.eval import FACTUAL_EVAL
+from prepare import MAX_SEQ_LEN, Tokenizer, evaluate_bpb
+# ---------------------------------------------------------------------------
+# Eval prompts (hardcoded for reproducibility)
+# ---------------------------------------------------------------------------
+EVAL_PROMPTS = [
+    "The capital of France is",
+    "In 1969, humans first",
+    "Water boils at a temperature of",
+    "The theory of relativity was developed by",
+    "The largest planet in our solar system is",
+    "Photosynthesis is the process by which",
+    "The stock market crashed in",
+    "DNA stands for",
+    "The speed of light is approximately",
+    "Shakespeare wrote the play",
+    "The mitochondria is often called the",
+    "In computer science, an algorithm is",
+    "The chemical symbol for gold is",
+    "The Great Wall of China was built to",
+    "Gravity is a force that",
+    "The human heart pumps blood through",
+    "The Amazon rainforest is located in",
+    "Pi is approximately equal to",
+    "The first President of the United States was",
+    "Oxygen makes up approximately",
+]
+# Reference continuations (approximate, for BLEU/ROUGE)
+EVAL_REFERENCES = [
+    "Paris, which is also the largest city in France.",
+    "landed on the Moon during the Apollo 11 mission.",
+    "100 degrees Celsius or 212 degrees Fahrenheit at standard atmospheric pressure.",
+    "Albert Einstein in the early twentieth century.",
+    "Jupiter, which is a gas giant.",
+    "plants convert sunlight into chemical energy and produce oxygen.",
+    "1929, leading to the Great Depression.",
+    "deoxyribonucleic acid, which carries genetic information.",
+    "299,792 kilometers per second in a vacuum.",
+    "Romeo and Juliet, one of the most famous tragedies.",
+    "powerhouse of the cell because it produces energy.",
+    "a step by step procedure for solving a problem.",
+    "Au, from the Latin word aurum.",
+    "protect against invasions from the north.",
+    "attracts objects with mass toward each other.",
+    "the circulatory system to deliver oxygen and nutrients.",
+    "South America, primarily within Brazil.",
+    "3.14159, and it represents the ratio of circumference to diameter.",
+    "George Washington, who served from 1789 to 1797.",
+    "21 percent of the Earth's atmosphere.",
+]
+COHERENCE_PROMPTS = [
+    "The history of science shows that",
+    "In modern society, technology has",
+    "The relationship between education and",
+    "Climate change is affecting the world because",
+    "The development of artificial intelligence has led to",
+    "Throughout human history, art has been",
+    "The economy of a nation depends on",
+    "Medical research has shown that",
+    "The role of government in society is",
+    "The ocean covers more than",
+]
+# ---------------------------------------------------------------------------
+# Manual BLEU implementation (no nltk dependency)
+# ---------------------------------------------------------------------------
+def _get_ngrams(tokens: list[str], n: int) -> Counter:
+    """Extract n-gram counts from token list."""
+    return Counter(tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1))
+def _modified_precision(reference_tokens: list[str], hypothesis_tokens: list[str], n: int) -> tuple[int, int]:
+    """Compute modified precision for n-grams."""
+    ref_ngrams = _get_ngrams(reference_tokens, n)
+    hyp_ngrams = _get_ngrams(hypothesis_tokens, n)
+    clipped_count = 0
+    total_count = 0
+    for ngram, count in hyp_ngrams.items():
+        clipped_count += min(count, ref_ngrams.get(ngram, 0))
+        total_count += count
+    return clipped_count, max(total_count, 1)
+def compute_bleu(references: list[list[str]], hypotheses: list[list[str]], max_n: int = 4) -> dict[str, float]:
+    """Corpus-level BLEU-1 through BLEU-max_n.
+    Uses brevity penalty and geometric mean of modified precisions.
+    """
+    precisions = []
+    for n in range(1, max_n + 1):
+        total_clip = 0
+        total_count = 0
+        for ref, hyp in zip(references, hypotheses):
+            clip, count = _modified_precision(ref, hyp, n)
+            total_clip += clip
+            total_count += count
+        precisions.append(total_clip / max(total_count, 1))
+    # Brevity penalty
+    ref_len = sum(len(r) for r in references)
+    hyp_len = sum(len(h) for h in hypotheses)
+    if hyp_len == 0:
+        return {f"bleu{n}": 0.0 for n in range(1, max_n + 1)}
+    bp = math.exp(min(0, 1 - ref_len / hyp_len))
+    result = {}
+    for n in range(1, max_n + 1):
+        # Geometric mean of precisions 1..n
+        log_avg = sum(math.log(max(p, 1e-10)) for p in precisions[:n]) / n
+        result[f"bleu{n}"] = bp * math.exp(log_avg)
+    return result
+# ---------------------------------------------------------------------------
+# Manual ROUGE implementation (no rouge_score dependency)
+# ---------------------------------------------------------------------------
+def _lcs_length(x: list[str], y: list[str]) -> int:
+    """Longest common subsequence length via DP."""
+    m, n = len(x), len(y)
+    if m == 0 or n == 0:
+        return 0
+    # Space-optimized: only keep current and previous row
+    prev = [0] * (n + 1)
+    curr = [0] * (n + 1)
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            if x[i - 1] == y[j - 1]:
+                curr[j] = prev[j - 1] + 1
+            else:
+                curr[j] = max(prev[j], curr[j - 1])
+        prev, curr = curr, [0] * (n + 1)
+    return prev[n]
+def compute_rouge(references: list[list[str]], hypotheses: list[list[str]]) -> dict[str, float]:
+    """Compute ROUGE-1 (unigram F1) and ROUGE-L (LCS-based F1)."""
+    rouge1_scores = []
+    rougel_scores = []
+    for ref, hyp in zip(references, hypotheses):
+        if not ref or not hyp:
+            rouge1_scores.append(0.0)
+            rougel_scores.append(0.0)
+            continue
+        # ROUGE-1: unigram overlap
+        ref_unigrams = Counter(ref)
+        hyp_unigrams = Counter(hyp)
+        overlap = sum((ref_unigrams & hyp_unigrams).values())
+        r1_precision = overlap / max(len(hyp), 1)
+        r1_recall = overlap / max(len(ref), 1)
+        r1_f1 = 2 * r1_precision * r1_recall / max(r1_precision + r1_recall, 1e-10)
+        rouge1_scores.append(r1_f1)
+        # ROUGE-L: LCS-based
+        lcs = _lcs_length(ref, hyp)
+        rl_precision = lcs / max(len(hyp), 1)
+        rl_recall = lcs / max(len(ref), 1)
+        rl_f1 = 2 * rl_precision * rl_recall / max(rl_precision + rl_recall, 1e-10)
+        rougel_scores.append(rl_f1)
+    return {
+        "rouge1": sum(rouge1_scores) / max(len(rouge1_scores), 1),
+        "rouge_l": sum(rougel_scores) / max(len(rougel_scores), 1),
+    }
+# ---------------------------------------------------------------------------
+# Greedy generation
+# ---------------------------------------------------------------------------
+@torch.no_grad()
+def greedy_generate(model, tokenizer, prompt: str, max_new_tokens: int = 32, device: str = "cuda") -> str:
+    """Greedy (argmax) autoregressive generation. Deterministic."""
+    ids = tokenizer.encode(prompt)
+    x = torch.tensor([ids], device=device, dtype=torch.long)
+    for _ in range(max_new_tokens):
+        logits = model(x, targets=None)
+        if logits.dim() == 3:
+            next_logits = logits[0, -1, :]
+        else:
+            next_logits = logits[0]
+        next_id = next_logits.argmax().unsqueeze(0).unsqueeze(0)
+        x = torch.cat([x, next_id], dim=1)
+        if x.size(1) >= MAX_SEQ_LEN:
+            break
+    all_ids = x[0].tolist()
+    return tokenizer.decode(all_ids[len(ids):])
+# ---------------------------------------------------------------------------
+# Coherence metrics
+# ---------------------------------------------------------------------------
+def compute_coherence(generations: list[str]) -> dict[str, float]:
+    """Compute distinct-2, repetition rate, and self-BLEU across generations."""
+    all_bigrams = []
+    all_fourgrams = []
+    tokenized_gens = []
+    for gen in generations:
+        tokens = gen.lower().split()
+        tokenized_gens.append(tokens)
+        bigrams = [tuple(tokens[i:i + 2]) for i in range(len(tokens) - 1)]
+        fourgrams = [tuple(tokens[i:i + 4]) for i in range(len(tokens) - 3)]
+        all_bigrams.extend(bigrams)
+        all_fourgrams.extend(fourgrams)
+    # Distinct-2: fraction of unique bigrams
+    distinct2 = len(set(all_bigrams)) / max(len(all_bigrams), 1)
+    # Repetition rate: fraction of 4-grams that appear more than once
+    fourgram_counts = Counter(all_fourgrams)
+    repeated = sum(1 for c in fourgram_counts.values() if c > 1)
+    repetition_rate = repeated / max(len(fourgram_counts), 1)
+    # Self-BLEU: average BLEU of each generation against all others
+    # Lower = more diverse
+    self_bleu_scores = []
+    for i, hyp in enumerate(tokenized_gens):
+        if not hyp:
+            continue
+        others = [g for j, g in enumerate(tokenized_gens) if j != i and g]
+        if not others:
+            continue
+        # Average BLEU against each other generation
+        pair_scores = []
+        for ref in others:
+            result = compute_bleu([ref], [hyp], max_n=4)
+            pair_scores.append(result.get("bleu4", 0.0))
+        self_bleu_scores.append(sum(pair_scores) / len(pair_scores))
+    self_bleu = sum(self_bleu_scores) / max(len(self_bleu_scores), 1)
+    return {
+        "distinct2": distinct2,
+        "repetition_rate": repetition_rate,
+        "self_bleu": self_bleu,
+    }
+# ---------------------------------------------------------------------------
+# Factual accuracy (reuse existing probes)
+# ---------------------------------------------------------------------------
+def compute_factual(model, tokenizer, device: str = "cuda") -> float:
+    """Run factual eval probes, return accuracy [0,1]."""
+    model.eval()
+    hits = 0
+    with torch.no_grad(), torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        for prompt, answers in FACTUAL_EVAL:
+            ids = tokenizer.encode(prompt)
+            x = torch.tensor([ids], device=device, dtype=torch.long)
+            logits = model(x, targets=None)
+            if logits.dim() == 3:
+                last_logits = logits[0, -1, :]
+            else:
+                last_logits = logits[0]
+            probs = torch.softmax(last_logits.float(), dim=-1)
+            top_k = min(20, probs.shape[-1])
+            top_ids = torch.topk(probs, top_k).indices.tolist()
+            top_tokens = [tokenizer.decode([tid]).strip().lower() for tid in top_ids]
+            answers_lower = [a.lower() for a in answers]
+            if any(any(a in tok for a in answers_lower) for tok in top_tokens):
+                hits += 1
+    return hits / max(len(FACTUAL_EVAL), 1)
+# ---------------------------------------------------------------------------
+# PPL (perplexity) via existing evaluate_bpb
+# ---------------------------------------------------------------------------
+def compute_ppl(model, tokenizer, batch_size: int = 8) -> tuple[float, float]:
+    """Compute BPB and PPL. Returns (bpb, ppl)."""
+    import prepare as _prepare_mod
+    # Use smaller eval set for speed (<30s budget)
+    orig_eval = _prepare_mod.EVAL_TOKENS
+    _prepare_mod.EVAL_TOKENS = 2 * 524288  # ~1M tokens, fast
+    try:
+        with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+            bpb = evaluate_bpb(model, tokenizer, batch_size)
+    finally:
+        _prepare_mod.EVAL_TOKENS = orig_eval
+    ppl = 2 ** bpb
+    return bpb, ppl
+# ---------------------------------------------------------------------------
+# Composite quality score
+# ---------------------------------------------------------------------------
+def compute_quality_score(ppl: float, bleu4: float, rouge_l: float,
+                          factual: float, repetition_rate: float) -> float:
+    """Single composite metric for autoresearch optimization.
+    Formula rationale:
+    - PPL (30%): Primary language modeling metric, capped at 100
+    - BLEU-4 (20%): Generation quality vs references
+    - ROUGE-L (20%): Recall of reference content
+    - Factual (15%): Knowledge memorization
+    - 1-repetition (15%): Diversity/coherence
+    """
+    return (
+        0.3 * (1 - min(ppl, 100) / 100) +
+        0.2 * bleu4 +
+        0.2 * rouge_l +
+        0.15 * factual +
+        0.15 * (1 - repetition_rate)
+    )
+# ---------------------------------------------------------------------------
+# Main evaluation entry point
+# ---------------------------------------------------------------------------
+def run_quality_eval(
+    model: torch.nn.Module,
+    tokenizer,
+    device: str = "cuda",
+    batch_size: int = 8,
+    verbose: bool = True,
+) -> dict[str, float]:
+    """Run full quality evaluation suite. Returns dict of all metrics."""
+    model.eval()
+    results: dict[str, float] = {}
+    t0 = time.time()
+    # 1. PPL / BPB
+    if verbose:
+        print("[eval] Computing PPL/BPB...", flush=True)
+    bpb, ppl = compute_ppl(model, tokenizer, batch_size)
+    results["bpb"] = bpb
+    results["ppl"] = ppl
+    # 2. Generate continuations for BLEU/ROUGE
+    if verbose:
+        print("[eval] Generating continuations (20 prompts, greedy)...", flush=True)
+    hypotheses_text = []
+    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        for prompt in EVAL_PROMPTS:
+            gen = greedy_generate(model, tokenizer, prompt, max_new_tokens=32, device=device)
+            hypotheses_text.append(gen)
+    # Tokenize for BLEU/ROUGE (simple whitespace split)
+    ref_tokens = [ref.lower().split() for ref in EVAL_REFERENCES]
+    hyp_tokens = [hyp.lower().split() for hyp in hypotheses_text]
+    # 3. BLEU
+    if verbose:
+        print("[eval] Computing BLEU...", flush=True)
+    bleu = compute_bleu(ref_tokens, hyp_tokens, max_n=4)
+    results["bleu1"] = bleu["bleu1"]
+    results["bleu4"] = bleu["bleu4"]
+    # 4. ROUGE
+    if verbose:
+        print("[eval] Computing ROUGE...", flush=True)
+    rouge = compute_rouge(ref_tokens, hyp_tokens)
+    results["rouge1"] = rouge["rouge1"]
+    results["rouge_l"] = rouge["rouge_l"]
+    # 5. Factual accuracy
+    if verbose:
+        print("[eval] Computing factual accuracy...", flush=True)
+    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        factual = compute_factual(model, tokenizer, device)
+    results["factual"] = factual
+    # 6. Coherence
+    if verbose:
+        print("[eval] Generating coherence passages (10 prompts, 64 tokens)...", flush=True)
+    coherence_gens = []
+    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        for prompt in COHERENCE_PROMPTS:
+            gen = greedy_generate(model, tokenizer, prompt, max_new_tokens=64, device=device)
+            coherence_gens.append(gen)
+    coherence = compute_coherence(coherence_gens)
+    results["distinct2"] = coherence["distinct2"]
+    results["repetition_rate"] = coherence["repetition_rate"]
+    results["self_bleu"] = coherence["self_bleu"]
+    # 7. Composite score
+    results["quality_score"] = compute_quality_score(
+        ppl=results["ppl"],
+        bleu4=results["bleu4"],
+        rouge_l=results["rouge_l"],
+        factual=results["factual"],
+        repetition_rate=results["repetition_rate"],
+    )
+    elapsed = time.time() - t0
+    results["eval_time_s"] = elapsed
+    # Print all metrics
+    if verbose:
+        print("\n--- Quality Evaluation Results ---")
+        for k, v in sorted(results.items()):
+            print(f"{k}={v:.6f}")
+        print("--- End Quality Evaluation ---\n")
+        # Print sample generations
+        print("--- Sample Generations ---")
+        for i, (prompt, gen) in enumerate(zip(EVAL_PROMPTS[:5], hypotheses_text[:5])):
+            print(f'  [{i}] "{prompt}" -> "{gen.strip()[:80]}"')
+        print("--- End Sample Generations ---\n")
+        print("--- Coherence Samples ---")
+        for i, (prompt, gen) in enumerate(zip(COHERENCE_PROMPTS[:3], coherence_gens[:3])):
+            print(f'  [{i}] "{prompt}" -> "{gen.strip()[:100]}"')
+        print("--- End Coherence Samples ---\n")
+    return results
+# ---------------------------------------------------------------------------
+# Standalone CLI
+# ---------------------------------------------------------------------------
+def _build_model_and_tokenizer(checkpoint: Optional[str] = None):
+    """Build model + tokenizer, optionally loading from checkpoint."""
+    from hydra.model import PostSemClawModel
+    device = torch.device("cuda")
+    tokenizer = Tokenizer.from_directory()
+    vocab_size = tokenizer.get_vocab_size()
+    config = PostSemClawConfig(
+        sequence_len=MAX_SEQ_LEN,
+        vocab_size=vocab_size,
+        n_layer=N_LAYER,
+        d_model=D_MODEL,
+        d_state=D_STATE,
+        headdim=HEADDIM,
+        n_heads=N_HEADS,
+        expand=EXPAND,
+        engram_n_columns=ENGRAM_N_COLUMNS,
+        engram_key_dim=ENGRAM_KEY_DIM,
+        engram_layer_idx=ENGRAM_LAYER_IDX,
+    )
+    with torch.device("meta"):
+        model = PostSemClawModel(config)
+    model.to_empty(device=device)
+    if checkpoint and os.path.exists(checkpoint):
+        print(f"[eval] Loading checkpoint: {checkpoint}")
+        state = torch.load(checkpoint, map_location=device, weights_only=True)
+        model.load_state_dict(state, strict=False)
+    else:
+        print("[eval] No checkpoint — using freshly initialized weights")
+        model.init_weights()
+    model.eval()
+    return model, tokenizer, device
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description="HYDRA quality evaluation")
+    parser.add_argument("--checkpoint", type=str, default=None, help="Path to model checkpoint")
+    parser.add_argument("--batch-size", type=int, default=DEVICE_BATCH_SIZE, help="Batch size for PPL eval")
+    args = parser.parse_args()
+    model, tokenizer, device = _build_model_and_tokenizer(args.checkpoint)
+    results = run_quality_eval(model, tokenizer, str(device), args.batch_size, verbose=True)
+    # Final summary line (grep-friendly)
+    print(f"QUALITY_SCORE={results['quality_score']:.6f} PPL={results['ppl']:.3f} "
+          f"BPB={results['bpb']:.4f} BLEU4={results['bleu4']:.4f} "
+          f"ROUGE_L={results['rouge_l']:.4f} FACTUAL={results['factual']:.4f} "
+          f"REP_RATE={results['repetition_rate']:.4f}")
+if __name__ == "__main__":
+    main()

overlay/scripts/fetch_corpus.py CHANGED Viewed

@@ -1,211 +1,211 @@
-"""
-Fetch additional training shards from karpathy/climbmix-400b-shuffle.
-The repo already has ~500 shards (~31B tokens). This script is a
-resumable, parallel downloader for cases where more shards are needed
-(e.g., multi-day training, experiments requiring fresh-unseen data,
-or when we want to split the corpus across processes).
-Usage:
-    # Fetch shards up to index 600 (total cap)
-    python scripts/fetch_corpus.py --target-shards 600
-    # Fetch a specific range
-    python scripts/fetch_corpus.py --start 500 --end 800
-    # Dry-run (list what would be downloaded)
-    python scripts/fetch_corpus.py --target-shards 600 --dry-run
-Notes:
-- Safe to run while training is active; only writes files not touched
-  by the training process.
-- Resumable: skips shards already on disk.
-- Downloads to the same DATA_DIR used by prepare.py so they're picked
-  up on next training launch.
-"""
-from __future__ import annotations
-import argparse
-import os
-import shutil
-import sys
-import time
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from pathlib import Path
-import requests
-REPO_ROOT = Path(__file__).resolve().parent.parent
-sys.path.insert(0, str(REPO_ROOT))
-from prepare import BASE_URL, DATA_DIR, MAX_SHARD, VAL_SHARD  # noqa: E402
-def human_bytes(n: int) -> str:
-    for unit in ("B", "KB", "MB", "GB", "TB"):
-        if n < 1024:
-            return f"{n:.1f}{unit}"
-        n /= 1024
-    return f"{n:.1f}PB"
-def download_one(
-    index: int, data_dir: str, timeout: int = 30, max_attempts: int = 5
-) -> tuple[int, bool, int, str]:
-    """
-    Download a single parquet shard. Resumable + retry with exponential backoff.
-    Returns (index, success, bytes_written, message).
-    """
-    filename = f"shard_{index:05d}.parquet"
-    filepath = os.path.join(data_dir, filename)
-    tmp_path = filepath + ".tmp"
-    if os.path.exists(filepath):
-        return index, True, 0, "already-present"
-    url = f"{BASE_URL}/{filename}"
-    for attempt in range(1, max_attempts + 1):
-        try:
-            with requests.get(url, stream=True, timeout=timeout) as r:
-                r.raise_for_status()
-                bytes_written = 0
-                with open(tmp_path, "wb") as f:
-                    for chunk in r.iter_content(chunk_size=1 << 20):
-                        if chunk:
-                            f.write(chunk)
-                            bytes_written += len(chunk)
-            os.rename(tmp_path, filepath)
-            return index, True, bytes_written, f"ok (attempt {attempt})"
-        except (requests.RequestException, OSError) as e:
-            # Clean up partial file.
-            for p in (tmp_path, filepath):
-                if os.path.exists(p):
-                    try:
-                        os.remove(p)
-                    except OSError:
-                        pass
-            if attempt < max_attempts:
-                wait = 2 ** attempt
-                time.sleep(wait)
-                continue
-            return index, False, 0, f"failed after {max_attempts} attempts: {e}"
-    return index, False, 0, "unknown failure"
-def check_disk_space(required_bytes: int, data_dir: str) -> tuple[bool, int]:
-    """Ensure we have at least required_bytes + 10% headroom free."""
-    os.makedirs(data_dir, exist_ok=True)
-    stats = shutil.disk_usage(data_dir)
-    headroom = int(required_bytes * 1.1)
-    return stats.free >= headroom, stats.free
-def main() -> int:
-    parser = argparse.ArgumentParser(
-        description="Fetch additional climbmix-400b-shuffle shards"
-    )
-    parser.add_argument(
-        "--target-shards",
-        type=int,
-        default=None,
-        help="Total train-shard count to reach (0..target-1). Mutually exclusive with --start/--end.",
-    )
-    parser.add_argument("--start", type=int, default=None, help="Starting shard index (inclusive)")
-    parser.add_argument("--end", type=int, default=None, help="Ending shard index (exclusive)")
-    parser.add_argument("--workers", type=int, default=8, help="Parallel download workers")
-    parser.add_argument(
-        "--include-val",
-        action="store_true",
-        help="Also fetch the pinned validation shard (normally present already)",
-    )
-    parser.add_argument(
-        "--dry-run",
-        action="store_true",
-        help="List what would be downloaded without fetching",
-    )
-    args = parser.parse_args()
-    # Resolve shard range.
-    if args.target_shards is not None:
-        if args.start is not None or args.end is not None:
-            print("ERROR: --target-shards is exclusive with --start/--end")
-            return 1
-        ids = list(range(min(args.target_shards, MAX_SHARD)))
-    else:
-        start = args.start or 0
-        end = args.end if args.end is not None else MAX_SHARD
-        end = min(end, MAX_SHARD)
-        ids = list(range(start, end))
-    if args.include_val and VAL_SHARD not in ids:
-        ids.append(VAL_SHARD)
-    os.makedirs(DATA_DIR, exist_ok=True)
-    present = set()
-    for p in Path(DATA_DIR).glob("shard_*.parquet"):
-        try:
-            idx = int(p.stem.split("_")[1])
-            present.add(idx)
-        except (IndexError, ValueError):
-            continue
-    to_fetch = [i for i in ids if i not in present]
-    if not to_fetch:
-        print(f"All {len(ids)} shards already present at {DATA_DIR}")
-        return 0
-    # Estimate space: shards are ~88MB; leave 10% headroom.
-    avg_shard_bytes = 90 * (1 << 20)  # 90MB
-    required = avg_shard_bytes * len(to_fetch)
-    ok, free = check_disk_space(required, DATA_DIR)
-    print(f"Plan: fetch {len(to_fetch)} shards (~{human_bytes(required)}); "
-          f"disk free: {human_bytes(free)}")
-    if not ok:
-        print("ERROR: insufficient disk space (need 1.1x required)")
-        return 2
-    if args.dry_run:
-        preview = to_fetch[:10]
-        print(
-            f"Dry-run — would fetch {len(to_fetch)} shards. First {len(preview)}: {preview}"
-        )
-        return 0
-    print(f"Downloading {len(to_fetch)} shards with {args.workers} workers...")
-    t_start = time.time()
-    success = 0
-    failed = 0
-    total_bytes = 0
-    with ThreadPoolExecutor(max_workers=args.workers) as ex:
-        futs = {ex.submit(download_one, i, DATA_DIR): i for i in to_fetch}
-        for fut in as_completed(futs):
-            idx, ok, nbytes, msg = fut.result()
-            if ok:
-                success += 1
-                total_bytes += nbytes
-                if success % 10 == 0 or success == len(to_fetch):
-                    elapsed = time.time() - t_start
-                    rate = total_bytes / max(elapsed, 1)
-                    print(
-                        f"  [{success}/{len(to_fetch)}] shard_{idx:05d} ok "
-                        f"({human_bytes(total_bytes)} @ {human_bytes(int(rate))}/s)"
-                    )
-            else:
-                failed += 1
-                print(f"  [FAIL] shard_{idx:05d}: {msg}")
-    elapsed = time.time() - t_start
-    print()
-    print("=" * 60)
-    print(f"Downloaded {success}/{len(to_fetch)} shards in {elapsed:.1f}s")
-    print(f"Failed: {failed}")
-    print(f"Total bytes: {human_bytes(total_bytes)}")
-    print("=" * 60)
-    return 0 if failed == 0 else 3
-if __name__ == "__main__":
-    raise SystemExit(main())

+"""
+Fetch additional training shards from karpathy/climbmix-400b-shuffle.
+The repo already has ~500 shards (~31B tokens). This script is a
+resumable, parallel downloader for cases where more shards are needed
+(e.g., multi-day training, experiments requiring fresh-unseen data,
+or when we want to split the corpus across processes).
+Usage:
+    # Fetch shards up to index 600 (total cap)
+    python scripts/fetch_corpus.py --target-shards 600
+    # Fetch a specific range
+    python scripts/fetch_corpus.py --start 500 --end 800
+    # Dry-run (list what would be downloaded)
+    python scripts/fetch_corpus.py --target-shards 600 --dry-run
+Notes:
+- Safe to run while training is active; only writes files not touched
+  by the training process.
+- Resumable: skips shards already on disk.
+- Downloads to the same DATA_DIR used by prepare.py so they're picked
+  up on next training launch.
+"""
+from __future__ import annotations
+import argparse
+import os
+import shutil
+import sys
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+import requests
+REPO_ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(REPO_ROOT))
+from prepare import BASE_URL, DATA_DIR, MAX_SHARD, VAL_SHARD  # noqa: E402
+def human_bytes(n: int) -> str:
+    for unit in ("B", "KB", "MB", "GB", "TB"):
+        if n < 1024:
+            return f"{n:.1f}{unit}"
+        n /= 1024
+    return f"{n:.1f}PB"
+def download_one(
+    index: int, data_dir: str, timeout: int = 30, max_attempts: int = 5
+) -> tuple[int, bool, int, str]:
+    """
+    Download a single parquet shard. Resumable + retry with exponential backoff.
+    Returns (index, success, bytes_written, message).
+    """
+    filename = f"shard_{index:05d}.parquet"
+    filepath = os.path.join(data_dir, filename)
+    tmp_path = filepath + ".tmp"
+    if os.path.exists(filepath):
+        return index, True, 0, "already-present"
+    url = f"{BASE_URL}/{filename}"
+    for attempt in range(1, max_attempts + 1):
+        try:
+            with requests.get(url, stream=True, timeout=timeout) as r:
+                r.raise_for_status()
+                bytes_written = 0
+                with open(tmp_path, "wb") as f:
+                    for chunk in r.iter_content(chunk_size=1 << 20):
+                        if chunk:
+                            f.write(chunk)
+                            bytes_written += len(chunk)
+            os.rename(tmp_path, filepath)
+            return index, True, bytes_written, f"ok (attempt {attempt})"
+        except (requests.RequestException, OSError) as e:
+            # Clean up partial file.
+            for p in (tmp_path, filepath):
+                if os.path.exists(p):
+                    try:
+                        os.remove(p)
+                    except OSError:
+                        pass
+            if attempt < max_attempts:
+                wait = 2 ** attempt
+                time.sleep(wait)
+                continue
+            return index, False, 0, f"failed after {max_attempts} attempts: {e}"
+    return index, False, 0, "unknown failure"
+def check_disk_space(required_bytes: int, data_dir: str) -> tuple[bool, int]:
+    """Ensure we have at least required_bytes + 10% headroom free."""
+    os.makedirs(data_dir, exist_ok=True)
+    stats = shutil.disk_usage(data_dir)
+    headroom = int(required_bytes * 1.1)
+    return stats.free >= headroom, stats.free
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Fetch additional climbmix-400b-shuffle shards"
+    )
+    parser.add_argument(
+        "--target-shards",
+        type=int,
+        default=None,
+        help="Total train-shard count to reach (0..target-1). Mutually exclusive with --start/--end.",
+    )
+    parser.add_argument("--start", type=int, default=None, help="Starting shard index (inclusive)")
+    parser.add_argument("--end", type=int, default=None, help="Ending shard index (exclusive)")
+    parser.add_argument("--workers", type=int, default=8, help="Parallel download workers")
+    parser.add_argument(
+        "--include-val",
+        action="store_true",
+        help="Also fetch the pinned validation shard (normally present already)",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="List what would be downloaded without fetching",
+    )
+    args = parser.parse_args()
+    # Resolve shard range.
+    if args.target_shards is not None:
+        if args.start is not None or args.end is not None:
+            print("ERROR: --target-shards is exclusive with --start/--end")
+            return 1
+        ids = list(range(min(args.target_shards, MAX_SHARD)))
+    else:
+        start = args.start or 0
+        end = args.end if args.end is not None else MAX_SHARD
+        end = min(end, MAX_SHARD)
+        ids = list(range(start, end))
+    if args.include_val and VAL_SHARD not in ids:
+        ids.append(VAL_SHARD)
+    os.makedirs(DATA_DIR, exist_ok=True)
+    present = set()
+    for p in Path(DATA_DIR).glob("shard_*.parquet"):
+        try:
+            idx = int(p.stem.split("_")[1])
+            present.add(idx)
+        except (IndexError, ValueError):
+            continue
+    to_fetch = [i for i in ids if i not in present]
+    if not to_fetch:
+        print(f"All {len(ids)} shards already present at {DATA_DIR}")
+        return 0
+    # Estimate space: shards are ~88MB; leave 10% headroom.
+    avg_shard_bytes = 90 * (1 << 20)  # 90MB
+    required = avg_shard_bytes * len(to_fetch)
+    ok, free = check_disk_space(required, DATA_DIR)
+    print(f"Plan: fetch {len(to_fetch)} shards (~{human_bytes(required)}); "
+          f"disk free: {human_bytes(free)}")
+    if not ok:
+        print("ERROR: insufficient disk space (need 1.1x required)")
+        return 2
+    if args.dry_run:
+        preview = to_fetch[:10]
+        print(
+            f"Dry-run — would fetch {len(to_fetch)} shards. First {len(preview)}: {preview}"
+        )
+        return 0
+    print(f"Downloading {len(to_fetch)} shards with {args.workers} workers...")
+    t_start = time.time()
+    success = 0
+    failed = 0
+    total_bytes = 0
+    with ThreadPoolExecutor(max_workers=args.workers) as ex:
+        futs = {ex.submit(download_one, i, DATA_DIR): i for i in to_fetch}
+        for fut in as_completed(futs):
+            idx, ok, nbytes, msg = fut.result()
+            if ok:
+                success += 1
+                total_bytes += nbytes
+                if success % 10 == 0 or success == len(to_fetch):
+                    elapsed = time.time() - t_start
+                    rate = total_bytes / max(elapsed, 1)
+                    print(
+                        f"  [{success}/{len(to_fetch)}] shard_{idx:05d} ok "
+                        f"({human_bytes(total_bytes)} @ {human_bytes(int(rate))}/s)"
+                    )
+            else:
+                failed += 1
+                print(f"  [FAIL] shard_{idx:05d}: {msg}")
+    elapsed = time.time() - t_start
+    print()
+    print("=" * 60)
+    print(f"Downloaded {success}/{len(to_fetch)} shards in {elapsed:.1f}s")
+    print(f"Failed: {failed}")
+    print(f"Total bytes: {human_bytes(total_bytes)}")
+    print("=" * 60)
+    return 0 if failed == 0 else 3
+if __name__ == "__main__":
+    raise SystemExit(main())

overlay/scripts/grad_probe.py CHANGED Viewed

@@ -1,196 +1,196 @@
-"""
-Gradient flow probe for PostSemClawModel.
-READ-ONLY diagnostic. Does NOT modify any source, does NOT train, does NOT
-step an optimizer. Runs one forward + backward and reports, per-parameter:
-  name, shape, dtype, requires_grad, grad-is-None?, |grad|.mean, |grad|.norm
-Severity classification at the bottom:
-  BLOCKER  — requires_grad=True but p.grad is None (disconnected from graph)
-  WARNING  — grad present but literally zero (ops cancel, wd_init, etc.)
-  WARNING  — requires_grad=True but param missing from every optimizer group
-  OK       — everything else
-Usage:
-    .venv/bin/python -u scripts/grad_probe.py
-"""
-from __future__ import annotations
-import os
-import sys
-from pathlib import Path
-# Ensure the project root is on sys.path (so `train`, `subsystems`, `prepare`
-# resolve when we run from any cwd). Probe is intentionally a thin wrapper.
-HERE = Path(__file__).resolve().parent
-ROOT = HERE.parent
-sys.path.insert(0, str(ROOT))
-# Small model config to keep the probe fast (still exercises every component).
-# K=4 MTP (default), d_model=256 (default), n_layer=4 (default).
-os.environ.setdefault("HYDRA_D_MODEL", "256")
-os.environ.setdefault("HYDRA_N_LAYER", "4")
-os.environ.setdefault("HYDRA_MTP_K", "4")
-import torch  # noqa: E402
-from train import PostSemClawModel, PostSemClawConfig  # noqa: E402
-def main() -> int:
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    if device != "cuda":
-        print("ERROR: CUDA required (model has mamba-ssm + bf16 autocast path).")
-        return 2
-    cfg = PostSemClawConfig(
-        sequence_len=64,
-        vocab_size=8192,
-        n_layer=int(os.environ["HYDRA_N_LAYER"]),
-        d_model=int(os.environ["HYDRA_D_MODEL"]),
-        d_state=64,
-        headdim=32,
-        n_heads=8,
-        expand=2,
-        engram_n_columns=1024,
-        engram_key_dim=64,
-        engram_layer_idx=1,
-        sdr_n_bits=16384,
-        sdr_target_active=327,
-        sdr_delta_rank=32,
-        sdr_som_warmup=500,
-        sdr_som_interval=100,
-        htm_n_columns=2048,
-        htm_cells_per_column=32,
-        mtp_k=int(os.environ["HYDRA_MTP_K"]),
-        mtp_weight_decay=0.5,
-    )
-    print(f"[probe] config: d_model={cfg.d_model} n_layer={cfg.n_layer} "
-          f"mtp_k={cfg.mtp_k} vocab={cfg.vocab_size}")
-    torch.manual_seed(0)
-    model = PostSemClawModel(cfg).to(device)
-    model.init_weights()
-    model.train()
-    # ---- Enumerate params & optimizer group assignment ----
-    all_params = list(model.named_parameters())
-    print(f"[probe] total named parameters: {len(all_params)}")
-    # Build optimizer to check group coverage (no step, no zero_grad).
-    opt = model.setup_optimizer()
-    grouped_ids: set[int] = set()
-    for group in opt.param_groups:
-        for p in group["params"]:
-            grouped_ids.add(id(p))
-    unique_param_ids = {id(p) for _, p in all_params}
-    missing_from_opt = unique_param_ids - grouped_ids
-    print(f"[probe] params in opt groups: {len(grouped_ids)} / unique: {len(unique_param_ids)}")
-    if missing_from_opt:
-        print(f"[probe] WARNING: {len(missing_from_opt)} unique params missing from opt groups")
-    # Tied weight check.
-    tied = model.wte.weight.data_ptr() == model.lm_head.weight.data_ptr()
-    print(f"[probe] tied lm_head<->wte (data_ptr match): {tied}")
-    # ---- One forward + backward under bf16 autocast ----
-    B, T = 1, 64
-    idx = torch.randint(0, cfg.vocab_size, (B, T), dtype=torch.long, device=device)
-    tgt = torch.roll(idx, -1, dims=1)
-    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
-        loss = model(idx, targets=tgt)
-    print(f"[probe] fwd loss = {float(loss.detach()):.4f}")
-    loss.backward()
-    torch.cuda.synchronize()
-    # ---- Report ----
-    blockers: list[str] = []
-    zero_grads: list[str] = []
-    unexpected_frozen: list[str] = []
-    not_in_opt: list[str] = []
-    rows: list[tuple[str, tuple, str, bool, bool, float, float]] = []
-    for name, p in all_params:
-        grad_is_none = p.grad is None
-        if p.requires_grad and grad_is_none:
-            blockers.append(name)
-            rows.append((name, tuple(p.shape), str(p.dtype).replace("torch.", ""),
-                         p.requires_grad, True, float("nan"), float("nan")))
-            continue
-        if not p.requires_grad:
-            unexpected_frozen.append(name)
-            rows.append((name, tuple(p.shape), str(p.dtype).replace("torch.", ""),
-                         False, True, float("nan"), float("nan")))
-            continue
-        g = p.grad.detach().float()
-        abs_mean = float(g.abs().mean().item())
-        norm = float(g.norm().item())
-        if abs_mean == 0.0 and norm == 0.0:
-            zero_grads.append(name)
-        if id(p) not in grouped_ids:
-            not_in_opt.append(name)
-        rows.append((name, tuple(p.shape), str(p.dtype).replace("torch.", ""),
-                     p.requires_grad, False, abs_mean, norm))
-    # Pretty table
-    print("\n[probe] per-parameter grad table:")
-    print(f"  {'name':<56}  {'shape':<22}  {'dtype':<8}  rg  none  {'|g|.mean':>10}  {'|g|.norm':>10}")
-    for name, shape, dtype, rg, none, mean, norm in rows:
-        shape_s = "x".join(str(s) for s in shape)
-        rg_s = "Y" if rg else "N"
-        none_s = "Y" if none else "N"
-        if none:
-            mean_s, norm_s = "     nan ", "     nan "
-        else:
-            mean_s = f"{mean:>10.3e}"
-            norm_s = f"{norm:>10.3e}"
-        print(f"  {name:<56}  {shape_s:<22}  {dtype:<8}  {rg_s}  {none_s}   {mean_s}  {norm_s}")
-    # Identity checks
-    print("\n[probe] identity checks:")
-    print(f"  id(wte.weight)       = {id(model.wte.weight)}")
-    print(f"  id(lm_head.weight)   = {id(model.lm_head.weight)}")
-    print(f"  same Python object   = {model.wte.weight is model.lm_head.weight}")
-    print(f"  same storage ptr     = {tied}")
-    # Engram memory inspection
-    print(f"\n[probe] engram.memory is nn.Parameter: "
-          f"{isinstance(model.engram.memory, torch.nn.Parameter)}")
-    print(f"  engram.memory.requires_grad = {model.engram.memory.requires_grad}")
-    if model.engram.memory.grad is None:
-        print(f"  engram.memory.grad = None (Hebbian-only path; no autograd through detach())")
-    else:
-        g = model.engram.memory.grad.detach().float()
-        print(f"  engram.memory.grad |.mean| = {float(g.abs().mean()):.3e}")
-    # Stash flag sanity: _last_sdr should be uint8, no graph
-    last = getattr(model, "_last_sdr", None)
-    if last is not None:
-        print(f"\n[probe] model._last_sdr dtype={last.dtype}, requires_grad={last.requires_grad}")
-    else:
-        print("\n[probe] model._last_sdr is None (fwd didn't stash — ok if path changed)")
-    # Summary
-    print("\n[probe] ============ SUMMARY ============")
-    print(f"  BLOCKERS (requires_grad but grad is None): {len(blockers)}")
-    for n in blockers:
-        print(f"    - {n}")
-    print(f"  WARNINGS (grad is literally zero):         {len(zero_grads)}")
-    for n in zero_grads:
-        print(f"    - {n}")
-    print(f"  WARNINGS (requires_grad=False):            {len(unexpected_frozen)}")
-    for n in unexpected_frozen:
-        print(f"    - {n}")
-    print(f"  WARNINGS (missing from every opt group):   {len(not_in_opt)}")
-    for n in not_in_opt:
-        print(f"    - {n}")
-    return 0
-if __name__ == "__main__":
-    sys.exit(main())

+"""
+Gradient flow probe for PostSemClawModel.
+READ-ONLY diagnostic. Does NOT modify any source, does NOT train, does NOT
+step an optimizer. Runs one forward + backward and reports, per-parameter:
+  name, shape, dtype, requires_grad, grad-is-None?, |grad|.mean, |grad|.norm
+Severity classification at the bottom:
+  BLOCKER  — requires_grad=True but p.grad is None (disconnected from graph)
+  WARNING  — grad present but literally zero (ops cancel, wd_init, etc.)
+  WARNING  — requires_grad=True but param missing from every optimizer group
+  OK       — everything else
+Usage:
+    .venv/bin/python -u scripts/grad_probe.py
+"""
+from __future__ import annotations
+import os
+import sys
+from pathlib import Path
+# Ensure the project root is on sys.path (so `train`, `subsystems`, `prepare`
+# resolve when we run from any cwd). Probe is intentionally a thin wrapper.
+HERE = Path(__file__).resolve().parent
+ROOT = HERE.parent
+sys.path.insert(0, str(ROOT))
+# Small model config to keep the probe fast (still exercises every component).
+# K=4 MTP (default), d_model=256 (default), n_layer=4 (default).
+os.environ.setdefault("HYDRA_D_MODEL", "256")
+os.environ.setdefault("HYDRA_N_LAYER", "4")
+os.environ.setdefault("HYDRA_MTP_K", "4")
+import torch  # noqa: E402
+from train import PostSemClawModel, PostSemClawConfig  # noqa: E402
+def main() -> int:
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    if device != "cuda":
+        print("ERROR: CUDA required (model has mamba-ssm + bf16 autocast path).")
+        return 2
+    cfg = PostSemClawConfig(
+        sequence_len=64,
+        vocab_size=8192,
+        n_layer=int(os.environ["HYDRA_N_LAYER"]),
+        d_model=int(os.environ["HYDRA_D_MODEL"]),
+        d_state=64,
+        headdim=32,
+        n_heads=8,
+        expand=2,
+        engram_n_columns=1024,
+        engram_key_dim=64,
+        engram_layer_idx=1,
+        sdr_n_bits=16384,
+        sdr_target_active=327,
+        sdr_delta_rank=32,
+        sdr_som_warmup=500,
+        sdr_som_interval=100,
+        htm_n_columns=2048,
+        htm_cells_per_column=32,
+        mtp_k=int(os.environ["HYDRA_MTP_K"]),
+        mtp_weight_decay=0.5,
+    )
+    print(f"[probe] config: d_model={cfg.d_model} n_layer={cfg.n_layer} "
+          f"mtp_k={cfg.mtp_k} vocab={cfg.vocab_size}")
+    torch.manual_seed(0)
+    model = PostSemClawModel(cfg).to(device)
+    model.init_weights()
+    model.train()
+    # ---- Enumerate params & optimizer group assignment ----
+    all_params = list(model.named_parameters())
+    print(f"[probe] total named parameters: {len(all_params)}")
+    # Build optimizer to check group coverage (no step, no zero_grad).
+    opt = model.setup_optimizer()
+    grouped_ids: set[int] = set()
+    for group in opt.param_groups:
+        for p in group["params"]:
+            grouped_ids.add(id(p))
+    unique_param_ids = {id(p) for _, p in all_params}
+    missing_from_opt = unique_param_ids - grouped_ids
+    print(f"[probe] params in opt groups: {len(grouped_ids)} / unique: {len(unique_param_ids)}")
+    if missing_from_opt:
+        print(f"[probe] WARNING: {len(missing_from_opt)} unique params missing from opt groups")
+    # Tied weight check.
+    tied = model.wte.weight.data_ptr() == model.lm_head.weight.data_ptr()
+    print(f"[probe] tied lm_head<->wte (data_ptr match): {tied}")
+    # ---- One forward + backward under bf16 autocast ----
+    B, T = 1, 64
+    idx = torch.randint(0, cfg.vocab_size, (B, T), dtype=torch.long, device=device)
+    tgt = torch.roll(idx, -1, dims=1)
+    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        loss = model(idx, targets=tgt)
+    print(f"[probe] fwd loss = {float(loss.detach()):.4f}")
+    loss.backward()
+    torch.cuda.synchronize()
+    # ---- Report ----
+    blockers: list[str] = []
+    zero_grads: list[str] = []
+    unexpected_frozen: list[str] = []
+    not_in_opt: list[str] = []
+    rows: list[tuple[str, tuple, str, bool, bool, float, float]] = []
+    for name, p in all_params:
+        grad_is_none = p.grad is None
+        if p.requires_grad and grad_is_none:
+            blockers.append(name)
+            rows.append((name, tuple(p.shape), str(p.dtype).replace("torch.", ""),
+                         p.requires_grad, True, float("nan"), float("nan")))
+            continue
+        if not p.requires_grad:
+            unexpected_frozen.append(name)
+            rows.append((name, tuple(p.shape), str(p.dtype).replace("torch.", ""),
+                         False, True, float("nan"), float("nan")))
+            continue
+        g = p.grad.detach().float()
+        abs_mean = float(g.abs().mean().item())
+        norm = float(g.norm().item())
+        if abs_mean == 0.0 and norm == 0.0:
+            zero_grads.append(name)
+        if id(p) not in grouped_ids:
+            not_in_opt.append(name)
+        rows.append((name, tuple(p.shape), str(p.dtype).replace("torch.", ""),
+                     p.requires_grad, False, abs_mean, norm))
+    # Pretty table
+    print("\n[probe] per-parameter grad table:")
+    print(f"  {'name':<56}  {'shape':<22}  {'dtype':<8}  rg  none  {'|g|.mean':>10}  {'|g|.norm':>10}")
+    for name, shape, dtype, rg, none, mean, norm in rows:
+        shape_s = "x".join(str(s) for s in shape)
+        rg_s = "Y" if rg else "N"
+        none_s = "Y" if none else "N"
+        if none:
+            mean_s, norm_s = "     nan ", "     nan "
+        else:
+            mean_s = f"{mean:>10.3e}"
+            norm_s = f"{norm:>10.3e}"
+        print(f"  {name:<56}  {shape_s:<22}  {dtype:<8}  {rg_s}  {none_s}   {mean_s}  {norm_s}")
+    # Identity checks
+    print("\n[probe] identity checks:")
+    print(f"  id(wte.weight)       = {id(model.wte.weight)}")
+    print(f"  id(lm_head.weight)   = {id(model.lm_head.weight)}")
+    print(f"  same Python object   = {model.wte.weight is model.lm_head.weight}")
+    print(f"  same storage ptr     = {tied}")
+    # Engram memory inspection
+    print(f"\n[probe] engram.memory is nn.Parameter: "
+          f"{isinstance(model.engram.memory, torch.nn.Parameter)}")
+    print(f"  engram.memory.requires_grad = {model.engram.memory.requires_grad}")
+    if model.engram.memory.grad is None:
+        print(f"  engram.memory.grad = None (Hebbian-only path; no autograd through detach())")
+    else:
+        g = model.engram.memory.grad.detach().float()
+        print(f"  engram.memory.grad |.mean| = {float(g.abs().mean()):.3e}")
+    # Stash flag sanity: _last_sdr should be uint8, no graph
+    last = getattr(model, "_last_sdr", None)
+    if last is not None:
+        print(f"\n[probe] model._last_sdr dtype={last.dtype}, requires_grad={last.requires_grad}")
+    else:
+        print("\n[probe] model._last_sdr is None (fwd didn't stash — ok if path changed)")
+    # Summary
+    print("\n[probe] ============ SUMMARY ============")
+    print(f"  BLOCKERS (requires_grad but grad is None): {len(blockers)}")
+    for n in blockers:
+        print(f"    - {n}")
+    print(f"  WARNINGS (grad is literally zero):         {len(zero_grads)}")
+    for n in zero_grads:
+        print(f"    - {n}")
+    print(f"  WARNINGS (requires_grad=False):            {len(unexpected_frozen)}")
+    for n in unexpected_frozen:
+        print(f"    - {n}")
+    print(f"  WARNINGS (missing from every opt group):   {len(not_in_opt)}")
+    for n in not_in_opt:
+        print(f"    - {n}")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

overlay/scripts/launch_feather_hf_job.py CHANGED Viewed

@@ -211,9 +211,12 @@ def main() -> int:
     if not USE_SPACE_IMAGE:
         print(f'[launch] image={DEFAULT_IMAGE}', flush=True)
     if DRY_RUN:
-        if 'HYDRA_USE_NEMOTRON' not in os.environ and should_enable_fast_start_streaming(TARGET_SHARDS, TIME_BUDGET):
             print('[launch] auto-enabled HYDRA_USE_NEMOTRON=1 for short-budget fast-start profile', flush=True)
         print('[launch] dry-run mode; skipping repo creation, upload, and job submission', flush=True)
         return 0
@@ -277,9 +280,12 @@ def main() -> int:
         'TRITON_CACHE_DIR': f'/workspace/triton_cache/{GPU_PROFILE}',
         'TRITON_CACHE_REPO': f'{routing.owner}/feather-triton-cache-{GPU_PROFILE}',
     }
-    if 'HYDRA_USE_NEMOTRON' not in os.environ and should_enable_fast_start_streaming(TARGET_SHARDS, TIME_BUDGET):
         env['HYDRA_USE_NEMOTRON'] = '1'
         print('[launch] auto-enabled HYDRA_USE_NEMOTRON=1 for short-budget fast-start profile', flush=True)
     # A10 compatibility profile: avoid known PTX/compile runtime pitfalls and
     # keep throughput path enabled. Caller can explicitly override each key by
     # setting it in the parent environment.

     if not USE_SPACE_IMAGE:
         print(f'[launch] image={DEFAULT_IMAGE}', flush=True)
+    fast_start_streaming = should_enable_fast_start_streaming(TARGET_SHARDS, TIME_BUDGET)
     if DRY_RUN:
+        if 'HYDRA_USE_NEMOTRON' not in os.environ and fast_start_streaming:
             print('[launch] auto-enabled HYDRA_USE_NEMOTRON=1 for short-budget fast-start profile', flush=True)
+        if 'HYDRA_LOCAL_SHARDS_ONLY' not in os.environ and fast_start_streaming:
+            print('[launch] auto-enabled HYDRA_LOCAL_SHARDS_ONLY=0 for Nemotron streaming fast-start profile', flush=True)
         print('[launch] dry-run mode; skipping repo creation, upload, and job submission', flush=True)
         return 0
         'TRITON_CACHE_DIR': f'/workspace/triton_cache/{GPU_PROFILE}',
         'TRITON_CACHE_REPO': f'{routing.owner}/feather-triton-cache-{GPU_PROFILE}',
     }
+    if 'HYDRA_USE_NEMOTRON' not in os.environ and fast_start_streaming:
         env['HYDRA_USE_NEMOTRON'] = '1'
         print('[launch] auto-enabled HYDRA_USE_NEMOTRON=1 for short-budget fast-start profile', flush=True)
+    if 'HYDRA_LOCAL_SHARDS_ONLY' not in os.environ and fast_start_streaming:
+        env['HYDRA_LOCAL_SHARDS_ONLY'] = '0'
+        print('[launch] auto-enabled HYDRA_LOCAL_SHARDS_ONLY=0 for Nemotron streaming fast-start profile', flush=True)
     # A10 compatibility profile: avoid known PTX/compile runtime pitfalls and
     # keep throughput path enabled. Caller can explicitly override each key by
     # setting it in the parent environment.

overlay/scripts/profile_forward.py CHANGED Viewed

@@ -1,87 +1,87 @@
-"""Per-subsystem timing to find the tok/s bottleneck.
-Runs a single forward+backward at (B=8, T=2048) and times each stage via
-torch.cuda.Event. Reports ms/stage and derived tok/s budget.
-"""
-import os, sys, time
-os.environ.setdefault("LD_LIBRARY_PATH", "/usr/lib/wsl/lib:/usr/local/cuda/lib64")
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-import torch
-from train import PostSemClawModel, PostSemClawConfig, MAX_SEQ_LEN
-B, T = 8, MAX_SEQ_LEN
-def timeit(name, fn, warmup=1, n=3):
-    for _ in range(warmup):
-        fn(); torch.cuda.synchronize()
-    s = torch.cuda.Event(enable_timing=True); e = torch.cuda.Event(enable_timing=True)
-    times = []
-    for _ in range(n):
-        torch.cuda.synchronize()
-        s.record(); fn(); e.record(); torch.cuda.synchronize()
-        times.append(s.elapsed_time(e))
-    avg = sum(times)/len(times)
-    print(f"  {name:30s} {avg:8.2f} ms   (min {min(times):.2f} max {max(times):.2f})")
-    return avg
-cfg = PostSemClawConfig()
-model = PostSemClawModel(cfg).cuda()
-model.init_weights()
-model.train()
-idx = torch.randint(0, cfg.vocab_size, (B, T), device="cuda", dtype=torch.long)
-y = idx.clone()
-print(f"== Profile at B={B} T={T} n_params={sum(p.numel() for p in model.parameters())/1e6:.1f}M ==\n")
-# Warmup full forward
-with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
-    _ = model(idx, y)
-torch.cuda.synchronize()
-print("Stage times (3 iter avg):\n")
-# 1) wte
-timeit("wte embedding", lambda: model.wte(idx).sum().item())
-# 2) sdr_semantic (STE forward)
-with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
-    timeit("sdr_semantic forward STE", lambda: model.sdr_semantic(idx).sum().item())
-# 3) sdr binary_only
-timeit("sdr binary_only", lambda: model.sdr_semantic.binary_only(idx).sum().item())
-# 4) HTM full forward (with reset/learn)
-with torch.no_grad():
-    timeit("HTM forward (B=8, T=2048)", lambda: model.htm(model.sdr_semantic.binary_only(idx)).sum().item())
-# 5) Mamba block stack only
-with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
-    def _blocks():
-        x = model.wte(idx)
-        from train import norm
-        x = norm(x)
-        streams = model.mhc[0].init_streams(x)
-        for i, (block, mhc_layer) in enumerate(zip(model.blocks, model.mhc)):
-            def _bfn(h, _b=block): return _b(norm(h))
-            streams = mhc_layer(streams, _bfn)
-        x = model.mhc[-1].merge_streams(streams)
-        return x.sum().item()
-    timeit("Mamba+mHC blocks (n_layer=4)", _blocks)
-# 6) Full forward+loss
-with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
-    timeit("FULL forward+loss", lambda: model(idx, y).item())
-# 7) Full forward+loss+backward
-def full_fwd_bwd():
-    model.zero_grad(set_to_none=True)
-    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
-        loss = model(idx, y)
-    loss.backward()
-    return loss.item()
-t_full = timeit("FULL forward+backward", full_fwd_bwd)
-print()
-print(f"FULL step (fwd+bwd): {t_full:.0f} ms for B*T = {B*T} tokens")
-print(f"tok/s per forward: {B*T / (t_full/1000):.0f}")
-print(f"Expected @MFU=20% on RTX3060 (~25 TFLOPS bf16): ~{25e12*0.2 / (6*7.5e6) / 1000:.0f}k tok/s")

+"""Per-subsystem timing to find the tok/s bottleneck.
+Runs a single forward+backward at (B=8, T=2048) and times each stage via
+torch.cuda.Event. Reports ms/stage and derived tok/s budget.
+"""
+import os, sys, time
+os.environ.setdefault("LD_LIBRARY_PATH", "/usr/lib/wsl/lib:/usr/local/cuda/lib64")
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import torch
+from train import PostSemClawModel, PostSemClawConfig, MAX_SEQ_LEN
+B, T = 8, MAX_SEQ_LEN
+def timeit(name, fn, warmup=1, n=3):
+    for _ in range(warmup):
+        fn(); torch.cuda.synchronize()
+    s = torch.cuda.Event(enable_timing=True); e = torch.cuda.Event(enable_timing=True)
+    times = []
+    for _ in range(n):
+        torch.cuda.synchronize()
+        s.record(); fn(); e.record(); torch.cuda.synchronize()
+        times.append(s.elapsed_time(e))
+    avg = sum(times)/len(times)
+    print(f"  {name:30s} {avg:8.2f} ms   (min {min(times):.2f} max {max(times):.2f})")
+    return avg
+cfg = PostSemClawConfig()
+model = PostSemClawModel(cfg).cuda()
+model.init_weights()
+model.train()
+idx = torch.randint(0, cfg.vocab_size, (B, T), device="cuda", dtype=torch.long)
+y = idx.clone()
+print(f"== Profile at B={B} T={T} n_params={sum(p.numel() for p in model.parameters())/1e6:.1f}M ==\n")
+# Warmup full forward
+with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+    _ = model(idx, y)
+torch.cuda.synchronize()
+print("Stage times (3 iter avg):\n")
+# 1) wte
+timeit("wte embedding", lambda: model.wte(idx).sum().item())
+# 2) sdr_semantic (STE forward)
+with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+    timeit("sdr_semantic forward STE", lambda: model.sdr_semantic(idx).sum().item())
+# 3) sdr binary_only
+timeit("sdr binary_only", lambda: model.sdr_semantic.binary_only(idx).sum().item())
+# 4) HTM full forward (with reset/learn)
+with torch.no_grad():
+    timeit("HTM forward (B=8, T=2048)", lambda: model.htm(model.sdr_semantic.binary_only(idx)).sum().item())
+# 5) Mamba block stack only
+with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+    def _blocks():
+        x = model.wte(idx)
+        from train import norm
+        x = norm(x)
+        streams = model.mhc[0].init_streams(x)
+        for i, (block, mhc_layer) in enumerate(zip(model.blocks, model.mhc)):
+            def _bfn(h, _b=block): return _b(norm(h))
+            streams = mhc_layer(streams, _bfn)
+        x = model.mhc[-1].merge_streams(streams)
+        return x.sum().item()
+    timeit("Mamba+mHC blocks (n_layer=4)", _blocks)
+# 6) Full forward+loss
+with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+    timeit("FULL forward+loss", lambda: model(idx, y).item())
+# 7) Full forward+loss+backward
+def full_fwd_bwd():
+    model.zero_grad(set_to_none=True)
+    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        loss = model(idx, y)
+    loss.backward()
+    return loss.item()
+t_full = timeit("FULL forward+backward", full_fwd_bwd)
+print()
+print(f"FULL step (fwd+bwd): {t_full:.0f} ms for B*T = {B*T} tokens")
+print(f"tok/s per forward: {B*T / (t_full/1000):.0f}")
+print(f"Expected @MFU=20% on RTX3060 (~25 TFLOPS bf16): ~{25e12*0.2 / (6*7.5e6) / 1000:.0f}k tok/s")

overlay/scripts/run_domain_expanded_pretrain.sh CHANGED Viewed

@@ -188,11 +188,7 @@ fi
 RESUME_PATH="$(resolve_resume_path || true)"
-# Only inject WSL library paths when running on WSL. Cloud containers
-# (H200/A10G HF Jobs) already have their driver paths set by entrypoint.py.
-if [[ -d /usr/lib/wsl/lib ]]; then
-  export LD_LIBRARY_PATH="/usr/lib/wsl/lib:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
-fi
 export HYDRA_TIME_BUDGET="${HYDRA_TIME_BUDGET:-28800}"
 export HYDRA_TARGET_SHARDS="$TARGET_SHARDS"
 export HYDRA_DOWNLOAD_WORKERS="$DOWNLOAD_WORKERS"

 RESUME_PATH="$(resolve_resume_path || true)"
+export LD_LIBRARY_PATH="/usr/lib/wsl/lib:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
 export HYDRA_TIME_BUDGET="${HYDRA_TIME_BUDGET:-28800}"
 export HYDRA_TARGET_SHARDS="$TARGET_SHARDS"
 export HYDRA_DOWNLOAD_WORKERS="$DOWNLOAD_WORKERS"

overlay/scripts/sample_utils.py CHANGED Viewed

@@ -1,107 +1,107 @@
-"""Shared sampling utilities for chat.py / chat_eval.py.
-Pure functions: given a 1-D logits tensor (vocab_size,), return a single
-sampled token id. No model/tokenizer knowledge here.
-"""
-from __future__ import annotations
-from typing import Iterable, Optional
-import torch
-def apply_repetition_penalty(
-    logits: torch.Tensor,
-    recent_tokens: Optional[Iterable[int]],
-    penalty: float,
-) -> torch.Tensor:
-    """Divide logits of recent positive tokens by `penalty`, multiply negatives.
-    Operates in-place on a *copy* (logits is cloned first by caller if needed).
-    `recent_tokens` may be any iterable of ints; duplicates are deduped internally.
-    """
-    if penalty == 1.0 or not recent_tokens:
-        return logits
-    seen = set(int(t) for t in recent_tokens)
-    if not seen:
-        return logits
-    idx = torch.tensor(list(seen), device=logits.device, dtype=torch.long)
-    vals = logits.index_select(0, idx)
-    vals = torch.where(vals > 0, vals / penalty, vals * penalty)
-    logits.index_copy_(0, idx, vals)
-    return logits
-def apply_top_k(logits: torch.Tensor, top_k: int) -> torch.Tensor:
-    """Keep only the top-k logits; set the rest to -inf.
-    top_k<=0 or top_k>=vocab disables the filter."""
-    if top_k <= 0 or top_k >= logits.size(-1):
-        return logits
-    topk_vals, topk_idx = logits.topk(top_k)
-    mask = torch.full_like(logits, float("-inf"))
-    mask.scatter_(0, topk_idx, topk_vals)
-    return mask
-def apply_top_p(logits: torch.Tensor, top_p: float) -> torch.Tensor:
-    """Nucleus filter: keep smallest set of tokens whose cumulative prob >= top_p."""
-    if top_p >= 1.0 or top_p <= 0.0:
-        return logits
-    sorted_logits, sorted_idx = logits.sort(descending=True)
-    cumulative_probs = sorted_logits.softmax(-1).cumsum(-1)
-    mask = cumulative_probs > top_p
-    # shift right so we always keep at least one token
-    mask[1:] = mask[:-1].clone()
-    mask[0] = False
-    sorted_logits = sorted_logits.masked_fill(mask, float("-inf"))
-    out = torch.full_like(logits, float("-inf"))
-    out.scatter_(0, sorted_idx, sorted_logits)
-    return out
-def sample_token(
-    logits: torch.Tensor,
-    temperature: float = 1.0,
-    top_k: int = 0,
-    top_p: float = 1.0,
-    repetition_penalty: float = 1.0,
-    recent_tokens: Optional[Iterable[int]] = None,
-) -> int:
-    """Return a single sampled token id (Python int).
-    logits: 1-D float tensor of shape (vocab_size,). fp32 or upcast-safe.
-    """
-    if logits.dim() != 1:
-        raise ValueError(f"sample_token expects 1-D logits, got shape {tuple(logits.shape)}")
-    # Work in fp32 on a clone so the caller's tensor is unchanged.
-    work = logits.detach().to(torch.float32).clone()
-    if repetition_penalty != 1.0 and recent_tokens is not None:
-        work = apply_repetition_penalty(work, recent_tokens, repetition_penalty)
-    # Temperature. Greedy when temperature <= 0.
-    if temperature <= 0.0:
-        return int(work.argmax().item())
-    work = work / max(temperature, 1e-6)
-    work = apply_top_k(work, top_k)
-    work = apply_top_p(work, top_p)
-    # Guard against all-(-inf) (can happen if top_k/top_p filter everything out).
-    if torch.isinf(work).all():
-        return int(logits.argmax().item())
-    probs = torch.softmax(work, dim=-1)
-    # Numerical safety — replace any NaN with 0 and renormalize.
-    if torch.isnan(probs).any():
-        probs = torch.nan_to_num(probs, nan=0.0)
-        s = probs.sum()
-        if s <= 0:
-            return int(logits.argmax().item())
-        probs = probs / s
-    tok = torch.multinomial(probs, num_samples=1)
-    return int(tok.item())

+"""Shared sampling utilities for chat.py / chat_eval.py.
+Pure functions: given a 1-D logits tensor (vocab_size,), return a single
+sampled token id. No model/tokenizer knowledge here.
+"""
+from __future__ import annotations
+from typing import Iterable, Optional
+import torch
+def apply_repetition_penalty(
+    logits: torch.Tensor,
+    recent_tokens: Optional[Iterable[int]],
+    penalty: float,
+) -> torch.Tensor:
+    """Divide logits of recent positive tokens by `penalty`, multiply negatives.
+    Operates in-place on a *copy* (logits is cloned first by caller if needed).
+    `recent_tokens` may be any iterable of ints; duplicates are deduped internally.
+    """
+    if penalty == 1.0 or not recent_tokens:
+        return logits
+    seen = set(int(t) for t in recent_tokens)
+    if not seen:
+        return logits
+    idx = torch.tensor(list(seen), device=logits.device, dtype=torch.long)
+    vals = logits.index_select(0, idx)
+    vals = torch.where(vals > 0, vals / penalty, vals * penalty)
+    logits.index_copy_(0, idx, vals)
+    return logits
+def apply_top_k(logits: torch.Tensor, top_k: int) -> torch.Tensor:
+    """Keep only the top-k logits; set the rest to -inf.
+    top_k<=0 or top_k>=vocab disables the filter."""
+    if top_k <= 0 or top_k >= logits.size(-1):
+        return logits
+    topk_vals, topk_idx = logits.topk(top_k)
+    mask = torch.full_like(logits, float("-inf"))
+    mask.scatter_(0, topk_idx, topk_vals)
+    return mask
+def apply_top_p(logits: torch.Tensor, top_p: float) -> torch.Tensor:
+    """Nucleus filter: keep smallest set of tokens whose cumulative prob >= top_p."""
+    if top_p >= 1.0 or top_p <= 0.0:
+        return logits
+    sorted_logits, sorted_idx = logits.sort(descending=True)
+    cumulative_probs = sorted_logits.softmax(-1).cumsum(-1)
+    mask = cumulative_probs > top_p
+    # shift right so we always keep at least one token
+    mask[1:] = mask[:-1].clone()
+    mask[0] = False
+    sorted_logits = sorted_logits.masked_fill(mask, float("-inf"))
+    out = torch.full_like(logits, float("-inf"))
+    out.scatter_(0, sorted_idx, sorted_logits)
+    return out
+def sample_token(
+    logits: torch.Tensor,
+    temperature: float = 1.0,
+    top_k: int = 0,
+    top_p: float = 1.0,
+    repetition_penalty: float = 1.0,
+    recent_tokens: Optional[Iterable[int]] = None,
+) -> int:
+    """Return a single sampled token id (Python int).
+    logits: 1-D float tensor of shape (vocab_size,). fp32 or upcast-safe.
+    """
+    if logits.dim() != 1:
+        raise ValueError(f"sample_token expects 1-D logits, got shape {tuple(logits.shape)}")
+    # Work in fp32 on a clone so the caller's tensor is unchanged.
+    work = logits.detach().to(torch.float32).clone()
+    if repetition_penalty != 1.0 and recent_tokens is not None:
+        work = apply_repetition_penalty(work, recent_tokens, repetition_penalty)
+    # Temperature. Greedy when temperature <= 0.
+    if temperature <= 0.0:
+        return int(work.argmax().item())
+    work = work / max(temperature, 1e-6)
+    work = apply_top_k(work, top_k)
+    work = apply_top_p(work, top_p)
+    # Guard against all-(-inf) (can happen if top_k/top_p filter everything out).
+    if torch.isinf(work).all():
+        return int(logits.argmax().item())
+    probs = torch.softmax(work, dim=-1)
+    # Numerical safety — replace any NaN with 0 and renormalize.
+    if torch.isnan(probs).any():
+        probs = torch.nan_to_num(probs, nan=0.0)
+        s = probs.sum()
+        if s <= 0:
+            return int(logits.argmax().item())
+        probs = probs / s
+    tok = torch.multinomial(probs, num_samples=1)
+    return int(tok.item())

overlay/scripts/sft.py CHANGED Viewed

@@ -1,559 +1,559 @@
-"""HYDRA SFT — instruction fine-tune the pretrained 7.5M-param base.
-Mode selection:
-    MODE=resume_from_pretrain  iff ~/.cache/autoresearch/pretrain_final.pt
-                               exists AND loads cleanly into a fresh model.
-    MODE=from_scratch          otherwise (degraded fallback).
-Data: int16 shards written by `scripts/download_sft_data.py`, paired with
-uint8 loss-mask shards (1 on assistant tokens, 0 on user-prompt tokens).
-At runtime we pack consecutive examples into fixed-length rows; prompt
-positions get target=-1 so CE's `ignore_index=-1` drops them.
-Env vars (with defaults tuned for RTX 3060 6GB, 7.5M params):
-    HYDRA_SFT_TIME_BUDGET   10800   SFT wall-clock budget (3h)
-    HYDRA_SFT_SEQ_LEN       512     sequence length during SFT
-    HYDRA_BATCH_SIZE        4       per-step device batch
-    HYDRA_TOTAL_BATCH       8192    effective batch (grad-accum derived)
-    HYDRA_SFT_LR_MULT       0.10    multiply pretrain LRs by this
-    HYDRA_SFT_EVAL_INTERVAL 500     steps between sample generations
-    HYDRA_SFT_CKPT_INTERVAL 2000    steps between interim checkpoints
-CLI:
-    --dry-run     load model+data, run 1 step, exit (validation path)
-    --eval-only   load `sft_final.pt`, run sample gen, exit
-"""
-from __future__ import annotations
-import argparse
-import json
-import math
-import os
-import sys
-import time
-from dataclasses import asdict
-from pathlib import Path
-import numpy as np
-import torch
-# Repo root on path
-_REPO_ROOT = Path(__file__).resolve().parent.parent
-if str(_REPO_ROOT) not in sys.path:
-    sys.path.insert(0, str(_REPO_ROOT))
-# Must import hydra.config BEFORE touching torch.cuda for CUDA env setup
-from hydra.config import (
-    ADAM_BETAS, D_MODEL, D_STATE, DEVICE_BATCH_SIZE, EMBEDDING_LR,
-    ENGRAM_KEY_DIM, ENGRAM_LAYER_IDX, ENGRAM_N_COLUMNS, EXPAND,
-    FINAL_LR_FRAC, GPU_BF16_PEAK_FLOPS, HEADDIM, MATRIX_LR, N_HEADS,
-    N_LAYER, PostSemClawConfig, SCALAR_LR, SEED, TOTAL_BATCH_SIZE,
-    UNEMBEDDING_LR, WARMUP_RATIO, WEIGHT_DECAY,
-)
-from hydra.model import PostSemClawModel
-from prepare import Tokenizer
-# Use line-buffered stdout
-try:
-    sys.stdout.reconfigure(line_buffering=True)
-except Exception:
-    pass
-# ---------------------------------------------------------------------------
-# Paths
-# ---------------------------------------------------------------------------
-CACHE_DIR = Path.home() / ".cache" / "autoresearch"
-PRETRAIN_CKPT = CACHE_DIR / "pretrain_final.pt"
-SFT_FINAL_CKPT = CACHE_DIR / "sft_final.pt"
-SFT_INTERIM_CKPT = CACHE_DIR / "sft_interim.pt"
-SFT_DATA_DIR = _REPO_ROOT / "data" / "sft"
-# ---------------------------------------------------------------------------
-# Env vars for SFT
-# ---------------------------------------------------------------------------
-SFT_TIME_BUDGET = int(os.environ.get("HYDRA_SFT_TIME_BUDGET", "10800"))
-SFT_SEQ_LEN = int(os.environ.get("HYDRA_SFT_SEQ_LEN", "512"))
-SFT_LR_MULT = float(os.environ.get("HYDRA_SFT_LR_MULT", "0.10"))
-SFT_EVAL_INTERVAL = int(os.environ.get("HYDRA_SFT_EVAL_INTERVAL", "500"))
-SFT_CKPT_INTERVAL = int(os.environ.get("HYDRA_SFT_CKPT_INTERVAL", "2000"))
-# ---------------------------------------------------------------------------
-# Data loading
-# ---------------------------------------------------------------------------
-def _load_meta() -> dict:
-    meta_path = SFT_DATA_DIR / "meta.json"
-    if not meta_path.exists():
-        raise FileNotFoundError(
-            f"SFT meta not found at {meta_path}. Run "
-            f"`python scripts/download_sft_data.py` first."
-        )
-    with open(meta_path) as f:
-        return json.load(f)
-def _load_shards():
-    """Load all shard_XXX.bin + mask_XXX.bin as big flat arrays.
-    Returns: (tokens: np.int64, mask: np.uint8)
-    Both arrays are 1-D and the same length. Total len ~= target_tokens.
-    """
-    tok_shards = sorted(SFT_DATA_DIR.glob("shard_*.bin"))
-    mask_shards = sorted(SFT_DATA_DIR.glob("mask_*.bin"))
-    if not tok_shards:
-        raise FileNotFoundError(f"No SFT shards in {SFT_DATA_DIR}")
-    assert len(tok_shards) == len(mask_shards), (
-        f"shard/mask count mismatch: {len(tok_shards)} vs {len(mask_shards)}"
-    )
-    tok_parts = []
-    mask_parts = []
-    for t, m in zip(tok_shards, mask_shards):
-        tok_parts.append(np.fromfile(str(t), dtype=np.int16).astype(np.int64))
-        mask_parts.append(np.fromfile(str(m), dtype=np.uint8))
-    tokens = np.concatenate(tok_parts)
-    mask = np.concatenate(mask_parts)
-    assert tokens.shape == mask.shape
-    # Guard against negative int16 values (unlikely with vocab=8192 but defensive)
-    if tokens.min() < 0 or tokens.max() >= 8192:
-        raise ValueError(
-            f"Token IDs out of range: min={tokens.min()} max={tokens.max()}"
-        )
-    return tokens, mask
-def make_sft_dataloader(tokens: np.ndarray, mask: np.ndarray, B: int, T: int,
-                        device: torch.device, seed: int = 0):
-    """Yield (x, y, epoch) forever.
-    Each row is a slice of length T+1 sampled at a random start. We produce:
-        x = slice[:-1]                    (B, T) int64 on device
-        y = slice[1:] with mask=0 -> -1   (B, T) int64 on device
-    The mask applies to target positions (y), not inputs. This way the
-    chunked CE loss in model.forward sees ignore_index=-1 for prompt tokens.
-    """
-    N = tokens.shape[0]
-    rng = np.random.default_rng(seed)
-    # Pin CPU tensors; copy to GPU non-blocking.
-    cpu_x = torch.empty(B, T, dtype=torch.long, pin_memory=True)
-    cpu_y = torch.empty(B, T, dtype=torch.long, pin_memory=True)
-    epoch = 1
-    samples_drawn = 0
-    samples_per_epoch = max(1, N // (T + 1))
-    # Minimum loss-positions per window. If a sampled window has fewer than
-    # this many assistant tokens, resample. Guards against all-prompt windows
-    # producing NaN from 0/0 in the chunked CE loss.
-    min_loss_positions = max(1, T // 32)
-    max_resample = 8
-    while True:
-        for b in range(B):
-            # Sample a starting index with a light rejection filter to ensure
-            # the window contains enough assistant (mask=1) positions.
-            if N <= T + 1:
-                start = 0
-            else:
-                start = int(rng.integers(0, N - T - 1))
-                for _ in range(max_resample):
-                    loss_in_window = int(mask[start + 1:start + T + 1].sum())
-                    if loss_in_window >= min_loss_positions:
-                        break
-                    start = int(rng.integers(0, N - T - 1))
-            window_tok = tokens[start:start + T + 1]
-            window_mask = mask[start:start + T + 1]
-            # x = window[:-1], y = window[1:]
-            cpu_x[b].copy_(torch.from_numpy(window_tok[:-1].astype(np.int64)))
-            y_slice = window_tok[1:].astype(np.int64).copy()
-            # Apply mask to targets: mask=0 (prompt) -> target=-1 (ignore)
-            y_slice[window_mask[1:] == 0] = -1
-            # Final guard: if no loss positions survived, force at least 1
-            # valid target so the batch doesn't produce NaN (it's rare with
-            # the rejection filter but defensive is cheap).
-            if (y_slice != -1).sum() == 0:
-                y_slice[-1] = int(window_tok[-1])
-            cpu_y[b].copy_(torch.from_numpy(y_slice))
-        x = cpu_x.to(device, non_blocking=True)
-        y = cpu_y.to(device, non_blocking=True)
-        samples_drawn += B
-        if samples_drawn >= samples_per_epoch:
-            epoch += 1
-            samples_drawn = 0
-        yield x, y, epoch
-# ---------------------------------------------------------------------------
-# Model init + checkpoint load
-# ---------------------------------------------------------------------------
-def _peek_pretrain_config(vocab_size: int) -> PostSemClawConfig | None:
-    """If pretrain checkpoint exists, return its saved config so we build
-    the SFT model with matching architecture. Returns None if unavailable."""
-    if not PRETRAIN_CKPT.exists():
-        return None
-    try:
-        ckpt = torch.load(str(PRETRAIN_CKPT), map_location="cpu",
-                          weights_only=False)
-        cfg_dict = ckpt.get("config")
-        if cfg_dict is None:
-            return None
-        # Override sequence_len to SFT's (shorter context) — architecture
-        # is independent of sequence_len since Mamba3 is recurrent.
-        cfg_dict = dict(cfg_dict)
-        cfg_dict["sequence_len"] = SFT_SEQ_LEN
-        cfg_dict["vocab_size"] = vocab_size
-        cfg = PostSemClawConfig(**cfg_dict)
-        return cfg
-    except Exception as e:
-        print(f"[model] could not peek pretrain config: {type(e).__name__}: {e}",
-              flush=True)
-        return None
-def build_model(vocab_size: int, device: torch.device) -> PostSemClawModel:
-    # Prefer checkpoint-derived config if available (guards against env-var drift)
-    config = _peek_pretrain_config(vocab_size)
-    if config is None:
-        config = PostSemClawConfig(
-            sequence_len=SFT_SEQ_LEN,
-            vocab_size=vocab_size,
-            n_layer=N_LAYER,
-            d_model=D_MODEL,
-            d_state=D_STATE,
-            headdim=HEADDIM,
-            n_heads=N_HEADS,
-            expand=EXPAND,
-            engram_n_columns=ENGRAM_N_COLUMNS,
-            engram_key_dim=ENGRAM_KEY_DIM,
-            engram_layer_idx=ENGRAM_LAYER_IDX,
-        )
-        print(f"[model] config (from env, no ckpt): {asdict(config)}", flush=True)
-    else:
-        print(f"[model] config (from pretrain ckpt): {asdict(config)}", flush=True)
-    with torch.device("meta"):
-        model = PostSemClawModel(config)
-    model.to_empty(device=device)
-    model.init_weights()
-    return model
-def try_load_pretrain(model: PostSemClawModel) -> tuple[bool, str]:
-    """Attempt to load pretrain checkpoint into model. Returns (loaded, msg)."""
-    if not PRETRAIN_CKPT.exists():
-        return False, f"no checkpoint at {PRETRAIN_CKPT}"
-    try:
-        ckpt = torch.load(str(PRETRAIN_CKPT), map_location="cuda",
-                          weights_only=False)
-        state = ckpt.get("model_state_dict", ckpt)
-        # Use strict=False in case SDR/HTM params are excluded from state_dict
-        # by torch.compile wrappers or similar.
-        missing, unexpected = model.load_state_dict(state, strict=False)
-        msg = (f"loaded {PRETRAIN_CKPT} — missing={len(missing)} "
-               f"unexpected={len(unexpected)}")
-        if missing:
-            # Log first few missing keys to help diagnose architecture skew
-            msg += f" first_missing={missing[:3]}"
-        return True, msg
-    except Exception as e:
-        return False, f"load failed: {type(e).__name__}: {e}"
-# ---------------------------------------------------------------------------
-# Sample generation (for in-training eval prints)
-# ---------------------------------------------------------------------------
-_SAMPLE_PROMPTS = [
-    "What is the capital of France?",
-    "Write a haiku about winter.",
-    "List three colors.",
-    "How are you?",
-    "Explain why the sky is blue in one sentence.",
-]
-@torch.no_grad()
-def sample_once(model, tokenizer, meta: dict, prompt: str,
-                max_new: int = 64, temperature: float = 0.8,
-                top_k: int = 40) -> str:
-    """Generate a chat-formatted reply. Stops on <|end|> or max_new tokens."""
-    bos = meta["special_tokens"]["bos"]
-    user = meta["special_tokens"]["user"]
-    assistant = meta["special_tokens"]["assistant"]
-    end = meta["special_tokens"]["end"]
-    prompt_ids = [bos, user] + tokenizer.encode("\n" + prompt.strip())
-    prompt_ids += tokenizer.encode("\n")
-    prompt_ids.append(assistant)
-    prompt_ids += tokenizer.encode("\n")
-    ctx = torch.tensor([prompt_ids], device="cuda", dtype=torch.long)
-    generated: list[int] = []
-    for _ in range(max_new):
-        with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
-            logits = model(ctx, targets=None)
-        last = logits[0, -1].float()
-        if top_k and top_k < last.shape[-1]:
-            kth = torch.topk(last, top_k).values[-1]
-            last = torch.where(last < kth, torch.full_like(last, -1e9), last)
-        probs = torch.softmax(last / max(temperature, 1e-6), dim=-1)
-        next_id = int(torch.multinomial(probs, num_samples=1).item())
-        generated.append(next_id)
-        if next_id == end:
-            break
-        ctx = torch.cat(
-            [ctx, torch.tensor([[next_id]], device="cuda", dtype=torch.long)],
-            dim=1,
-        )
-        # Hard cap on ctx length (model was trained at 2048, SFT at 512,
-        # but inference could theoretically go longer)
-        if ctx.size(1) >= 2048:
-            break
-    try:
-        text = tokenizer.decode(generated)
-    except Exception:
-        text = "<decode error>"
-    return text
-def run_samples(model, tokenizer, meta: dict, step: int):
-    model.eval()
-    print(f"\n=== SFT samples @ step {step} ===", flush=True)
-    for p in _SAMPLE_PROMPTS:
-        try:
-            resp = sample_once(model, tokenizer, meta, p)
-        except Exception as e:
-            resp = f"<sample failed: {type(e).__name__}: {e}>"
-        # Sanitize newlines for log readability
-        resp_clean = resp.replace("\n", " ⏎ ").replace("\r", " ")
-        print(f"  prompt: {p!r}")
-        print(f"  reply:  {resp_clean!r}")
-    print("=== end samples ===\n", flush=True)
-    model.train()
-# ---------------------------------------------------------------------------
-# Checkpoint save
-# ---------------------------------------------------------------------------
-def save_ckpt(model, step: int, smoothed_loss: float, path: Path,
-              mode: str, meta: dict):
-    try:
-        CACHE_DIR.mkdir(parents=True, exist_ok=True)
-        payload = {
-            "model_state_dict": model.state_dict(),
-            "step": step,
-            "smoothed_loss": smoothed_loss,
-            "mode": mode,
-            "sft_meta": meta,
-        }
-        torch.save(payload, str(path))
-        print(f"[ckpt] saved {path} (step={step})", flush=True)
-    except Exception as e:
-        print(f"[ckpt] SAVE FAILED {path}: {type(e).__name__}: {e}", flush=True)
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
-def main():
-    ap = argparse.ArgumentParser()
-    ap.add_argument("--dry-run", action="store_true",
-                    help="Load model+data, run 1 step, exit.")
-    ap.add_argument("--eval-only", action="store_true",
-                    help="Load sft_final.pt and run sample gen.")
-    args = ap.parse_args()
-    t_start = time.time()
-    torch.manual_seed(SEED + 1)  # +1 so SFT draws different RNG than pretrain
-    torch.cuda.manual_seed(SEED + 1)
-    torch.set_float32_matmul_precision("high")
-    device = torch.device("cuda")
-    autocast_ctx = torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16)
-    # --- Tokenizer ---
-    tokenizer = Tokenizer.from_directory()
-    vocab_size = tokenizer.get_vocab_size()
-    print(f"[init] vocab: {vocab_size}", flush=True)
-    # --- Data meta ---
-    meta = _load_meta()
-    print(f"[data] meta: {meta}", flush=True)
-    # --- Model ---
-    model = build_model(vocab_size, device)
-    n_params = sum(p.numel() for p in model.parameters())
-    print(f"[model] params: {n_params:,}", flush=True)
-    loaded, msg = try_load_pretrain(model)
-    mode = "resume_from_pretrain" if loaded else "from_scratch"
-    print(f"[init] MODE={mode} :: {msg}", flush=True)
-    # --- Eval-only path ---
-    if args.eval_only:
-        if SFT_FINAL_CKPT.exists():
-            ckpt = torch.load(str(SFT_FINAL_CKPT), map_location=device,
-                              weights_only=False)
-            state = ckpt.get("model_state_dict", ckpt)
-            model.load_state_dict(state, strict=False)
-            print(f"[eval-only] loaded {SFT_FINAL_CKPT}", flush=True)
-        else:
-            print(f"[eval-only] no SFT checkpoint — running on current weights",
-                  flush=True)
-        run_samples(model, tokenizer, meta, step=-1)
-        return
-    # --- Dataloader ---
-    print(f"[data] loading shards ...", flush=True)
-    tokens, mask = _load_shards()
-    print(f"[data] tokens: {len(tokens):,}  loss-positions: {int(mask.sum()):,}",
-          flush=True)
-    B = DEVICE_BATCH_SIZE
-    T = SFT_SEQ_LEN
-    tokens_per_fwdbwd = B * T
-    assert TOTAL_BATCH_SIZE % tokens_per_fwdbwd == 0, (
-        f"TOTAL_BATCH_SIZE={TOTAL_BATCH_SIZE} not divisible by B*T={tokens_per_fwdbwd}"
-    )
-    grad_accum = TOTAL_BATCH_SIZE // tokens_per_fwdbwd
-    print(f"[train] B={B} T={T} accum={grad_accum} effective_batch={TOTAL_BATCH_SIZE}",
-          flush=True)
-    loader = make_sft_dataloader(tokens, mask, B, T, device, seed=SEED + 7)
-    x, y, epoch = next(loader)
-    # --- Optimizer (scaled LRs) ---
-    matrix_lr = MATRIX_LR * SFT_LR_MULT
-    embed_lr = EMBEDDING_LR * SFT_LR_MULT
-    unembed_lr = UNEMBEDDING_LR * SFT_LR_MULT
-    scalar_lr = SCALAR_LR * SFT_LR_MULT
-    print(f"[opt] LRs scaled by {SFT_LR_MULT}: matrix={matrix_lr:.5f} "
-          f"embed={embed_lr:.5f} unembed={unembed_lr:.6f}", flush=True)
-    optimizer = model.setup_optimizer(
-        unembedding_lr=unembed_lr,
-        embedding_lr=embed_lr,
-        scalar_lr=scalar_lr,
-        adam_betas=ADAM_BETAS,
-        matrix_lr=matrix_lr,
-        weight_decay=WEIGHT_DECAY,
-    )
-    # --- Dry-run path (validation) ---
-    if args.dry_run:
-        print("[dry-run] running 1 step ...", flush=True)
-        with autocast_ctx:
-            loss = model(x, y)
-        loss_f = float(loss.item())
-        print(f"[dry-run] step0 loss={loss_f:.4f}", flush=True)
-        loss.backward()
-        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
-        optimizer.step()
-        model.zero_grad(set_to_none=True)
-        if math.isnan(loss_f) or loss_f > 100:
-            print("[dry-run] FAILED (NaN / huge loss)", flush=True)
-            sys.exit(1)
-        print("[dry-run] OK", flush=True)
-        return
-    # --- Training loop ---
-    print(f"[train] budget={SFT_TIME_BUDGET}s  eval_every={SFT_EVAL_INTERVAL} "
-          f"ckpt_every={SFT_CKPT_INTERVAL}", flush=True)
-    t_loop_start = time.time()
-    smooth_loss = 0.0
-    step = 0
-    total_train_secs = 0.0
-    # Warmup schedule for SFT: linear 0->1 over first 5% of budget, then cosine.
-    sft_warmup_frac = 0.05
-    def lr_mult(progress: float) -> float:
-        if progress < sft_warmup_frac:
-            return progress / sft_warmup_frac if sft_warmup_frac > 0 else 1.0
-        decay = (progress - sft_warmup_frac) / (1.0 - sft_warmup_frac)
-        return FINAL_LR_FRAC + 0.5 * (1.0 - FINAL_LR_FRAC) * \
-               (1 + math.cos(math.pi * decay))
-    while True:
-        torch.cuda.synchronize()
-        t0 = time.time()
-        for _ in range(grad_accum):
-            with autocast_ctx:
-                loss = model(x, y)
-            train_loss_val = loss.detach()
-            (loss / grad_accum).backward()
-            x, y, epoch = next(loader)
-        progress = min(total_train_secs / SFT_TIME_BUDGET, 1.0)
-        mult = lr_mult(progress)
-        for group in optimizer.param_groups:
-            group["lr"] = group["initial_lr"] * mult
-        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
-        optimizer.step()
-        model.zero_grad(set_to_none=True)
-        loss_f = float(train_loss_val.item())
-        if math.isnan(loss_f) or loss_f > 100:
-            print(f"[FAIL] step={step} loss={loss_f} — aborting", flush=True)
-            save_ckpt(model, step, smooth_loss, SFT_INTERIM_CKPT, mode, meta)
-            sys.exit(1)
-        torch.cuda.synchronize()
-        dt = time.time() - t0
-        if step > 3:
-            total_train_secs += dt
-        # EMA loss (debiased)
-        beta = 0.9
-        smooth_loss = beta * smooth_loss + (1 - beta) * loss_f
-        debiased = smooth_loss / (1 - beta ** (step + 1))
-        bpt = debiased / math.log(2)
-        tps = int(TOTAL_BATCH_SIZE / dt) if dt > 0 else 0
-        vram_mib = torch.cuda.memory_allocated() / 1024 / 1024
-        lr_now = optimizer.param_groups[0]["lr"]
-        remaining = max(0, SFT_TIME_BUDGET - total_train_secs)
-        print(
-            f"sft_step={step:05d} loss={debiased:.4f} bpt={bpt:.3f} "
-            f"tps={tps} dt_ms={dt*1000:.0f} lr={lr_now:.2e} "
-            f"vram={vram_mib:.0f}MiB pct={100*progress:.1f} "
-            f"epoch={epoch} remaining={remaining:.0f}s",
-            flush=True,
-        )
-        if step > 0 and step % SFT_EVAL_INTERVAL == 0:
-            run_samples(model, tokenizer, meta, step)
-        if step > 0 and step % SFT_CKPT_INTERVAL == 0:
-            save_ckpt(model, step, smooth_loss, SFT_INTERIM_CKPT, mode, meta)
-        step += 1
-        if step > 5 and total_train_secs >= SFT_TIME_BUDGET:
-            break
-    # Final samples + save
-    run_samples(model, tokenizer, meta, step)
-    save_ckpt(model, step, smooth_loss, SFT_FINAL_CKPT, mode, meta)
-    total_secs = time.time() - t_start
-    print("---", flush=True)
-    print(f"SFT_COMPLETE mode={mode} step={step} "
-          f"smoothed_loss={smooth_loss:.4f} total_seconds={total_secs:.0f} "
-          f"train_seconds={total_train_secs:.0f}", flush=True)
-if __name__ == "__main__":
-    try:
-        main()
-    except SystemExit:
-        raise
-    except Exception as e:
-        import traceback
-        print(f"SFT_FAILED {type(e).__name__}: {e}", flush=True)
-        traceback.print_exc()
-        sys.exit(1)

+"""HYDRA SFT — instruction fine-tune the pretrained 7.5M-param base.
+Mode selection:
+    MODE=resume_from_pretrain  iff ~/.cache/autoresearch/pretrain_final.pt
+                               exists AND loads cleanly into a fresh model.
+    MODE=from_scratch          otherwise (degraded fallback).
+Data: int16 shards written by `scripts/download_sft_data.py`, paired with
+uint8 loss-mask shards (1 on assistant tokens, 0 on user-prompt tokens).
+At runtime we pack consecutive examples into fixed-length rows; prompt
+positions get target=-1 so CE's `ignore_index=-1` drops them.
+Env vars (with defaults tuned for RTX 3060 6GB, 7.5M params):
+    HYDRA_SFT_TIME_BUDGET   10800   SFT wall-clock budget (3h)
+    HYDRA_SFT_SEQ_LEN       512     sequence length during SFT
+    HYDRA_BATCH_SIZE        4       per-step device batch
+    HYDRA_TOTAL_BATCH       8192    effective batch (grad-accum derived)
+    HYDRA_SFT_LR_MULT       0.10    multiply pretrain LRs by this
+    HYDRA_SFT_EVAL_INTERVAL 500     steps between sample generations
+    HYDRA_SFT_CKPT_INTERVAL 2000    steps between interim checkpoints
+CLI:
+    --dry-run     load model+data, run 1 step, exit (validation path)
+    --eval-only   load `sft_final.pt`, run sample gen, exit
+"""
+from __future__ import annotations
+import argparse
+import json
+import math
+import os
+import sys
+import time
+from dataclasses import asdict
+from pathlib import Path
+import numpy as np
+import torch
+# Repo root on path
+_REPO_ROOT = Path(__file__).resolve().parent.parent
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+# Must import hydra.config BEFORE touching torch.cuda for CUDA env setup
+from hydra.config import (
+    ADAM_BETAS, D_MODEL, D_STATE, DEVICE_BATCH_SIZE, EMBEDDING_LR,
+    ENGRAM_KEY_DIM, ENGRAM_LAYER_IDX, ENGRAM_N_COLUMNS, EXPAND,
+    FINAL_LR_FRAC, GPU_BF16_PEAK_FLOPS, HEADDIM, MATRIX_LR, N_HEADS,
+    N_LAYER, PostSemClawConfig, SCALAR_LR, SEED, TOTAL_BATCH_SIZE,
+    UNEMBEDDING_LR, WARMUP_RATIO, WEIGHT_DECAY,
+)
+from hydra.model import PostSemClawModel
+from prepare import Tokenizer
+# Use line-buffered stdout
+try:
+    sys.stdout.reconfigure(line_buffering=True)
+except Exception:
+    pass
+# ---------------------------------------------------------------------------
+# Paths
+# ---------------------------------------------------------------------------
+CACHE_DIR = Path.home() / ".cache" / "autoresearch"
+PRETRAIN_CKPT = CACHE_DIR / "pretrain_final.pt"
+SFT_FINAL_CKPT = CACHE_DIR / "sft_final.pt"
+SFT_INTERIM_CKPT = CACHE_DIR / "sft_interim.pt"
+SFT_DATA_DIR = _REPO_ROOT / "data" / "sft"
+# ---------------------------------------------------------------------------
+# Env vars for SFT
+# ---------------------------------------------------------------------------
+SFT_TIME_BUDGET = int(os.environ.get("HYDRA_SFT_TIME_BUDGET", "10800"))
+SFT_SEQ_LEN = int(os.environ.get("HYDRA_SFT_SEQ_LEN", "512"))
+SFT_LR_MULT = float(os.environ.get("HYDRA_SFT_LR_MULT", "0.10"))
+SFT_EVAL_INTERVAL = int(os.environ.get("HYDRA_SFT_EVAL_INTERVAL", "500"))
+SFT_CKPT_INTERVAL = int(os.environ.get("HYDRA_SFT_CKPT_INTERVAL", "2000"))
+# ---------------------------------------------------------------------------
+# Data loading
+# ---------------------------------------------------------------------------
+def _load_meta() -> dict:
+    meta_path = SFT_DATA_DIR / "meta.json"
+    if not meta_path.exists():
+        raise FileNotFoundError(
+            f"SFT meta not found at {meta_path}. Run "
+            f"`python scripts/download_sft_data.py` first."
+        )
+    with open(meta_path) as f:
+        return json.load(f)
+def _load_shards():
+    """Load all shard_XXX.bin + mask_XXX.bin as big flat arrays.
+    Returns: (tokens: np.int64, mask: np.uint8)
+    Both arrays are 1-D and the same length. Total len ~= target_tokens.
+    """
+    tok_shards = sorted(SFT_DATA_DIR.glob("shard_*.bin"))
+    mask_shards = sorted(SFT_DATA_DIR.glob("mask_*.bin"))
+    if not tok_shards:
+        raise FileNotFoundError(f"No SFT shards in {SFT_DATA_DIR}")
+    assert len(tok_shards) == len(mask_shards), (
+        f"shard/mask count mismatch: {len(tok_shards)} vs {len(mask_shards)}"
+    )
+    tok_parts = []
+    mask_parts = []
+    for t, m in zip(tok_shards, mask_shards):
+        tok_parts.append(np.fromfile(str(t), dtype=np.int16).astype(np.int64))
+        mask_parts.append(np.fromfile(str(m), dtype=np.uint8))
+    tokens = np.concatenate(tok_parts)
+    mask = np.concatenate(mask_parts)
+    assert tokens.shape == mask.shape
+    # Guard against negative int16 values (unlikely with vocab=8192 but defensive)
+    if tokens.min() < 0 or tokens.max() >= 8192:
+        raise ValueError(
+            f"Token IDs out of range: min={tokens.min()} max={tokens.max()}"
+        )
+    return tokens, mask
+def make_sft_dataloader(tokens: np.ndarray, mask: np.ndarray, B: int, T: int,
+                        device: torch.device, seed: int = 0):
+    """Yield (x, y, epoch) forever.
+    Each row is a slice of length T+1 sampled at a random start. We produce:
+        x = slice[:-1]                    (B, T) int64 on device
+        y = slice[1:] with mask=0 -> -1   (B, T) int64 on device
+    The mask applies to target positions (y), not inputs. This way the
+    chunked CE loss in model.forward sees ignore_index=-1 for prompt tokens.
+    """
+    N = tokens.shape[0]
+    rng = np.random.default_rng(seed)
+    # Pin CPU tensors; copy to GPU non-blocking.
+    cpu_x = torch.empty(B, T, dtype=torch.long, pin_memory=True)
+    cpu_y = torch.empty(B, T, dtype=torch.long, pin_memory=True)
+    epoch = 1
+    samples_drawn = 0
+    samples_per_epoch = max(1, N // (T + 1))
+    # Minimum loss-positions per window. If a sampled window has fewer than
+    # this many assistant tokens, resample. Guards against all-prompt windows
+    # producing NaN from 0/0 in the chunked CE loss.
+    min_loss_positions = max(1, T // 32)
+    max_resample = 8
+    while True:
+        for b in range(B):
+            # Sample a starting index with a light rejection filter to ensure
+            # the window contains enough assistant (mask=1) positions.
+            if N <= T + 1:
+                start = 0
+            else:
+                start = int(rng.integers(0, N - T - 1))
+                for _ in range(max_resample):
+                    loss_in_window = int(mask[start + 1:start + T + 1].sum())
+                    if loss_in_window >= min_loss_positions:
+                        break
+                    start = int(rng.integers(0, N - T - 1))
+            window_tok = tokens[start:start + T + 1]
+            window_mask = mask[start:start + T + 1]
+            # x = window[:-1], y = window[1:]
+            cpu_x[b].copy_(torch.from_numpy(window_tok[:-1].astype(np.int64)))
+            y_slice = window_tok[1:].astype(np.int64).copy()
+            # Apply mask to targets: mask=0 (prompt) -> target=-1 (ignore)
+            y_slice[window_mask[1:] == 0] = -1
+            # Final guard: if no loss positions survived, force at least 1
+            # valid target so the batch doesn't produce NaN (it's rare with
+            # the rejection filter but defensive is cheap).
+            if (y_slice != -1).sum() == 0:
+                y_slice[-1] = int(window_tok[-1])
+            cpu_y[b].copy_(torch.from_numpy(y_slice))
+        x = cpu_x.to(device, non_blocking=True)
+        y = cpu_y.to(device, non_blocking=True)
+        samples_drawn += B
+        if samples_drawn >= samples_per_epoch:
+            epoch += 1
+            samples_drawn = 0
+        yield x, y, epoch
+# ---------------------------------------------------------------------------
+# Model init + checkpoint load
+# ---------------------------------------------------------------------------
+def _peek_pretrain_config(vocab_size: int) -> PostSemClawConfig | None:
+    """If pretrain checkpoint exists, return its saved config so we build
+    the SFT model with matching architecture. Returns None if unavailable."""
+    if not PRETRAIN_CKPT.exists():
+        return None
+    try:
+        ckpt = torch.load(str(PRETRAIN_CKPT), map_location="cpu",
+                          weights_only=False)
+        cfg_dict = ckpt.get("config")
+        if cfg_dict is None:
+            return None
+        # Override sequence_len to SFT's (shorter context) — architecture
+        # is independent of sequence_len since Mamba3 is recurrent.
+        cfg_dict = dict(cfg_dict)
+        cfg_dict["sequence_len"] = SFT_SEQ_LEN
+        cfg_dict["vocab_size"] = vocab_size
+        cfg = PostSemClawConfig(**cfg_dict)
+        return cfg
+    except Exception as e:
+        print(f"[model] could not peek pretrain config: {type(e).__name__}: {e}",
+              flush=True)
+        return None
+def build_model(vocab_size: int, device: torch.device) -> PostSemClawModel:
+    # Prefer checkpoint-derived config if available (guards against env-var drift)
+    config = _peek_pretrain_config(vocab_size)
+    if config is None:
+        config = PostSemClawConfig(
+            sequence_len=SFT_SEQ_LEN,
+            vocab_size=vocab_size,
+            n_layer=N_LAYER,
+            d_model=D_MODEL,
+            d_state=D_STATE,
+            headdim=HEADDIM,
+            n_heads=N_HEADS,
+            expand=EXPAND,
+            engram_n_columns=ENGRAM_N_COLUMNS,
+            engram_key_dim=ENGRAM_KEY_DIM,
+            engram_layer_idx=ENGRAM_LAYER_IDX,
+        )
+        print(f"[model] config (from env, no ckpt): {asdict(config)}", flush=True)
+    else:
+        print(f"[model] config (from pretrain ckpt): {asdict(config)}", flush=True)
+    with torch.device("meta"):
+        model = PostSemClawModel(config)
+    model.to_empty(device=device)
+    model.init_weights()
+    return model
+def try_load_pretrain(model: PostSemClawModel) -> tuple[bool, str]:
+    """Attempt to load pretrain checkpoint into model. Returns (loaded, msg)."""
+    if not PRETRAIN_CKPT.exists():
+        return False, f"no checkpoint at {PRETRAIN_CKPT}"
+    try:
+        ckpt = torch.load(str(PRETRAIN_CKPT), map_location="cuda",
+                          weights_only=False)
+        state = ckpt.get("model_state_dict", ckpt)
+        # Use strict=False in case SDR/HTM params are excluded from state_dict
+        # by torch.compile wrappers or similar.
+        missing, unexpected = model.load_state_dict(state, strict=False)
+        msg = (f"loaded {PRETRAIN_CKPT} — missing={len(missing)} "
+               f"unexpected={len(unexpected)}")
+        if missing:
+            # Log first few missing keys to help diagnose architecture skew
+            msg += f" first_missing={missing[:3]}"
+        return True, msg
+    except Exception as e:
+        return False, f"load failed: {type(e).__name__}: {e}"
+# ---------------------------------------------------------------------------
+# Sample generation (for in-training eval prints)
+# ---------------------------------------------------------------------------
+_SAMPLE_PROMPTS = [
+    "What is the capital of France?",
+    "Write a haiku about winter.",
+    "List three colors.",
+    "How are you?",
+    "Explain why the sky is blue in one sentence.",
+]
+@torch.no_grad()
+def sample_once(model, tokenizer, meta: dict, prompt: str,
+                max_new: int = 64, temperature: float = 0.8,
+                top_k: int = 40) -> str:
+    """Generate a chat-formatted reply. Stops on <|end|> or max_new tokens."""
+    bos = meta["special_tokens"]["bos"]
+    user = meta["special_tokens"]["user"]
+    assistant = meta["special_tokens"]["assistant"]
+    end = meta["special_tokens"]["end"]
+    prompt_ids = [bos, user] + tokenizer.encode("\n" + prompt.strip())
+    prompt_ids += tokenizer.encode("\n")
+    prompt_ids.append(assistant)
+    prompt_ids += tokenizer.encode("\n")
+    ctx = torch.tensor([prompt_ids], device="cuda", dtype=torch.long)
+    generated: list[int] = []
+    for _ in range(max_new):
+        with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+            logits = model(ctx, targets=None)
+        last = logits[0, -1].float()
+        if top_k and top_k < last.shape[-1]:
+            kth = torch.topk(last, top_k).values[-1]
+            last = torch.where(last < kth, torch.full_like(last, -1e9), last)
+        probs = torch.softmax(last / max(temperature, 1e-6), dim=-1)
+        next_id = int(torch.multinomial(probs, num_samples=1).item())
+        generated.append(next_id)
+        if next_id == end:
+            break
+        ctx = torch.cat(
+            [ctx, torch.tensor([[next_id]], device="cuda", dtype=torch.long)],
+            dim=1,
+        )
+        # Hard cap on ctx length (model was trained at 2048, SFT at 512,
+        # but inference could theoretically go longer)
+        if ctx.size(1) >= 2048:
+            break
+    try:
+        text = tokenizer.decode(generated)
+    except Exception:
+        text = "<decode error>"
+    return text
+def run_samples(model, tokenizer, meta: dict, step: int):
+    model.eval()
+    print(f"\n=== SFT samples @ step {step} ===", flush=True)
+    for p in _SAMPLE_PROMPTS:
+        try:
+            resp = sample_once(model, tokenizer, meta, p)
+        except Exception as e:
+            resp = f"<sample failed: {type(e).__name__}: {e}>"
+        # Sanitize newlines for log readability
+        resp_clean = resp.replace("\n", " ⏎ ").replace("\r", " ")
+        print(f"  prompt: {p!r}")
+        print(f"  reply:  {resp_clean!r}")
+    print("=== end samples ===\n", flush=True)
+    model.train()
+# ---------------------------------------------------------------------------
+# Checkpoint save
+# ---------------------------------------------------------------------------
+def save_ckpt(model, step: int, smoothed_loss: float, path: Path,
+              mode: str, meta: dict):
+    try:
+        CACHE_DIR.mkdir(parents=True, exist_ok=True)
+        payload = {
+            "model_state_dict": model.state_dict(),
+            "step": step,
+            "smoothed_loss": smoothed_loss,
+            "mode": mode,
+            "sft_meta": meta,
+        }
+        torch.save(payload, str(path))
+        print(f"[ckpt] saved {path} (step={step})", flush=True)
+    except Exception as e:
+        print(f"[ckpt] SAVE FAILED {path}: {type(e).__name__}: {e}", flush=True)
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--dry-run", action="store_true",
+                    help="Load model+data, run 1 step, exit.")
+    ap.add_argument("--eval-only", action="store_true",
+                    help="Load sft_final.pt and run sample gen.")
+    args = ap.parse_args()
+    t_start = time.time()
+    torch.manual_seed(SEED + 1)  # +1 so SFT draws different RNG than pretrain
+    torch.cuda.manual_seed(SEED + 1)
+    torch.set_float32_matmul_precision("high")
+    device = torch.device("cuda")
+    autocast_ctx = torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16)
+    # --- Tokenizer ---
+    tokenizer = Tokenizer.from_directory()
+    vocab_size = tokenizer.get_vocab_size()
+    print(f"[init] vocab: {vocab_size}", flush=True)
+    # --- Data meta ---
+    meta = _load_meta()
+    print(f"[data] meta: {meta}", flush=True)
+    # --- Model ---
+    model = build_model(vocab_size, device)
+    n_params = sum(p.numel() for p in model.parameters())
+    print(f"[model] params: {n_params:,}", flush=True)
+    loaded, msg = try_load_pretrain(model)
+    mode = "resume_from_pretrain" if loaded else "from_scratch"
+    print(f"[init] MODE={mode} :: {msg}", flush=True)
+    # --- Eval-only path ---
+    if args.eval_only:
+        if SFT_FINAL_CKPT.exists():
+            ckpt = torch.load(str(SFT_FINAL_CKPT), map_location=device,
+                              weights_only=False)
+            state = ckpt.get("model_state_dict", ckpt)
+            model.load_state_dict(state, strict=False)
+            print(f"[eval-only] loaded {SFT_FINAL_CKPT}", flush=True)
+        else:
+            print(f"[eval-only] no SFT checkpoint — running on current weights",
+                  flush=True)
+        run_samples(model, tokenizer, meta, step=-1)
+        return
+    # --- Dataloader ---
+    print(f"[data] loading shards ...", flush=True)
+    tokens, mask = _load_shards()
+    print(f"[data] tokens: {len(tokens):,}  loss-positions: {int(mask.sum()):,}",
+          flush=True)
+    B = DEVICE_BATCH_SIZE
+    T = SFT_SEQ_LEN
+    tokens_per_fwdbwd = B * T
+    assert TOTAL_BATCH_SIZE % tokens_per_fwdbwd == 0, (
+        f"TOTAL_BATCH_SIZE={TOTAL_BATCH_SIZE} not divisible by B*T={tokens_per_fwdbwd}"
+    )
+    grad_accum = TOTAL_BATCH_SIZE // tokens_per_fwdbwd
+    print(f"[train] B={B} T={T} accum={grad_accum} effective_batch={TOTAL_BATCH_SIZE}",
+          flush=True)
+    loader = make_sft_dataloader(tokens, mask, B, T, device, seed=SEED + 7)
+    x, y, epoch = next(loader)
+    # --- Optimizer (scaled LRs) ---
+    matrix_lr = MATRIX_LR * SFT_LR_MULT
+    embed_lr = EMBEDDING_LR * SFT_LR_MULT
+    unembed_lr = UNEMBEDDING_LR * SFT_LR_MULT
+    scalar_lr = SCALAR_LR * SFT_LR_MULT
+    print(f"[opt] LRs scaled by {SFT_LR_MULT}: matrix={matrix_lr:.5f} "
+          f"embed={embed_lr:.5f} unembed={unembed_lr:.6f}", flush=True)
+    optimizer = model.setup_optimizer(
+        unembedding_lr=unembed_lr,
+        embedding_lr=embed_lr,
+        scalar_lr=scalar_lr,
+        adam_betas=ADAM_BETAS,
+        matrix_lr=matrix_lr,
+        weight_decay=WEIGHT_DECAY,
+    )
+    # --- Dry-run path (validation) ---
+    if args.dry_run:
+        print("[dry-run] running 1 step ...", flush=True)
+        with autocast_ctx:
+            loss = model(x, y)
+        loss_f = float(loss.item())
+        print(f"[dry-run] step0 loss={loss_f:.4f}", flush=True)
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+        optimizer.step()
+        model.zero_grad(set_to_none=True)
+        if math.isnan(loss_f) or loss_f > 100:
+            print("[dry-run] FAILED (NaN / huge loss)", flush=True)
+            sys.exit(1)
+        print("[dry-run] OK", flush=True)
+        return
+    # --- Training loop ---
+    print(f"[train] budget={SFT_TIME_BUDGET}s  eval_every={SFT_EVAL_INTERVAL} "
+          f"ckpt_every={SFT_CKPT_INTERVAL}", flush=True)
+    t_loop_start = time.time()
+    smooth_loss = 0.0
+    step = 0
+    total_train_secs = 0.0
+    # Warmup schedule for SFT: linear 0->1 over first 5% of budget, then cosine.
+    sft_warmup_frac = 0.05
+    def lr_mult(progress: float) -> float:
+        if progress < sft_warmup_frac:
+            return progress / sft_warmup_frac if sft_warmup_frac > 0 else 1.0
+        decay = (progress - sft_warmup_frac) / (1.0 - sft_warmup_frac)
+        return FINAL_LR_FRAC + 0.5 * (1.0 - FINAL_LR_FRAC) * \
+               (1 + math.cos(math.pi * decay))
+    while True:
+        torch.cuda.synchronize()
+        t0 = time.time()
+        for _ in range(grad_accum):
+            with autocast_ctx:
+                loss = model(x, y)
+            train_loss_val = loss.detach()
+            (loss / grad_accum).backward()
+            x, y, epoch = next(loader)
+        progress = min(total_train_secs / SFT_TIME_BUDGET, 1.0)
+        mult = lr_mult(progress)
+        for group in optimizer.param_groups:
+            group["lr"] = group["initial_lr"] * mult
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+        optimizer.step()
+        model.zero_grad(set_to_none=True)
+        loss_f = float(train_loss_val.item())
+        if math.isnan(loss_f) or loss_f > 100:
+            print(f"[FAIL] step={step} loss={loss_f} — aborting", flush=True)
+            save_ckpt(model, step, smooth_loss, SFT_INTERIM_CKPT, mode, meta)
+            sys.exit(1)
+        torch.cuda.synchronize()
+        dt = time.time() - t0
+        if step > 3:
+            total_train_secs += dt
+        # EMA loss (debiased)
+        beta = 0.9
+        smooth_loss = beta * smooth_loss + (1 - beta) * loss_f
+        debiased = smooth_loss / (1 - beta ** (step + 1))
+        bpt = debiased / math.log(2)
+        tps = int(TOTAL_BATCH_SIZE / dt) if dt > 0 else 0
+        vram_mib = torch.cuda.memory_allocated() / 1024 / 1024
+        lr_now = optimizer.param_groups[0]["lr"]
+        remaining = max(0, SFT_TIME_BUDGET - total_train_secs)
+        print(
+            f"sft_step={step:05d} loss={debiased:.4f} bpt={bpt:.3f} "
+            f"tps={tps} dt_ms={dt*1000:.0f} lr={lr_now:.2e} "
+            f"vram={vram_mib:.0f}MiB pct={100*progress:.1f} "
+            f"epoch={epoch} remaining={remaining:.0f}s",
+            flush=True,
+        )
+        if step > 0 and step % SFT_EVAL_INTERVAL == 0:
+            run_samples(model, tokenizer, meta, step)
+        if step > 0 and step % SFT_CKPT_INTERVAL == 0:
+            save_ckpt(model, step, smooth_loss, SFT_INTERIM_CKPT, mode, meta)
+        step += 1
+        if step > 5 and total_train_secs >= SFT_TIME_BUDGET:
+            break
+    # Final samples + save
+    run_samples(model, tokenizer, meta, step)
+    save_ckpt(model, step, smooth_loss, SFT_FINAL_CKPT, mode, meta)
+    total_secs = time.time() - t_start
+    print("---", flush=True)
+    print(f"SFT_COMPLETE mode={mode} step={step} "
+          f"smoothed_loss={smooth_loss:.4f} total_seconds={total_secs:.0f} "
+          f"train_seconds={total_train_secs:.0f}", flush=True)
+if __name__ == "__main__":
+    try:
+        main()
+    except SystemExit:
+        raise
+    except Exception as e:
+        import traceback
+        print(f"SFT_FAILED {type(e).__name__}: {e}", flush=True)
+        traceback.print_exc()
+        sys.exit(1)

overlay/scripts/sft_orchestrator.sh CHANGED Viewed

@@ -1,165 +1,165 @@
-#!/usr/bin/env bash
-#
-# SFT orchestrator: waits for pretrain (train.py) to either complete or
-# reach the 8h budget, then kicks off SFT.
-#
-# Behavior:
-#   - Polls for `train.py` process every 60 s
-#   - Exits the wait loop on either:
-#       (a) no train.py process found (pretrain completed naturally), or
-#       (b) 8h elapsed since this script started
-#   - Sends SIGTERM first (graceful — triggers checkpoint-save patch if
-#     applied), waits 30 s, then SIGKILL as fallback
-#   - Invokes `scripts/download_sft_data.py` if shards don't exist
-#   - Launches `scripts/sft.py` in the background with tuned env vars
-#   - Redirects all output to `run_sft.log`
-#
-# Re-entrant: safe to invoke even if pretrain has already exited.
-# Does NOT re-launch if SFT is already running.
-#
-# Usage (typical):
-#   cd /home/mikeb/work/feather
-#   nohup bash scripts/sft_orchestrator.sh > orchestrator.log 2>&1 &
-#   disown
-set -u  # error on unset vars, but don't -e (we handle failures explicitly)
-REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
-cd "$REPO_ROOT" || { echo "cannot cd to $REPO_ROOT" >&2; exit 1; }
-PY="$REPO_ROOT/.venv/bin/python"
-if [ ! -x "$PY" ]; then
-    echo "[orchestrator] ERROR: python not found at $PY" >&2
-    exit 1
-fi
-LOG_FILE="$REPO_ROOT/run_sft.log"
-DATA_LOG="$REPO_ROOT/run_sft_download.log"
-MAX_WAIT_SECONDS=28800   # 8 hours
-POLL_INTERVAL=60
-GRACEFUL_SHUTDOWN_WAIT=30
-log() {
-    echo "[orchestrator $(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*"
-}
-# ---------------------------------------------------------------------------
-# Stage 1: wait for pretrain
-# ---------------------------------------------------------------------------
-log "starting; max wait = ${MAX_WAIT_SECONDS}s"
-# Guard against double-launch
-if pgrep -f "scripts/sft.py" > /dev/null; then
-    log "SFT is already running — exiting orchestrator to avoid conflict"
-    exit 0
-fi
-T_START=$(date +%s)
-while true; do
-    NOW=$(date +%s)
-    ELAPSED=$((NOW - T_START))
-    if [ $ELAPSED -ge $MAX_WAIT_SECONDS ]; then
-        log "reached 8h wait cap (${ELAPSED}s) — will kill pretrain"
-        break
-    fi
-    # Count train.py processes owned by current user (not orchestrator/sft.py)
-    PRETRAIN_PIDS=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null | tr '\n' ' ')
-    # Strip pid of this script if pgrep matched something spurious
-    PRETRAIN_PIDS=$(echo "$PRETRAIN_PIDS" | sed "s/\b$$\b//g" | xargs)
-    if [ -z "$PRETRAIN_PIDS" ]; then
-        log "no train.py process found — pretrain already exited"
-        break
-    fi
-    # Log a status every 10 polls (~10 min)
-    if [ $((ELAPSED / POLL_INTERVAL % 10)) -eq 0 ]; then
-        log "waiting... elapsed=${ELAPSED}s  pretrain PIDs: $PRETRAIN_PIDS"
-    fi
-    sleep $POLL_INTERVAL
-done
-# ---------------------------------------------------------------------------
-# Stage 2: kill any remaining pretrain processes
-# ---------------------------------------------------------------------------
-PRETRAIN_PIDS=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null | tr '\n' ' ')
-if [ -n "$PRETRAIN_PIDS" ]; then
-    log "sending SIGTERM to pretrain PIDs: $PRETRAIN_PIDS"
-    for pid in $PRETRAIN_PIDS; do
-        kill -TERM "$pid" 2>/dev/null || true
-    done
-    # Wait for graceful shutdown (gives the checkpoint-save patch time to run)
-    for _ in $(seq 1 $GRACEFUL_SHUTDOWN_WAIT); do
-        REMAINING=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null | tr '\n' ' ')
-        if [ -z "$REMAINING" ]; then break; fi
-        sleep 1
-    done
-    # Force-kill any stragglers
-    REMAINING=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null | tr '\n' ' ')
-    if [ -n "$REMAINING" ]; then
-        log "force-killing stragglers: $REMAINING"
-        for pid in $REMAINING; do
-            kill -9 "$pid" 2>/dev/null || true
-        done
-        sleep 5
-    fi
-fi
-# ---------------------------------------------------------------------------
-# Stage 3: ensure SFT data exists
-# ---------------------------------------------------------------------------
-META_JSON="$REPO_ROOT/data/sft/meta.json"
-if [ ! -f "$META_JSON" ]; then
-    log "no SFT data found — running download_sft_data.py"
-    LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
-        "$PY" -u "$REPO_ROOT/scripts/download_sft_data.py" \
-        > "$DATA_LOG" 2>&1
-    DL_RC=$?
-    if [ $DL_RC -ne 0 ] || [ ! -f "$META_JSON" ]; then
-        log "ERROR: SFT data download failed (rc=$DL_RC)"
-        log "  last 20 lines of $DATA_LOG:"
-        tail -20 "$DATA_LOG" 2>/dev/null | sed 's/^/    /'
-        exit 2
-    fi
-    log "SFT data ready"
-else
-    log "SFT data already present at $META_JSON"
-fi
-# ---------------------------------------------------------------------------
-# Stage 4: launch SFT
-# ---------------------------------------------------------------------------
-# Guard: if we somehow got here and SFT is now running, don't double-launch.
-if pgrep -f "scripts/sft.py" > /dev/null; then
-    log "SFT is already running — skipping launch"
-    exit 0
-fi
-log "launching SFT (log -> $LOG_FILE)"
-export LD_LIBRARY_PATH="/usr/lib/wsl/lib:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
-export HYDRA_SFT_TIME_BUDGET="${HYDRA_SFT_TIME_BUDGET:-10800}"
-export HYDRA_BATCH_SIZE="${HYDRA_BATCH_SIZE:-4}"
-export HYDRA_TOTAL_BATCH="${HYDRA_TOTAL_BATCH:-8192}"
-export HYDRA_SFT_SEQ_LEN="${HYDRA_SFT_SEQ_LEN:-512}"
-export HYDRA_SFT_LR_MULT="${HYDRA_SFT_LR_MULT:-0.10}"
-export HYDRA_SFT_EVAL_INTERVAL="${HYDRA_SFT_EVAL_INTERVAL:-500}"
-export HYDRA_SFT_CKPT_INTERVAL="${HYDRA_SFT_CKPT_INTERVAL:-2000}"
-export HYDRA_DROPOUT="${HYDRA_DROPOUT:-0.1}"
-nohup "$PY" -u "$REPO_ROOT/scripts/sft.py" \
-    > "$LOG_FILE" 2>&1 &
-SFT_PID=$!
-disown $SFT_PID 2>/dev/null || true
-log "SFT launched as PID $SFT_PID (budget=${HYDRA_SFT_TIME_BUDGET}s)"
-log "monitor with: tail -f $LOG_FILE"

+#!/usr/bin/env bash
+#
+# SFT orchestrator: waits for pretrain (train.py) to either complete or
+# reach the 8h budget, then kicks off SFT.
+#
+# Behavior:
+#   - Polls for `train.py` process every 60 s
+#   - Exits the wait loop on either:
+#       (a) no train.py process found (pretrain completed naturally), or
+#       (b) 8h elapsed since this script started
+#   - Sends SIGTERM first (graceful — triggers checkpoint-save patch if
+#     applied), waits 30 s, then SIGKILL as fallback
+#   - Invokes `scripts/download_sft_data.py` if shards don't exist
+#   - Launches `scripts/sft.py` in the background with tuned env vars
+#   - Redirects all output to `run_sft.log`
+#
+# Re-entrant: safe to invoke even if pretrain has already exited.
+# Does NOT re-launch if SFT is already running.
+#
+# Usage (typical):
+#   cd /home/mikeb/work/feather
+#   nohup bash scripts/sft_orchestrator.sh > orchestrator.log 2>&1 &
+#   disown
+set -u  # error on unset vars, but don't -e (we handle failures explicitly)
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$REPO_ROOT" || { echo "cannot cd to $REPO_ROOT" >&2; exit 1; }
+PY="$REPO_ROOT/.venv/bin/python"
+if [ ! -x "$PY" ]; then
+    echo "[orchestrator] ERROR: python not found at $PY" >&2
+    exit 1
+fi
+LOG_FILE="$REPO_ROOT/run_sft.log"
+DATA_LOG="$REPO_ROOT/run_sft_download.log"
+MAX_WAIT_SECONDS=28800   # 8 hours
+POLL_INTERVAL=60
+GRACEFUL_SHUTDOWN_WAIT=30
+log() {
+    echo "[orchestrator $(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*"
+}
+# ---------------------------------------------------------------------------
+# Stage 1: wait for pretrain
+# ---------------------------------------------------------------------------
+log "starting; max wait = ${MAX_WAIT_SECONDS}s"
+# Guard against double-launch
+if pgrep -f "scripts/sft.py" > /dev/null; then
+    log "SFT is already running — exiting orchestrator to avoid conflict"
+    exit 0
+fi
+T_START=$(date +%s)
+while true; do
+    NOW=$(date +%s)
+    ELAPSED=$((NOW - T_START))
+    if [ $ELAPSED -ge $MAX_WAIT_SECONDS ]; then
+        log "reached 8h wait cap (${ELAPSED}s) — will kill pretrain"
+        break
+    fi
+    # Count train.py processes owned by current user (not orchestrator/sft.py)
+    PRETRAIN_PIDS=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null | tr '\n' ' ')
+    # Strip pid of this script if pgrep matched something spurious
+    PRETRAIN_PIDS=$(echo "$PRETRAIN_PIDS" | sed "s/\b$$\b//g" | xargs)
+    if [ -z "$PRETRAIN_PIDS" ]; then
+        log "no train.py process found — pretrain already exited"
+        break
+    fi
+    # Log a status every 10 polls (~10 min)
+    if [ $((ELAPSED / POLL_INTERVAL % 10)) -eq 0 ]; then
+        log "waiting... elapsed=${ELAPSED}s  pretrain PIDs: $PRETRAIN_PIDS"
+    fi
+    sleep $POLL_INTERVAL
+done
+# ---------------------------------------------------------------------------
+# Stage 2: kill any remaining pretrain processes
+# ---------------------------------------------------------------------------
+PRETRAIN_PIDS=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null | tr '\n' ' ')
+if [ -n "$PRETRAIN_PIDS" ]; then
+    log "sending SIGTERM to pretrain PIDs: $PRETRAIN_PIDS"
+    for pid in $PRETRAIN_PIDS; do
+        kill -TERM "$pid" 2>/dev/null || true
+    done
+    # Wait for graceful shutdown (gives the checkpoint-save patch time to run)
+    for _ in $(seq 1 $GRACEFUL_SHUTDOWN_WAIT); do
+        REMAINING=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null | tr '\n' ' ')
+        if [ -z "$REMAINING" ]; then break; fi
+        sleep 1
+    done
+    # Force-kill any stragglers
+    REMAINING=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null | tr '\n' ' ')
+    if [ -n "$REMAINING" ]; then
+        log "force-killing stragglers: $REMAINING"
+        for pid in $REMAINING; do
+            kill -9 "$pid" 2>/dev/null || true
+        done
+        sleep 5
+    fi
+fi
+# ---------------------------------------------------------------------------
+# Stage 3: ensure SFT data exists
+# ---------------------------------------------------------------------------
+META_JSON="$REPO_ROOT/data/sft/meta.json"
+if [ ! -f "$META_JSON" ]; then
+    log "no SFT data found — running download_sft_data.py"
+    LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
+        "$PY" -u "$REPO_ROOT/scripts/download_sft_data.py" \
+        > "$DATA_LOG" 2>&1
+    DL_RC=$?
+    if [ $DL_RC -ne 0 ] || [ ! -f "$META_JSON" ]; then
+        log "ERROR: SFT data download failed (rc=$DL_RC)"
+        log "  last 20 lines of $DATA_LOG:"
+        tail -20 "$DATA_LOG" 2>/dev/null | sed 's/^/    /'
+        exit 2
+    fi
+    log "SFT data ready"
+else
+    log "SFT data already present at $META_JSON"
+fi
+# ---------------------------------------------------------------------------
+# Stage 4: launch SFT
+# ---------------------------------------------------------------------------
+# Guard: if we somehow got here and SFT is now running, don't double-launch.
+if pgrep -f "scripts/sft.py" > /dev/null; then
+    log "SFT is already running — skipping launch"
+    exit 0
+fi
+log "launching SFT (log -> $LOG_FILE)"
+export LD_LIBRARY_PATH="/usr/lib/wsl/lib:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
+export HYDRA_SFT_TIME_BUDGET="${HYDRA_SFT_TIME_BUDGET:-10800}"
+export HYDRA_BATCH_SIZE="${HYDRA_BATCH_SIZE:-4}"
+export HYDRA_TOTAL_BATCH="${HYDRA_TOTAL_BATCH:-8192}"
+export HYDRA_SFT_SEQ_LEN="${HYDRA_SFT_SEQ_LEN:-512}"
+export HYDRA_SFT_LR_MULT="${HYDRA_SFT_LR_MULT:-0.10}"
+export HYDRA_SFT_EVAL_INTERVAL="${HYDRA_SFT_EVAL_INTERVAL:-500}"
+export HYDRA_SFT_CKPT_INTERVAL="${HYDRA_SFT_CKPT_INTERVAL:-2000}"
+export HYDRA_DROPOUT="${HYDRA_DROPOUT:-0.1}"
+nohup "$PY" -u "$REPO_ROOT/scripts/sft.py" \
+    > "$LOG_FILE" 2>&1 &
+SFT_PID=$!
+disown $SFT_PID 2>/dev/null || true
+log "SFT launched as PID $SFT_PID (budget=${HYDRA_SFT_TIME_BUDGET}s)"
+log "monitor with: tail -f $LOG_FILE"

overlay/subsystems/fused_sdr_project.py CHANGED Viewed

@@ -114,6 +114,13 @@ class FusedSDRProject(torch.autograd.Function):
         out = torch.empty(P, D, device=active.device, dtype=sdr_proj_weight.dtype)
         BLOCK_D = min(256, triton.next_power_of_2(D))
         grid = (P * triton.cdiv(D, BLOCK_D),)

         out = torch.empty(P, D, device=active.device, dtype=sdr_proj_weight.dtype)
+        if not active.is_cuda:
+            # Local CPU validation has no Triton driver. Keep the same custom
+            # autograd contract but use a deterministic gather+sum fallback.
+            out = wt[active].sum(dim=1).to(dtype=sdr_proj_weight.dtype)
+            ctx.save_for_backward(active, token_ids, sdr_proj_weight, delta_u, delta_v)
+            return out.view(B, T, D)
         BLOCK_D = min(256, triton.next_power_of_2(D))
         grid = (P * triton.cdiv(D, BLOCK_D),)

overlay/subsystems/htm.py CHANGED Viewed

@@ -99,13 +99,19 @@ class HTMLayer(nn.Module):
         self._forward_counter = 0
         # GPU backend gate. Default: auto-detect — use GPU when the pyo3
         # module was built with --features gpu AND CUDA is actually usable.
         if use_gpu is None:
-            use_gpu = _HTM_HAS_GPU and torch.cuda.is_available()
         elif use_gpu and not _HTM_HAS_GPU:
             raise RuntimeError(
                 "HTMLayer(use_gpu=True) but htm_rust was not built with "
                 "--features gpu. Re-run `maturin develop --features gpu`."
             )
         self._use_gpu = bool(use_gpu)
         cls = htm_rust.HTMRegionGpu if self._use_gpu else htm_rust.HTMRegion
         self._region_cls = cls

         self._forward_counter = 0
         # GPU backend gate. Default: auto-detect — use GPU when the pyo3
         # module was built with --features gpu AND CUDA is actually usable.
+        # HYDRA_FORCE_HTM_CPU=1 is an operational safety valve for paid remote
+        # canaries when the compiled CUDA HTM backend is present but unstable on
+        # a specific hardware/runtime combination.
+        force_cpu = _os.environ.get("HYDRA_FORCE_HTM_CPU", "0") == "1"
         if use_gpu is None:
+            use_gpu = (not force_cpu) and _HTM_HAS_GPU and torch.cuda.is_available()
         elif use_gpu and not _HTM_HAS_GPU:
             raise RuntimeError(
                 "HTMLayer(use_gpu=True) but htm_rust was not built with "
                 "--features gpu. Re-run `maturin develop --features gpu`."
             )
+        elif use_gpu and force_cpu:
+            use_gpu = False
         self._use_gpu = bool(use_gpu)
         cls = htm_rust.HTMRegionGpu if self._use_gpu else htm_rust.HTMRegion
         self._region_cls = cls

overlay/subsystems/sdr_semantic.py CHANGED Viewed

@@ -46,19 +46,10 @@ class _SDRSTE(torch.autograd.Function):
         flat_grad = grad_out.reshape(B * T, n_bits).to(delta_v.dtype)
         flat_ids = token_ids.reshape(B * T)
         V = delta_u.shape[0]
-        R = delta_u.shape[1]  # delta_rank — typically 32
-        # OOM fix: old code allocated (V, n_bits) = 4GB buffer via index_add.
-        # Instead, project to rank-R space first (small), then scatter.
-        #   grad_delta_u[t, r] = sum_{pos: id=flat_ids[pos]=t} (flat_grad[pos] @ delta_v[r])
-        #   = index_add(V, R, flat_ids, flat_grad @ delta_v.T)
-        projected = flat_grad @ delta_v.t()  # (B*T, R) — ~1MB at B=8,T=1024,R=32
-        per_tok_u = torch.zeros(V, R, device=flat_grad.device, dtype=delta_v.dtype)
-        per_tok_u.index_add_(0, flat_ids, projected)
-        grad_delta_u = per_tok_u  # (V, R) — ~8MB at V=65536
-        #   grad_delta_v = sum_{pos} delta_u[flat_ids[pos]]^T @ flat_grad[pos]
-        #   = delta_u[flat_ids].T @ flat_grad  — no intermediate buffer
-        gathered_u = delta_u[flat_ids]  # (B*T, R) — ~1MB
-        grad_delta_v = gathered_u.t() @ flat_grad  # (R, n_bits) — ~2MB
         return None, grad_delta_u, grad_delta_v, None
@@ -249,25 +240,12 @@ class SemanticFoldingSDR(nn.Module):
         sdr_binary = sdr_binary.view(B, T, self.n_bits)
         return _SDRSTE.apply(sdr_binary, self.delta_u, self.delta_v, token_ids)
-    @torch.no_grad()
-    def active_indices(self, token_ids: torch.Tensor) -> torch.Tensor:
-        """Compact int16 Reality Buffer view: (B,T,K) active retina offsets.
-        This is the production discrete bridge for Cantor/Engram routing. It
-        avoids reconstructing dense (B,T,n_bits) masks when consumers only need
-        the L0 support set.
-        """
-        if token_ids.dim() != 2:
-            raise ValueError(f"expected (B, T) token_ids, got shape {tuple(token_ids.shape)}")
-        B, T = token_ids.shape
-        return self._retina_indices[token_ids.reshape(-1)].view(B, T, self.target_active)
     @torch.no_grad()
     def binary_only(self, token_ids: torch.Tensor) -> torch.Tensor:
         """uint8 retina view — no STE, no autocast cost. For HTM/consumers that
         only need the binary pattern.  Reconstructs dense from CSR indices."""
         B, T = token_ids.shape
-        idx = self.active_indices(token_ids).reshape(B * T, self.target_active)
         sdr = torch.zeros(
             B * T, self.n_bits, dtype=torch.uint8, device=token_ids.device,
         )

         flat_grad = grad_out.reshape(B * T, n_bits).to(delta_v.dtype)
         flat_ids = token_ids.reshape(B * T)
         V = delta_u.shape[0]
+        per_tok = torch.zeros(V, n_bits, device=flat_grad.device, dtype=delta_v.dtype)
+        per_tok.index_add_(0, flat_ids, flat_grad)
+        grad_delta_u = per_tok @ delta_v.t()
+        grad_delta_v = delta_u.t() @ per_tok
         return None, grad_delta_u, grad_delta_v, None
         sdr_binary = sdr_binary.view(B, T, self.n_bits)
         return _SDRSTE.apply(sdr_binary, self.delta_u, self.delta_v, token_ids)
     @torch.no_grad()
     def binary_only(self, token_ids: torch.Tensor) -> torch.Tensor:
         """uint8 retina view — no STE, no autocast cost. For HTM/consumers that
         only need the binary pattern.  Reconstructs dense from CSR indices."""
         B, T = token_ids.shape
+        idx = self._retina_indices[token_ids.reshape(-1)]  # (B*T, K) int16
         sdr = torch.zeros(
             B * T, self.n_bits, dtype=torch.uint8, device=token_ids.device,
         )