Spaces:

GAInTech
/

feather-a10g-large-runtime

Runtime error

Jackoatmon commited on 8 days ago

Commit

ebe0fed

verified ·

1 Parent(s): 8a0b9b5

Update Feather a10g-large training runtime image

Files changed (3) hide show

overlay/scripts/launch_feather_hf_job.py CHANGED Viewed

@@ -151,6 +151,7 @@ def main() -> int:
         'HYDRA_PROFILE_STEPS': PROFILE_STEPS,
         'HYDRA_MID_VAL_INTERVAL': MID_VAL_INTERVAL,
         'HYDRA_MUON_COMPILE': '0' if FORCE_CLEAN_REBUILD else os.environ.get('HYDRA_MUON_COMPILE', '1'),
         'PYTHONUNBUFFERED': '1',
         'FEATHER_CLEAN_REBUILD': '1' if FORCE_CLEAN_REBUILD else '0',
         'FEATHER_DISABLE_TRITON_CACHE': '1' if FORCE_CLEAN_REBUILD else os.environ.get('FEATHER_DISABLE_TRITON_CACHE', '0'),

         'HYDRA_PROFILE_STEPS': PROFILE_STEPS,
         'HYDRA_MID_VAL_INTERVAL': MID_VAL_INTERVAL,
         'HYDRA_MUON_COMPILE': '0' if FORCE_CLEAN_REBUILD else os.environ.get('HYDRA_MUON_COMPILE', '1'),
+        'HYDRA_FORCE_HTM_CPU': os.environ.get('HYDRA_FORCE_HTM_CPU', '1'),
         'PYTHONUNBUFFERED': '1',
         'FEATHER_CLEAN_REBUILD': '1' if FORCE_CLEAN_REBUILD else '0',
         'FEATHER_DISABLE_TRITON_CACHE': '1' if FORCE_CLEAN_REBUILD else os.environ.get('FEATHER_DISABLE_TRITON_CACHE', '0'),

overlay/subsystems/__pycache__/htm.cpython-312.pyc CHANGED Viewed

Binary files a/overlay/subsystems/__pycache__/htm.cpython-312.pyc and b/overlay/subsystems/__pycache__/htm.cpython-312.pyc differ

overlay/subsystems/htm.py CHANGED Viewed

@@ -93,20 +93,25 @@ class HTMLayer(nn.Module):
         # instead of every forward cuts HTM cost ~2x. Hebbian learning still
         # converges since the EMA accumulates over many calls. Env:
         # HYDRA_HTM_LEARN_EVERY=N (default 1 = every forward, 0 = disabled).
-        import os as _os
-        self._learn_every = max(1, int(_os.environ.get("HYDRA_HTM_LEARN_EVERY", "1")))
-        self._forward_counter = 0
-        # GPU backend gate. Default: auto-detect — use GPU when the pyo3
-        # module was built with --features gpu AND CUDA is actually usable.
-        if use_gpu is None:
-            use_gpu = _HTM_HAS_GPU and torch.cuda.is_available()
-        elif use_gpu and not _HTM_HAS_GPU:
-            raise RuntimeError(
-                "HTMLayer(use_gpu=True) but htm_rust was not built with "
-                "--features gpu. Re-run `maturin develop --features gpu`."
-            )
-        self._use_gpu = bool(use_gpu)
-        cls = htm_rust.HTMRegionGpu if self._use_gpu else htm_rust.HTMRegion
         self._region_cls = cls
         self._regions = [
             cls(input_bits, n_columns, cells_per_column, seed + i)

         # instead of every forward cuts HTM cost ~2x. Hebbian learning still
         # converges since the EMA accumulates over many calls. Env:
         # HYDRA_HTM_LEARN_EVERY=N (default 1 = every forward, 0 = disabled).
+        import os as _os
+        self._learn_every = max(1, int(_os.environ.get("HYDRA_HTM_LEARN_EVERY", "1")))
+        self._forward_counter = 0
+        force_cpu = _os.environ.get("HYDRA_FORCE_HTM_CPU", "0") == "1"
+        # GPU backend gate. Default: auto-detect — use GPU when the pyo3
+        # module was built with --features gpu AND CUDA is actually usable.
+        if use_gpu is None:
+            use_gpu = (not force_cpu) and _HTM_HAS_GPU and torch.cuda.is_available()
+        elif use_gpu and not _HTM_HAS_GPU:
+            raise RuntimeError(
+                "HTMLayer(use_gpu=True) but htm_rust was not built with "
+                "--features gpu. Re-run `maturin develop --features gpu`."
+            )
+        if force_cpu:
+            use_gpu = False
+        self._use_gpu = bool(use_gpu)
+        if force_cpu:
+            print("[htm] HYDRA_FORCE_HTM_CPU=1 -> using CPU HTM backend", flush=True)
+        cls = htm_rust.HTMRegionGpu if self._use_gpu else htm_rust.HTMRegion
         self._region_cls = cls
         self._regions = [
             cls(input_bits, n_columns, cells_per_column, seed + i)