Spaces:
Runtime error
Runtime error
Update Feather a10g-large training runtime image
Browse files
overlay/scripts/launch_feather_hf_job.py
CHANGED
|
@@ -151,6 +151,7 @@ def main() -> int:
|
|
| 151 |
'HYDRA_PROFILE_STEPS': PROFILE_STEPS,
|
| 152 |
'HYDRA_MID_VAL_INTERVAL': MID_VAL_INTERVAL,
|
| 153 |
'HYDRA_MUON_COMPILE': '0' if FORCE_CLEAN_REBUILD else os.environ.get('HYDRA_MUON_COMPILE', '1'),
|
|
|
|
| 154 |
'PYTHONUNBUFFERED': '1',
|
| 155 |
'FEATHER_CLEAN_REBUILD': '1' if FORCE_CLEAN_REBUILD else '0',
|
| 156 |
'FEATHER_DISABLE_TRITON_CACHE': '1' if FORCE_CLEAN_REBUILD else os.environ.get('FEATHER_DISABLE_TRITON_CACHE', '0'),
|
|
|
|
| 151 |
'HYDRA_PROFILE_STEPS': PROFILE_STEPS,
|
| 152 |
'HYDRA_MID_VAL_INTERVAL': MID_VAL_INTERVAL,
|
| 153 |
'HYDRA_MUON_COMPILE': '0' if FORCE_CLEAN_REBUILD else os.environ.get('HYDRA_MUON_COMPILE', '1'),
|
| 154 |
+
'HYDRA_FORCE_HTM_CPU': os.environ.get('HYDRA_FORCE_HTM_CPU', '1'),
|
| 155 |
'PYTHONUNBUFFERED': '1',
|
| 156 |
'FEATHER_CLEAN_REBUILD': '1' if FORCE_CLEAN_REBUILD else '0',
|
| 157 |
'FEATHER_DISABLE_TRITON_CACHE': '1' if FORCE_CLEAN_REBUILD else os.environ.get('FEATHER_DISABLE_TRITON_CACHE', '0'),
|
overlay/subsystems/__pycache__/htm.cpython-312.pyc
CHANGED
|
Binary files a/overlay/subsystems/__pycache__/htm.cpython-312.pyc and b/overlay/subsystems/__pycache__/htm.cpython-312.pyc differ
|
|
|
overlay/subsystems/htm.py
CHANGED
|
@@ -93,20 +93,25 @@ class HTMLayer(nn.Module):
|
|
| 93 |
# instead of every forward cuts HTM cost ~2x. Hebbian learning still
|
| 94 |
# converges since the EMA accumulates over many calls. Env:
|
| 95 |
# HYDRA_HTM_LEARN_EVERY=N (default 1 = every forward, 0 = disabled).
|
| 96 |
-
import os as _os
|
| 97 |
-
self._learn_every = max(1, int(_os.environ.get("HYDRA_HTM_LEARN_EVERY", "1")))
|
| 98 |
-
self._forward_counter = 0
|
| 99 |
-
|
| 100 |
-
#
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
"
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
self._region_cls = cls
|
| 111 |
self._regions = [
|
| 112 |
cls(input_bits, n_columns, cells_per_column, seed + i)
|
|
|
|
| 93 |
# instead of every forward cuts HTM cost ~2x. Hebbian learning still
|
| 94 |
# converges since the EMA accumulates over many calls. Env:
|
| 95 |
# HYDRA_HTM_LEARN_EVERY=N (default 1 = every forward, 0 = disabled).
|
| 96 |
+
import os as _os
|
| 97 |
+
self._learn_every = max(1, int(_os.environ.get("HYDRA_HTM_LEARN_EVERY", "1")))
|
| 98 |
+
self._forward_counter = 0
|
| 99 |
+
force_cpu = _os.environ.get("HYDRA_FORCE_HTM_CPU", "0") == "1"
|
| 100 |
+
# GPU backend gate. Default: auto-detect — use GPU when the pyo3
|
| 101 |
+
# module was built with --features gpu AND CUDA is actually usable.
|
| 102 |
+
if use_gpu is None:
|
| 103 |
+
use_gpu = (not force_cpu) and _HTM_HAS_GPU and torch.cuda.is_available()
|
| 104 |
+
elif use_gpu and not _HTM_HAS_GPU:
|
| 105 |
+
raise RuntimeError(
|
| 106 |
+
"HTMLayer(use_gpu=True) but htm_rust was not built with "
|
| 107 |
+
"--features gpu. Re-run `maturin develop --features gpu`."
|
| 108 |
+
)
|
| 109 |
+
if force_cpu:
|
| 110 |
+
use_gpu = False
|
| 111 |
+
self._use_gpu = bool(use_gpu)
|
| 112 |
+
if force_cpu:
|
| 113 |
+
print("[htm] HYDRA_FORCE_HTM_CPU=1 -> using CPU HTM backend", flush=True)
|
| 114 |
+
cls = htm_rust.HTMRegionGpu if self._use_gpu else htm_rust.HTMRegion
|
| 115 |
self._region_cls = cls
|
| 116 |
self._regions = [
|
| 117 |
cls(input_bits, n_columns, cells_per_column, seed + i)
|