Jackoatmon commited on
Commit
ebe0fed
·
verified ·
1 Parent(s): 8a0b9b5

Update Feather a10g-large training runtime image

Browse files
overlay/scripts/launch_feather_hf_job.py CHANGED
@@ -151,6 +151,7 @@ def main() -> int:
151
  'HYDRA_PROFILE_STEPS': PROFILE_STEPS,
152
  'HYDRA_MID_VAL_INTERVAL': MID_VAL_INTERVAL,
153
  'HYDRA_MUON_COMPILE': '0' if FORCE_CLEAN_REBUILD else os.environ.get('HYDRA_MUON_COMPILE', '1'),
 
154
  'PYTHONUNBUFFERED': '1',
155
  'FEATHER_CLEAN_REBUILD': '1' if FORCE_CLEAN_REBUILD else '0',
156
  'FEATHER_DISABLE_TRITON_CACHE': '1' if FORCE_CLEAN_REBUILD else os.environ.get('FEATHER_DISABLE_TRITON_CACHE', '0'),
 
151
  'HYDRA_PROFILE_STEPS': PROFILE_STEPS,
152
  'HYDRA_MID_VAL_INTERVAL': MID_VAL_INTERVAL,
153
  'HYDRA_MUON_COMPILE': '0' if FORCE_CLEAN_REBUILD else os.environ.get('HYDRA_MUON_COMPILE', '1'),
154
+ 'HYDRA_FORCE_HTM_CPU': os.environ.get('HYDRA_FORCE_HTM_CPU', '1'),
155
  'PYTHONUNBUFFERED': '1',
156
  'FEATHER_CLEAN_REBUILD': '1' if FORCE_CLEAN_REBUILD else '0',
157
  'FEATHER_DISABLE_TRITON_CACHE': '1' if FORCE_CLEAN_REBUILD else os.environ.get('FEATHER_DISABLE_TRITON_CACHE', '0'),
overlay/subsystems/__pycache__/htm.cpython-312.pyc CHANGED
Binary files a/overlay/subsystems/__pycache__/htm.cpython-312.pyc and b/overlay/subsystems/__pycache__/htm.cpython-312.pyc differ
 
overlay/subsystems/htm.py CHANGED
@@ -93,20 +93,25 @@ class HTMLayer(nn.Module):
93
  # instead of every forward cuts HTM cost ~2x. Hebbian learning still
94
  # converges since the EMA accumulates over many calls. Env:
95
  # HYDRA_HTM_LEARN_EVERY=N (default 1 = every forward, 0 = disabled).
96
- import os as _os
97
- self._learn_every = max(1, int(_os.environ.get("HYDRA_HTM_LEARN_EVERY", "1")))
98
- self._forward_counter = 0
99
- # GPU backend gate. Default: auto-detect — use GPU when the pyo3
100
- # module was built with --features gpu AND CUDA is actually usable.
101
- if use_gpu is None:
102
- use_gpu = _HTM_HAS_GPU and torch.cuda.is_available()
103
- elif use_gpu and not _HTM_HAS_GPU:
104
- raise RuntimeError(
105
- "HTMLayer(use_gpu=True) but htm_rust was not built with "
106
- "--features gpu. Re-run `maturin develop --features gpu`."
107
- )
108
- self._use_gpu = bool(use_gpu)
109
- cls = htm_rust.HTMRegionGpu if self._use_gpu else htm_rust.HTMRegion
 
 
 
 
 
110
  self._region_cls = cls
111
  self._regions = [
112
  cls(input_bits, n_columns, cells_per_column, seed + i)
 
93
  # instead of every forward cuts HTM cost ~2x. Hebbian learning still
94
  # converges since the EMA accumulates over many calls. Env:
95
  # HYDRA_HTM_LEARN_EVERY=N (default 1 = every forward, 0 = disabled).
96
+ import os as _os
97
+ self._learn_every = max(1, int(_os.environ.get("HYDRA_HTM_LEARN_EVERY", "1")))
98
+ self._forward_counter = 0
99
+ force_cpu = _os.environ.get("HYDRA_FORCE_HTM_CPU", "0") == "1"
100
+ # GPU backend gate. Default: auto-detect use GPU when the pyo3
101
+ # module was built with --features gpu AND CUDA is actually usable.
102
+ if use_gpu is None:
103
+ use_gpu = (not force_cpu) and _HTM_HAS_GPU and torch.cuda.is_available()
104
+ elif use_gpu and not _HTM_HAS_GPU:
105
+ raise RuntimeError(
106
+ "HTMLayer(use_gpu=True) but htm_rust was not built with "
107
+ "--features gpu. Re-run `maturin develop --features gpu`."
108
+ )
109
+ if force_cpu:
110
+ use_gpu = False
111
+ self._use_gpu = bool(use_gpu)
112
+ if force_cpu:
113
+ print("[htm] HYDRA_FORCE_HTM_CPU=1 -> using CPU HTM backend", flush=True)
114
+ cls = htm_rust.HTMRegionGpu if self._use_gpu else htm_rust.HTMRegion
115
  self._region_cls = cls
116
  self._regions = [
117
  cls(input_bits, n_columns, cells_per_column, seed + i)