Jackoatmon commited on
Commit
383a8e9
·
verified ·
1 Parent(s): ebe0fed

Update Feather a10g-large training runtime image

Browse files
__pycache__/entrypoint.cpython-312.pyc CHANGED
Binary files a/__pycache__/entrypoint.cpython-312.pyc and b/__pycache__/entrypoint.cpython-312.pyc differ
 
entrypoint.py CHANGED
@@ -169,6 +169,7 @@ def run_job_mode() -> int:
169
  os.environ.setdefault('HYDRA_PROFILE_STEPS', '3')
170
  os.environ.setdefault('HYDRA_MID_VAL_INTERVAL', '100')
171
  os.environ.setdefault('HYDRA_MUON_COMPILE', '0' if os.environ.get('FEATHER_CLEAN_REBUILD', '0') == '1' else '1')
 
172
  os.environ.setdefault('HYDRA_RESUME_CKPT', str(CACHE_ROOT / 'latest.pt'))
173
  os.environ.setdefault('FEATHER_GPU_PROFILE', 'a10g-large')
174
  os.environ.setdefault('FEATHER_HF_OWNER', HF_OWNER)
@@ -179,6 +180,7 @@ def run_job_mode() -> int:
179
  print(f"[job] gpu_profile={os.environ['FEATHER_GPU_PROFILE']} htm_cuda_arch={os.environ['HTM_CUDA_ARCH']} torch_cuda_arch={os.environ['TORCH_CUDA_ARCH_LIST']}", flush=True)
180
  print(f"[job] profile_steps={os.environ['HYDRA_PROFILE_STEPS']} mid_val_interval={os.environ['HYDRA_MID_VAL_INTERVAL']}", flush=True)
181
  print(f"[job] clean_rebuild={os.environ.get('FEATHER_CLEAN_REBUILD', '0')} muon_compile={os.environ.get('HYDRA_MUON_COMPILE')}", flush=True)
 
182
 
183
  # CUDA readiness was kicked at module import via _early_cuda_kick. Keep
184
  # the wait as a second safety net — no-op if CUDA already ready.
 
169
  os.environ.setdefault('HYDRA_PROFILE_STEPS', '3')
170
  os.environ.setdefault('HYDRA_MID_VAL_INTERVAL', '100')
171
  os.environ.setdefault('HYDRA_MUON_COMPILE', '0' if os.environ.get('FEATHER_CLEAN_REBUILD', '0') == '1' else '1')
172
+ os.environ.setdefault('HYDRA_HYENA_LAYERS', ','.join(str(i) for i in range(int(os.environ.get('HYDRA_N_LAYER', '4')))))
173
  os.environ.setdefault('HYDRA_RESUME_CKPT', str(CACHE_ROOT / 'latest.pt'))
174
  os.environ.setdefault('FEATHER_GPU_PROFILE', 'a10g-large')
175
  os.environ.setdefault('FEATHER_HF_OWNER', HF_OWNER)
 
180
  print(f"[job] gpu_profile={os.environ['FEATHER_GPU_PROFILE']} htm_cuda_arch={os.environ['HTM_CUDA_ARCH']} torch_cuda_arch={os.environ['TORCH_CUDA_ARCH_LIST']}", flush=True)
181
  print(f"[job] profile_steps={os.environ['HYDRA_PROFILE_STEPS']} mid_val_interval={os.environ['HYDRA_MID_VAL_INTERVAL']}", flush=True)
182
  print(f"[job] clean_rebuild={os.environ.get('FEATHER_CLEAN_REBUILD', '0')} muon_compile={os.environ.get('HYDRA_MUON_COMPILE')}", flush=True)
183
+ print(f"[job] hyena_layers={os.environ.get('HYDRA_HYENA_LAYERS')}", flush=True)
184
 
185
  # CUDA readiness was kicked at module import via _early_cuda_kick. Keep
186
  # the wait as a second safety net — no-op if CUDA already ready.
overlay/scripts/__pycache__/launch_feather_hf_job.cpython-312.pyc CHANGED
Binary files a/overlay/scripts/__pycache__/launch_feather_hf_job.cpython-312.pyc and b/overlay/scripts/__pycache__/launch_feather_hf_job.cpython-312.pyc differ
 
overlay/scripts/launch_feather_hf_job.py CHANGED
@@ -13,6 +13,7 @@ GPU_FLAVOR = os.environ.get('FEATHER_HF_FLAVOR', 'a10g-large')
13
  GPU_PROFILE = os.environ.get('FEATHER_GPU_PROFILE', GPU_FLAVOR)
14
  HF_OWNER = os.environ.get('FEATHER_HF_OWNER', os.environ.get('HF_OWNER', 'GAInTech'))
15
  JOB_NAMESPACE = os.environ.get('FEATHER_HF_JOB_NAMESPACE', HF_OWNER)
 
16
  GPU_ARCH_BY_FLAVOR = {
17
  'a10g-small': ('sm_86', '8.6'),
18
  'a10g-large': ('sm_86', '8.6'),
@@ -112,6 +113,7 @@ def main() -> int:
112
  print(f'[launch] target_shards={TARGET_SHARDS} time_budget={TIME_BUDGET} timeout={TIMEOUT}', flush=True)
113
  print(f'[launch] profile_steps={PROFILE_STEPS} mid_val_interval={MID_VAL_INTERVAL}', flush=True)
114
  print(f'[launch] clean_rebuild={FORCE_CLEAN_REBUILD}', flush=True)
 
115
  print(f'[launch] flavor={GPU_FLAVOR} profile={GPU_PROFILE} htm_cuda_arch={HTM_CUDA_ARCH} torch_cuda_arch={TORCH_CUDA_ARCH}', flush=True)
116
  print(f'[launch] image_mode={"space" if USE_SPACE_IMAGE else "ghcr"}', flush=True)
117
  if not USE_SPACE_IMAGE:
@@ -152,6 +154,7 @@ def main() -> int:
152
  'HYDRA_MID_VAL_INTERVAL': MID_VAL_INTERVAL,
153
  'HYDRA_MUON_COMPILE': '0' if FORCE_CLEAN_REBUILD else os.environ.get('HYDRA_MUON_COMPILE', '1'),
154
  'HYDRA_FORCE_HTM_CPU': os.environ.get('HYDRA_FORCE_HTM_CPU', '1'),
 
155
  'PYTHONUNBUFFERED': '1',
156
  'FEATHER_CLEAN_REBUILD': '1' if FORCE_CLEAN_REBUILD else '0',
157
  'FEATHER_DISABLE_TRITON_CACHE': '1' if FORCE_CLEAN_REBUILD else os.environ.get('FEATHER_DISABLE_TRITON_CACHE', '0'),
 
13
  GPU_PROFILE = os.environ.get('FEATHER_GPU_PROFILE', GPU_FLAVOR)
14
  HF_OWNER = os.environ.get('FEATHER_HF_OWNER', os.environ.get('HF_OWNER', 'GAInTech'))
15
  JOB_NAMESPACE = os.environ.get('FEATHER_HF_JOB_NAMESPACE', HF_OWNER)
16
+ DEFAULT_HYENA_LAYERS = ','.join(str(i) for i in range(int(os.environ.get('HYDRA_N_LAYER', '4'))))
17
  GPU_ARCH_BY_FLAVOR = {
18
  'a10g-small': ('sm_86', '8.6'),
19
  'a10g-large': ('sm_86', '8.6'),
 
113
  print(f'[launch] target_shards={TARGET_SHARDS} time_budget={TIME_BUDGET} timeout={TIMEOUT}', flush=True)
114
  print(f'[launch] profile_steps={PROFILE_STEPS} mid_val_interval={MID_VAL_INTERVAL}', flush=True)
115
  print(f'[launch] clean_rebuild={FORCE_CLEAN_REBUILD}', flush=True)
116
+ print(f'[launch] hyena_layers={os.environ.get("HYDRA_HYENA_LAYERS", DEFAULT_HYENA_LAYERS)}', flush=True)
117
  print(f'[launch] flavor={GPU_FLAVOR} profile={GPU_PROFILE} htm_cuda_arch={HTM_CUDA_ARCH} torch_cuda_arch={TORCH_CUDA_ARCH}', flush=True)
118
  print(f'[launch] image_mode={"space" if USE_SPACE_IMAGE else "ghcr"}', flush=True)
119
  if not USE_SPACE_IMAGE:
 
154
  'HYDRA_MID_VAL_INTERVAL': MID_VAL_INTERVAL,
155
  'HYDRA_MUON_COMPILE': '0' if FORCE_CLEAN_REBUILD else os.environ.get('HYDRA_MUON_COMPILE', '1'),
156
  'HYDRA_FORCE_HTM_CPU': os.environ.get('HYDRA_FORCE_HTM_CPU', '1'),
157
+ 'HYDRA_HYENA_LAYERS': os.environ.get('HYDRA_HYENA_LAYERS', DEFAULT_HYENA_LAYERS),
158
  'PYTHONUNBUFFERED': '1',
159
  'FEATHER_CLEAN_REBUILD': '1' if FORCE_CLEAN_REBUILD else '0',
160
  'FEATHER_DISABLE_TRITON_CACHE': '1' if FORCE_CLEAN_REBUILD else os.environ.get('FEATHER_DISABLE_TRITON_CACHE', '0'),