Spaces:
Runtime error
Runtime error
Update Feather a10g-large training runtime image
Browse files
__pycache__/entrypoint.cpython-312.pyc
CHANGED
|
Binary files a/__pycache__/entrypoint.cpython-312.pyc and b/__pycache__/entrypoint.cpython-312.pyc differ
|
|
|
entrypoint.py
CHANGED
|
@@ -169,6 +169,7 @@ def run_job_mode() -> int:
|
|
| 169 |
os.environ.setdefault('HYDRA_PROFILE_STEPS', '3')
|
| 170 |
os.environ.setdefault('HYDRA_MID_VAL_INTERVAL', '100')
|
| 171 |
os.environ.setdefault('HYDRA_MUON_COMPILE', '0' if os.environ.get('FEATHER_CLEAN_REBUILD', '0') == '1' else '1')
|
|
|
|
| 172 |
os.environ.setdefault('HYDRA_RESUME_CKPT', str(CACHE_ROOT / 'latest.pt'))
|
| 173 |
os.environ.setdefault('FEATHER_GPU_PROFILE', 'a10g-large')
|
| 174 |
os.environ.setdefault('FEATHER_HF_OWNER', HF_OWNER)
|
|
@@ -179,6 +180,7 @@ def run_job_mode() -> int:
|
|
| 179 |
print(f"[job] gpu_profile={os.environ['FEATHER_GPU_PROFILE']} htm_cuda_arch={os.environ['HTM_CUDA_ARCH']} torch_cuda_arch={os.environ['TORCH_CUDA_ARCH_LIST']}", flush=True)
|
| 180 |
print(f"[job] profile_steps={os.environ['HYDRA_PROFILE_STEPS']} mid_val_interval={os.environ['HYDRA_MID_VAL_INTERVAL']}", flush=True)
|
| 181 |
print(f"[job] clean_rebuild={os.environ.get('FEATHER_CLEAN_REBUILD', '0')} muon_compile={os.environ.get('HYDRA_MUON_COMPILE')}", flush=True)
|
|
|
|
| 182 |
|
| 183 |
# CUDA readiness was kicked at module import via _early_cuda_kick. Keep
|
| 184 |
# the wait as a second safety net — no-op if CUDA already ready.
|
|
|
|
| 169 |
os.environ.setdefault('HYDRA_PROFILE_STEPS', '3')
|
| 170 |
os.environ.setdefault('HYDRA_MID_VAL_INTERVAL', '100')
|
| 171 |
os.environ.setdefault('HYDRA_MUON_COMPILE', '0' if os.environ.get('FEATHER_CLEAN_REBUILD', '0') == '1' else '1')
|
| 172 |
+
os.environ.setdefault('HYDRA_HYENA_LAYERS', ','.join(str(i) for i in range(int(os.environ.get('HYDRA_N_LAYER', '4')))))
|
| 173 |
os.environ.setdefault('HYDRA_RESUME_CKPT', str(CACHE_ROOT / 'latest.pt'))
|
| 174 |
os.environ.setdefault('FEATHER_GPU_PROFILE', 'a10g-large')
|
| 175 |
os.environ.setdefault('FEATHER_HF_OWNER', HF_OWNER)
|
|
|
|
| 180 |
print(f"[job] gpu_profile={os.environ['FEATHER_GPU_PROFILE']} htm_cuda_arch={os.environ['HTM_CUDA_ARCH']} torch_cuda_arch={os.environ['TORCH_CUDA_ARCH_LIST']}", flush=True)
|
| 181 |
print(f"[job] profile_steps={os.environ['HYDRA_PROFILE_STEPS']} mid_val_interval={os.environ['HYDRA_MID_VAL_INTERVAL']}", flush=True)
|
| 182 |
print(f"[job] clean_rebuild={os.environ.get('FEATHER_CLEAN_REBUILD', '0')} muon_compile={os.environ.get('HYDRA_MUON_COMPILE')}", flush=True)
|
| 183 |
+
print(f"[job] hyena_layers={os.environ.get('HYDRA_HYENA_LAYERS')}", flush=True)
|
| 184 |
|
| 185 |
# CUDA readiness was kicked at module import via _early_cuda_kick. Keep
|
| 186 |
# the wait as a second safety net — no-op if CUDA already ready.
|
overlay/scripts/__pycache__/launch_feather_hf_job.cpython-312.pyc
CHANGED
|
Binary files a/overlay/scripts/__pycache__/launch_feather_hf_job.cpython-312.pyc and b/overlay/scripts/__pycache__/launch_feather_hf_job.cpython-312.pyc differ
|
|
|
overlay/scripts/launch_feather_hf_job.py
CHANGED
|
@@ -13,6 +13,7 @@ GPU_FLAVOR = os.environ.get('FEATHER_HF_FLAVOR', 'a10g-large')
|
|
| 13 |
GPU_PROFILE = os.environ.get('FEATHER_GPU_PROFILE', GPU_FLAVOR)
|
| 14 |
HF_OWNER = os.environ.get('FEATHER_HF_OWNER', os.environ.get('HF_OWNER', 'GAInTech'))
|
| 15 |
JOB_NAMESPACE = os.environ.get('FEATHER_HF_JOB_NAMESPACE', HF_OWNER)
|
|
|
|
| 16 |
GPU_ARCH_BY_FLAVOR = {
|
| 17 |
'a10g-small': ('sm_86', '8.6'),
|
| 18 |
'a10g-large': ('sm_86', '8.6'),
|
|
@@ -112,6 +113,7 @@ def main() -> int:
|
|
| 112 |
print(f'[launch] target_shards={TARGET_SHARDS} time_budget={TIME_BUDGET} timeout={TIMEOUT}', flush=True)
|
| 113 |
print(f'[launch] profile_steps={PROFILE_STEPS} mid_val_interval={MID_VAL_INTERVAL}', flush=True)
|
| 114 |
print(f'[launch] clean_rebuild={FORCE_CLEAN_REBUILD}', flush=True)
|
|
|
|
| 115 |
print(f'[launch] flavor={GPU_FLAVOR} profile={GPU_PROFILE} htm_cuda_arch={HTM_CUDA_ARCH} torch_cuda_arch={TORCH_CUDA_ARCH}', flush=True)
|
| 116 |
print(f'[launch] image_mode={"space" if USE_SPACE_IMAGE else "ghcr"}', flush=True)
|
| 117 |
if not USE_SPACE_IMAGE:
|
|
@@ -152,6 +154,7 @@ def main() -> int:
|
|
| 152 |
'HYDRA_MID_VAL_INTERVAL': MID_VAL_INTERVAL,
|
| 153 |
'HYDRA_MUON_COMPILE': '0' if FORCE_CLEAN_REBUILD else os.environ.get('HYDRA_MUON_COMPILE', '1'),
|
| 154 |
'HYDRA_FORCE_HTM_CPU': os.environ.get('HYDRA_FORCE_HTM_CPU', '1'),
|
|
|
|
| 155 |
'PYTHONUNBUFFERED': '1',
|
| 156 |
'FEATHER_CLEAN_REBUILD': '1' if FORCE_CLEAN_REBUILD else '0',
|
| 157 |
'FEATHER_DISABLE_TRITON_CACHE': '1' if FORCE_CLEAN_REBUILD else os.environ.get('FEATHER_DISABLE_TRITON_CACHE', '0'),
|
|
|
|
| 13 |
GPU_PROFILE = os.environ.get('FEATHER_GPU_PROFILE', GPU_FLAVOR)
|
| 14 |
HF_OWNER = os.environ.get('FEATHER_HF_OWNER', os.environ.get('HF_OWNER', 'GAInTech'))
|
| 15 |
JOB_NAMESPACE = os.environ.get('FEATHER_HF_JOB_NAMESPACE', HF_OWNER)
|
| 16 |
+
DEFAULT_HYENA_LAYERS = ','.join(str(i) for i in range(int(os.environ.get('HYDRA_N_LAYER', '4'))))
|
| 17 |
GPU_ARCH_BY_FLAVOR = {
|
| 18 |
'a10g-small': ('sm_86', '8.6'),
|
| 19 |
'a10g-large': ('sm_86', '8.6'),
|
|
|
|
| 113 |
print(f'[launch] target_shards={TARGET_SHARDS} time_budget={TIME_BUDGET} timeout={TIMEOUT}', flush=True)
|
| 114 |
print(f'[launch] profile_steps={PROFILE_STEPS} mid_val_interval={MID_VAL_INTERVAL}', flush=True)
|
| 115 |
print(f'[launch] clean_rebuild={FORCE_CLEAN_REBUILD}', flush=True)
|
| 116 |
+
print(f'[launch] hyena_layers={os.environ.get("HYDRA_HYENA_LAYERS", DEFAULT_HYENA_LAYERS)}', flush=True)
|
| 117 |
print(f'[launch] flavor={GPU_FLAVOR} profile={GPU_PROFILE} htm_cuda_arch={HTM_CUDA_ARCH} torch_cuda_arch={TORCH_CUDA_ARCH}', flush=True)
|
| 118 |
print(f'[launch] image_mode={"space" if USE_SPACE_IMAGE else "ghcr"}', flush=True)
|
| 119 |
if not USE_SPACE_IMAGE:
|
|
|
|
| 154 |
'HYDRA_MID_VAL_INTERVAL': MID_VAL_INTERVAL,
|
| 155 |
'HYDRA_MUON_COMPILE': '0' if FORCE_CLEAN_REBUILD else os.environ.get('HYDRA_MUON_COMPILE', '1'),
|
| 156 |
'HYDRA_FORCE_HTM_CPU': os.environ.get('HYDRA_FORCE_HTM_CPU', '1'),
|
| 157 |
+
'HYDRA_HYENA_LAYERS': os.environ.get('HYDRA_HYENA_LAYERS', DEFAULT_HYENA_LAYERS),
|
| 158 |
'PYTHONUNBUFFERED': '1',
|
| 159 |
'FEATHER_CLEAN_REBUILD': '1' if FORCE_CLEAN_REBUILD else '0',
|
| 160 |
'FEATHER_DISABLE_TRITON_CACHE': '1' if FORCE_CLEAN_REBUILD else os.environ.get('FEATHER_DISABLE_TRITON_CACHE', '0'),
|