Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors Claude Sonnet 4.6 commited on 1 day ago

Commit

7592f82

1 Parent(s): ebde550

refactor: replace _cpu_ctx with thread-local storage, deduplicate xregen wrappers, parallel downloads, quiet=True

- Replace fragile function-attribute CPU→GPU context passing (_fn._cpu_ctx = {})
with thread-local storage (_tl.<name>_ctx) for thread safety under ZeroGPU
multi-user concurrency — 6 sites updated across generate_* and regen_* paths
- Add _xregen_dispatch() generator helper to deduplicate the pending-yield /
infer / splice-yield skeleton shared by xregen_taro, xregen_mmaudio,
xregen_hunyuan (~40 lines removed)
- Parallelize all 7 startup downloads with ThreadPoolExecutor (I/O-bound network
calls run concurrently, cutting Space cold-start time ~proportionally)
- Consolidate per-model scalar constants into MODEL_CONFIGS as single source of
truth; add _clamp_duration() / _estimate_gpu_duration() / _estimate_regen_duration()
helpers to eliminate repeated duration-clamping boilerplate
- Restore quiet=True in mux_video_audio (was temporarily quiet=False for debugging)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

app.py +214 -179

app.py CHANGED Viewed

@@ -17,6 +17,7 @@ import tempfile
 import random
 import threading
 import time
 from pathlib import Path
 import torch
@@ -35,69 +36,102 @@ CKPT_REPO_ID = "JackIsNotInTheBox/Generate_Audio_for_Video_Checkpoints"
 CACHE_DIR    = "/tmp/model_ckpts"
 os.makedirs(CACHE_DIR, exist_ok=True)
-# ---- TARO checkpoints (in TARO/ subfolder of the HF repo) ----
-print("Downloading TARO checkpoints…")
-cavp_ckpt_path  = hf_hub_download(repo_id=CKPT_REPO_ID, filename="TARO/cavp_epoch66.ckpt",  cache_dir=CACHE_DIR)
-onset_ckpt_path = hf_hub_download(repo_id=CKPT_REPO_ID, filename="TARO/onset_model.ckpt",   cache_dir=CACHE_DIR)
-taro_ckpt_path  = hf_hub_download(repo_id=CKPT_REPO_ID, filename="TARO/taro_ckpt.pt",       cache_dir=CACHE_DIR)
-print("TARO checkpoints downloaded.")
-# ---- MMAudio checkpoints (in MMAudio/ subfolder) ----
-# MMAudio normally auto-downloads from its own HF repo, but we
-# override the paths so it pulls from our consolidated repo instead.
 MMAUDIO_WEIGHTS_DIR  = Path(CACHE_DIR) / "MMAudio" / "weights"
 MMAUDIO_EXT_DIR      = Path(CACHE_DIR) / "MMAudio" / "ext_weights"
 MMAUDIO_WEIGHTS_DIR.mkdir(parents=True, exist_ok=True)
 MMAUDIO_EXT_DIR.mkdir(parents=True, exist_ok=True)
-print("Downloading MMAudio checkpoints…")
-mmaudio_model_path       = hf_hub_download(repo_id=CKPT_REPO_ID, filename="MMAudio/mmaudio_large_44k_v2.pth",     cache_dir=CACHE_DIR, local_dir=str(MMAUDIO_WEIGHTS_DIR), local_dir_use_symlinks=False)
-mmaudio_vae_path         = hf_hub_download(repo_id=CKPT_REPO_ID, filename="MMAudio/v1-44.pth",                    cache_dir=CACHE_DIR, local_dir=str(MMAUDIO_EXT_DIR),     local_dir_use_symlinks=False)
-mmaudio_synchformer_path = hf_hub_download(repo_id=CKPT_REPO_ID, filename="MMAudio/synchformer_state_dict.pth",   cache_dir=CACHE_DIR, local_dir=str(MMAUDIO_EXT_DIR),     local_dir_use_symlinks=False)
-print("MMAudio checkpoints downloaded.")
-# ---- HunyuanVideoFoley checkpoints (in HunyuanFoley/ subfolder) ----
-HUNYUAN_MODEL_DIR = Path(CACHE_DIR) / "HunyuanFoley"
 HUNYUAN_MODEL_DIR.mkdir(parents=True, exist_ok=True)
-print("Downloading HunyuanVideoFoley checkpoints…")
-hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanVideo-Foley/hunyuanvideo_foley.pth",      cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
-hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanVideo-Foley/vae_128d_48k.pth",            cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
-hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanVideo-Foley/synchformer_state_dict.pth",  cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
-print("HunyuanVideoFoley checkpoints downloaded.")
-# Pre-download CLAP model so from_pretrained() reads local cache inside the
-# ZeroGPU daemonic worker (spawning child processes there is not allowed).
-print("Pre-downloading CLAP model (laion/larger_clap_general)…")
-snapshot_download(repo_id="laion/larger_clap_general")
-print("CLAP model pre-downloaded.")
-# Pre-download MMAudio's CLIP model (apple/DFN5B-CLIP-ViT-H-14-384, ~3.95 GB).
-# open_clip.create_model_from_pretrained('hf-hub:apple/DFN5B-CLIP-ViT-H-14-384')
-# fetches this at first use — inside the GPU window on cold workers — which
-# burns ~5-10s of the allocated ZeroGPU budget before inference even starts.
-print("Pre-downloading MMAudio CLIP model (apple/DFN5B-CLIP-ViT-H-14-384)…")
-snapshot_download(repo_id="apple/DFN5B-CLIP-ViT-H-14-384")
-print("MMAudio CLIP model pre-downloaded.")
-# Pre-download TARO's AudioLDM2 VAE + vocoder (cvssp/audioldm2).
-# AutoencoderKL.from_pretrained() and SpeechT5HifiGan.from_pretrained() fetch
-# this repo inside the GPU window on every cold worker start, burning GPU budget
-# before inference even begins.  Pre-fetching here ensures the cache is warm.
-print("Pre-downloading AudioLDM2 (cvssp/audioldm2)…")
-snapshot_download(repo_id="cvssp/audioldm2")
-print("AudioLDM2 pre-downloaded.")
-# Pre-download MMAudio's BigVGAN vocoder (nvidia/bigvgan_v2_44khz_128band_512x, ~489MB).
-# This is fetched inside the GPU window on cold workers during MMAudio inference/regen.
-print("Pre-downloading BigVGAN vocoder (nvidia/bigvgan_v2_44khz_128band_512x)…")
-snapshot_download(repo_id="nvidia/bigvgan_v2_44khz_128band_512x")
-print("BigVGAN vocoder pre-downloaded.")
 # ================================================================== #
 #                     SHARED CONSTANTS / HELPERS                      #
 # ================================================================== #
 MAX_SLOTS = 8   # max parallel generation slots shown in UI
 MAX_SEGS  = 8   # max segments per slot (same as MAX_SLOTS; video ≤ ~64 s at 8 s/seg)
@@ -351,7 +385,7 @@ def mux_video_audio(silent_video: str, audio_path: str, output_path: str,
             pix_fmt="yuv420p",
             acodec="aac", audio_bitrate="128k",
             movflags="+faststart",
-        ).run(overwrite_output=True, quiet=False)
 # ------------------------------------------------------------------ #
@@ -417,65 +451,76 @@ def _cf_join(a: np.ndarray, b: np.ndarray,
 #   latents_scale: [0.18215]*8 — AudioLDM2 VAE scale factor
 # ================================================================== #
-TARO_SR            = 16000
-TARO_TRUNCATE      = 131072
-TARO_FPS           = 4
-TARO_TRUNCATE_FRAME = int(TARO_FPS * TARO_TRUNCATE / TARO_SR)   # 32
 TARO_TRUNCATE_ONSET = 120
-TARO_MODEL_DUR     = TARO_TRUNCATE / TARO_SR                    # 8.192 s
-TARO_SECS_PER_STEP = 0.025  # measured 0.023s/step on H200; was 0.05, tightened to halve GPU allocation
-TARO_LOAD_OVERHEAD     = 15    # seconds: model load + CAVP feature extraction
-MMAUDIO_WINDOW         = 8.0   # seconds — MMAudio's fixed generation window
-MMAUDIO_SECS_PER_STEP  = 0.25  # measured 0.230s/step on H200 (8.3s video, 2 segs × 25 steps = 11.5s wall)
-MMAUDIO_LOAD_OVERHEAD  = 30    # 15s warm + 15s model init; open_clip pre-downloaded at startup
-HUNYUAN_MAX_DUR        = 15.0  # seconds — HunyuanFoley max video duration
-HUNYUAN_SECS_PER_STEP  = 0.35  # measured 0.328s/step on H200 (8.3s video, 1 seg × 50 steps = 16.4s wall)
-HUNYUAN_LOAD_OVERHEAD  = 55    # ~55s to load the 10GB XXL model weights into GPU
-GPU_DURATION_CAP       = 300   # hard cap per call — never reserve more than this
-# ------------------------------------------------------------------ #
-# Model configuration registry — single source of truth for per-model #
-# constants used by duration estimation, segmentation, and UI.        #
-# ------------------------------------------------------------------ #
 MODEL_CONFIGS = {
     "taro": {
-        "window_s":       TARO_MODEL_DUR,       # 8.192 s
-        "sr":             TARO_SR,               # 16000
-        "secs_per_step":  TARO_SECS_PER_STEP,   # 0.025
-        "load_overhead":  TARO_LOAD_OVERHEAD,    # 15
         "tab_prefix":     "taro",
-        "regen_fn":       None,   # set after function definitions (avoids forward-ref)
         "label":          "TARO",
     },
     "mmaudio": {
-        "window_s":       MMAUDIO_WINDOW,        # 8.0 s
-        "sr":             48000,   # resampled to 48kHz in post-processing
-        "secs_per_step":  MMAUDIO_SECS_PER_STEP, # 0.25
-        "load_overhead":  MMAUDIO_LOAD_OVERHEAD,  # 15
         "tab_prefix":     "mma",
-        "regen_fn":       None,
         "label":          "MMAudio",
     },
     "hunyuan": {
-        "window_s":       HUNYUAN_MAX_DUR,        # 15.0 s
         "sr":             48000,
-        "secs_per_step":  HUNYUAN_SECS_PER_STEP,  # 0.35
-        "load_overhead":  HUNYUAN_LOAD_OVERHEAD,   # 55
         "tab_prefix":     "hf",
-        "regen_fn":       None,
         "label":          "HunyuanFoley",
     },
 }
 def _estimate_gpu_duration(model_key: str, num_samples: int, num_steps: int,
                            total_dur_s: float = None, crossfade_s: float = 0,
                            video_file: str = None) -> int:
-    """Generic GPU duration estimator used by all models.
-    Computes: num_samples × n_segs × num_steps × secs_per_step + load_overhead
-    Clamped to [60, GPU_DURATION_CAP].
     """
     cfg = MODEL_CONFIGS[model_key]
     try:
@@ -484,25 +529,18 @@ def _estimate_gpu_duration(model_key: str, num_samples: int, num_steps: int,
         n_segs = len(_build_segments(total_dur_s, cfg["window_s"], float(crossfade_s)))
     except Exception:
         n_segs = 1
-    secs   = int(num_samples) * n_segs * int(num_steps) * cfg["secs_per_step"] + cfg["load_overhead"]
-    result = min(GPU_DURATION_CAP, max(60, int(secs)))
     print(f"[duration] {cfg['label']}: {int(num_samples)}samp × {n_segs}seg × "
-          f"{int(num_steps)}steps → {secs:.0f}s → capped {result}s")
-    return result
 def _estimate_regen_duration(model_key: str, num_steps: int) -> int:
-    """Generic GPU duration estimator for single-segment regen.
-    Floor is 20s — enough headroom above the 10s ZeroGPU abort threshold
-    for any model on a warm worker.  Cold-start spin-up happens *before*
-    the timer starts so raising the floor does not help with cold-start aborts.
-    """
     cfg  = MODEL_CONFIGS[model_key]
     secs = int(num_steps) * cfg["secs_per_step"] + cfg["load_overhead"]
-    result = min(GPU_DURATION_CAP, max(60, int(secs)))
-    print(f"[duration] {cfg['label']} regen: 1 seg × {int(num_steps)} steps → {secs:.0f}s → capped {result}s")
-    return result
 _TARO_CACHE_MAXLEN = 16   # evict oldest entries beyond this limit
 _TARO_INFERENCE_CACHE: dict = {}   # keyed by (video_file, seed, cfg, steps, mode, crossfade_s)
@@ -750,8 +788,8 @@ def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
     from TARO.onset_util import extract_onset
     from TARO.samplers   import euler_sampler, euler_maruyama_sampler
-    # Use pre-computed CPU results from the wrapper
-    ctx        = _taro_gpu_infer._cpu_ctx
     tmp_dir    = ctx["tmp_dir"]
     silent_video = ctx["silent_video"]
     segments   = ctx["segments"]
@@ -810,9 +848,6 @@ def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
     return results
-# Attach a context slot for the CPU wrapper to pass pre-computed data
-_taro_gpu_infer._cpu_ctx = {}
 def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
                   crossfade_s, crossfade_db, num_samples):
@@ -826,8 +861,8 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
     tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
         video_file, TARO_MODEL_DUR, crossfade_s)
-    # Pass pre-computed CPU results to the GPU function via context
-    _taro_gpu_infer._cpu_ctx = {
         "tmp_dir": tmp_dir, "silent_video": silent_video,
         "segments": segments, "total_dur_s": total_dur_s,
     }
@@ -906,7 +941,7 @@ def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
-    ctx            = _mmaudio_gpu_infer._cpu_ctx
     segments       = ctx["segments"]
     seg_clip_paths = ctx["seg_clip_paths"]
@@ -966,8 +1001,6 @@ def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
     return results
-_mmaudio_gpu_infer._cpu_ctx = {}
 def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
                      cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
@@ -987,7 +1020,7 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
         for i, (s, e) in enumerate(segments)
     ]
-    _mmaudio_gpu_infer._cpu_ctx = {
         "segments": segments, "seg_clip_paths": seg_clip_paths,
     }
@@ -1057,7 +1090,7 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
     model_dict, cfg = _load_hunyuan_model(device, model_size)
-    ctx              = _hunyuan_gpu_infer._cpu_ctx
     segments         = ctx["segments"]
     total_dur_s      = ctx["total_dur_s"]
     dummy_seg_path   = ctx["dummy_seg_path"]
@@ -1115,8 +1148,6 @@ def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
     return results
-_hunyuan_gpu_infer._cpu_ctx = {}
 def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
                      guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
@@ -1143,7 +1174,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
         for i, (s, e) in enumerate(segments)
     ]
-    _hunyuan_gpu_infer._cpu_ctx = {
         "segments": segments, "total_dur_s": total_dur_s,
         "dummy_seg_path": dummy_seg_path, "seg_clip_paths": seg_clip_paths,
     }
@@ -1182,7 +1213,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
 def _preload_taro_regen_ctx(meta: dict) -> dict:
     """Pre-load TARO CAVP/onset features on CPU for regen.
-    Returns a dict suitable for _regen_taro_gpu._cpu_ctx."""
     cavp_path  = meta.get("cavp_path", "")
     onset_path = meta.get("onset_path", "")
     ctx = {}
@@ -1194,7 +1225,7 @@ def _preload_taro_regen_ctx(meta: dict) -> dict:
 def _preload_hunyuan_regen_ctx(meta: dict, seg_path: str) -> dict:
     """Pre-load HunyuanFoley text features + segment path on CPU for regen.
-    Returns a dict suitable for _regen_hunyuan_gpu._cpu_ctx."""
     ctx = {"seg_path": seg_path}
     text_feats_path = meta.get("text_feats_path", "")
     if text_feats_path and os.path.exists(text_feats_path):
@@ -1285,7 +1316,7 @@ def _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
     from TARO.samplers import euler_sampler, euler_maruyama_sampler
     # Use pre-loaded features from CPU wrapper (avoids np.load inside GPU window)
-    ctx = _regen_taro_gpu._cpu_ctx
     if "cavp" in ctx and "onset" in ctx:
         print("[TARO regen] Using pre-loaded CAVP + onset features (CPU cache hit)")
         cavp_feats  = ctx["cavp"]
@@ -1323,7 +1354,7 @@ def regen_taro_segment(video_file, seg_idx, seg_meta_json,
     seg_idx = int(seg_idx)
     # CPU: pre-load cached features so np.load doesn't happen inside GPU window
-    _regen_taro_gpu._cpu_ctx = _preload_taro_regen_ctx(meta)
     # GPU: inference only
     new_wav = _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
@@ -1365,7 +1396,7 @@ def _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
     sr = seq_cfg.sampling_rate
     # Use pre-extracted segment clip from the CPU wrapper
-    seg_path = _regen_mmaudio_gpu._cpu_ctx.get("seg_path")
     assert seg_path, "[MMAudio regen] seg_path not set — wrapper must pre-extract segment clip"
     rng = torch.Generator(device=device)
@@ -1391,8 +1422,6 @@ def _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
     new_wav     = new_wav[:, :seg_samples]
     return new_wav, sr
-_regen_mmaudio_gpu._cpu_ctx = {}
 def regen_mmaudio_segment(video_file, seg_idx, seg_meta_json,
                           prompt, negative_prompt, seed_val,
@@ -1409,7 +1438,7 @@ def regen_mmaudio_segment(video_file, seg_idx, seg_meta_json,
         meta["silent_video"], seg_start, seg_dur,
         os.path.join(tmp_dir, "regen_seg.mp4"),
     )
-    _regen_mmaudio_gpu._cpu_ctx = {"seg_path": seg_path}
     # GPU: inference only
     new_wav, sr = _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
@@ -1458,12 +1487,11 @@ def _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
     set_global_seed(random.randint(0, 2**32 - 1))
-    # Use pre-extracted segment clip from wrapper
-    seg_path = _regen_hunyuan_gpu._cpu_ctx.get("seg_path")
     assert seg_path, "[HunyuanFoley regen] seg_path not set — wrapper must pre-extract segment clip"
-    # Use pre-loaded text_feats from CPU wrapper (avoids torch.load inside GPU window)
-    ctx = _regen_hunyuan_gpu._cpu_ctx
     if "text_feats" in ctx:
         print("[HunyuanFoley regen] Using pre-loaded text features (CPU cache hit)")
         from hunyuanvideo_foley.utils.feature_utils import encode_video_features
@@ -1486,8 +1514,6 @@ def _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
     new_wav     = new_wav[:, :seg_samples]
     return new_wav, sr
-_regen_hunyuan_gpu._cpu_ctx = {}
 def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
                           prompt, negative_prompt, seed_val,
@@ -1505,7 +1531,7 @@ def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
         meta["silent_video"], seg_start, seg_dur,
         os.path.join(tmp_dir, "regen_seg.mp4"),
     )
-    _regen_hunyuan_gpu._cpu_ctx = _preload_hunyuan_regen_ctx(meta, seg_path)
     # GPU: inference only
     new_wav, sr = _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
@@ -1575,28 +1601,43 @@ def _xregen_splice(new_wav_raw: np.ndarray, src_sr: int,
     return video_path, waveform_html
 def xregen_taro(seg_idx, state_json, slot_id,
                 seed_val, cfg_scale, num_steps, mode,
                 crossfade_s, crossfade_db,
                 request: gr.Request = None):
     """Cross-model regen: run TARO inference and splice into *slot_id*."""
-    meta    = json.loads(state_json)
     seg_idx = int(seg_idx)
-    # Show pending waveform immediately
-    pending_html = _build_regen_pending_html(meta["segments"], seg_idx, slot_id, "")
-    yield gr.update(), gr.update(value=pending_html)
-    # CPU: pre-load cached features so np.load doesn't happen inside GPU window
-    _regen_taro_gpu._cpu_ctx = _preload_taro_regen_ctx(meta)
-    new_wav_raw = _regen_taro_gpu(None, seg_idx, state_json,
-                                  seed_val, cfg_scale, num_steps, mode,
-                                  crossfade_s, crossfade_db, slot_id)
-    # Upsample 16kHz → 48kHz (sinc, CPU)
-    new_wav_raw = _upsample_taro(new_wav_raw)
-    video_path, waveform_html = _xregen_splice(new_wav_raw, TARO_SR_OUT, meta, seg_idx, slot_id)
-    yield gr.update(value=video_path), gr.update(value=waveform_html)
 def xregen_mmaudio(seg_idx, state_json, slot_id,
@@ -1604,26 +1645,23 @@ def xregen_mmaudio(seg_idx, state_json, slot_id,
                    cfg_strength, num_steps, crossfade_s, crossfade_db,
                    request: gr.Request = None):
     """Cross-model regen: run MMAudio inference and splice into *slot_id*."""
-    meta    = json.loads(state_json)
     seg_idx = int(seg_idx)
     seg_start, seg_end = meta["segments"][seg_idx]
-    # Show pending waveform immediately
-    pending_html = _build_regen_pending_html(meta["segments"], seg_idx, slot_id, "")
-    yield gr.update(), gr.update(value=pending_html)
-    seg_path = _extract_segment_clip(
-        meta["silent_video"], seg_start, seg_end - seg_start,
-        os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
-    )
-    _regen_mmaudio_gpu._cpu_ctx = {"seg_path": seg_path}
-    new_wav_raw, src_sr = _regen_mmaudio_gpu(None, seg_idx, state_json,
-                                             prompt, negative_prompt, seed_val,
-                                             cfg_strength, num_steps,
-                                             crossfade_s, crossfade_db, slot_id)
-    video_path, waveform_html = _xregen_splice(new_wav_raw, src_sr, meta, seg_idx, slot_id)
-    yield gr.update(value=video_path), gr.update(value=waveform_html)
 def xregen_hunyuan(seg_idx, state_json, slot_id,
@@ -1632,26 +1670,23 @@ def xregen_hunyuan(seg_idx, state_json, slot_id,
                    crossfade_s, crossfade_db,
                    request: gr.Request = None):
     """Cross-model regen: run HunyuanFoley inference and splice into *slot_id*."""
-    meta    = json.loads(state_json)
     seg_idx = int(seg_idx)
     seg_start, seg_end = meta["segments"][seg_idx]
-    # Show pending waveform immediately
-    pending_html = _build_regen_pending_html(meta["segments"], seg_idx, slot_id, "")
-    yield gr.update(), gr.update(value=pending_html)
-    seg_path = _extract_segment_clip(
-        meta["silent_video"], seg_start, seg_end - seg_start,
-        os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
-    )
-    _regen_hunyuan_gpu._cpu_ctx = _preload_hunyuan_regen_ctx(meta, seg_path)
-    new_wav_raw, src_sr = _regen_hunyuan_gpu(None, seg_idx, state_json,
-                                             prompt, negative_prompt, seed_val,
-                                             guidance_scale, num_steps, model_size,
-                                             crossfade_s, crossfade_db, slot_id)
-    video_path, waveform_html = _xregen_splice(new_wav_raw, src_sr, meta, seg_idx, slot_id)
-    yield gr.update(value=video_path), gr.update(value=waveform_html)
 # ================================================================== #

 import random
 import threading
 import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 import torch
 CACHE_DIR    = "/tmp/model_ckpts"
 os.makedirs(CACHE_DIR, exist_ok=True)
+# ---- Local directories that must exist before parallel downloads start ----
 MMAUDIO_WEIGHTS_DIR  = Path(CACHE_DIR) / "MMAudio" / "weights"
 MMAUDIO_EXT_DIR      = Path(CACHE_DIR) / "MMAudio" / "ext_weights"
+HUNYUAN_MODEL_DIR    = Path(CACHE_DIR) / "HunyuanFoley"
 MMAUDIO_WEIGHTS_DIR.mkdir(parents=True, exist_ok=True)
 MMAUDIO_EXT_DIR.mkdir(parents=True, exist_ok=True)
 HUNYUAN_MODEL_DIR.mkdir(parents=True, exist_ok=True)
+# ------------------------------------------------------------------ #
+# Parallel checkpoint + model downloads                               #
+# All downloads are I/O-bound (network), so running them in threads   #
+# cuts Space cold-start time roughly proportional to the number of   #
+# independent groups (previously sequential, now concurrent).         #
+# hf_hub_download / snapshot_download are thread-safe.               #
+# ------------------------------------------------------------------ #
+def _dl_taro():
+    """Download TARO .ckpt/.pt files and return their local paths."""
+    c = hf_hub_download(repo_id=CKPT_REPO_ID, filename="TARO/cavp_epoch66.ckpt", cache_dir=CACHE_DIR)
+    o = hf_hub_download(repo_id=CKPT_REPO_ID, filename="TARO/onset_model.ckpt",  cache_dir=CACHE_DIR)
+    t = hf_hub_download(repo_id=CKPT_REPO_ID, filename="TARO/taro_ckpt.pt",      cache_dir=CACHE_DIR)
+    print("TARO checkpoints downloaded.")
+    return c, o, t
+def _dl_mmaudio():
+    """Download MMAudio .pth files and return their local paths."""
+    m = hf_hub_download(repo_id=CKPT_REPO_ID, filename="MMAudio/mmaudio_large_44k_v2.pth",
+                        cache_dir=CACHE_DIR, local_dir=str(MMAUDIO_WEIGHTS_DIR), local_dir_use_symlinks=False)
+    v = hf_hub_download(repo_id=CKPT_REPO_ID, filename="MMAudio/v1-44.pth",
+                        cache_dir=CACHE_DIR, local_dir=str(MMAUDIO_EXT_DIR), local_dir_use_symlinks=False)
+    s = hf_hub_download(repo_id=CKPT_REPO_ID, filename="MMAudio/synchformer_state_dict.pth",
+                        cache_dir=CACHE_DIR, local_dir=str(MMAUDIO_EXT_DIR), local_dir_use_symlinks=False)
+    print("MMAudio checkpoints downloaded.")
+    return m, v, s
+def _dl_hunyuan():
+    """Download HunyuanVideoFoley .pth files."""
+    hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanVideo-Foley/hunyuanvideo_foley.pth",
+                    cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
+    hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanVideo-Foley/vae_128d_48k.pth",
+                    cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
+    hf_hub_download(repo_id=CKPT_REPO_ID, filename="HunyuanVideo-Foley/synchformer_state_dict.pth",
+                    cache_dir=CACHE_DIR, local_dir=str(HUNYUAN_MODEL_DIR), local_dir_use_symlinks=False)
+    print("HunyuanVideoFoley checkpoints downloaded.")
+def _dl_clap():
+    """Pre-download CLAP so from_pretrained() hits local cache inside the ZeroGPU worker."""
+    snapshot_download(repo_id="laion/larger_clap_general")
+    print("CLAP model pre-downloaded.")
+def _dl_clip():
+    """Pre-download MMAudio's CLIP model (~3.95 GB) to avoid GPU-window budget drain."""
+    snapshot_download(repo_id="apple/DFN5B-CLIP-ViT-H-14-384")
+    print("MMAudio CLIP model pre-downloaded.")
+def _dl_audioldm2():
+    """Pre-download AudioLDM2 VAE/vocoder used by TARO's from_pretrained() calls."""
+    snapshot_download(repo_id="cvssp/audioldm2")
+    print("AudioLDM2 pre-downloaded.")
+def _dl_bigvgan():
+    """Pre-download BigVGAN vocoder (~489 MB) used by MMAudio."""
+    snapshot_download(repo_id="nvidia/bigvgan_v2_44khz_128band_512x")
+    print("BigVGAN vocoder pre-downloaded.")
+print("[startup] Starting parallel checkpoint + model downloads…")
+_t_dl_start = time.perf_counter()
+with ThreadPoolExecutor(max_workers=7) as _pool:
+    _fut_taro     = _pool.submit(_dl_taro)
+    _fut_mmaudio  = _pool.submit(_dl_mmaudio)
+    _fut_hunyuan  = _pool.submit(_dl_hunyuan)
+    _fut_clap     = _pool.submit(_dl_clap)
+    _fut_clip     = _pool.submit(_dl_clip)
+    _fut_aldm2    = _pool.submit(_dl_audioldm2)
+    _fut_bigvgan  = _pool.submit(_dl_bigvgan)
+    # Raise any download exceptions immediately
+    for _fut in as_completed([_fut_taro, _fut_mmaudio, _fut_hunyuan,
+                               _fut_clap, _fut_clip, _fut_aldm2, _fut_bigvgan]):
+        _fut.result()
+cavp_ckpt_path, onset_ckpt_path, taro_ckpt_path = _fut_taro.result()
+mmaudio_model_path, mmaudio_vae_path, mmaudio_synchformer_path = _fut_mmaudio.result()
+print(f"[startup] All downloads done in {time.perf_counter() - _t_dl_start:.1f}s")
 # ================================================================== #
 #                     SHARED CONSTANTS / HELPERS                      #
 # ================================================================== #
+# Thread-local storage for CPU → GPU context passing.
+# Replaces the fragile function-attribute pattern (_fn._cpu_ctx = {...}).
+# Each wrapper writes its context under a unique key before calling the
+# @spaces.GPU function; the GPU function reads it back.  Using thread-local
+# storage means concurrent requests on different threads don't clobber
+# each other's context — the function-attribute approach was not thread-safe.
+_tl = threading.local()
 MAX_SLOTS = 8   # max parallel generation slots shown in UI
 MAX_SEGS  = 8   # max segments per slot (same as MAX_SLOTS; video ≤ ~64 s at 8 s/seg)
             pix_fmt="yuv420p",
             acodec="aac", audio_bitrate="128k",
             movflags="+faststart",
+        ).run(overwrite_output=True, quiet=True)
 # ------------------------------------------------------------------ #
 #   latents_scale: [0.18215]*8 — AudioLDM2 VAE scale factor
 # ================================================================== #
+# ================================================================== #
+#                  MODEL CONSTANTS & CONFIGURATION REGISTRY           #
+# ================================================================== #
+# All per-model numeric constants live here — MODEL_CONFIGS is the   #
+# single source of truth consumed by duration estimation, segmentation,#
+# and the UI.  Standalone names kept only where other code references #
+# them by name (TARO geometry, TARGET_SR, GPU_DURATION_CAP).         #
+# ================================================================== #
+# TARO geometry — referenced directly in _taro_infer_segment
+TARO_SR             = 16000
+TARO_TRUNCATE       = 131072
+TARO_FPS            = 4
+TARO_TRUNCATE_FRAME = int(TARO_FPS * TARO_TRUNCATE / TARO_SR)  # 32
 TARO_TRUNCATE_ONSET = 120
+TARO_MODEL_DUR      = TARO_TRUNCATE / TARO_SR                  # 8.192 s
+GPU_DURATION_CAP = 300   # hard cap per @spaces.GPU call — never reserve more than this
 MODEL_CONFIGS = {
     "taro": {
+        "window_s":       TARO_MODEL_DUR,  # 8.192 s
+        "sr":             TARO_SR,          # 16000 (output resampled to TARGET_SR)
+        "secs_per_step":  0.025,  # measured 0.023 s/step on H200
+        "load_overhead":  15,     # model load + CAVP feature extraction
         "tab_prefix":     "taro",
         "label":          "TARO",
+        "regen_fn":       None,   # set after function definitions (avoids forward-ref)
     },
     "mmaudio": {
+        "window_s":       8.0,    # MMAudio's fixed generation window
+        "sr":             48000,  # resampled from 44100 in post-processing
+        "secs_per_step":  0.25,   # measured 0.230 s/step on H200
+        "load_overhead":  30,     # 15s warm + 15s model init
         "tab_prefix":     "mma",
         "label":          "MMAudio",
+        "regen_fn":       None,
     },
     "hunyuan": {
+        "window_s":       15.0,   # HunyuanFoley max video duration
         "sr":             48000,
+        "secs_per_step":  0.35,   # measured 0.328 s/step on H200
+        "load_overhead":  55,     # ~55s to load the 10 GB XXL weights
         "tab_prefix":     "hf",
         "label":          "HunyuanFoley",
+        "regen_fn":       None,
     },
 }
+# Convenience aliases used only in the TARO inference path
+TARO_SECS_PER_STEP  = MODEL_CONFIGS["taro"]["secs_per_step"]
+MMAUDIO_WINDOW      = MODEL_CONFIGS["mmaudio"]["window_s"]
+MMAUDIO_SECS_PER_STEP = MODEL_CONFIGS["mmaudio"]["secs_per_step"]
+HUNYUAN_MAX_DUR     = MODEL_CONFIGS["hunyuan"]["window_s"]
+HUNYUAN_SECS_PER_STEP = MODEL_CONFIGS["hunyuan"]["secs_per_step"]
+def _clamp_duration(secs: float, label: str) -> int:
+    """Clamp a raw GPU-seconds estimate to [60, GPU_DURATION_CAP] and log it."""
+    result = min(GPU_DURATION_CAP, max(60, int(secs)))
+    print(f"[duration] {label}: {secs:.0f}s raw → {result}s reserved")
+    return result
 def _estimate_gpu_duration(model_key: str, num_samples: int, num_steps: int,
                            total_dur_s: float = None, crossfade_s: float = 0,
                            video_file: str = None) -> int:
+    """Estimate GPU seconds for a full generation call.
+    Formula: num_samples × n_segs × num_steps × secs_per_step + load_overhead
     """
     cfg = MODEL_CONFIGS[model_key]
     try:
         n_segs = len(_build_segments(total_dur_s, cfg["window_s"], float(crossfade_s)))
     except Exception:
         n_segs = 1
+    secs = int(num_samples) * n_segs * int(num_steps) * cfg["secs_per_step"] + cfg["load_overhead"]
     print(f"[duration] {cfg['label']}: {int(num_samples)}samp × {n_segs}seg × "
+          f"{int(num_steps)}steps → {secs:.0f}s → capped ", end="")
+    return _clamp_duration(secs, cfg["label"])
 def _estimate_regen_duration(model_key: str, num_steps: int) -> int:
+    """Estimate GPU seconds for a single-segment regen call."""
     cfg  = MODEL_CONFIGS[model_key]
     secs = int(num_steps) * cfg["secs_per_step"] + cfg["load_overhead"]
+    print(f"[duration] {cfg['label']} regen: 1 seg × {int(num_steps)} steps → ", end="")
+    return _clamp_duration(secs, f"{cfg['label']} regen")
 _TARO_CACHE_MAXLEN = 16   # evict oldest entries beyond this limit
 _TARO_INFERENCE_CACHE: dict = {}   # keyed by (video_file, seed, cfg, steps, mode, crossfade_s)
     from TARO.onset_util import extract_onset
     from TARO.samplers   import euler_sampler, euler_maruyama_sampler
+    # Use pre-computed CPU results passed via thread-local storage
+    ctx        = _tl.taro_gen_ctx
     tmp_dir    = ctx["tmp_dir"]
     silent_video = ctx["silent_video"]
     segments   = ctx["segments"]
     return results
 def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
                   crossfade_s, crossfade_db, num_samples):
     tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
         video_file, TARO_MODEL_DUR, crossfade_s)
+    # Pass pre-computed CPU results to the GPU function via thread-local storage
+    _tl.taro_gen_ctx = {
         "tmp_dir": tmp_dir, "silent_video": silent_video,
         "segments": segments, "total_dur_s": total_dur_s,
     }
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
+    ctx            = _tl.mmaudio_gen_ctx
     segments       = ctx["segments"]
     seg_clip_paths = ctx["seg_clip_paths"]
     return results
 def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
                      cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
         for i, (s, e) in enumerate(segments)
     ]
+    _tl.mmaudio_gen_ctx = {
         "segments": segments, "seg_clip_paths": seg_clip_paths,
     }
     model_dict, cfg = _load_hunyuan_model(device, model_size)
+    ctx              = _tl.hunyuan_gen_ctx
     segments         = ctx["segments"]
     total_dur_s      = ctx["total_dur_s"]
     dummy_seg_path   = ctx["dummy_seg_path"]
     return results
 def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
                      guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
         for i, (s, e) in enumerate(segments)
     ]
+    _tl.hunyuan_gen_ctx = {
         "segments": segments, "total_dur_s": total_dur_s,
         "dummy_seg_path": dummy_seg_path, "seg_clip_paths": seg_clip_paths,
     }
 def _preload_taro_regen_ctx(meta: dict) -> dict:
     """Pre-load TARO CAVP/onset features on CPU for regen.
+    Returns a dict for _tl.taro_regen_ctx (thread-local storage)."""
     cavp_path  = meta.get("cavp_path", "")
     onset_path = meta.get("onset_path", "")
     ctx = {}
 def _preload_hunyuan_regen_ctx(meta: dict, seg_path: str) -> dict:
     """Pre-load HunyuanFoley text features + segment path on CPU for regen.
+    Returns a dict for _tl.hunyuan_regen_ctx (thread-local storage)."""
     ctx = {"seg_path": seg_path}
     text_feats_path = meta.get("text_feats_path", "")
     if text_feats_path and os.path.exists(text_feats_path):
     from TARO.samplers import euler_sampler, euler_maruyama_sampler
     # Use pre-loaded features from CPU wrapper (avoids np.load inside GPU window)
+    ctx = getattr(_tl, "taro_regen_ctx", {})
     if "cavp" in ctx and "onset" in ctx:
         print("[TARO regen] Using pre-loaded CAVP + onset features (CPU cache hit)")
         cavp_feats  = ctx["cavp"]
     seg_idx = int(seg_idx)
     # CPU: pre-load cached features so np.load doesn't happen inside GPU window
+    _tl.taro_regen_ctx = _preload_taro_regen_ctx(meta)
     # GPU: inference only
     new_wav = _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
     sr = seq_cfg.sampling_rate
     # Use pre-extracted segment clip from the CPU wrapper
+    seg_path = getattr(_tl, "mmaudio_regen_ctx", {}).get("seg_path")
     assert seg_path, "[MMAudio regen] seg_path not set — wrapper must pre-extract segment clip"
     rng = torch.Generator(device=device)
     new_wav     = new_wav[:, :seg_samples]
     return new_wav, sr
 def regen_mmaudio_segment(video_file, seg_idx, seg_meta_json,
                           prompt, negative_prompt, seed_val,
         meta["silent_video"], seg_start, seg_dur,
         os.path.join(tmp_dir, "regen_seg.mp4"),
     )
+    _tl.mmaudio_regen_ctx = {"seg_path": seg_path}
     # GPU: inference only
     new_wav, sr = _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
     set_global_seed(random.randint(0, 2**32 - 1))
+    # Use pre-extracted segment clip + text_feats from CPU wrapper
+    ctx = getattr(_tl, "hunyuan_regen_ctx", {})
+    seg_path = ctx.get("seg_path")
     assert seg_path, "[HunyuanFoley regen] seg_path not set — wrapper must pre-extract segment clip"
     if "text_feats" in ctx:
         print("[HunyuanFoley regen] Using pre-loaded text features (CPU cache hit)")
         from hunyuanvideo_foley.utils.feature_utils import encode_video_features
     new_wav     = new_wav[:, :seg_samples]
     return new_wav, sr
 def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
                           prompt, negative_prompt, seed_val,
         meta["silent_video"], seg_start, seg_dur,
         os.path.join(tmp_dir, "regen_seg.mp4"),
     )
+    _tl.hunyuan_regen_ctx = _preload_hunyuan_regen_ctx(meta, seg_path)
     # GPU: inference only
     new_wav, sr = _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
     return video_path, waveform_html
+def _xregen_dispatch(state_json: str, seg_idx: int, slot_id: str, infer_fn):
+    """Shared generator skeleton for all xregen_* wrappers.
+    Yields pending HTML immediately, then calls *infer_fn()* — a zero-argument
+    callable that runs model-specific CPU prep + GPU inference and returns
+    (wav_array, src_sr).  For TARO, *infer_fn* should return the wav already
+    upsampled to 48 kHz; pass TARO_SR_OUT as src_sr.
+    Yields:
+        First:  (gr.update(), gr.update(value=pending_html))  — shown while GPU runs
+        Second: (gr.update(value=video_path), gr.update(value=waveform_html))
+    """
+    meta         = json.loads(state_json)
+    pending_html = _build_regen_pending_html(meta["segments"], seg_idx, slot_id, "")
+    yield gr.update(), gr.update(value=pending_html)
+    new_wav_raw, src_sr = infer_fn()
+    video_path, waveform_html = _xregen_splice(new_wav_raw, src_sr, meta, seg_idx, slot_id)
+    yield gr.update(value=video_path), gr.update(value=waveform_html)
 def xregen_taro(seg_idx, state_json, slot_id,
                 seed_val, cfg_scale, num_steps, mode,
                 crossfade_s, crossfade_db,
                 request: gr.Request = None):
     """Cross-model regen: run TARO inference and splice into *slot_id*."""
     seg_idx = int(seg_idx)
+    meta    = json.loads(state_json)
+    def _run():
+        _tl.taro_regen_ctx = _preload_taro_regen_ctx(meta)
+        wav = _regen_taro_gpu(None, seg_idx, state_json,
+                              seed_val, cfg_scale, num_steps, mode,
+                              crossfade_s, crossfade_db, slot_id)
+        return _upsample_taro(wav), TARO_SR_OUT   # 16 kHz → 48 kHz (CPU)
+    yield from _xregen_dispatch(state_json, seg_idx, slot_id, _run)
 def xregen_mmaudio(seg_idx, state_json, slot_id,
                    cfg_strength, num_steps, crossfade_s, crossfade_db,
                    request: gr.Request = None):
     """Cross-model regen: run MMAudio inference and splice into *slot_id*."""
     seg_idx = int(seg_idx)
+    meta    = json.loads(state_json)
     seg_start, seg_end = meta["segments"][seg_idx]
+    def _run():
+        seg_path = _extract_segment_clip(
+            meta["silent_video"], seg_start, seg_end - seg_start,
+            os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
+        )
+        _tl.mmaudio_regen_ctx = {"seg_path": seg_path}
+        wav, src_sr = _regen_mmaudio_gpu(None, seg_idx, state_json,
+                                         prompt, negative_prompt, seed_val,
+                                         cfg_strength, num_steps,
+                                         crossfade_s, crossfade_db, slot_id)
+        return wav, src_sr
+    yield from _xregen_dispatch(state_json, seg_idx, slot_id, _run)
 def xregen_hunyuan(seg_idx, state_json, slot_id,
                    crossfade_s, crossfade_db,
                    request: gr.Request = None):
     """Cross-model regen: run HunyuanFoley inference and splice into *slot_id*."""
     seg_idx = int(seg_idx)
+    meta    = json.loads(state_json)
     seg_start, seg_end = meta["segments"][seg_idx]
+    def _run():
+        seg_path = _extract_segment_clip(
+            meta["silent_video"], seg_start, seg_end - seg_start,
+            os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
+        )
+        _tl.hunyuan_regen_ctx = _preload_hunyuan_regen_ctx(meta, seg_path)
+        wav, src_sr = _regen_hunyuan_gpu(None, seg_idx, state_json,
+                                         prompt, negative_prompt, seed_val,
+                                         guidance_scale, num_steps, model_size,
+                                         crossfade_s, crossfade_db, slot_id)
+        return wav, src_sr
+    yield from _xregen_dispatch(state_json, seg_idx, slot_id, _run)
 # ================================================================== #