Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors Claude Opus 4.6 commited on 4 days ago

Commit

13cc4e6

1 Parent(s): a4226e1

perf: move CPU work outside @spaces.GPU to reduce ZeroGPU cost

Split all generate and regen functions into CPU wrapper + GPU-only
inner function pattern. CPU pre/post-processing (ffmpeg, torchaudio,
numpy stitching, muxing) now runs outside @spaces.GPU boundary.
Saves ~5-12s of GPU reservation time per call.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show

app.py +282 -127

app.py CHANGED Viewed

@@ -402,14 +402,14 @@ def _stitch_wavs(wavs: list, crossfade_s: float, db_boost: float,
 @spaces.GPU(duration=_taro_duration)
-def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
-                  crossfade_s, crossfade_db, num_samples):
-    """TARO: video-conditioned diffusion, 16 kHz, 8.192 s sliding window."""
     global _TARO_INFERENCE_CACHE
     seed_val     = int(seed_val)
     crossfade_s  = float(crossfade_s)
-    crossfade_db = float(crossfade_db)
     num_samples  = int(num_samples)
     if seed_val < 0:
         seed_val = random.randint(0, 2**32 - 1)
@@ -418,8 +418,6 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
     device       = "cuda" if torch.cuda.is_available() else "cpu"
     weight_dtype = torch.bfloat16
-    # TARO modules use bare imports (e.g. `from cavp_util import ...`) that
-    # assume the TARO directory is on sys.path.  Add it before importing.
     _taro_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "TARO")
     if _taro_dir not in sys.path:
         sys.path.insert(0, _taro_dir)
@@ -427,21 +425,19 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
     from TARO.onset_util import extract_onset
     from TARO.samplers   import euler_sampler, euler_maruyama_sampler
     extract_cavp, onset_model = _load_taro_feature_extractors(device)
     model, vae, vocoder, latents_scale = _load_taro_models(device, weight_dtype)
-    # -- Prepare silent video (shared across all samples) --
-    tmp_dir      = _register_tmp_dir(tempfile.mkdtemp())
-    silent_video = os.path.join(tmp_dir, "silent_input.mp4")
-    strip_audio_from_video(video_file, silent_video)
     cavp_feats  = extract_cavp(silent_video, tmp_path=tmp_dir)
-    # Use actual video duration from ffprobe — CAVP frame count can under-count
-    # if the extractor drops the last partial window, leading to truncated audio.
-    total_dur_s = get_video_duration(video_file)
-    segments    = _build_segments(total_dur_s, TARO_MODEL_DUR, crossfade_s)
-    outputs = []
     for sample_idx in range(num_samples):
         sample_seed = seed_val + sample_idx
         cache_key   = (video_file, sample_seed, float(cfg_scale), int(num_steps), mode, crossfade_s)
@@ -450,7 +446,7 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
             cached = _TARO_INFERENCE_CACHE.get(cache_key)
         if cached is not None:
             print(f"[TARO] Sample {sample_idx+1}: cache hit.")
-            wavs = cached["wavs"]
         else:
             set_global_seed(sample_seed)
             onset_feats = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
@@ -478,7 +474,42 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
                 _TARO_INFERENCE_CACHE[cache_key] = {"wavs": wavs}
                 while len(_TARO_INFERENCE_CACHE) > _TARO_CACHE_MAXLEN:
                     _TARO_INFERENCE_CACHE.pop(next(iter(_TARO_INFERENCE_CACHE)))
         final_wav  = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, TARO_SR)
         audio_path = os.path.join(tmp_dir, f"taro_{sample_idx}.wav")
         torchaudio.save(audio_path, torch.from_numpy(np.ascontiguousarray(final_wav)).unsqueeze(0), TARO_SR)
@@ -489,7 +520,8 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
         cavp_path  = os.path.join(tmp_dir, f"taro_{sample_idx}_cavp.npy")
         onset_path = os.path.join(tmp_dir, f"taro_{sample_idx}_onset.npy")
         np.save(cavp_path, cavp_feats)
-        np.save(onset_path, onset_feats)
         seg_meta = {
             "segments":    segments,
             "wav_paths":   wav_paths,
@@ -539,51 +571,33 @@ def _mmaudio_duration(video_file, prompt, negative_prompt, seed_val,
 @spaces.GPU(duration=_mmaudio_duration)
-def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
-                     cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
-    """MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window."""
     _mmaudio_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "MMAudio")
     if _mmaudio_dir not in sys.path:
         sys.path.insert(0, _mmaudio_dir)
-    from mmaudio.eval_utils        import generate, load_video, make_video
     from mmaudio.model.flow_matching   import FlowMatching
     seed_val     = int(seed_val)
     num_samples  = int(num_samples)
     crossfade_s  = float(crossfade_s)
-    crossfade_db = float(crossfade_db)
     device = "cuda" if torch.cuda.is_available() else "cpu"
     dtype  = torch.bfloat16
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
-    tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
-    outputs = []
-    # Strip original audio so the muxed output only contains the generated track
-    silent_video = os.path.join(tmp_dir, "silent_input.mp4")
-    strip_audio_from_video(video_file, silent_video)
-    # MMAudio's fixed window is 8 s. For longer videos we slide over 8 s segments
-    # with a crossfade overlap and stitch the results into a full-length track.
-    total_dur_s  = get_video_duration(video_file)
-    segments     = _build_segments(total_dur_s, MMAUDIO_WINDOW, crossfade_s)
-    print(f"[MMAudio] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤8 s")
     sr = seq_cfg.sampling_rate   # 44100
-    # Pre-extract all segment clips once (shared across samples, saves ffmpeg overhead)
-    seg_clip_paths = []
-    for seg_i, (seg_start, seg_end) in enumerate(segments):
-        seg_dur = seg_end - seg_start
-        seg_path = os.path.join(tmp_dir, f"mma_seg_{seg_i}.mp4")
-        ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
-            seg_path, vcodec="copy", an=None
-        ).run(overwrite_output=True, quiet=True)
-        seg_clip_paths.append(seg_path)
     for sample_idx in range(num_samples):
         rng = torch.Generator(device=device)
         if seed_val >= 0:
@@ -591,7 +605,7 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
         else:
             rng.seed()
-        seg_audios = []   # list of (channels, samples) numpy arrays
         _t_mma_start = time.perf_counter()
         for seg_i, (seg_start, seg_end) in enumerate(segments):
@@ -633,8 +647,49 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
         print(f"[MMAudio] Inference done: {_n_segs_mma} seg(s) × {int(num_steps)} steps in "
               f"{_t_mma_elapsed:.1f}s wall → {_secs_per_step_mma:.3f}s/step "
               f"(current constant={MMAUDIO_SECS_PER_STEP})")
-        # Crossfade-stitch all segments using shared equal-power helper
         full_wav = seg_audios[0]
         for nw in seg_audios[1:]:
             full_wav = _cf_join(full_wav, nw, crossfade_s, crossfade_db, sr)
@@ -642,7 +697,6 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
         audio_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.wav")
         torchaudio.save(audio_path, torch.from_numpy(full_wav), sr)
         video_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.mp4")
         mux_video_audio(silent_video, audio_path, video_path)
         wav_paths = _save_seg_wavs(seg_audios, tmp_dir, f"mmaudio_{sample_idx}")
@@ -694,79 +748,52 @@ def _hunyuan_duration(video_file, prompt, negative_prompt, seed_val,
 @spaces.GPU(duration=_hunyuan_duration)
-def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
-                     guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
-    """HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s."""
-    # Ensure HunyuanVideo-Foley package is importable
     _hf_path = str(Path("HunyuanVideo-Foley").resolve())
     if _hf_path not in sys.path:
         sys.path.insert(0, _hf_path)
     from hunyuanvideo_foley.utils.model_utils  import denoise_process
     from hunyuanvideo_foley.utils.feature_utils import feature_process
-    from hunyuanvideo_foley.utils.media_utils   import merge_audio_video
     seed_val     = int(seed_val)
     num_samples  = int(num_samples)
     crossfade_s  = float(crossfade_s)
-    crossfade_db = float(crossfade_db)
     if seed_val >= 0:
         set_global_seed(seed_val)
     device      = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model_size  = model_size.lower()   # "xl" or "xxl"
     model_dict, cfg = _load_hunyuan_model(device, model_size)
-    tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
-    outputs = []
-    # Strip original audio so the muxed output only contains the generated track
-    silent_video = os.path.join(tmp_dir, "silent_input.mp4")
-    strip_audio_from_video(video_file, silent_video)
-    # HunyuanFoley is limited to 15 s per pass.  For longer videos we slice the
-    # input into overlapping segments, generate audio for each, then crossfade-
-    # stitch the results into a single full-length audio track.
-    total_dur_s  = get_video_duration(silent_video)
-    segments     = _build_segments(total_dur_s, HUNYUAN_MAX_DUR, crossfade_s)
-    print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
-    # Pre-extract text features once (same for every segment; stream-copy, no re-encode)
-    _dummy_seg_path = os.path.join(tmp_dir, "_seg_dummy.mp4")
-    ffmpeg.input(silent_video, ss=0, t=min(total_dur_s, HUNYUAN_MAX_DUR)).output(
-        _dummy_seg_path, vcodec="copy", an=None
-    ).run(overwrite_output=True, quiet=True)
     _, text_feats, _ = feature_process(
-        _dummy_seg_path,
         prompt if prompt else "",
         model_dict,
         cfg,
         neg_prompt=negative_prompt if negative_prompt else None,
     )
-    # Pre-extract all segment clips once (shared across samples, saves ffmpeg overhead)
-    hny_seg_clip_paths = []
-    for seg_i, (seg_start, seg_end) in enumerate(segments):
-        seg_dur = seg_end - seg_start
-        seg_path = os.path.join(tmp_dir, f"hny_seg_{seg_i}.mp4")
-        ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
-            seg_path, vcodec="copy", an=None
-        ).run(overwrite_output=True, quiet=True)
-        hny_seg_clip_paths.append(seg_path)
-    # Generate audio per segment, then stitch
     for sample_idx in range(num_samples):
         seg_wavs = []
-        sr = 48000  # HunyuanFoley always outputs 48 kHz
         _t_hny_start = time.perf_counter()
         for seg_i, (seg_start, seg_end) in enumerate(segments):
             seg_dur = seg_end - seg_start
-            seg_path = hny_seg_clip_paths[seg_i]
-            # feature_process returns (visual_feats, text_feats, audio_len).
-            # We discard the returned text_feats (_) and use the pre-computed
-            # text_feats from above — text encoding runs once, not per segment.
             visual_feats, _, seg_audio_len = feature_process(
                 seg_path,
                 prompt if prompt else "",
@@ -787,9 +814,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
                 num_inference_steps=int(num_steps),
                 batch_size=1,
             )
-            # audio_batch shape: (1, channels, samples) — take first (and only) sample
-            wav = audio_batch[0].float().cpu().numpy()  # (channels, samples)
-            # Trim to exact segment length in samples
             seg_samples = int(round(seg_dur * sr))
             wav = wav[:, :seg_samples]
             seg_wavs.append(wav)
@@ -800,12 +825,66 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
         print(f"[HunyuanFoley] Inference done: {_n_segs_hny} seg(s) × {int(num_steps)} steps in "
               f"{_t_hny_elapsed:.1f}s wall → {_secs_per_step_hny:.3f}s/step "
               f"(current constant={HUNYUAN_SECS_PER_STEP})")
-        # Crossfade-stitch all segments using shared equal-power helper
         full_wav = seg_wavs[0]
         for nw in seg_wavs[1:]:
             full_wav = _cf_join(full_wav, nw, crossfade_s, crossfade_db, sr)
-        # Trim to exact video duration
         full_wav = full_wav[:, : int(round(total_dur_s * sr))]
         audio_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.wav")
@@ -813,7 +892,6 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
         video_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.mp4")
         merge_audio_video(audio_path, silent_video, video_path)
         wav_paths = _save_seg_wavs(seg_wavs, tmp_dir, f"hunyuan_{sample_idx}")
-        # Cache text features so regen can skip text encoding (~2-3s saved)
         text_feats_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}_text_feats.pt")
         torch.save(text_feats, text_feats_path)
         seg_meta = {
@@ -922,10 +1000,10 @@ def _taro_regen_duration(video_file, seg_idx, seg_meta_json,
 @spaces.GPU(duration=_taro_regen_duration)
-def regen_taro_segment(video_file, seg_idx, seg_meta_json,
-                       seed_val, cfg_scale, num_steps, mode,
-                       crossfade_s, crossfade_db, slot_id):
-    """Regenerate one TARO segment with a fresh random seed."""
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
     seg_start_s, seg_end_s = meta["segments"][seg_idx]
@@ -940,7 +1018,6 @@ def regen_taro_segment(video_file, seg_idx, seg_meta_json,
     from TARO.samplers import euler_sampler, euler_maruyama_sampler
-    # Load cached CAVP + onset features if available (saves ~5-7s of GPU work)
     cavp_path = meta.get("cavp_path")
     onset_path = meta.get("onset_path")
     if cavp_path and os.path.exists(cavp_path) and onset_path and os.path.exists(onset_path):
@@ -960,13 +1037,27 @@ def regen_taro_segment(video_file, seg_idx, seg_meta_json,
     set_global_seed(random.randint(0, 2**32 - 1))
-    new_wav = _taro_infer_segment(
         model_net, vae, vocoder, cavp_feats, onset_feats,
         seg_start_s, seg_end_s, device, weight_dtype,
         float(cfg_scale), int(num_steps), mode, latents_scale,
         euler_sampler, euler_maruyama_sampler,
     )
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
     )
@@ -983,10 +1074,10 @@ def _mmaudio_regen_duration(video_file, seg_idx, seg_meta_json,
 @spaces.GPU(duration=_mmaudio_regen_duration)
-def regen_mmaudio_segment(video_file, seg_idx, seg_meta_json,
-                          prompt, negative_prompt, seed_val,
-                          cfg_strength, num_steps, crossfade_s, crossfade_db, slot_id):
-    """Regenerate one MMAudio segment with a fresh random seed."""
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
     seg_start, seg_end = meta["segments"][seg_idx]
@@ -1003,14 +1094,18 @@ def regen_mmaudio_segment(video_file, seg_idx, seg_meta_json,
     dtype  = torch.bfloat16
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
-    sr          = seq_cfg.sampling_rate
-    silent_video = meta["silent_video"]
-    tmp_dir     = tempfile.mkdtemp()
-    seg_path    = os.path.join(tmp_dir, "regen_seg.mp4")
-    ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
-        seg_path, vcodec="copy", an=None
-    ).run(overwrite_output=True, quiet=True)
     rng = torch.Generator(device=device)
     rng.manual_seed(random.randint(0, 2**32 - 1))
@@ -1033,9 +1128,37 @@ def regen_mmaudio_segment(video_file, seg_idx, seg_meta_json,
     new_wav     = audios.float().cpu()[0].numpy()
     seg_samples = int(round(seg_dur * sr))
     new_wav     = new_wav[:, :seg_samples]
-    meta["sr"]   = sr
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
     )
@@ -1053,11 +1176,11 @@ def _hunyuan_regen_duration(video_file, seg_idx, seg_meta_json,
 @spaces.GPU(duration=_hunyuan_regen_duration)
-def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
-                          prompt, negative_prompt, seed_val,
-                          guidance_scale, num_steps, model_size,
-                          crossfade_s, crossfade_db, slot_id):
-    """Regenerate one HunyuanFoley segment with a fresh random seed."""
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
     seg_start, seg_end = meta["segments"][seg_idx]
@@ -1075,14 +1198,16 @@ def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
     set_global_seed(random.randint(0, 2**32 - 1))
-    silent_video = meta["silent_video"]
-    tmp_dir      = tempfile.mkdtemp()
-    seg_path     = os.path.join(tmp_dir, "regen_seg.mp4")
-    ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
-        seg_path, vcodec="copy", an=None
-    ).run(overwrite_output=True, quiet=True)
-    # Load cached text features if available (saves ~2-3s text encoding)
     text_feats_path = meta.get("text_feats_path")
     if text_feats_path and os.path.exists(text_feats_path):
         print("[HunyuanFoley regen] Loading cached text features, extracting visual only")
@@ -1104,9 +1229,39 @@ def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
     new_wav     = audio_batch[0].float().cpu().numpy()
     seg_samples = int(round(seg_dur * sr))
     new_wav     = new_wav[:, :seg_samples]
-    meta["sr"]   = sr
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
     )

 @spaces.GPU(duration=_taro_duration)
+def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
+                    crossfade_s, crossfade_db, num_samples):
+    """GPU-only TARO inference — model loading + feature extraction + diffusion.
+    Returns list of (wavs_list, onset_feats) per sample."""
     global _TARO_INFERENCE_CACHE
     seed_val     = int(seed_val)
     crossfade_s  = float(crossfade_s)
     num_samples  = int(num_samples)
     if seed_val < 0:
         seed_val = random.randint(0, 2**32 - 1)
     device       = "cuda" if torch.cuda.is_available() else "cpu"
     weight_dtype = torch.bfloat16
     _taro_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "TARO")
     if _taro_dir not in sys.path:
         sys.path.insert(0, _taro_dir)
     from TARO.onset_util import extract_onset
     from TARO.samplers   import euler_sampler, euler_maruyama_sampler
+    # Use pre-computed CPU results from the wrapper
+    ctx        = _taro_gpu_infer._cpu_ctx
+    tmp_dir    = ctx["tmp_dir"]
+    silent_video = ctx["silent_video"]
+    segments   = ctx["segments"]
+    total_dur_s = ctx["total_dur_s"]
     extract_cavp, onset_model = _load_taro_feature_extractors(device)
     model, vae, vocoder, latents_scale = _load_taro_models(device, weight_dtype)
     cavp_feats  = extract_cavp(silent_video, tmp_path=tmp_dir)
+    results = []   # list of (wavs, onset_feats) per sample
     for sample_idx in range(num_samples):
         sample_seed = seed_val + sample_idx
         cache_key   = (video_file, sample_seed, float(cfg_scale), int(num_steps), mode, crossfade_s)
             cached = _TARO_INFERENCE_CACHE.get(cache_key)
         if cached is not None:
             print(f"[TARO] Sample {sample_idx+1}: cache hit.")
+            results.append((cached["wavs"], cavp_feats, None))
         else:
             set_global_seed(sample_seed)
             onset_feats = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
                 _TARO_INFERENCE_CACHE[cache_key] = {"wavs": wavs}
                 while len(_TARO_INFERENCE_CACHE) > _TARO_CACHE_MAXLEN:
                     _TARO_INFERENCE_CACHE.pop(next(iter(_TARO_INFERENCE_CACHE)))
+            results.append((wavs, cavp_feats, onset_feats))
+    return results
+# Attach a context slot for the CPU wrapper to pass pre-computed data
+_taro_gpu_infer._cpu_ctx = {}
+def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
+                  crossfade_s, crossfade_db, num_samples):
+    """TARO: video-conditioned diffusion, 16 kHz, 8.192 s sliding window.
+    CPU pre/post-processing wraps the GPU-only inference to minimize ZeroGPU cost."""
+    crossfade_s  = float(crossfade_s)
+    crossfade_db = float(crossfade_db)
+    num_samples  = int(num_samples)
+    # ── CPU pre-processing (no GPU needed) ──
+    tmp_dir      = _register_tmp_dir(tempfile.mkdtemp())
+    silent_video = os.path.join(tmp_dir, "silent_input.mp4")
+    strip_audio_from_video(video_file, silent_video)
+    total_dur_s  = get_video_duration(video_file)
+    segments     = _build_segments(total_dur_s, TARO_MODEL_DUR, crossfade_s)
+    # Pass pre-computed CPU results to the GPU function via context
+    _taro_gpu_infer._cpu_ctx = {
+        "tmp_dir": tmp_dir, "silent_video": silent_video,
+        "segments": segments, "total_dur_s": total_dur_s,
+    }
+    # ── GPU inference only ──
+    results = _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
+                              crossfade_s, crossfade_db, num_samples)
+    # ── CPU post-processing (no GPU needed) ──
+    outputs = []
+    for sample_idx, (wavs, cavp_feats, onset_feats) in enumerate(results):
         final_wav  = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, TARO_SR)
         audio_path = os.path.join(tmp_dir, f"taro_{sample_idx}.wav")
         torchaudio.save(audio_path, torch.from_numpy(np.ascontiguousarray(final_wav)).unsqueeze(0), TARO_SR)
         cavp_path  = os.path.join(tmp_dir, f"taro_{sample_idx}_cavp.npy")
         onset_path = os.path.join(tmp_dir, f"taro_{sample_idx}_onset.npy")
         np.save(cavp_path, cavp_feats)
+        if onset_feats is not None:
+            np.save(onset_path, onset_feats)
         seg_meta = {
             "segments":    segments,
             "wav_paths":   wav_paths,
 @spaces.GPU(duration=_mmaudio_duration)
+def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
+                       cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
+    """GPU-only MMAudio inference — model loading + flow-matching generation.
+    Returns list of (seg_audios, sr) per sample."""
     _mmaudio_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "MMAudio")
     if _mmaudio_dir not in sys.path:
         sys.path.insert(0, _mmaudio_dir)
+    from mmaudio.eval_utils        import generate, load_video
     from mmaudio.model.flow_matching   import FlowMatching
     seed_val     = int(seed_val)
     num_samples  = int(num_samples)
     crossfade_s  = float(crossfade_s)
     device = "cuda" if torch.cuda.is_available() else "cpu"
     dtype  = torch.bfloat16
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
+    ctx            = _mmaudio_gpu_infer._cpu_ctx
+    segments       = ctx["segments"]
+    seg_clip_paths = ctx["seg_clip_paths"]
     sr = seq_cfg.sampling_rate   # 44100
+    results = []
     for sample_idx in range(num_samples):
         rng = torch.Generator(device=device)
         if seed_val >= 0:
         else:
             rng.seed()
+        seg_audios = []
         _t_mma_start = time.perf_counter()
         for seg_i, (seg_start, seg_end) in enumerate(segments):
         print(f"[MMAudio] Inference done: {_n_segs_mma} seg(s) × {int(num_steps)} steps in "
               f"{_t_mma_elapsed:.1f}s wall → {_secs_per_step_mma:.3f}s/step "
               f"(current constant={MMAUDIO_SECS_PER_STEP})")
+        results.append((seg_audios, sr))
+    return results
+_mmaudio_gpu_infer._cpu_ctx = {}
+def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
+                     cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples):
+    """MMAudio: flow-matching video-to-audio, 44.1 kHz, 8 s sliding window.
+    CPU pre/post-processing wraps the GPU-only inference to minimize ZeroGPU cost."""
+    num_samples  = int(num_samples)
+    crossfade_s  = float(crossfade_s)
+    crossfade_db = float(crossfade_db)
+    # ── CPU pre-processing ──
+    tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
+    silent_video = os.path.join(tmp_dir, "silent_input.mp4")
+    strip_audio_from_video(video_file, silent_video)
+    total_dur_s  = get_video_duration(video_file)
+    segments     = _build_segments(total_dur_s, MMAUDIO_WINDOW, crossfade_s)
+    print(f"[MMAudio] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤8 s")
+    seg_clip_paths = []
+    for seg_i, (seg_start, seg_end) in enumerate(segments):
+        seg_dur = seg_end - seg_start
+        seg_path = os.path.join(tmp_dir, f"mma_seg_{seg_i}.mp4")
+        ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
+            seg_path, vcodec="copy", an=None
+        ).run(overwrite_output=True, quiet=True)
+        seg_clip_paths.append(seg_path)
+    _mmaudio_gpu_infer._cpu_ctx = {
+        "segments": segments, "seg_clip_paths": seg_clip_paths,
+    }
+    # ── GPU inference only ──
+    results = _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
+                                 cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples)
+    # ── CPU post-processing ──
+    outputs = []
+    for sample_idx, (seg_audios, sr) in enumerate(results):
         full_wav = seg_audios[0]
         for nw in seg_audios[1:]:
             full_wav = _cf_join(full_wav, nw, crossfade_s, crossfade_db, sr)
         audio_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.wav")
         torchaudio.save(audio_path, torch.from_numpy(full_wav), sr)
         video_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.mp4")
         mux_video_audio(silent_video, audio_path, video_path)
         wav_paths = _save_seg_wavs(seg_audios, tmp_dir, f"mmaudio_{sample_idx}")
 @spaces.GPU(duration=_hunyuan_duration)
+def _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
+                       guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
+    """GPU-only HunyuanFoley inference — model loading + feature extraction + denoising.
+    Returns list of (seg_wavs, sr, text_feats) per sample."""
     _hf_path = str(Path("HunyuanVideo-Foley").resolve())
     if _hf_path not in sys.path:
         sys.path.insert(0, _hf_path)
     from hunyuanvideo_foley.utils.model_utils  import denoise_process
     from hunyuanvideo_foley.utils.feature_utils import feature_process
     seed_val     = int(seed_val)
     num_samples  = int(num_samples)
     crossfade_s  = float(crossfade_s)
     if seed_val >= 0:
         set_global_seed(seed_val)
     device      = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model_size  = model_size.lower()
     model_dict, cfg = _load_hunyuan_model(device, model_size)
+    ctx              = _hunyuan_gpu_infer._cpu_ctx
+    segments         = ctx["segments"]
+    total_dur_s      = ctx["total_dur_s"]
+    dummy_seg_path   = ctx["dummy_seg_path"]
+    seg_clip_paths   = ctx["seg_clip_paths"]
+    # Text feature extraction (GPU — runs once for all segments)
     _, text_feats, _ = feature_process(
+        dummy_seg_path,
         prompt if prompt else "",
         model_dict,
         cfg,
         neg_prompt=negative_prompt if negative_prompt else None,
     )
+    results = []
     for sample_idx in range(num_samples):
         seg_wavs = []
+        sr = 48000
         _t_hny_start = time.perf_counter()
         for seg_i, (seg_start, seg_end) in enumerate(segments):
             seg_dur = seg_end - seg_start
+            seg_path = seg_clip_paths[seg_i]
             visual_feats, _, seg_audio_len = feature_process(
                 seg_path,
                 prompt if prompt else "",
                 num_inference_steps=int(num_steps),
                 batch_size=1,
             )
+            wav = audio_batch[0].float().cpu().numpy()
             seg_samples = int(round(seg_dur * sr))
             wav = wav[:, :seg_samples]
             seg_wavs.append(wav)
         print(f"[HunyuanFoley] Inference done: {_n_segs_hny} seg(s) × {int(num_steps)} steps in "
               f"{_t_hny_elapsed:.1f}s wall → {_secs_per_step_hny:.3f}s/step "
               f"(current constant={HUNYUAN_SECS_PER_STEP})")
+        results.append((seg_wavs, sr, text_feats))
+    return results
+_hunyuan_gpu_infer._cpu_ctx = {}
+def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
+                     guidance_scale, num_steps, model_size, crossfade_s, crossfade_db, num_samples):
+    """HunyuanVideoFoley: text-guided foley, 48 kHz, up to 15 s.
+    CPU pre/post-processing wraps the GPU-only inference to minimize ZeroGPU cost."""
+    num_samples  = int(num_samples)
+    crossfade_s  = float(crossfade_s)
+    crossfade_db = float(crossfade_db)
+    # ── CPU pre-processing (no GPU needed) ──
+    tmp_dir = _register_tmp_dir(tempfile.mkdtemp())
+    silent_video = os.path.join(tmp_dir, "silent_input.mp4")
+    strip_audio_from_video(video_file, silent_video)
+    total_dur_s  = get_video_duration(silent_video)
+    segments     = _build_segments(total_dur_s, HUNYUAN_MAX_DUR, crossfade_s)
+    print(f"[HunyuanFoley] Video={total_dur_s:.2f}s | {len(segments)} segment(s) × ≤15 s")
+    # Pre-extract dummy segment for text feature extraction (ffmpeg, CPU)
+    dummy_seg_path = os.path.join(tmp_dir, "_seg_dummy.mp4")
+    ffmpeg.input(silent_video, ss=0, t=min(total_dur_s, HUNYUAN_MAX_DUR)).output(
+        dummy_seg_path, vcodec="copy", an=None
+    ).run(overwrite_output=True, quiet=True)
+    # Pre-extract all segment clips (ffmpeg, CPU)
+    seg_clip_paths = []
+    for seg_i, (seg_start, seg_end) in enumerate(segments):
+        seg_dur = seg_end - seg_start
+        seg_path = os.path.join(tmp_dir, f"hny_seg_{seg_i}.mp4")
+        ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
+            seg_path, vcodec="copy", an=None
+        ).run(overwrite_output=True, quiet=True)
+        seg_clip_paths.append(seg_path)
+    _hunyuan_gpu_infer._cpu_ctx = {
+        "segments": segments, "total_dur_s": total_dur_s,
+        "dummy_seg_path": dummy_seg_path, "seg_clip_paths": seg_clip_paths,
+    }
+    # ── GPU inference only ──
+    results = _hunyuan_gpu_infer(video_file, prompt, negative_prompt, seed_val,
+                                 guidance_scale, num_steps, model_size,
+                                 crossfade_s, crossfade_db, num_samples)
+    # ── CPU post-processing (no GPU needed) ──
+    _hf_path = str(Path("HunyuanVideo-Foley").resolve())
+    if _hf_path not in sys.path:
+        sys.path.insert(0, _hf_path)
+    from hunyuanvideo_foley.utils.media_utils import merge_audio_video
+    outputs = []
+    for sample_idx, (seg_wavs, sr, text_feats) in enumerate(results):
         full_wav = seg_wavs[0]
         for nw in seg_wavs[1:]:
             full_wav = _cf_join(full_wav, nw, crossfade_s, crossfade_db, sr)
         full_wav = full_wav[:, : int(round(total_dur_s * sr))]
         audio_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.wav")
         video_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.mp4")
         merge_audio_video(audio_path, silent_video, video_path)
         wav_paths = _save_seg_wavs(seg_wavs, tmp_dir, f"hunyuan_{sample_idx}")
         text_feats_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}_text_feats.pt")
         torch.save(text_feats, text_feats_path)
         seg_meta = {
 @spaces.GPU(duration=_taro_regen_duration)
+def _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
+                    seed_val, cfg_scale, num_steps, mode,
+                    crossfade_s, crossfade_db, slot_id=None):
+    """GPU-only TARO regen — returns new_wav for a single segment."""
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
     seg_start_s, seg_end_s = meta["segments"][seg_idx]
     from TARO.samplers import euler_sampler, euler_maruyama_sampler
     cavp_path = meta.get("cavp_path")
     onset_path = meta.get("onset_path")
     if cavp_path and os.path.exists(cavp_path) and onset_path and os.path.exists(onset_path):
     set_global_seed(random.randint(0, 2**32 - 1))
+    return _taro_infer_segment(
         model_net, vae, vocoder, cavp_feats, onset_feats,
         seg_start_s, seg_end_s, device, weight_dtype,
         float(cfg_scale), int(num_steps), mode, latents_scale,
         euler_sampler, euler_maruyama_sampler,
     )
+def regen_taro_segment(video_file, seg_idx, seg_meta_json,
+                       seed_val, cfg_scale, num_steps, mode,
+                       crossfade_s, crossfade_db, slot_id):
+    """Regenerate one TARO segment. GPU inference + CPU splice/save."""
+    meta    = json.loads(seg_meta_json)
+    seg_idx = int(seg_idx)
+    # GPU: inference only
+    new_wav = _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
+                              seed_val, cfg_scale, num_steps, mode,
+                              crossfade_s, crossfade_db, slot_id)
+    # CPU: splice, stitch, mux, save
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
     )
 @spaces.GPU(duration=_mmaudio_regen_duration)
+def _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
+                       prompt, negative_prompt, seed_val,
+                       cfg_strength, num_steps, crossfade_s, crossfade_db, slot_id=None):
+    """GPU-only MMAudio regen — returns (new_wav, sr) for a single segment."""
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
     seg_start, seg_end = meta["segments"][seg_idx]
     dtype  = torch.bfloat16
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
+    sr = seq_cfg.sampling_rate
+    # Use pre-extracted segment clip from the wrapper
+    seg_path = _regen_mmaudio_gpu._cpu_ctx.get("seg_path")
+    if not seg_path:
+        # Fallback: extract inside GPU (shouldn't happen)
+        silent_video = meta["silent_video"]
+        tmp_dir      = tempfile.mkdtemp()
+        seg_path     = os.path.join(tmp_dir, "regen_seg.mp4")
+        ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
+            seg_path, vcodec="copy", an=None
+        ).run(overwrite_output=True, quiet=True)
     rng = torch.Generator(device=device)
     rng.manual_seed(random.randint(0, 2**32 - 1))
     new_wav     = audios.float().cpu()[0].numpy()
     seg_samples = int(round(seg_dur * sr))
     new_wav     = new_wav[:, :seg_samples]
+    return new_wav, sr
+_regen_mmaudio_gpu._cpu_ctx = {}
+def regen_mmaudio_segment(video_file, seg_idx, seg_meta_json,
+                          prompt, negative_prompt, seed_val,
+                          cfg_strength, num_steps, crossfade_s, crossfade_db, slot_id):
+    """Regenerate one MMAudio segment. GPU inference + CPU splice/save."""
+    meta    = json.loads(seg_meta_json)
+    seg_idx = int(seg_idx)
+    seg_start, seg_end = meta["segments"][seg_idx]
+    seg_dur = seg_end - seg_start
+    # CPU: pre-extract segment clip
+    silent_video = meta["silent_video"]
+    tmp_dir      = tempfile.mkdtemp()
+    seg_path     = os.path.join(tmp_dir, "regen_seg.mp4")
+    ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
+        seg_path, vcodec="copy", an=None
+    ).run(overwrite_output=True, quiet=True)
+    _regen_mmaudio_gpu._cpu_ctx = {"seg_path": seg_path}
+    # GPU: inference only
+    new_wav, sr = _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
+                                     prompt, negative_prompt, seed_val,
+                                     cfg_strength, num_steps, crossfade_s, crossfade_db, slot_id)
+    meta["sr"] = sr
+    # CPU: splice, stitch, mux, save
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
     )
 @spaces.GPU(duration=_hunyuan_regen_duration)
+def _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
+                       prompt, negative_prompt, seed_val,
+                       guidance_scale, num_steps, model_size,
+                       crossfade_s, crossfade_db, slot_id=None):
+    """GPU-only HunyuanFoley regen — returns (new_wav, sr) for a single segment."""
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
     seg_start, seg_end = meta["segments"][seg_idx]
     set_global_seed(random.randint(0, 2**32 - 1))
+    # Use pre-extracted segment clip from wrapper
+    seg_path = _regen_hunyuan_gpu._cpu_ctx.get("seg_path")
+    if not seg_path:
+        silent_video = meta["silent_video"]
+        tmp_dir      = tempfile.mkdtemp()
+        seg_path     = os.path.join(tmp_dir, "regen_seg.mp4")
+        ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
+            seg_path, vcodec="copy", an=None
+        ).run(overwrite_output=True, quiet=True)
     text_feats_path = meta.get("text_feats_path")
     if text_feats_path and os.path.exists(text_feats_path):
         print("[HunyuanFoley regen] Loading cached text features, extracting visual only")
     new_wav     = audio_batch[0].float().cpu().numpy()
     seg_samples = int(round(seg_dur * sr))
     new_wav     = new_wav[:, :seg_samples]
+    return new_wav, sr
+_regen_hunyuan_gpu._cpu_ctx = {}
+def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
+                          prompt, negative_prompt, seed_val,
+                          guidance_scale, num_steps, model_size,
+                          crossfade_s, crossfade_db, slot_id):
+    """Regenerate one HunyuanFoley segment. GPU inference + CPU splice/save."""
+    meta    = json.loads(seg_meta_json)
+    seg_idx = int(seg_idx)
+    seg_start, seg_end = meta["segments"][seg_idx]
+    seg_dur = seg_end - seg_start
+    # CPU: pre-extract segment clip
+    silent_video = meta["silent_video"]
+    tmp_dir      = tempfile.mkdtemp()
+    seg_path     = os.path.join(tmp_dir, "regen_seg.mp4")
+    ffmpeg.input(silent_video, ss=seg_start, t=seg_dur).output(
+        seg_path, vcodec="copy", an=None
+    ).run(overwrite_output=True, quiet=True)
+    _regen_hunyuan_gpu._cpu_ctx = {"seg_path": seg_path}
+    # GPU: inference only
+    new_wav, sr = _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
+                                     prompt, negative_prompt, seed_val,
+                                     guidance_scale, num_steps, model_size,
+                                     crossfade_s, crossfade_db, slot_id)
+    meta["sr"] = sr
+    # CPU: splice, stitch, mux, save
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
     )