Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors Claude Sonnet 4.6 commited on 3 days ago

Commit

3272260

1 Parent(s): 813c771

ZeroGPU optimizations, refactoring, and regen UX improvements

ZeroGPU optimizations:
- Pre-load TARO CAVP/onset features on CPU via _preload_taro_regen_ctx()
- Pre-load HunyuanFoley text_feats on CPU via _preload_hunyuan_regen_ctx()
- Move FlowMatching creation outside per-segment loop in MMAudio
- Replace fallback video extraction in regen GPU fns with asserts
- Tighten TARO_SECS_PER_STEP 0.05→0.025 (measured 0.023s/step)
- Drop regen duration floor 60s→20s (cold-start spin-up not in timer)

Refactoring:
- Extract _post_process_samples() shared post-processing for all 3 models
- Unify mux_video_audio() to handle HunyuanFoley internally via model= param
- Extract _preload_taro_regen_ctx / _preload_hunyuan_regen_ctx helpers
- Make _resample_to_target() accept dst_sr; _resample_to_slot_sr delegates to it

Regen UX:
- Flash red border on waveform for 3s when regen aborts
- Show 'GPU cold-start — segment unchanged, try again' instead of misleading
'Quota exceeded' for GPU task aborted errors
- Status bar and seg label both auto-clear after 8s

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

app.py +198 -132

app.py CHANGED Viewed

@@ -261,14 +261,24 @@ def _load_hunyuan_model(device, model_size):
                       enable_offload=False, model_size=model_size)
-def mux_video_audio(silent_video: str, audio_path: str, output_path: str) -> None:
-    """Mux a silent video with an audio file into *output_path* (stream-copy video, encode audio)."""
-    ffmpeg.output(
-        ffmpeg.input(silent_video),
-        ffmpeg.input(audio_path),
-        output_path,
-        vcodec="copy", acodec="aac", strict="experimental",
-    ).run(overwrite_output=True, quiet=True)
 # ------------------------------------------------------------------ #
@@ -340,7 +350,7 @@ TARO_FPS           = 4
 TARO_TRUNCATE_FRAME = int(TARO_FPS * TARO_TRUNCATE / TARO_SR)   # 32
 TARO_TRUNCATE_ONSET = 120
 TARO_MODEL_DUR     = TARO_TRUNCATE / TARO_SR                    # 8.192 s
-TARO_SECS_PER_STEP = 0.05  # measured 0.043s/step on H200 (8.2s video, 2 segs × 25 steps = 2.2s wall)
 TARO_LOAD_OVERHEAD     = 15    # seconds: model load + CAVP feature extraction
 MMAUDIO_WINDOW         = 8.0   # seconds — MMAudio's fixed generation window
@@ -359,7 +369,7 @@ MODEL_CONFIGS = {
     "taro": {
         "window_s":       TARO_MODEL_DUR,       # 8.192 s
         "sr":             TARO_SR,               # 16000
-        "secs_per_step":  TARO_SECS_PER_STEP,   # 0.05
         "load_overhead":  TARO_LOAD_OVERHEAD,    # 15
         "tab_prefix":     "taro",
         "regen_fn":       None,   # set after function definitions (avoids forward-ref)
@@ -410,11 +420,14 @@ def _estimate_gpu_duration(model_key: str, num_samples: int, num_steps: int,
 def _estimate_regen_duration(model_key: str, num_steps: int) -> int:
     """Generic GPU duration estimator for single-segment regen.
-    Uses a lower floor (30s) than initial generation since regen only runs
-    one segment — saves 30s of wasted ZeroGPU quota per regen call."""
     cfg  = MODEL_CONFIGS[model_key]
     secs = int(num_steps) * cfg["secs_per_step"] + cfg["load_overhead"]
-    result = min(GPU_DURATION_CAP, max(60, int(secs)))
     print(f"[duration] {cfg['label']} regen: 1 seg × {int(num_steps)} steps → {secs:.0f}s → capped {result}s")
     return result
@@ -509,19 +522,22 @@ TARGET_SR   = 48000   # unified output sample rate for all three models
 TARO_SR_OUT = TARGET_SR
-def _resample_to_target(wav: np.ndarray, src_sr: int) -> np.ndarray:
-    """Resample *wav* (mono or stereo numpy float32) from src_sr to TARGET_SR (48kHz).
-    No-op if src_sr already equals TARGET_SR. Uses torchaudio Kaiser-windowed
-    sinc resampling — CPU-only, ZeroGPU-safe.
     """
-    if src_sr == TARGET_SR:
         return wav
     stereo = wav.ndim == 2
     t = torch.from_numpy(np.ascontiguousarray(wav.astype(np.float32)))
     if not stereo:
         t = t.unsqueeze(0)          # [1, T]
-    t = torchaudio.functional.resample(t, src_sr, TARGET_SR)
     if not stereo:
         t = t.squeeze(0)            # [T]
     return t.numpy()
@@ -592,6 +608,45 @@ def _build_seg_meta(*, segments, wav_paths, audio_path, video_path,
     return meta
 def _cpu_preprocess(video_file: str, model_dur: float,
                     crossfade_s: float) -> tuple:
     """Shared CPU pre-processing for all generate_* wrappers.
@@ -709,34 +764,34 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
                               crossfade_s, crossfade_db, num_samples)
     # ── CPU post-processing (no GPU needed) ──
-    # Cache CAVP + onset features once (same for all samples — they depend only on the video)
     cavp_path  = os.path.join(tmp_dir, "taro_cavp.npy")
     onset_path = os.path.join(tmp_dir, "taro_onset.npy")
-    first_cavp_saved = False
-    outputs = []
-    for sample_idx, (wavs, cavp_feats, onset_feats) in enumerate(results):
-        # Upsample each segment 16kHz → 48kHz (sinc, CPU)
         wavs = [_upsample_taro(w) for w in wavs]
-        final_wav  = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, TARO_SR_OUT)
-        audio_path = os.path.join(tmp_dir, f"taro_{sample_idx}.wav")
-        _save_wav(audio_path, final_wav, TARO_SR_OUT)
-        video_path = os.path.join(tmp_dir, f"taro_{sample_idx}.mp4")
-        mux_video_audio(silent_video, audio_path, video_path)
-        wav_paths = _save_seg_wavs(wavs, tmp_dir, f"taro_{sample_idx}")
-        # Save shared features once (not per-sample — they're identical)
-        if not first_cavp_saved:
             np.save(cavp_path, cavp_feats)
             if onset_feats is not None:
                 np.save(onset_path, onset_feats)
-            first_cavp_saved = True
-        seg_meta = _build_seg_meta(
-            segments=segments, wav_paths=wav_paths, audio_path=audio_path,
-            video_path=video_path, silent_video=silent_video, sr=TARO_SR_OUT,
-            model="taro", crossfade_s=crossfade_s, crossfade_db=crossfade_db,
-            total_dur_s=total_dur_s, cavp_path=cavp_path, onset_path=onset_path,
-        )
-        outputs.append((video_path, audio_path, seg_meta))
     return _pad_outputs(outputs)
@@ -794,12 +849,12 @@ def _mmaudio_gpu_infer(video_file, prompt, negative_prompt, seed_val,
         seg_audios = []
         _t_mma_start = time.perf_counter()
         for seg_i, (seg_start, seg_end) in enumerate(segments):
             seg_dur = seg_end - seg_start
             seg_path = seg_clip_paths[seg_i]
-            fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
             video_info  = load_video(seg_path, seg_dur)
             clip_frames = video_info.clip_frames.unsqueeze(0)
             sync_frames = video_info.sync_frames.unsqueeze(0)
@@ -868,29 +923,21 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
                                  cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples)
     # ── CPU post-processing ──
-    outputs = []
-    for sample_idx, (seg_audios, sr) in enumerate(results):
-        # Resample 44100 → 48000 Hz so all three models share the same output SR
         if sr != TARGET_SR:
             print(f"[MMAudio upsample] resampling {sr}Hz → {TARGET_SR}Hz (sinc, CPU) …")
             seg_audios = [_resample_to_target(w, sr) for w in seg_audios]
             print(f"[MMAudio upsample] done — {len(seg_audios)} seg(s) @ {TARGET_SR}Hz")
-            sr = TARGET_SR
-        full_wav = _stitch_wavs(seg_audios, crossfade_s, crossfade_db, total_dur_s, sr)
-        audio_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.wav")
-        _save_wav(audio_path, full_wav, sr)
-        video_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.mp4")
-        mux_video_audio(silent_video, audio_path, video_path)
-        wav_paths = _save_seg_wavs(seg_audios, tmp_dir, f"mmaudio_{sample_idx}")
-        seg_meta = _build_seg_meta(
-            segments=segments, wav_paths=wav_paths, audio_path=audio_path,
-            video_path=video_path, silent_video=silent_video, sr=sr,
-            model="mmaudio", crossfade_s=crossfade_s, crossfade_db=crossfade_db,
-            total_dur_s=total_dur_s,
-        )
-        outputs.append((video_path, audio_path, seg_meta))
     return _pad_outputs(outputs)
@@ -1034,28 +1081,19 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
                                  crossfade_s, crossfade_db, num_samples)
     # ── CPU post-processing (no GPU needed) ──
-    _ensure_syspath("HunyuanVideo-Foley")
-    from hunyuanvideo_foley.utils.media_utils import merge_audio_video
-    outputs = []
-    for sample_idx, (seg_wavs, sr, text_feats) in enumerate(results):
-        full_wav = _stitch_wavs(seg_wavs, crossfade_s, crossfade_db, total_dur_s, sr)
-        audio_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.wav")
-        _save_wav(audio_path, full_wav, sr)
-        video_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}.mp4")
-        merge_audio_video(audio_path, silent_video, video_path)
-        wav_paths = _save_seg_wavs(seg_wavs, tmp_dir, f"hunyuan_{sample_idx}")
-        text_feats_path = os.path.join(tmp_dir, f"hunyuan_{sample_idx}_text_feats.pt")
-        torch.save(text_feats, text_feats_path)
-        seg_meta = _build_seg_meta(
-            segments=segments, wav_paths=wav_paths, audio_path=audio_path,
-            video_path=video_path, silent_video=silent_video, sr=sr,
-            model="hunyuan", crossfade_s=crossfade_s, crossfade_db=crossfade_db,
-            total_dur_s=total_dur_s, text_feats_path=text_feats_path,
-        )
-        outputs.append((video_path, audio_path, seg_meta))
     return _pad_outputs(outputs)
@@ -1069,6 +1107,28 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
 #   4. Returns (new_video_path, new_audio_path, updated_seg_meta, new_waveform_html)
 # ================================================================== #
 def _splice_and_save(new_wav, seg_idx, meta, slot_id):
     """Replace wavs[seg_idx] with new_wav, re-stitch, re-save, re-mux.
     Returns (video_path, audio_path, updated_meta, waveform_html).
@@ -1099,13 +1159,7 @@ def _splice_and_save(new_wav, seg_idx, meta, slot_id):
     _vid_base   = os.path.splitext(os.path.basename(meta["video_path"]))[0]
     _vid_base_clean = _vid_base.rsplit("_regen_", 1)[0]
     video_path  = os.path.join(tmp_dir, f"{_vid_base_clean}_regen_{_ts}.mp4")
-    if model == "hunyuan":
-        # HunyuanFoley uses its own merge_audio_video
-        _ensure_syspath("HunyuanVideo-Foley")
-        from hunyuanvideo_foley.utils.media_utils import merge_audio_video
-        merge_audio_video(audio_path, silent_video, video_path)
-    else:
-        mux_video_audio(silent_video, audio_path, video_path)
     # Save updated segment wavs to .npy files
     updated_wav_paths = _save_seg_wavs(wavs, tmp_dir, os.path.splitext(_base_clean)[0])
@@ -1157,12 +1211,12 @@ def _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
     _ensure_syspath("TARO")
     from TARO.samplers import euler_sampler, euler_maruyama_sampler
-    cavp_path = meta.get("cavp_path")
-    onset_path = meta.get("onset_path")
-    if cavp_path and os.path.exists(cavp_path) and onset_path and os.path.exists(onset_path):
-        print("[TARO regen] Loading cached CAVP + onset features")
-        cavp_feats  = np.load(cavp_path)
-        onset_feats = np.load(onset_path)
     else:
         print("[TARO regen] Cache miss — re-extracting CAVP + onset features")
         from TARO.onset_util import extract_onset
@@ -1195,6 +1249,9 @@ def regen_taro_segment(video_file, seg_idx, seg_meta_json,
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
     # GPU: inference only
     new_wav = _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
                               seed_val, cfg_scale, num_steps, mode,
@@ -1234,14 +1291,9 @@ def _regen_mmaudio_gpu(video_file, seg_idx, seg_meta_json,
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
     sr = seq_cfg.sampling_rate
-    # Use pre-extracted segment clip from the wrapper
     seg_path = _regen_mmaudio_gpu._cpu_ctx.get("seg_path")
-    if not seg_path:
-        # Fallback: extract inside GPU (shouldn't happen)
-        seg_path = _extract_segment_clip(
-            meta["silent_video"], seg_start, seg_dur,
-            os.path.join(tempfile.mkdtemp(), "regen_seg.mp4"),
-        )
     rng = torch.Generator(device=device)
     rng.manual_seed(random.randint(0, 2**32 - 1))
@@ -1335,18 +1387,15 @@ def _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
     # Use pre-extracted segment clip from wrapper
     seg_path = _regen_hunyuan_gpu._cpu_ctx.get("seg_path")
-    if not seg_path:
-        seg_path = _extract_segment_clip(
-            meta["silent_video"], seg_start, seg_dur,
-            os.path.join(tempfile.mkdtemp(), "regen_seg.mp4"),
-        )
-    text_feats_path = meta.get("text_feats_path")
-    if text_feats_path and os.path.exists(text_feats_path):
-        print("[HunyuanFoley regen] Loading cached text features, extracting visual only")
         from hunyuanvideo_foley.utils.feature_utils import encode_video_features
         visual_feats, seg_audio_len = encode_video_features(seg_path, model_dict)
-        text_feats = torch.load(text_feats_path, map_location=device, weights_only=False)
     else:
         print("[HunyuanFoley regen] Cache miss — extracting text + visual features")
         visual_feats, text_feats, seg_audio_len = feature_process(
@@ -1377,13 +1426,13 @@ def regen_hunyuan_segment(video_file, seg_idx, seg_meta_json,
     seg_start, seg_end = meta["segments"][seg_idx]
     seg_dur = seg_end - seg_start
-    # CPU: pre-extract segment clip
     tmp_dir  = _register_tmp_dir(tempfile.mkdtemp())
     seg_path = _extract_segment_clip(
         meta["silent_video"], seg_start, seg_dur,
         os.path.join(tmp_dir, "regen_seg.mp4"),
     )
-    _regen_hunyuan_gpu._cpu_ctx = {"seg_path": seg_path}
     # GPU: inference only
     new_wav, sr = _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
@@ -1419,26 +1468,17 @@ MODEL_CONFIGS["hunyuan"]["regen_fn"] = regen_hunyuan_segment
 def _resample_to_slot_sr(wav: np.ndarray, src_sr: int, dst_sr: int,
                          slot_wav_ref: np.ndarray = None) -> np.ndarray:
-    """Resample *wav* from src_sr to dst_sr using torchaudio, then match
-    channel layout to *slot_wav_ref* (the first existing segment in the slot).
     TARO is mono (T,), MMAudio/Hunyuan are stereo (C, T).  Mixing them
     without normalisation causes a shape mismatch in _cf_join.  Rules:
-      • stereo → mono : average channels
-      • mono   → stereo: duplicate the single channel
     """
-    # 1. Resample
-    if src_sr != dst_sr:
-        stereo_in = wav.ndim == 2
-        t = torch.from_numpy(np.ascontiguousarray(wav))
-        if not stereo_in:
-            t = t.unsqueeze(0)
-        t = torchaudio.functional.resample(t.float(), src_sr, dst_sr)
-        if not stereo_in:
-            t = t.squeeze(0)
-        wav = t.numpy()
-    # 2. Match channel layout to the slot's existing segments
     if slot_wav_ref is not None:
         slot_stereo = slot_wav_ref.ndim == 2
         wav_stereo  = wav.ndim == 2
@@ -1474,6 +1514,9 @@ def xregen_taro(seg_idx, state_json, slot_id,
     pending_html = _build_regen_pending_html(meta["segments"], seg_idx, slot_id, "")
     yield gr.update(), gr.update(value=pending_html)
     new_wav_raw = _regen_taro_gpu(None, seg_idx, state_json,
                                   seed_val, cfg_scale, num_steps, mode,
                                   crossfade_s, crossfade_db, slot_id)
@@ -1528,7 +1571,7 @@ def xregen_hunyuan(seg_idx, state_json, slot_id,
         meta["silent_video"], seg_start, seg_end - seg_start,
         os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
     )
-    _regen_hunyuan_gpu._cpu_ctx = {"seg_path": seg_path}
     new_wav_raw, src_sr = _regen_hunyuan_gpu(None, seg_idx, state_json,
                                              prompt, negative_prompt, seed_val,
@@ -2364,6 +2407,7 @@ _GLOBAL_JS = """
           var lbl = document.getElementById('wf_seglabel_' + slot_id);
           if (hadError) {
             var toastMsg = typeof errMsg === 'string' ? errMsg : JSON.stringify(errMsg);
             if (preRegenWaveHtml !== null) {
               var waveEl2 = document.getElementById('slot_wave_' + slot_id);
               if (waveEl2) waveEl2.innerHTML = preRegenWaveHtml;
@@ -2372,13 +2416,35 @@ _GLOBAL_JS = """
               var vidElR = document.getElementById('slot_vid_' + slot_id);
               if (vidElR) { var vR = vidElR.querySelector('video'); if (vR) { vR.setAttribute('src', preRegenVideoSrc); vR.src = preRegenVideoSrc; vR.load(); } }
             }
             var statusBar = document.getElementById('wf_statusbar_' + slot_id);
             if (statusBar) {
               statusBar.style.color = '#e05252';
-              statusBar.textContent = '\u26a0 ' + toastMsg;
-              setTimeout(function() { statusBar.style.color = '#888'; statusBar.textContent = 'Click a segment to regenerate \u00a0|\u00a0 Playhead syncs to video'; }, 15000);
             }
-            if (lbl) lbl.textContent = 'Quota exceeded — try again later';
           } else {
             if (lbl) lbl.textContent = 'Done';
             var src = _pendingVideoSrc;

                       enable_offload=False, model_size=model_size)
+def mux_video_audio(silent_video: str, audio_path: str, output_path: str,
+                     model: str = None) -> None:
+    """Mux a silent video with an audio file into *output_path*.
+    For HunyuanFoley (*model*="hunyuan") we use its own merge_audio_video which
+    handles its specific ffmpeg quirks; all other models use stream-copy muxing.
+    """
+    if model == "hunyuan":
+        _ensure_syspath("HunyuanVideo-Foley")
+        from hunyuanvideo_foley.utils.media_utils import merge_audio_video
+        merge_audio_video(audio_path, silent_video, output_path)
+    else:
+        ffmpeg.output(
+            ffmpeg.input(silent_video),
+            ffmpeg.input(audio_path),
+            output_path,
+            vcodec="copy", acodec="aac", strict="experimental",
+        ).run(overwrite_output=True, quiet=True)
 # ------------------------------------------------------------------ #
 TARO_TRUNCATE_FRAME = int(TARO_FPS * TARO_TRUNCATE / TARO_SR)   # 32
 TARO_TRUNCATE_ONSET = 120
 TARO_MODEL_DUR     = TARO_TRUNCATE / TARO_SR                    # 8.192 s
+TARO_SECS_PER_STEP = 0.025  # measured 0.023s/step on H200; was 0.05, tightened to halve GPU allocation
 TARO_LOAD_OVERHEAD     = 15    # seconds: model load + CAVP feature extraction
 MMAUDIO_WINDOW         = 8.0   # seconds — MMAudio's fixed generation window
     "taro": {
         "window_s":       TARO_MODEL_DUR,       # 8.192 s
         "sr":             TARO_SR,               # 16000
+        "secs_per_step":  TARO_SECS_PER_STEP,   # 0.025
         "load_overhead":  TARO_LOAD_OVERHEAD,    # 15
         "tab_prefix":     "taro",
         "regen_fn":       None,   # set after function definitions (avoids forward-ref)
 def _estimate_regen_duration(model_key: str, num_steps: int) -> int:
     """Generic GPU duration estimator for single-segment regen.
+    Floor is 20s — enough headroom above the 10s ZeroGPU abort threshold
+    for any model on a warm worker.  Cold-start spin-up happens *before*
+    the timer starts so raising the floor does not help with cold-start aborts.
+    """
     cfg  = MODEL_CONFIGS[model_key]
     secs = int(num_steps) * cfg["secs_per_step"] + cfg["load_overhead"]
+    result = min(GPU_DURATION_CAP, max(20, int(secs)))
     print(f"[duration] {cfg['label']} regen: 1 seg × {int(num_steps)} steps → {secs:.0f}s → capped {result}s")
     return result
 TARO_SR_OUT = TARGET_SR
+def _resample_to_target(wav: np.ndarray, src_sr: int,
+                         dst_sr: int = None) -> np.ndarray:
+    """Resample *wav* (mono or stereo numpy float32) from *src_sr* to *dst_sr*.
+    *dst_sr* defaults to TARGET_SR (48 kHz).  No-op if src_sr == dst_sr.
+    Uses torchaudio Kaiser-windowed sinc resampling — CPU-only, ZeroGPU-safe.
     """
+    if dst_sr is None:
+        dst_sr = TARGET_SR
+    if src_sr == dst_sr:
         return wav
     stereo = wav.ndim == 2
     t = torch.from_numpy(np.ascontiguousarray(wav.astype(np.float32)))
     if not stereo:
         t = t.unsqueeze(0)          # [1, T]
+    t = torchaudio.functional.resample(t, src_sr, dst_sr)
     if not stereo:
         t = t.squeeze(0)            # [T]
     return t.numpy()
     return meta
+def _post_process_samples(results: list, *, model: str, tmp_dir: str,
+                           silent_video: str, segments: list,
+                           crossfade_s: float, crossfade_db: float,
+                           total_dur_s: float, sr: int,
+                           extra_meta_fn=None) -> list:
+    """Shared CPU post-processing for all three generate_* wrappers.
+    Each entry in *results* is a tuple whose first element is a list of
+    per-segment wav arrays.  The remaining elements are model-specific
+    (e.g. TARO returns features, HunyuanFoley returns text_feats).
+    *extra_meta_fn(sample_idx, result_tuple, tmp_dir) -> dict* is an optional
+    callback that returns model-specific extra keys to merge into seg_meta
+    (e.g. cavp_path, onset_path, text_feats_path).
+    Returns a list of (video_path, audio_path, seg_meta) tuples.
+    """
+    outputs = []
+    for sample_idx, result in enumerate(results):
+        seg_wavs = result[0]
+        full_wav   = _stitch_wavs(seg_wavs, crossfade_s, crossfade_db, total_dur_s, sr)
+        audio_path = os.path.join(tmp_dir, f"{model}_{sample_idx}.wav")
+        _save_wav(audio_path, full_wav, sr)
+        video_path = os.path.join(tmp_dir, f"{model}_{sample_idx}.mp4")
+        mux_video_audio(silent_video, audio_path, video_path, model=model)
+        wav_paths  = _save_seg_wavs(seg_wavs, tmp_dir, f"{model}_{sample_idx}")
+        extras = extra_meta_fn(sample_idx, result, tmp_dir) if extra_meta_fn else {}
+        seg_meta = _build_seg_meta(
+            segments=segments, wav_paths=wav_paths, audio_path=audio_path,
+            video_path=video_path, silent_video=silent_video, sr=sr,
+            model=model, crossfade_s=crossfade_s, crossfade_db=crossfade_db,
+            total_dur_s=total_dur_s, **extras,
+        )
+        outputs.append((video_path, audio_path, seg_meta))
+    return outputs
 def _cpu_preprocess(video_file: str, model_dur: float,
                     crossfade_s: float) -> tuple:
     """Shared CPU pre-processing for all generate_* wrappers.
                               crossfade_s, crossfade_db, num_samples)
     # ── CPU post-processing (no GPU needed) ──
+    # Upsample 16kHz → 48kHz and normalise result tuples to (seg_wavs, ...)
     cavp_path  = os.path.join(tmp_dir, "taro_cavp.npy")
     onset_path = os.path.join(tmp_dir, "taro_onset.npy")
+    _feats_saved = False
+    def _upsample_and_save_feats(result):
+        nonlocal _feats_saved
+        wavs, cavp_feats, onset_feats = result
         wavs = [_upsample_taro(w) for w in wavs]
+        if not _feats_saved:
             np.save(cavp_path, cavp_feats)
             if onset_feats is not None:
                 np.save(onset_path, onset_feats)
+            _feats_saved = True
+        return (wavs, cavp_feats, onset_feats)
+    results = [_upsample_and_save_feats(r) for r in results]
+    def _taro_extras(sample_idx, result, td):
+        return {"cavp_path": cavp_path, "onset_path": onset_path}
+    outputs = _post_process_samples(
+        results, model="taro", tmp_dir=tmp_dir,
+        silent_video=silent_video, segments=segments,
+        crossfade_s=crossfade_s, crossfade_db=crossfade_db,
+        total_dur_s=total_dur_s, sr=TARO_SR_OUT,
+        extra_meta_fn=_taro_extras,
+    )
     return _pad_outputs(outputs)
         seg_audios = []
         _t_mma_start = time.perf_counter()
+        fm = FlowMatching(min_sigma=0, inference_mode="euler", num_steps=num_steps)
         for seg_i, (seg_start, seg_end) in enumerate(segments):
             seg_dur = seg_end - seg_start
             seg_path = seg_clip_paths[seg_i]
             video_info  = load_video(seg_path, seg_dur)
             clip_frames = video_info.clip_frames.unsqueeze(0)
             sync_frames = video_info.sync_frames.unsqueeze(0)
                                  cfg_strength, num_steps, crossfade_s, crossfade_db, num_samples)
     # ── CPU post-processing ──
+    # Resample 44100 → 48000 and normalise tuples to (seg_wavs, ...)
+    resampled = []
+    for seg_audios, sr in results:
         if sr != TARGET_SR:
             print(f"[MMAudio upsample] resampling {sr}Hz → {TARGET_SR}Hz (sinc, CPU) …")
             seg_audios = [_resample_to_target(w, sr) for w in seg_audios]
             print(f"[MMAudio upsample] done — {len(seg_audios)} seg(s) @ {TARGET_SR}Hz")
+        resampled.append((seg_audios,))
+    outputs = _post_process_samples(
+        resampled, model="mmaudio", tmp_dir=tmp_dir,
+        silent_video=silent_video, segments=segments,
+        crossfade_s=crossfade_s, crossfade_db=crossfade_db,
+        total_dur_s=total_dur_s, sr=TARGET_SR,
+    )
     return _pad_outputs(outputs)
                                  crossfade_s, crossfade_db, num_samples)
     # ── CPU post-processing (no GPU needed) ──
+    def _hunyuan_extras(sample_idx, result, td):
+        _, _sr, text_feats = result
+        path = os.path.join(td, f"hunyuan_{sample_idx}_text_feats.pt")
+        torch.save(text_feats, path)
+        return {"text_feats_path": path}
+    outputs = _post_process_samples(
+        results, model="hunyuan", tmp_dir=tmp_dir,
+        silent_video=silent_video, segments=segments,
+        crossfade_s=crossfade_s, crossfade_db=crossfade_db,
+        total_dur_s=total_dur_s, sr=48000,
+        extra_meta_fn=_hunyuan_extras,
+    )
     return _pad_outputs(outputs)
 #   4. Returns (new_video_path, new_audio_path, updated_seg_meta, new_waveform_html)
 # ================================================================== #
+def _preload_taro_regen_ctx(meta: dict) -> dict:
+    """Pre-load TARO CAVP/onset features on CPU for regen.
+    Returns a dict suitable for _regen_taro_gpu._cpu_ctx."""
+    cavp_path  = meta.get("cavp_path", "")
+    onset_path = meta.get("onset_path", "")
+    ctx = {}
+    if cavp_path and os.path.exists(cavp_path) and onset_path and os.path.exists(onset_path):
+        ctx["cavp"]  = np.load(cavp_path)
+        ctx["onset"] = np.load(onset_path)
+    return ctx
+def _preload_hunyuan_regen_ctx(meta: dict, seg_path: str) -> dict:
+    """Pre-load HunyuanFoley text features + segment path on CPU for regen.
+    Returns a dict suitable for _regen_hunyuan_gpu._cpu_ctx."""
+    ctx = {"seg_path": seg_path}
+    text_feats_path = meta.get("text_feats_path", "")
+    if text_feats_path and os.path.exists(text_feats_path):
+        ctx["text_feats"] = torch.load(text_feats_path, map_location="cpu", weights_only=False)
+    return ctx
 def _splice_and_save(new_wav, seg_idx, meta, slot_id):
     """Replace wavs[seg_idx] with new_wav, re-stitch, re-save, re-mux.
     Returns (video_path, audio_path, updated_meta, waveform_html).
     _vid_base   = os.path.splitext(os.path.basename(meta["video_path"]))[0]
     _vid_base_clean = _vid_base.rsplit("_regen_", 1)[0]
     video_path  = os.path.join(tmp_dir, f"{_vid_base_clean}_regen_{_ts}.mp4")
+    mux_video_audio(silent_video, audio_path, video_path, model=model)
     # Save updated segment wavs to .npy files
     updated_wav_paths = _save_seg_wavs(wavs, tmp_dir, os.path.splitext(_base_clean)[0])
     _ensure_syspath("TARO")
     from TARO.samplers import euler_sampler, euler_maruyama_sampler
+    # Use pre-loaded features from CPU wrapper (avoids np.load inside GPU window)
+    ctx = _regen_taro_gpu._cpu_ctx
+    if "cavp" in ctx and "onset" in ctx:
+        print("[TARO regen] Using pre-loaded CAVP + onset features (CPU cache hit)")
+        cavp_feats  = ctx["cavp"]
+        onset_feats = ctx["onset"]
     else:
         print("[TARO regen] Cache miss — re-extracting CAVP + onset features")
         from TARO.onset_util import extract_onset
     meta    = json.loads(seg_meta_json)
     seg_idx = int(seg_idx)
+    # CPU: pre-load cached features so np.load doesn't happen inside GPU window
+    _regen_taro_gpu._cpu_ctx = _preload_taro_regen_ctx(meta)
     # GPU: inference only
     new_wav = _regen_taro_gpu(video_file, seg_idx, seg_meta_json,
                               seed_val, cfg_scale, num_steps, mode,
     net, feature_utils, model_cfg, seq_cfg = _load_mmaudio_models(device, dtype)
     sr = seq_cfg.sampling_rate
+    # Use pre-extracted segment clip from the CPU wrapper
     seg_path = _regen_mmaudio_gpu._cpu_ctx.get("seg_path")
+    assert seg_path, "[MMAudio regen] seg_path not set — wrapper must pre-extract segment clip"
     rng = torch.Generator(device=device)
     rng.manual_seed(random.randint(0, 2**32 - 1))
     # Use pre-extracted segment clip from wrapper
     seg_path = _regen_hunyuan_gpu._cpu_ctx.get("seg_path")
+    assert seg_path, "[HunyuanFoley regen] seg_path not set — wrapper must pre-extract segment clip"
+    # Use pre-loaded text_feats from CPU wrapper (avoids torch.load inside GPU window)
+    ctx = _regen_hunyuan_gpu._cpu_ctx
+    if "text_feats" in ctx:
+        print("[HunyuanFoley regen] Using pre-loaded text features (CPU cache hit)")
         from hunyuanvideo_foley.utils.feature_utils import encode_video_features
         visual_feats, seg_audio_len = encode_video_features(seg_path, model_dict)
+        text_feats = ctx["text_feats"].to(device)
     else:
         print("[HunyuanFoley regen] Cache miss — extracting text + visual features")
         visual_feats, text_feats, seg_audio_len = feature_process(
     seg_start, seg_end = meta["segments"][seg_idx]
     seg_dur = seg_end - seg_start
+    # CPU: pre-extract segment clip + pre-load cached text features
     tmp_dir  = _register_tmp_dir(tempfile.mkdtemp())
     seg_path = _extract_segment_clip(
         meta["silent_video"], seg_start, seg_dur,
         os.path.join(tmp_dir, "regen_seg.mp4"),
     )
+    _regen_hunyuan_gpu._cpu_ctx = _preload_hunyuan_regen_ctx(meta, seg_path)
     # GPU: inference only
     new_wav, sr = _regen_hunyuan_gpu(video_file, seg_idx, seg_meta_json,
 def _resample_to_slot_sr(wav: np.ndarray, src_sr: int, dst_sr: int,
                          slot_wav_ref: np.ndarray = None) -> np.ndarray:
+    """Resample *wav* from src_sr to dst_sr, then match channel layout to
+    *slot_wav_ref* (the first existing segment in the slot).
     TARO is mono (T,), MMAudio/Hunyuan are stereo (C, T).  Mixing them
     without normalisation causes a shape mismatch in _cf_join.  Rules:
+      - stereo → mono : average channels
+      - mono   → stereo: duplicate the single channel
     """
+    wav = _resample_to_target(wav, src_sr, dst_sr)
+    # Match channel layout to the slot's existing segments
     if slot_wav_ref is not None:
         slot_stereo = slot_wav_ref.ndim == 2
         wav_stereo  = wav.ndim == 2
     pending_html = _build_regen_pending_html(meta["segments"], seg_idx, slot_id, "")
     yield gr.update(), gr.update(value=pending_html)
+    # CPU: pre-load cached features so np.load doesn't happen inside GPU window
+    _regen_taro_gpu._cpu_ctx = _preload_taro_regen_ctx(meta)
     new_wav_raw = _regen_taro_gpu(None, seg_idx, state_json,
                                   seed_val, cfg_scale, num_steps, mode,
                                   crossfade_s, crossfade_db, slot_id)
         meta["silent_video"], seg_start, seg_end - seg_start,
         os.path.join(tempfile.mkdtemp(), "xregen_seg.mp4"),
     )
+    _regen_hunyuan_gpu._cpu_ctx = _preload_hunyuan_regen_ctx(meta, seg_path)
     new_wav_raw, src_sr = _regen_hunyuan_gpu(None, seg_idx, state_json,
                                              prompt, negative_prompt, seed_val,
           var lbl = document.getElementById('wf_seglabel_' + slot_id);
           if (hadError) {
             var toastMsg = typeof errMsg === 'string' ? errMsg : JSON.stringify(errMsg);
+            // Restore previous waveform HTML and video src
             if (preRegenWaveHtml !== null) {
               var waveEl2 = document.getElementById('slot_wave_' + slot_id);
               if (waveEl2) waveEl2.innerHTML = preRegenWaveHtml;
               var vidElR = document.getElementById('slot_vid_' + slot_id);
               if (vidElR) { var vR = vidElR.querySelector('video'); if (vR) { vR.setAttribute('src', preRegenVideoSrc); vR.src = preRegenVideoSrc; vR.load(); } }
             }
+            // Flash the waveform iframe border red so it's obvious the segment didn't change
+            var iframeEl = document.getElementById('wf_iframe_' + slot_id);
+            if (!iframeEl) {
+              // waveform may have been restored into preRegenWaveHtml — find via slot_wave wrapper
+              var waveWrap = document.getElementById('slot_wave_' + slot_id);
+              if (waveWrap) iframeEl = waveWrap.querySelector('iframe[id^="wf_iframe_"]');
+            }
+            if (iframeEl) {
+              iframeEl.style.transition = 'box-shadow 0.15s';
+              iframeEl.style.boxShadow = '0 0 0 2px #e05252';
+              setTimeout(function() { iframeEl.style.boxShadow = 'none'; }, 3000);
+            }
+            // Pick a human-readable message based on the error text
+            var isAbort   = toastMsg.toLowerCase().indexOf('aborted') !== -1;
+            var isTimeout = toastMsg.toLowerCase().indexOf('timeout') !== -1;
+            var userMsg = isAbort || isTimeout
+              ? '\u26a0\ufe0f GPU cold-start — segment unchanged, try again'
+              : '\u26a0\ufe0f Regen failed — segment unchanged';
             var statusBar = document.getElementById('wf_statusbar_' + slot_id);
             if (statusBar) {
               statusBar.style.color = '#e05252';
+              statusBar.textContent = userMsg;
+              setTimeout(function() { statusBar.style.color = '#888'; statusBar.textContent = 'Click a segment to regenerate \u00a0|\u00a0 Playhead syncs to video'; }, 8000);
+            }
+            if (lbl) {
+              lbl.style.color = '#e05252';
+              lbl.textContent = isAbort || isTimeout ? 'Cold-start abort — segment unchanged, try again' : 'Regen failed — segment unchanged';
+              setTimeout(function() { lbl.style.color = '#aaa'; lbl.textContent = ''; }, 8000);
             }
           } else {
             if (lbl) lbl.textContent = 'Done';
             var src = _pendingVideoSrc;