Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors commited on about 19 hours ago

Commit

47fd0ad

1 Parent(s): 166fb8e

xregen: use target model's optimal window centered on segment midpoint

Each xregen_* now:
1. Computes optimal clip window centered on segment midpoint (clamped to video)
2. Runs _build_segments on that clip with target model's window size
3. Calls full generation GPU pipeline (same path as initial generation)
4. Stitches sub-segments with _stitch_wavs + contact-edge trimming
5. Returns (wav, sr, clip_start_s) so _xregen_splice can align to original grid

Handles all cases: target window > span (single inference), target window <
span (multiple sub-segments), video shorter than target window (clamped).

Files changed (1) hide show

app.py +141 -26

app.py CHANGED Viewed

@@ -1617,13 +1617,64 @@ def _resample_to_slot_sr(wav: np.ndarray, src_sr: int, dst_sr: int,
     return wav
 def _xregen_splice(new_wav_raw: np.ndarray, src_sr: int,
-                   meta: dict, seg_idx: int, slot_id: str) -> tuple:
     """Shared epilogue for all xregen_* functions: resample → splice → save.
-    Returns (video_path, waveform_html)."""
     slot_sr   = int(meta["sr"])
     slot_wavs = _load_seg_wavs(meta["wav_paths"])
     new_wav   = _resample_to_slot_sr(new_wav_raw, src_sr, slot_sr, slot_wavs[0])
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
     )
@@ -1635,8 +1686,8 @@ def _xregen_dispatch(state_json: str, seg_idx: int, slot_id: str, infer_fn):
     Yields pending HTML immediately, then calls *infer_fn()* — a zero-argument
     callable that runs model-specific CPU prep + GPU inference and returns
-    (wav_array, src_sr).  For TARO, *infer_fn* should return the wav already
-    upsampled to 48 kHz; pass TARO_SR_OUT as src_sr.
     Yields:
         First:  (gr.update(), gr.update(value=pending_html))  — shown while GPU runs
@@ -1646,8 +1697,8 @@ def _xregen_dispatch(state_json: str, seg_idx: int, slot_id: str, infer_fn):
     pending_html = _build_regen_pending_html(meta["segments"], seg_idx, slot_id, "")
     yield gr.update(), gr.update(value=pending_html)
-    new_wav_raw, src_sr = infer_fn()
-    video_path, waveform_html = _xregen_splice(new_wav_raw, src_sr, meta, seg_idx, slot_id)
     yield gr.update(value=video_path), gr.update(value=waveform_html)
@@ -1655,16 +1706,35 @@ def xregen_taro(seg_idx, state_json, slot_id,
                 seed_val, cfg_scale, num_steps, mode,
                 crossfade_s, crossfade_db,
                 request: gr.Request = None):
-    """Cross-model regen: run TARO inference and splice into *slot_id*."""
     seg_idx = int(seg_idx)
     meta    = json.loads(state_json)
     def _run():
-        # CAVP/onset features are loaded from disk paths inside the GPU fn
-        wav = _regen_taro_gpu(None, seg_idx, state_json,
-                              seed_val, cfg_scale, num_steps, mode,
-                              crossfade_s, crossfade_db, slot_id)
-        return _upsample_taro(wav), TARO_SR_OUT   # 16 kHz → 48 kHz (CPU)
     yield from _xregen_dispatch(state_json, seg_idx, slot_id, _run)
@@ -1673,16 +1743,37 @@ def xregen_mmaudio(seg_idx, state_json, slot_id,
                    prompt, negative_prompt, seed_val,
                    cfg_strength, num_steps, crossfade_s, crossfade_db,
                    request: gr.Request = None):
-    """Cross-model regen: run MMAudio inference and splice into *slot_id*."""
     seg_idx = int(seg_idx)
     def _run():
-        # Segment clip extraction happens inside _regen_mmaudio_gpu
-        wav, src_sr = _regen_mmaudio_gpu(None, seg_idx, state_json,
-                                         prompt, negative_prompt, seed_val,
-                                         cfg_strength, num_steps,
-                                         crossfade_s, crossfade_db, slot_id)
-        return wav, src_sr
     yield from _xregen_dispatch(state_json, seg_idx, slot_id, _run)
@@ -1692,16 +1783,40 @@ def xregen_hunyuan(seg_idx, state_json, slot_id,
                    guidance_scale, num_steps, model_size,
                    crossfade_s, crossfade_db,
                    request: gr.Request = None):
-    """Cross-model regen: run HunyuanFoley inference and splice into *slot_id*."""
     seg_idx = int(seg_idx)
     def _run():
-        # Segment clip extraction happens inside _regen_hunyuan_gpu
-        wav, src_sr = _regen_hunyuan_gpu(None, seg_idx, state_json,
-                                         prompt, negative_prompt, seed_val,
-                                         guidance_scale, num_steps, model_size,
-                                         crossfade_s, crossfade_db, slot_id)
-        return wav, src_sr
     yield from _xregen_dispatch(state_json, seg_idx, slot_id, _run)

     return wav
+def _xregen_clip_window(meta: dict, seg_idx: int, target_window_s: float) -> tuple:
+    """Compute the video clip window for a cross-model regen.
+    Centers *target_window_s* on the original segment's midpoint, clamped to
+    [0, total_dur_s].  Returns (clip_start, clip_end, clip_dur).
+    If the video is shorter than *target_window_s*, the full video is used
+    (suboptimal but never breaks).  If the segment span exceeds
+    *target_window_s*, the caller should run _build_segments on the span and
+    generate multiple sub-segments — but the clip window is still returned as
+    the full segment span so the caller can decide.
+    """
+    total_dur_s = float(meta["total_dur_s"])
+    seg_start, seg_end = meta["segments"][seg_idx]
+    seg_mid   = (seg_start + seg_end) / 2.0
+    half_win  = target_window_s / 2.0
+    clip_start = max(0.0, seg_mid - half_win)
+    clip_end   = min(total_dur_s, seg_mid + half_win)
+    # If clamped at one end, extend the other to preserve full window if possible
+    if clip_start == 0.0:
+        clip_end = min(total_dur_s, target_window_s)
+    elif clip_end == total_dur_s:
+        clip_start = max(0.0, total_dur_s - target_window_s)
+    clip_dur = clip_end - clip_start
+    return clip_start, clip_end, clip_dur
 def _xregen_splice(new_wav_raw: np.ndarray, src_sr: int,
+                   meta: dict, seg_idx: int, slot_id: str,
+                   clip_start_s: float = None) -> tuple:
     """Shared epilogue for all xregen_* functions: resample → splice → save.
+    Returns (video_path, waveform_html).
+    *clip_start_s* is the absolute video time where new_wav_raw starts.
+    When the clip was centered on the segment midpoint (not at seg_start),
+    we need to shift the wav so _stitch_wavs can trim it correctly relative
+    to the original segment's start.  We do this by prepending silence so
+    the wav's time origin aligns with the original segment's start.
+    """
     slot_sr   = int(meta["sr"])
     slot_wavs = _load_seg_wavs(meta["wav_paths"])
     new_wav   = _resample_to_slot_sr(new_wav_raw, src_sr, slot_sr, slot_wavs[0])
+    # If the clip started before the original segment start, prepend silence
+    # so that sample index 0 of new_wav corresponds to seg_start in video time.
+    if clip_start_s is not None:
+        seg_start = meta["segments"][seg_idx][0]
+        offset_s  = seg_start - clip_start_s   # positive = seg starts after clip start
+        if offset_s < 0:
+            # clip started after seg_start — prepend silence to align
+            pad_samples = int(round(abs(offset_s) * slot_sr))
+            silence = np.zeros(
+                (new_wav.shape[0], pad_samples) if new_wav.ndim == 2 else pad_samples,
+                dtype=new_wav.dtype,
+            )
+            new_wav = np.concatenate([silence, new_wav], axis=1 if new_wav.ndim == 2 else 0)
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
         new_wav, seg_idx, meta, slot_id
     )
     Yields pending HTML immediately, then calls *infer_fn()* — a zero-argument
     callable that runs model-specific CPU prep + GPU inference and returns
+    (wav_array, src_sr, clip_start_s).  For TARO, *infer_fn* should return
+    the wav already upsampled to 48 kHz; pass TARO_SR_OUT as src_sr.
     Yields:
         First:  (gr.update(), gr.update(value=pending_html))  — shown while GPU runs
     pending_html = _build_regen_pending_html(meta["segments"], seg_idx, slot_id, "")
     yield gr.update(), gr.update(value=pending_html)
+    new_wav_raw, src_sr, clip_start_s = infer_fn()
+    video_path, waveform_html = _xregen_splice(new_wav_raw, src_sr, meta, seg_idx, slot_id, clip_start_s)
     yield gr.update(value=video_path), gr.update(value=waveform_html)
                 seed_val, cfg_scale, num_steps, mode,
                 crossfade_s, crossfade_db,
                 request: gr.Request = None):
+    """Cross-model regen: run TARO on its optimal window, splice into *slot_id*."""
     seg_idx = int(seg_idx)
     meta    = json.loads(state_json)
     def _run():
+        clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, TARO_MODEL_DUR)
+        tmp_dir    = _register_tmp_dir(tempfile.mkdtemp())
+        clip_path  = _extract_segment_clip(
+            meta["silent_video"], clip_start, clip_dur,
+            os.path.join(tmp_dir, "xregen_taro_clip.mp4"),
+        )
+        # Build a minimal fake-video meta so generate_taro can run on clip_path
+        sub_segs   = _build_segments(clip_dur, TARO_MODEL_DUR, float(crossfade_s))
+        sub_meta_json = json.dumps({
+            "segments": sub_segs, "silent_video": clip_path,
+            "total_dur_s": clip_dur,
+        })
+        # Run full TARO generation pipeline on the clip
+        _ctx_store("taro_gpu_infer", {
+            "tmp_dir": tmp_dir, "silent_video": clip_path,
+            "segments": sub_segs, "total_dur_s": clip_dur,
+        })
+        results = _taro_gpu_infer(clip_path, seed_val, cfg_scale, num_steps, mode,
+                                  crossfade_s, crossfade_db, 1)
+        wavs, _, _ = results[0]
+        wavs = [_upsample_taro(w) for w in wavs]
+        wav  = _stitch_wavs(wavs, float(crossfade_s), float(crossfade_db),
+                            clip_dur, TARO_SR_OUT, sub_segs)
+        return wav, TARO_SR_OUT, clip_start
     yield from _xregen_dispatch(state_json, seg_idx, slot_id, _run)
                    prompt, negative_prompt, seed_val,
                    cfg_strength, num_steps, crossfade_s, crossfade_db,
                    request: gr.Request = None):
+    """Cross-model regen: run MMAudio on its optimal window, splice into *slot_id*."""
     seg_idx = int(seg_idx)
+    meta    = json.loads(state_json)
     def _run():
+        clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, MMAUDIO_WINDOW)
+        tmp_dir   = _register_tmp_dir(tempfile.mkdtemp())
+        clip_path = _extract_segment_clip(
+            meta["silent_video"], clip_start, clip_dur,
+            os.path.join(tmp_dir, "xregen_mmaudio_clip.mp4"),
+        )
+        sub_segs  = _build_segments(clip_dur, MMAUDIO_WINDOW, float(crossfade_s))
+        seg_clip_paths = [
+            _extract_segment_clip(
+                clip_path, s, e - s,
+                os.path.join(tmp_dir, f"xregen_mma_sub_{i}.mp4"),
+            )
+            for i, (s, e) in enumerate(sub_segs)
+        ]
+        _ctx_store("mmaudio_gpu_infer", {
+            "segments": sub_segs, "seg_clip_paths": seg_clip_paths,
+        })
+        results = _mmaudio_gpu_infer(clip_path, prompt, negative_prompt, seed_val,
+                                     cfg_strength, num_steps, crossfade_s, crossfade_db, 1)
+        seg_wavs, sr = results[0]
+        wav = _stitch_wavs(seg_wavs, float(crossfade_s), float(crossfade_db),
+                           clip_dur, sr, sub_segs)
+        if sr != TARGET_SR:
+            wav = _resample_to_target(wav, sr)
+            sr  = TARGET_SR
+        return wav, sr, clip_start
     yield from _xregen_dispatch(state_json, seg_idx, slot_id, _run)
                    guidance_scale, num_steps, model_size,
                    crossfade_s, crossfade_db,
                    request: gr.Request = None):
+    """Cross-model regen: run HunyuanFoley on its optimal window, splice into *slot_id*."""
     seg_idx = int(seg_idx)
+    meta    = json.loads(state_json)
     def _run():
+        clip_start, clip_end, clip_dur = _xregen_clip_window(meta, seg_idx, HUNYUAN_MAX_DUR)
+        tmp_dir      = _register_tmp_dir(tempfile.mkdtemp())
+        clip_path    = _extract_segment_clip(
+            meta["silent_video"], clip_start, clip_dur,
+            os.path.join(tmp_dir, "xregen_hunyuan_clip.mp4"),
+        )
+        sub_segs     = _build_segments(clip_dur, HUNYUAN_MAX_DUR, float(crossfade_s))
+        seg_clip_paths = [
+            _extract_segment_clip(
+                clip_path, s, e - s,
+                os.path.join(tmp_dir, f"xregen_hny_sub_{i}.mp4"),
+            )
+            for i, (s, e) in enumerate(sub_segs)
+        ]
+        dummy_seg_path = _extract_segment_clip(
+            clip_path, 0, min(clip_dur, HUNYUAN_MAX_DUR),
+            os.path.join(tmp_dir, "xregen_hny_dummy.mp4"),
+        )
+        _ctx_store("hunyuan_gpu_infer", {
+            "segments": sub_segs, "total_dur_s": clip_dur,
+            "dummy_seg_path": dummy_seg_path, "seg_clip_paths": seg_clip_paths,
+        })
+        results = _hunyuan_gpu_infer(clip_path, prompt, negative_prompt, seed_val,
+                                     guidance_scale, num_steps, model_size,
+                                     crossfade_s, crossfade_db, 1)
+        seg_wavs, sr, _ = results[0]
+        wav = _stitch_wavs(seg_wavs, float(crossfade_s), float(crossfade_db),
+                           clip_dur, sr, sub_segs)
+        return wav, sr, clip_start
     yield from _xregen_dispatch(state_json, seg_idx, slot_id, _run)