Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Sleeping

BoxOfColors Claude Sonnet 4.6 commited on 23 days ago

Commit

894b188

1 Parent(s): 8b87263

Remove channel-layout safety nets; assert stereo (2,T) everywhere

All seg wavs are now guaranteed (2,T) by _save_seg_wavs/_to_stereo.
Remove the silent fallbacks that were masking shape bugs:

- _load_seg_wavs: assert (2,T) instead of silently squeezing (1,T)
- _splice_and_save: assert new_wav is (2,T) on entry; remove
_normalize_channel_layout call (function deleted)
- _resample_to_slot_sr: always call _to_stereo(); drop slot_wav_ref
channel-matching logic and defensive squeezes

Any shape violation now raises AssertionError immediately with the
offending shape, instead of silently producing wrong-shaped output.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

app.py +11 -48

app.py CHANGED Viewed

@@ -339,16 +339,14 @@ def _save_seg_wavs(wavs: list[np.ndarray], tmp_dir: str, prefix: str) -> list[st
 def _load_seg_wavs(paths: list[str]) -> list[np.ndarray]:
     """Load segment wav arrays from .npy file paths.
-    Normalises (1, T) arrays → (T,) mono so that single-channel output from
-    models like HunyuanFoley (DAC decoder emits shape (1, T)) never causes a
-    shape mismatch in _cf_join when mixed with true stereo (2, T) arrays.
     """
     wavs = []
     for p in paths:
         w = np.load(p)
-        if w.ndim == 2 and w.shape[0] == 1:
-            w = w.squeeze(0)   # (1, T) → (T,)  mono
         wavs.append(w)
     return wavs
@@ -1395,30 +1393,15 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
 # ================================================================== #
-def _normalize_channel_layout(wavs: list[np.ndarray]) -> list[np.ndarray]:
-    """Ensure all wavs in *wavs* share the same channel layout.
-    Rule: stereo wins.  If ANY segment is stereo (2, T), all mono (T,)
-    segments are duplicated to (2, T).  This preserves MMAudio's genuine
-    stereo output even when the slot also contains TARO or HunyuanFoley
-    mono segments.  (1, T) arrays should already be squeezed by
-    _load_seg_wavs, but we handle them defensively here too.)
-    """
-    # Squeeze any residual (1, T) to (T,) first
-    wavs = [w.squeeze(0) if (w.ndim == 2 and w.shape[0] == 1) else w for w in wavs]
-    has_stereo = any(w.ndim == 2 and w.shape[0] == 2 for w in wavs)
-    if not has_stereo:
-        return wavs
-    return [np.stack([w, w], axis=0) if w.ndim == 1 else w for w in wavs]
 def _splice_and_save(new_wav, seg_idx, meta, slot_id):
     """Replace wavs[seg_idx] with new_wav, re-stitch, re-save, re-mux.
     Returns (video_path, audio_path, updated_meta, waveform_html).
     """
     wavs         = _load_seg_wavs(meta["wav_paths"])
     wavs[seg_idx]= new_wav
-    wavs         = _normalize_channel_layout(wavs)
     crossfade_s  = float(meta["crossfade_s"])
     crossfade_db = float(meta["crossfade_db"])
     sr           = int(meta["sr"])
@@ -1727,31 +1710,11 @@ MODEL_CONFIGS["hunyuan"]["regen_fn"] = regen_hunyuan_segment
 def _resample_to_slot_sr(wav: np.ndarray, src_sr: int, dst_sr: int,
                          slot_wav_ref: np.ndarray = None) -> np.ndarray:
-    """Resample *wav* from src_sr to dst_sr, then match channel layout to
-    *slot_wav_ref* (the first existing segment in the slot).
-    Stereo wins: if either the new wav or the slot reference is stereo,
-    the mono side is duplicated to (2, T).  This preserves MMAudio's
-    genuine stereo rather than averaging it down to mono.
-    (1, T) pseudo-stereo from HunyuanFoley's DAC is squeezed to mono first.
-    """
     wav = _resample_to_target(wav, src_sr, dst_sr)
-    # Squeeze (1, T) → (T,) before channel decision
-    if wav.ndim == 2 and wav.shape[0] == 1:
-        wav = wav.squeeze(0)
-    if slot_wav_ref is not None:
-        # Squeeze slot ref too, defensively
-        ref = slot_wav_ref.squeeze(0) if (slot_wav_ref.ndim == 2 and slot_wav_ref.shape[0] == 1) else slot_wav_ref
-        slot_stereo = ref.ndim == 2 and ref.shape[0] == 2
-        wav_stereo  = wav.ndim == 2 and wav.shape[0] == 2
-        if slot_stereo and not wav_stereo:
-            wav = np.stack([wav, wav], axis=0)   # mono → stereo (C, T)
-        elif wav_stereo and not slot_stereo:
-            pass  # keep new wav stereo; _normalize_channel_layout in
-                  # _splice_and_save will upcast the existing mono segs
-    return wav
 def _resolve_silent_video(meta: dict) -> str:

 def _load_seg_wavs(paths: list[str]) -> list[np.ndarray]:
     """Load segment wav arrays from .npy file paths.
+    All files on disk are expected to be stereo (2, T) — _save_seg_wavs
+    guarantees this.  Raises AssertionError if any array has unexpected shape.
     """
     wavs = []
     for p in paths:
         w = np.load(p)
+        assert w.ndim == 2 and w.shape[0] == 2, \
+            f"Expected stereo (2, T) in {p}, got shape {w.shape}"
         wavs.append(w)
     return wavs
 # ================================================================== #
 def _splice_and_save(new_wav, seg_idx, meta, slot_id):
     """Replace wavs[seg_idx] with new_wav, re-stitch, re-save, re-mux.
     Returns (video_path, audio_path, updated_meta, waveform_html).
+    All wavs (loaded from disk and new_wav) must be stereo (2, T).
     """
+    assert new_wav.ndim == 2 and new_wav.shape[0] == 2, \
+        f"new_wav must be stereo (2, T), got shape {new_wav.shape}"
     wavs         = _load_seg_wavs(meta["wav_paths"])
     wavs[seg_idx]= new_wav
     crossfade_s  = float(meta["crossfade_s"])
     crossfade_db = float(meta["crossfade_db"])
     sr           = int(meta["sr"])
 def _resample_to_slot_sr(wav: np.ndarray, src_sr: int, dst_sr: int,
                          slot_wav_ref: np.ndarray = None) -> np.ndarray:
+    """Resample *wav* from src_sr to dst_sr and convert to stereo (2, T).
+    slot_wav_ref is unused (kept for call-site compatibility) — all wavs
+    are now always stereo so no per-slot channel matching is needed."""
     wav = _resample_to_target(wav, src_sr, dst_sr)
+    return _to_stereo(wav)
 def _resolve_silent_video(meta: dict) -> str: