Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors Claude Sonnet 4.6 commited on 23 days ago

Commit

6ef8b2e

1 Parent(s): 359f37a

Preserve MMAudio stereo; stereo-wins channel normalization

- _normalize_channel_layout(): new helper called in _splice_and_save that
upcasts all mono (T,) segments to stereo (2,T) when any segment in the
slot is stereo. Previously mono-wins: a TARO regen on an MMAudio slot
would average MMAudio's stereo segments down to mono.

- _resample_to_slot_sr(): stereo-wins instead of slot-wins. If the new
wav is stereo and the slot ref is mono, keep the new wav stereo and let
_normalize_channel_layout upcast the existing segments. Defensively
squeezes (1,T) to (T,) before any channel decision.

- _post_process_samples / initial generation: seg_wavs are saved as-is
per model (TARO mono, MMAudio stereo, HunyuanFoley mono after squeeze).
Cross-model regens now use stereo-wins so MMAudio stereo is never lost.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

app.py +33 -9

app.py CHANGED Viewed

@@ -1385,12 +1385,30 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
 # ================================================================== #
 def _splice_and_save(new_wav, seg_idx, meta, slot_id):
     """Replace wavs[seg_idx] with new_wav, re-stitch, re-save, re-mux.
     Returns (video_path, audio_path, updated_meta, waveform_html).
     """
     wavs         = _load_seg_wavs(meta["wav_paths"])
     wavs[seg_idx]= new_wav
     crossfade_s  = float(meta["crossfade_s"])
     crossfade_db = float(meta["crossfade_db"])
     sr           = int(meta["sr"])
@@ -1702,21 +1720,27 @@ def _resample_to_slot_sr(wav: np.ndarray, src_sr: int, dst_sr: int,
     """Resample *wav* from src_sr to dst_sr, then match channel layout to
     *slot_wav_ref* (the first existing segment in the slot).
-    TARO is mono (T,), MMAudio/Hunyuan are stereo (C, T).  Mixing them
-    without normalisation causes a shape mismatch in _cf_join.  Rules:
-      - stereo → mono : average channels
-      - mono   → stereo: duplicate the single channel
     """
     wav = _resample_to_target(wav, src_sr, dst_sr)
-    # Match channel layout to the slot's existing segments
     if slot_wav_ref is not None:
-        slot_stereo = slot_wav_ref.ndim == 2
-        wav_stereo  = wav.ndim == 2
         if slot_stereo and not wav_stereo:
             wav = np.stack([wav, wav], axis=0)   # mono → stereo (C, T)
-        elif not slot_stereo and wav_stereo:
-            wav = wav.mean(axis=0)               # stereo → mono  (T,)
     return wav

 # ================================================================== #
+def _normalize_channel_layout(wavs: list[np.ndarray]) -> list[np.ndarray]:
+    """Ensure all wavs in *wavs* share the same channel layout.
+    Rule: stereo wins.  If ANY segment is stereo (2, T), all mono (T,)
+    segments are duplicated to (2, T).  This preserves MMAudio's genuine
+    stereo output even when the slot also contains TARO or HunyuanFoley
+    mono segments.  (1, T) arrays should already be squeezed by
+    _load_seg_wavs, but we handle them defensively here too.)
+    """
+    # Squeeze any residual (1, T) to (T,) first
+    wavs = [w.squeeze(0) if (w.ndim == 2 and w.shape[0] == 1) else w for w in wavs]
+    has_stereo = any(w.ndim == 2 and w.shape[0] == 2 for w in wavs)
+    if not has_stereo:
+        return wavs
+    return [np.stack([w, w], axis=0) if w.ndim == 1 else w for w in wavs]
 def _splice_and_save(new_wav, seg_idx, meta, slot_id):
     """Replace wavs[seg_idx] with new_wav, re-stitch, re-save, re-mux.
     Returns (video_path, audio_path, updated_meta, waveform_html).
     """
     wavs         = _load_seg_wavs(meta["wav_paths"])
     wavs[seg_idx]= new_wav
+    wavs         = _normalize_channel_layout(wavs)
     crossfade_s  = float(meta["crossfade_s"])
     crossfade_db = float(meta["crossfade_db"])
     sr           = int(meta["sr"])
     """Resample *wav* from src_sr to dst_sr, then match channel layout to
     *slot_wav_ref* (the first existing segment in the slot).
+    Stereo wins: if either the new wav or the slot reference is stereo,
+    the mono side is duplicated to (2, T).  This preserves MMAudio's
+    genuine stereo rather than averaging it down to mono.
+    (1, T) pseudo-stereo from HunyuanFoley's DAC is squeezed to mono first.
     """
     wav = _resample_to_target(wav, src_sr, dst_sr)
+    # Squeeze (1, T) → (T,) before channel decision
+    if wav.ndim == 2 and wav.shape[0] == 1:
+        wav = wav.squeeze(0)
     if slot_wav_ref is not None:
+        # Squeeze slot ref too, defensively
+        ref = slot_wav_ref.squeeze(0) if (slot_wav_ref.ndim == 2 and slot_wav_ref.shape[0] == 1) else slot_wav_ref
+        slot_stereo = ref.ndim == 2 and ref.shape[0] == 2
+        wav_stereo  = wav.ndim == 2 and wav.shape[0] == 2
         if slot_stereo and not wav_stereo:
             wav = np.stack([wav, wav], axis=0)   # mono → stereo (C, T)
+        elif wav_stereo and not slot_stereo:
+            pass  # keep new wav stereo; _normalize_channel_layout in
+                  # _splice_and_save will upcast the existing mono segs
     return wav