Spaces:
Running on Zero
Preserve MMAudio stereo; stereo-wins channel normalization
Browse files- _normalize_channel_layout(): new helper called in _splice_and_save that
upcasts all mono (T,) segments to stereo (2,T) when any segment in the
slot is stereo. Previously mono-wins: a TARO regen on an MMAudio slot
would average MMAudio's stereo segments down to mono.
- _resample_to_slot_sr(): stereo-wins instead of slot-wins. If the new
wav is stereo and the slot ref is mono, keep the new wav stereo and let
_normalize_channel_layout upcast the existing segments. Defensively
squeezes (1,T) to (T,) before any channel decision.
- _post_process_samples / initial generation: seg_wavs are saved as-is
per model (TARO mono, MMAudio stereo, HunyuanFoley mono after squeeze).
Cross-model regens now use stereo-wins so MMAudio stereo is never lost.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
|
@@ -1385,12 +1385,30 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
|
|
| 1385 |
# ================================================================== #
|
| 1386 |
|
| 1387 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1388 |
def _splice_and_save(new_wav, seg_idx, meta, slot_id):
|
| 1389 |
"""Replace wavs[seg_idx] with new_wav, re-stitch, re-save, re-mux.
|
| 1390 |
Returns (video_path, audio_path, updated_meta, waveform_html).
|
| 1391 |
"""
|
| 1392 |
wavs = _load_seg_wavs(meta["wav_paths"])
|
| 1393 |
wavs[seg_idx]= new_wav
|
|
|
|
| 1394 |
crossfade_s = float(meta["crossfade_s"])
|
| 1395 |
crossfade_db = float(meta["crossfade_db"])
|
| 1396 |
sr = int(meta["sr"])
|
|
@@ -1702,21 +1720,27 @@ def _resample_to_slot_sr(wav: np.ndarray, src_sr: int, dst_sr: int,
|
|
| 1702 |
"""Resample *wav* from src_sr to dst_sr, then match channel layout to
|
| 1703 |
*slot_wav_ref* (the first existing segment in the slot).
|
| 1704 |
|
| 1705 |
-
|
| 1706 |
-
|
| 1707 |
-
|
| 1708 |
-
|
| 1709 |
"""
|
| 1710 |
wav = _resample_to_target(wav, src_sr, dst_sr)
|
| 1711 |
|
| 1712 |
-
#
|
|
|
|
|
|
|
|
|
|
| 1713 |
if slot_wav_ref is not None:
|
| 1714 |
-
|
| 1715 |
-
|
|
|
|
|
|
|
| 1716 |
if slot_stereo and not wav_stereo:
|
| 1717 |
wav = np.stack([wav, wav], axis=0) # mono → stereo (C, T)
|
| 1718 |
-
elif
|
| 1719 |
-
|
|
|
|
| 1720 |
return wav
|
| 1721 |
|
| 1722 |
|
|
|
|
| 1385 |
# ================================================================== #
|
| 1386 |
|
| 1387 |
|
| 1388 |
+
def _normalize_channel_layout(wavs: list[np.ndarray]) -> list[np.ndarray]:
|
| 1389 |
+
"""Ensure all wavs in *wavs* share the same channel layout.
|
| 1390 |
+
|
| 1391 |
+
Rule: stereo wins. If ANY segment is stereo (2, T), all mono (T,)
|
| 1392 |
+
segments are duplicated to (2, T). This preserves MMAudio's genuine
|
| 1393 |
+
stereo output even when the slot also contains TARO or HunyuanFoley
|
| 1394 |
+
mono segments. (1, T) arrays should already be squeezed by
|
| 1395 |
+
_load_seg_wavs, but we handle them defensively here too.)
|
| 1396 |
+
"""
|
| 1397 |
+
# Squeeze any residual (1, T) to (T,) first
|
| 1398 |
+
wavs = [w.squeeze(0) if (w.ndim == 2 and w.shape[0] == 1) else w for w in wavs]
|
| 1399 |
+
has_stereo = any(w.ndim == 2 and w.shape[0] == 2 for w in wavs)
|
| 1400 |
+
if not has_stereo:
|
| 1401 |
+
return wavs
|
| 1402 |
+
return [np.stack([w, w], axis=0) if w.ndim == 1 else w for w in wavs]
|
| 1403 |
+
|
| 1404 |
+
|
| 1405 |
def _splice_and_save(new_wav, seg_idx, meta, slot_id):
|
| 1406 |
"""Replace wavs[seg_idx] with new_wav, re-stitch, re-save, re-mux.
|
| 1407 |
Returns (video_path, audio_path, updated_meta, waveform_html).
|
| 1408 |
"""
|
| 1409 |
wavs = _load_seg_wavs(meta["wav_paths"])
|
| 1410 |
wavs[seg_idx]= new_wav
|
| 1411 |
+
wavs = _normalize_channel_layout(wavs)
|
| 1412 |
crossfade_s = float(meta["crossfade_s"])
|
| 1413 |
crossfade_db = float(meta["crossfade_db"])
|
| 1414 |
sr = int(meta["sr"])
|
|
|
|
| 1720 |
"""Resample *wav* from src_sr to dst_sr, then match channel layout to
|
| 1721 |
*slot_wav_ref* (the first existing segment in the slot).
|
| 1722 |
|
| 1723 |
+
Stereo wins: if either the new wav or the slot reference is stereo,
|
| 1724 |
+
the mono side is duplicated to (2, T). This preserves MMAudio's
|
| 1725 |
+
genuine stereo rather than averaging it down to mono.
|
| 1726 |
+
(1, T) pseudo-stereo from HunyuanFoley's DAC is squeezed to mono first.
|
| 1727 |
"""
|
| 1728 |
wav = _resample_to_target(wav, src_sr, dst_sr)
|
| 1729 |
|
| 1730 |
+
# Squeeze (1, T) → (T,) before channel decision
|
| 1731 |
+
if wav.ndim == 2 and wav.shape[0] == 1:
|
| 1732 |
+
wav = wav.squeeze(0)
|
| 1733 |
+
|
| 1734 |
if slot_wav_ref is not None:
|
| 1735 |
+
# Squeeze slot ref too, defensively
|
| 1736 |
+
ref = slot_wav_ref.squeeze(0) if (slot_wav_ref.ndim == 2 and slot_wav_ref.shape[0] == 1) else slot_wav_ref
|
| 1737 |
+
slot_stereo = ref.ndim == 2 and ref.shape[0] == 2
|
| 1738 |
+
wav_stereo = wav.ndim == 2 and wav.shape[0] == 2
|
| 1739 |
if slot_stereo and not wav_stereo:
|
| 1740 |
wav = np.stack([wav, wav], axis=0) # mono → stereo (C, T)
|
| 1741 |
+
elif wav_stereo and not slot_stereo:
|
| 1742 |
+
pass # keep new wav stereo; _normalize_channel_layout in
|
| 1743 |
+
# _splice_and_save will upcast the existing mono segs
|
| 1744 |
return wav
|
| 1745 |
|
| 1746 |
|