BoxOfColors Claude Sonnet 4.6 commited on
Commit
6ef8b2e
·
1 Parent(s): 359f37a

Preserve MMAudio stereo; stereo-wins channel normalization

Browse files

- _normalize_channel_layout(): new helper called in _splice_and_save that
upcasts all mono (T,) segments to stereo (2,T) when any segment in the
slot is stereo. Previously mono-wins: a TARO regen on an MMAudio slot
would average MMAudio's stereo segments down to mono.

- _resample_to_slot_sr(): stereo-wins instead of slot-wins. If the new
wav is stereo and the slot ref is mono, keep the new wav stereo and let
_normalize_channel_layout upcast the existing segments. Defensively
squeezes (1,T) to (T,) before any channel decision.

- _post_process_samples / initial generation: seg_wavs are saved as-is
per model (TARO mono, MMAudio stereo, HunyuanFoley mono after squeeze).
Cross-model regens now use stereo-wins so MMAudio stereo is never lost.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +33 -9
app.py CHANGED
@@ -1385,12 +1385,30 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
1385
  # ================================================================== #
1386
 
1387
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1388
  def _splice_and_save(new_wav, seg_idx, meta, slot_id):
1389
  """Replace wavs[seg_idx] with new_wav, re-stitch, re-save, re-mux.
1390
  Returns (video_path, audio_path, updated_meta, waveform_html).
1391
  """
1392
  wavs = _load_seg_wavs(meta["wav_paths"])
1393
  wavs[seg_idx]= new_wav
 
1394
  crossfade_s = float(meta["crossfade_s"])
1395
  crossfade_db = float(meta["crossfade_db"])
1396
  sr = int(meta["sr"])
@@ -1702,21 +1720,27 @@ def _resample_to_slot_sr(wav: np.ndarray, src_sr: int, dst_sr: int,
1702
  """Resample *wav* from src_sr to dst_sr, then match channel layout to
1703
  *slot_wav_ref* (the first existing segment in the slot).
1704
 
1705
- TARO is mono (T,), MMAudio/Hunyuan are stereo (C, T). Mixing them
1706
- without normalisation causes a shape mismatch in _cf_join. Rules:
1707
- - stereo mono : average channels
1708
- - mono → stereo: duplicate the single channel
1709
  """
1710
  wav = _resample_to_target(wav, src_sr, dst_sr)
1711
 
1712
- # Match channel layout to the slot's existing segments
 
 
 
1713
  if slot_wav_ref is not None:
1714
- slot_stereo = slot_wav_ref.ndim == 2
1715
- wav_stereo = wav.ndim == 2
 
 
1716
  if slot_stereo and not wav_stereo:
1717
  wav = np.stack([wav, wav], axis=0) # mono → stereo (C, T)
1718
- elif not slot_stereo and wav_stereo:
1719
- wav = wav.mean(axis=0) # stereo mono (T,)
 
1720
  return wav
1721
 
1722
 
 
1385
  # ================================================================== #
1386
 
1387
 
1388
+ def _normalize_channel_layout(wavs: list[np.ndarray]) -> list[np.ndarray]:
1389
+ """Ensure all wavs in *wavs* share the same channel layout.
1390
+
1391
+ Rule: stereo wins. If ANY segment is stereo (2, T), all mono (T,)
1392
+ segments are duplicated to (2, T). This preserves MMAudio's genuine
1393
+ stereo output even when the slot also contains TARO or HunyuanFoley
1394
+ mono segments. (1, T) arrays should already be squeezed by
1395
+ _load_seg_wavs, but we handle them defensively here too.)
1396
+ """
1397
+ # Squeeze any residual (1, T) to (T,) first
1398
+ wavs = [w.squeeze(0) if (w.ndim == 2 and w.shape[0] == 1) else w for w in wavs]
1399
+ has_stereo = any(w.ndim == 2 and w.shape[0] == 2 for w in wavs)
1400
+ if not has_stereo:
1401
+ return wavs
1402
+ return [np.stack([w, w], axis=0) if w.ndim == 1 else w for w in wavs]
1403
+
1404
+
1405
  def _splice_and_save(new_wav, seg_idx, meta, slot_id):
1406
  """Replace wavs[seg_idx] with new_wav, re-stitch, re-save, re-mux.
1407
  Returns (video_path, audio_path, updated_meta, waveform_html).
1408
  """
1409
  wavs = _load_seg_wavs(meta["wav_paths"])
1410
  wavs[seg_idx]= new_wav
1411
+ wavs = _normalize_channel_layout(wavs)
1412
  crossfade_s = float(meta["crossfade_s"])
1413
  crossfade_db = float(meta["crossfade_db"])
1414
  sr = int(meta["sr"])
 
1720
  """Resample *wav* from src_sr to dst_sr, then match channel layout to
1721
  *slot_wav_ref* (the first existing segment in the slot).
1722
 
1723
+ Stereo wins: if either the new wav or the slot reference is stereo,
1724
+ the mono side is duplicated to (2, T). This preserves MMAudio's
1725
+ genuine stereo rather than averaging it down to mono.
1726
+ (1, T) pseudo-stereo from HunyuanFoley's DAC is squeezed to mono first.
1727
  """
1728
  wav = _resample_to_target(wav, src_sr, dst_sr)
1729
 
1730
+ # Squeeze (1, T) (T,) before channel decision
1731
+ if wav.ndim == 2 and wav.shape[0] == 1:
1732
+ wav = wav.squeeze(0)
1733
+
1734
  if slot_wav_ref is not None:
1735
+ # Squeeze slot ref too, defensively
1736
+ ref = slot_wav_ref.squeeze(0) if (slot_wav_ref.ndim == 2 and slot_wav_ref.shape[0] == 1) else slot_wav_ref
1737
+ slot_stereo = ref.ndim == 2 and ref.shape[0] == 2
1738
+ wav_stereo = wav.ndim == 2 and wav.shape[0] == 2
1739
  if slot_stereo and not wav_stereo:
1740
  wav = np.stack([wav, wav], axis=0) # mono → stereo (C, T)
1741
+ elif wav_stereo and not slot_stereo:
1742
+ pass # keep new wav stereo; _normalize_channel_layout in
1743
+ # _splice_and_save will upcast the existing mono segs
1744
  return wav
1745
 
1746