Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors commited on 5 days ago

Commit

585a112

1 Parent(s): 72afd74

Unify mono and stereo crossfade into a single _cf_join function

Replace separate _crossfade_join (mono/TARO) and _cf_join_stereo (stereo/
MMAudio+HunyuanFoley) with one _cf_join that handles both shapes via a.ndim
check: stereo (C,T) uses axis=1 slicing, mono (T,) uses 1D indexing.
Update _stitch_wavs to accept an sr parameter and call _cf_join, and update
its call site in generate_taro to pass TARO_SR explicitly.

Files changed (1) hide show

app.py +24 -31

app.py CHANGED Viewed

@@ -127,19 +127,26 @@ def _build_segments(total_dur_s: float, window_s: float, crossfade_s: float) ->
     return segments
-def _cf_join_stereo(a: np.ndarray, b: np.ndarray,
-                    crossfade_s: float, db_boost: float, sr: int) -> np.ndarray:
-    """Equal-power crossfade join for stereo (C, T) numpy arrays."""
-    cf = int(round(crossfade_s * sr))
-    cf = min(cf, a.shape[1], b.shape[1])
     if cf <= 0:
-        return np.concatenate([a, b], axis=1)
-    gain = 10 ** (db_boost / 20.0)
-    t = np.linspace(0.0, 1.0, cf, dtype=np.float32)
     fade_out = np.cos(t * np.pi / 2)   # 1 → 0
     fade_in  = np.sin(t * np.pi / 2)   # 0 → 1
-    overlap = a[:, -cf:] * fade_out * gain + b[:, :cf] * fade_in * gain
-    return np.concatenate([a[:, :-cf], overlap, b[:, cf:]], axis=1)
 # ================================================================== #
@@ -237,26 +244,12 @@ def _taro_infer_segment(
     return wav[:seg_samples]
-def _crossfade_join(wav_a: np.ndarray, wav_b: np.ndarray,
-                    crossfade_s: float, db_boost: float) -> np.ndarray:
-    cf = int(round(crossfade_s * TARO_SR))
-    cf = min(cf, len(wav_a), len(wav_b))
-    if cf <= 0:
-        return np.concatenate([wav_a, wav_b])
-    gain = 10 ** (db_boost / 20.0)
-    # Equal-power fade: fade-out a, fade-in b over the overlap region
-    t = np.linspace(0.0, 1.0, cf, dtype=np.float32)
-    fade_out = np.cos(t * np.pi / 2)   # 1 → 0
-    fade_in  = np.sin(t * np.pi / 2)   # 0 → 1
-    overlap  = wav_a[-cf:] * fade_out * gain + wav_b[:cf] * fade_in * gain
-    return np.concatenate([wav_a[:-cf], overlap, wav_b[cf:]])
-def _stitch_wavs(wavs: list, crossfade_s: float, db_boost: float, total_dur_s: float) -> np.ndarray:
     out = wavs[0]
     for nw in wavs[1:]:
-        out = _crossfade_join(out, nw, crossfade_s, db_boost)
-    return out[:int(round(total_dur_s * TARO_SR))]
 @spaces.GPU(duration=600)
@@ -360,7 +353,7 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
                 wavs.append(wav)
             _TARO_INFERENCE_CACHE[cache_key] = {"wavs": wavs}
-        final_wav  = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s)
         audio_path = os.path.join(tmp_dir, f"taro_{sample_idx}.wav")
         sf.write(audio_path, final_wav, TARO_SR)
         video_path = os.path.join(tmp_dir, f"taro_{sample_idx}.mp4")
@@ -495,7 +488,7 @@ def generate_mmaudio(video_file, prompt, negative_prompt, seed_val,
         # Crossfade-stitch all segments using shared equal-power helper
         full_wav = seg_audios[0]
         for nw in seg_audios[1:]:
-            full_wav = _cf_join_stereo(full_wav, nw, MMA_CF_S, MMA_CF_DB, sr)
         full_wav = full_wav[:, : int(round(total_dur_s * sr))]
         audio_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.flac")
@@ -631,7 +624,7 @@ def generate_hunyuan(video_file, prompt, negative_prompt, seed_val,
         # Crossfade-stitch all segments using shared equal-power helper
         full_wav = seg_wavs[0]
         for nw in seg_wavs[1:]:
-            full_wav = _cf_join_stereo(full_wav, nw, CF_S, CF_DB, sr)
         # Trim to exact video duration
         full_wav = full_wav[:, : int(round(total_dur_s * sr))]

     return segments
+def _cf_join(a: np.ndarray, b: np.ndarray,
+             crossfade_s: float, db_boost: float, sr: int) -> np.ndarray:
+    """Equal-power crossfade join.  Works for both mono (T,) and stereo (C, T) arrays.
+    Stereo arrays are expected in (channels, samples) layout."""
+    stereo = a.ndim == 2
+    n_a = a.shape[1] if stereo else len(a)
+    n_b = b.shape[1] if stereo else len(b)
+    cf  = min(int(round(crossfade_s * sr)), n_a, n_b)
     if cf <= 0:
+        return np.concatenate([a, b], axis=1 if stereo else 0)
+    gain     = 10 ** (db_boost / 20.0)
+    t        = np.linspace(0.0, 1.0, cf, dtype=np.float32)
     fade_out = np.cos(t * np.pi / 2)   # 1 → 0
     fade_in  = np.sin(t * np.pi / 2)   # 0 → 1
+    if stereo:
+        overlap = a[:, -cf:] * fade_out * gain + b[:, :cf] * fade_in * gain
+        return np.concatenate([a[:, :-cf], overlap, b[:, cf:]], axis=1)
+    else:
+        overlap = a[-cf:] * fade_out * gain + b[:cf] * fade_in * gain
+        return np.concatenate([a[:-cf], overlap, b[cf:]])
 # ================================================================== #
     return wav[:seg_samples]
+def _stitch_wavs(wavs: list, crossfade_s: float, db_boost: float,
+                 total_dur_s: float, sr: int) -> np.ndarray:
     out = wavs[0]
     for nw in wavs[1:]:
+        out = _cf_join(out, nw, crossfade_s, db_boost, sr)
+    return out[:int(round(total_dur_s * sr))]
 @spaces.GPU(duration=600)
                 wavs.append(wav)
             _TARO_INFERENCE_CACHE[cache_key] = {"wavs": wavs}
+        final_wav  = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, TARO_SR)
         audio_path = os.path.join(tmp_dir, f"taro_{sample_idx}.wav")
         sf.write(audio_path, final_wav, TARO_SR)
         video_path = os.path.join(tmp_dir, f"taro_{sample_idx}.mp4")
         # Crossfade-stitch all segments using shared equal-power helper
         full_wav = seg_audios[0]
         for nw in seg_audios[1:]:
+            full_wav = _cf_join(full_wav, nw, MMA_CF_S, MMA_CF_DB, sr)
         full_wav = full_wav[:, : int(round(total_dur_s * sr))]
         audio_path = os.path.join(tmp_dir, f"mmaudio_{sample_idx}.flac")
         # Crossfade-stitch all segments using shared equal-power helper
         full_wav = seg_wavs[0]
         for nw in seg_wavs[1:]:
+            full_wav = _cf_join(full_wav, nw, CF_S, CF_DB, sr)
         # Trim to exact video duration
         full_wav = full_wav[:, : int(round(total_dur_s * sr))]