Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors commited on 4 days ago

Commit

efe424b

1 Parent(s): 4d46101

feat: add FlashSR post-processing to upsample TARO 16kHz → 48kHz

All three models now output at 48kHz (TARO via FlashSR, MMAudio at
44.1kHz natively resampled, HunyuanFoley at 48kHz natively).
FlashSR is applied after generation and after each regen/xregen on
TARO outputs. Console logs confirm each upsampling step with duration
and sample rate. Falls back to sinc resampling if FlashSR errors.

Files changed (2) hide show

app.py +90 -5
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -498,6 +498,73 @@ def _taro_infer_segment(
     return wav[:seg_samples]
 def _stitch_wavs(wavs: list[np.ndarray], crossfade_s: float, db_boost: float,
                  total_dur_s: float, sr: int) -> np.ndarray:
     """Crossfade-join a list of wav arrays and trim to *total_dur_s*.
@@ -672,8 +739,15 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
     outputs = []
     for sample_idx, (wavs, cavp_feats, onset_feats) in enumerate(results):
         final_wav  = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, TARO_SR)
         audio_path = os.path.join(tmp_dir, f"taro_{sample_idx}.wav")
-        _save_wav(audio_path, final_wav, TARO_SR)
         video_path = os.path.join(tmp_dir, f"taro_{sample_idx}.mp4")
         mux_video_audio(silent_video, audio_path, video_path)
         wav_paths = _save_seg_wavs(wavs, tmp_dir, f"taro_{sample_idx}")
@@ -685,7 +759,7 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
             first_cavp_saved = True
         seg_meta = _build_seg_meta(
             segments=segments, wav_paths=wav_paths, audio_path=audio_path,
-            video_path=video_path, silent_video=silent_video, sr=TARO_SR,
             model="taro", crossfade_s=crossfade_s, crossfade_db=crossfade_db,
             total_dur_s=total_dur_s, cavp_path=cavp_path, onset_path=onset_path,
         )
@@ -1135,9 +1209,16 @@ def regen_taro_segment(video_file, seg_idx, seg_meta_json,
                               seed_val, cfg_scale, num_steps, mode,
                               crossfade_s, crossfade_db, slot_id)
-    # CPU: splice, stitch, mux, save
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
-        new_wav, seg_idx, meta, slot_id
     )
     return video_path, audio_path, json.dumps(updated_meta), waveform_html
@@ -1405,7 +1486,11 @@ def xregen_taro(seg_idx, state_json, slot_id,
     new_wav_raw = _regen_taro_gpu(None, seg_idx, state_json,
                                   seed_val, cfg_scale, num_steps, mode,
                                   crossfade_s, crossfade_db, slot_id)
-    video_path, waveform_html = _xregen_splice(new_wav_raw, TARO_SR, meta, seg_idx, slot_id)
     yield gr.update(value=video_path), gr.update(value=waveform_html)

     return wav[:seg_samples]
+# ================================================================== #
+#                         FlashSR (16 → 48 kHz)                       #
+# ================================================================== #
+# FlashSR is used as a post-processing step on TARO outputs only.
+# TARO generates at 16 kHz; FlashSR upsamples to 48 kHz so all three
+# models produce output at the same sample rate.
+# Model weights are downloaded once from HF Hub and cached on disk.
+_FLASHSR_MODEL = None   # module-level cache — loaded once per process
+_FLASHSR_LOCK  = threading.Lock()
+FLASHSR_SR_IN  = 16000
+FLASHSR_SR_OUT = 48000
+def _load_flashsr():
+    """Load FlashSR model (cached after first call). Returns FASR instance."""
+    global _FLASHSR_MODEL
+    with _FLASHSR_LOCK:
+        if _FLASHSR_MODEL is not None:
+            return _FLASHSR_MODEL
+        print("[FlashSR] Loading model weights from HF Hub …")
+        from huggingface_hub import hf_hub_download
+        from FastAudioSR import FASR
+        ckpt_path = hf_hub_download(
+            repo_id="YatharthS/FlashSR",
+            filename="upsampler.pth",
+            local_dir=os.path.join(os.path.dirname(os.path.abspath(__file__)), ".flashsr_cache"),
+        )
+        model = FASR(ckpt_path)
+        if torch.cuda.is_available():
+            model.model.half().cuda()
+            print("[FlashSR] Model loaded on GPU (fp16)")
+        else:
+            print("[FlashSR] Model loaded on CPU (fp32)")
+        _FLASHSR_MODEL = model
+        return model
+def _apply_flashsr(wav_16k: np.ndarray) -> np.ndarray:
+    """Upsample a mono 16 kHz numpy array to 48 kHz using FlashSR.
+    Returns a mono float32 numpy array at 48 kHz.
+    Falls back to torchaudio sinc resampling if FlashSR fails.
+    """
+    try:
+        model = _load_flashsr()
+        t = torch.from_numpy(wav_16k.astype(np.float32)).unsqueeze(0)
+        if torch.cuda.is_available():
+            t = t.half().cuda()
+        print(f"[FlashSR] Upsampling {len(wav_16k)/FLASHSR_SR_IN:.2f}s @ 16kHz → 48kHz …")
+        with torch.no_grad():
+            out = model.run(t)
+        # out is a tensor or numpy array — normalise to numpy float32 cpu
+        if isinstance(out, torch.Tensor):
+            out = out.float().cpu().squeeze().numpy()
+        else:
+            out = np.array(out, dtype=np.float32).squeeze()
+        print(f"[FlashSR] Done — output shape {out.shape}, sr={FLASHSR_SR_OUT}")
+        return out
+    except Exception as e:
+        print(f"[FlashSR] ERROR: {e} — falling back to sinc resampling")
+        t = torch.from_numpy(wav_16k.astype(np.float32)).unsqueeze(0)
+        out = torchaudio.functional.resample(t, FLASHSR_SR_IN, FLASHSR_SR_OUT)
+        return out.squeeze().numpy()
 def _stitch_wavs(wavs: list[np.ndarray], crossfade_s: float, db_boost: float,
                  total_dur_s: float, sr: int) -> np.ndarray:
     """Crossfade-join a list of wav arrays and trim to *total_dur_s*.
     outputs = []
     for sample_idx, (wavs, cavp_feats, onset_feats) in enumerate(results):
         final_wav  = _stitch_wavs(wavs, crossfade_s, crossfade_db, total_dur_s, TARO_SR)
+        # ── FlashSR: upsample 16 kHz → 48 kHz ──
+        print(f"[TARO] Sample {sample_idx+1}: running FlashSR upsampler (16kHz → 48kHz) …")
+        final_wav  = _apply_flashsr(final_wav)
+        out_sr     = FLASHSR_SR_OUT
+        print(f"[TARO] Sample {sample_idx+1}: FlashSR complete — {len(final_wav)/out_sr:.2f}s @ {out_sr}Hz")
         audio_path = os.path.join(tmp_dir, f"taro_{sample_idx}.wav")
+        _save_wav(audio_path, final_wav, out_sr)
         video_path = os.path.join(tmp_dir, f"taro_{sample_idx}.mp4")
         mux_video_audio(silent_video, audio_path, video_path)
         wav_paths = _save_seg_wavs(wavs, tmp_dir, f"taro_{sample_idx}")
             first_cavp_saved = True
         seg_meta = _build_seg_meta(
             segments=segments, wav_paths=wav_paths, audio_path=audio_path,
+            video_path=video_path, silent_video=silent_video, sr=out_sr,
             model="taro", crossfade_s=crossfade_s, crossfade_db=crossfade_db,
             total_dur_s=total_dur_s, cavp_path=cavp_path, onset_path=onset_path,
         )
                               seed_val, cfg_scale, num_steps, mode,
                               crossfade_s, crossfade_db, slot_id)
+    # FlashSR: upsample 16 kHz → 48 kHz before splicing
+    print(f"[TARO regen] Running FlashSR upsampler (16kHz → 48kHz) on seg {seg_idx} …")
+    new_wav = _apply_flashsr(new_wav)
+    print(f"[TARO regen] FlashSR complete — {len(new_wav)/FLASHSR_SR_OUT:.2f}s @ {FLASHSR_SR_OUT}Hz")
+    # CPU: splice, stitch, mux, save — meta["sr"] must reflect the upsampled rate
+    meta_48k = dict(meta)
+    meta_48k["sr"] = FLASHSR_SR_OUT
     video_path, audio_path, updated_meta, waveform_html = _splice_and_save(
+        new_wav, seg_idx, meta_48k, slot_id
     )
     return video_path, audio_path, json.dumps(updated_meta), waveform_html
     new_wav_raw = _regen_taro_gpu(None, seg_idx, state_json,
                                   seed_val, cfg_scale, num_steps, mode,
                                   crossfade_s, crossfade_db, slot_id)
+    # FlashSR: upsample 16 kHz → 48 kHz before splicing into slot
+    print(f"[xregen TARO] Running FlashSR upsampler (16kHz → 48kHz) on seg {seg_idx} …")
+    new_wav_raw = _apply_flashsr(new_wav_raw)
+    print(f"[xregen TARO] FlashSR complete — {len(new_wav_raw)/FLASHSR_SR_OUT:.2f}s @ {FLASHSR_SR_OUT}Hz")
+    video_path, waveform_html = _xregen_splice(new_wav_raw, FLASHSR_SR_OUT, meta, seg_idx, slot_id)
     yield gr.update(value=video_path), gr.update(value=waveform_html)

requirements.txt CHANGED Viewed

@@ -21,6 +21,7 @@ loguru
 torchdiffeq
 open_clip_torch
 git+https://github.com/descriptinc/audiotools
 --extra-index-url https://download.pytorch.org/whl/cu124
 torchaudio==2.5.1+cu124
 --find-links https://download.openmmlab.com/mmcv/dist/cu121/torch2.4.0/index.html

 torchdiffeq
 open_clip_torch
 git+https://github.com/descriptinc/audiotools
+git+https://github.com/ysharma3501/FlashSR.git
 --extra-index-url https://download.pytorch.org/whl/cu124
 torchaudio==2.5.1+cu124
 --find-links https://download.openmmlab.com/mmcv/dist/cu121/torch2.4.0/index.html