Spaces:

WeReCooking
/

ACE-Step-CPU

Running

Nekochu commited on 9 days ago

Commit

17d39ba

1 Parent(s): 6b3e616

fix: use librosa instead of torchaudio for VAD (torchcodec not installed), fix audios deprecation

Files changed (1) hide show

caption_fast.py CHANGED Viewed

@@ -118,7 +118,7 @@ def tag_audio(audio_path: str, top_n: int = 10) -> List[str]:
     inputs = processor(
         text=TAGS,
-        audios=[audio],
         sampling_rate=48000,
         return_tensors="pt",
         padding=True,
@@ -137,14 +137,11 @@ def detect_speech(audio_path: str, threshold: float = 5.0) -> bool:
     Returns True if speech detected for more than `threshold` seconds.
     """
     import torch
-    import torchaudio
     vad = _load_vad()
-    wav, sr = torchaudio.load(audio_path)
-    if wav.shape[0] > 1:
-        wav = wav.mean(dim=0, keepdim=True)
-    if sr != 16000:
-        wav = torchaudio.functional.resample(wav, sr, 16000)
     speech_timestamps = []
     window_size = 512

     inputs = processor(
         text=TAGS,
+        audio=[audio],
         sampling_rate=48000,
         return_tensors="pt",
         padding=True,
     Returns True if speech detected for more than `threshold` seconds.
     """
     import torch
+    import librosa
     vad = _load_vad()
+    y, sr = librosa.load(audio_path, sr=16000, mono=True)
+    wav = torch.from_numpy(y).unsqueeze(0)
     speech_timestamps = []
     window_size = 512