Spaces:
Running
Running
fix: use librosa instead of torchaudio for VAD (torchcodec not installed), fix audios deprecation
Browse files- caption_fast.py +4 -7
caption_fast.py
CHANGED
|
@@ -118,7 +118,7 @@ def tag_audio(audio_path: str, top_n: int = 10) -> List[str]:
|
|
| 118 |
|
| 119 |
inputs = processor(
|
| 120 |
text=TAGS,
|
| 121 |
-
|
| 122 |
sampling_rate=48000,
|
| 123 |
return_tensors="pt",
|
| 124 |
padding=True,
|
|
@@ -137,14 +137,11 @@ def detect_speech(audio_path: str, threshold: float = 5.0) -> bool:
|
|
| 137 |
Returns True if speech detected for more than `threshold` seconds.
|
| 138 |
"""
|
| 139 |
import torch
|
| 140 |
-
import
|
| 141 |
|
| 142 |
vad = _load_vad()
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
wav = wav.mean(dim=0, keepdim=True)
|
| 146 |
-
if sr != 16000:
|
| 147 |
-
wav = torchaudio.functional.resample(wav, sr, 16000)
|
| 148 |
|
| 149 |
speech_timestamps = []
|
| 150 |
window_size = 512
|
|
|
|
| 118 |
|
| 119 |
inputs = processor(
|
| 120 |
text=TAGS,
|
| 121 |
+
audio=[audio],
|
| 122 |
sampling_rate=48000,
|
| 123 |
return_tensors="pt",
|
| 124 |
padding=True,
|
|
|
|
| 137 |
Returns True if speech detected for more than `threshold` seconds.
|
| 138 |
"""
|
| 139 |
import torch
|
| 140 |
+
import librosa
|
| 141 |
|
| 142 |
vad = _load_vad()
|
| 143 |
+
y, sr = librosa.load(audio_path, sr=16000, mono=True)
|
| 144 |
+
wav = torch.from_numpy(y).unsqueeze(0)
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
speech_timestamps = []
|
| 147 |
window_size = 512
|