Nekochu commited on
Commit
17d39ba
·
1 Parent(s): 6b3e616

fix: use librosa instead of torchaudio for VAD (torchcodec not installed), fix audios deprecation

Browse files
Files changed (1) hide show
  1. caption_fast.py +4 -7
caption_fast.py CHANGED
@@ -118,7 +118,7 @@ def tag_audio(audio_path: str, top_n: int = 10) -> List[str]:
118
 
119
  inputs = processor(
120
  text=TAGS,
121
- audios=[audio],
122
  sampling_rate=48000,
123
  return_tensors="pt",
124
  padding=True,
@@ -137,14 +137,11 @@ def detect_speech(audio_path: str, threshold: float = 5.0) -> bool:
137
  Returns True if speech detected for more than `threshold` seconds.
138
  """
139
  import torch
140
- import torchaudio
141
 
142
  vad = _load_vad()
143
- wav, sr = torchaudio.load(audio_path)
144
- if wav.shape[0] > 1:
145
- wav = wav.mean(dim=0, keepdim=True)
146
- if sr != 16000:
147
- wav = torchaudio.functional.resample(wav, sr, 16000)
148
 
149
  speech_timestamps = []
150
  window_size = 512
 
118
 
119
  inputs = processor(
120
  text=TAGS,
121
+ audio=[audio],
122
  sampling_rate=48000,
123
  return_tensors="pt",
124
  padding=True,
 
137
  Returns True if speech detected for more than `threshold` seconds.
138
  """
139
  import torch
140
+ import librosa
141
 
142
  vad = _load_vad()
143
+ y, sr = librosa.load(audio_path, sr=16000, mono=True)
144
+ wav = torch.from_numpy(y).unsqueeze(0)
 
 
 
145
 
146
  speech_timestamps = []
147
  window_size = 512