jhj0517 commited on
Commit
6f9cdbb
·
unverified ·
2 Parent(s): e1a6c10 07b6329

Merge pull request #213 from jhj0517/fix/vad

Browse files

Add option to silence non-speech segments in VAD instead of cutting off

modules/vad/silero_vad.py CHANGED
@@ -1,6 +1,6 @@
1
  from faster_whisper.vad import VadOptions, get_vad_model
2
  import numpy as np
3
- from typing import BinaryIO, Union, List, Optional
4
  import warnings
5
  import faster_whisper
6
  import gradio as gr
@@ -15,6 +15,7 @@ class SileroVAD:
15
  def run(self,
16
  audio: Union[str, BinaryIO, np.ndarray],
17
  vad_parameters: VadOptions,
 
18
  progress: gr.Progress = gr.Progress()):
19
  """
20
  Run VAD
@@ -25,6 +26,8 @@ class SileroVAD:
25
  Audio path or file binary or Audio numpy array
26
  vad_parameters:
27
  Options for VAD processing.
 
 
28
  progress: gr.Progress
29
  Indicator to show progress directly in gradio.
30
 
@@ -40,19 +43,32 @@ class SileroVAD:
40
  audio = faster_whisper.decode_audio(audio, sampling_rate=sampling_rate)
41
 
42
  duration = audio.shape[0] / sampling_rate
43
- duration_after_vad = duration
44
 
45
  if vad_parameters is None:
46
  vad_parameters = VadOptions()
47
  elif isinstance(vad_parameters, dict):
48
  vad_parameters = VadOptions(**vad_parameters)
 
49
  speech_chunks = self.get_speech_timestamps(
50
  audio=audio,
51
  vad_options=vad_parameters,
52
  progress=progress
53
  )
54
- audio = self.collect_chunks(audio, speech_chunks)
55
- duration_after_vad = audio.shape[0] / sampling_rate
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  return audio
58
 
@@ -208,13 +224,41 @@ class SileroVAD:
208
  def update_model(self):
209
  self.model = get_vad_model()
210
 
211
- @staticmethod
212
- def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
213
- """Collects and concatenates audio chunks."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  if not chunks:
215
- return np.array([], dtype=np.float32)
 
 
 
 
 
216
 
217
- return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
 
 
 
 
 
 
 
 
218
 
219
  @staticmethod
220
  def format_timestamp(
@@ -238,4 +282,3 @@ class SileroVAD:
238
  return (
239
  f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
240
  )
241
-
 
1
  from faster_whisper.vad import VadOptions, get_vad_model
2
  import numpy as np
3
+ from typing import BinaryIO, Union, List, Optional, Tuple
4
  import warnings
5
  import faster_whisper
6
  import gradio as gr
 
15
  def run(self,
16
  audio: Union[str, BinaryIO, np.ndarray],
17
  vad_parameters: VadOptions,
18
+ silence_non_speech: bool = True,
19
  progress: gr.Progress = gr.Progress()):
20
  """
21
  Run VAD
 
26
  Audio path or file binary or Audio numpy array
27
  vad_parameters:
28
  Options for VAD processing.
29
+ silence_non_speech: bool
30
+ If True, non-speech parts will be silenced instead of being removed.
31
  progress: gr.Progress
32
  Indicator to show progress directly in gradio.
33
 
 
43
  audio = faster_whisper.decode_audio(audio, sampling_rate=sampling_rate)
44
 
45
  duration = audio.shape[0] / sampling_rate
 
46
 
47
  if vad_parameters is None:
48
  vad_parameters = VadOptions()
49
  elif isinstance(vad_parameters, dict):
50
  vad_parameters = VadOptions(**vad_parameters)
51
+
52
  speech_chunks = self.get_speech_timestamps(
53
  audio=audio,
54
  vad_options=vad_parameters,
55
  progress=progress
56
  )
57
+
58
+ audio, duration_diff = self.collect_chunks(
59
+ audio=audio,
60
+ chunks=speech_chunks,
61
+ silence_non_speech=silence_non_speech
62
+ )
63
+
64
+ if silence_non_speech:
65
+ print(
66
+ f"VAD filter silenced {self.format_timestamp(duration_diff)} of audio.",
67
+ )
68
+ else:
69
+ print(
70
+ f"VAD filter removed {self.format_timestamp(duration_diff)} of audio",
71
+ )
72
 
73
  return audio
74
 
 
224
  def update_model(self):
225
  self.model = get_vad_model()
226
 
227
+ def collect_chunks(
228
+ self,
229
+ audio: np.ndarray,
230
+ chunks: List[dict],
231
+ silence_non_speech: bool = True,
232
+ ) -> Tuple[np.ndarray, float]:
233
+ """Collects and concatenate audio chunks.
234
+
235
+ Args:
236
+ audio: One dimensional float array.
237
+ chunks: List of dictionaries containing start and end samples of speech chunks
238
+ silence_non_speech: If True, non-speech parts will be silenced instead of being removed.
239
+
240
+ Returns:
241
+ Tuple containing:
242
+ - Processed audio as a numpy array
243
+ - Duration of non-speech (silenced or removed) audio in seconds
244
+ """
245
  if not chunks:
246
+ return np.array([], dtype=np.float32), 0.0
247
+
248
+ total_samples = audio.shape[0]
249
+ speech_samples_count = sum(chunk["end"] - chunk["start"] for chunk in chunks)
250
+ non_speech_samples_count = total_samples - speech_samples_count
251
+ non_speech_duration = non_speech_samples_count / self.sampling_rate
252
 
253
+ if not silence_non_speech:
254
+ processed_audio = np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
255
+ else:
256
+ processed_audio = np.zeros_like(audio)
257
+ for chunk in chunks:
258
+ start, end = chunk['start'], chunk['end']
259
+ processed_audio[start:end] = audio[start:end]
260
+
261
+ return processed_audio, non_speech_duration
262
 
263
  @staticmethod
264
  def format_timestamp(
 
282
  return (
283
  f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
284
  )
 
modules/whisper/whisper_base.py CHANGED
@@ -96,6 +96,7 @@ class WhisperBase(ABC):
96
  audio = self.vad.run(
97
  audio=audio,
98
  vad_parameters=vad_options,
 
99
  progress=progress
100
  )
101
 
 
96
  audio = self.vad.run(
97
  audio=audio,
98
  vad_parameters=vad_options,
99
+ silence_non_speech=True,
100
  progress=progress
101
  )
102