jhj0517 commited on
Commit
f7c5695
·
1 Parent(s): 0c00704

Revert "add `silence_non_speech` parameter"

Browse files

This reverts commit b678293544dbce3ad7b234752336c86154dfb05a.

modules/vad/silero_vad.py CHANGED
@@ -1,6 +1,6 @@
1
  from faster_whisper.vad import VadOptions, get_vad_model
2
  import numpy as np
3
- from typing import BinaryIO, Union, List, Optional, Tuple
4
  import warnings
5
  import faster_whisper
6
  import gradio as gr
@@ -15,7 +15,6 @@ class SileroVAD:
15
  def run(self,
16
  audio: Union[str, BinaryIO, np.ndarray],
17
  vad_parameters: VadOptions,
18
- silence_non_speech: bool = True,
19
  progress: gr.Progress = gr.Progress()):
20
  """
21
  Run VAD
@@ -26,8 +25,6 @@ class SileroVAD:
26
  Audio path or file binary or Audio numpy array
27
  vad_parameters:
28
  Options for VAD processing.
29
- silence_non_speech: bool
30
- If True, non-speech parts will be silenced instead of being removed.
31
  progress: gr.Progress
32
  Indicator to show progress directly in gradio.
33
 
@@ -43,32 +40,19 @@ class SileroVAD:
43
  audio = faster_whisper.decode_audio(audio, sampling_rate=sampling_rate)
44
 
45
  duration = audio.shape[0] / sampling_rate
 
46
 
47
  if vad_parameters is None:
48
  vad_parameters = VadOptions()
49
  elif isinstance(vad_parameters, dict):
50
  vad_parameters = VadOptions(**vad_parameters)
51
-
52
  speech_chunks = self.get_speech_timestamps(
53
  audio=audio,
54
  vad_options=vad_parameters,
55
  progress=progress
56
  )
57
-
58
- audio, duration_diff = self.collect_chunks(
59
- audio=audio,
60
- chunks=speech_chunks,
61
- silence_non_speech=silence_non_speech
62
- )
63
-
64
- if silence_non_speech:
65
- print(
66
- f"VAD filter silenced {self.format_timestamp(duration_diff)} of audio.",
67
- )
68
- else:
69
- print(
70
- f"VAD filter removed {self.format_timestamp(duration_diff)} of audio",
71
- )
72
 
73
  return audio
74
 
@@ -224,41 +208,13 @@ class SileroVAD:
224
  def update_model(self):
225
  self.model = get_vad_model()
226
 
227
- def collect_chunks(
228
- self,
229
- audio: np.ndarray,
230
- chunks: List[dict],
231
- silence_non_speech: bool = True,
232
- ) -> Tuple[np.ndarray, float]:
233
- """Collects and concatenate audio chunks.
234
-
235
- Args:
236
- audio: One dimensional float array.
237
- chunks: List of dictionaries containing start and end samples of speech chunks
238
- silence_non_speech: If True, non-speech parts will be silenced instead of being removed.
239
-
240
- Returns:
241
- Tuple containing:
242
- - Processed audio as a numpy array
243
- - Duration of changed (silenced or removed) audio in seconds
244
- """
245
  if not chunks:
246
- return np.array([], dtype=np.float32), 0.0
247
-
248
- total_samples = audio.shape[0]
249
- speech_samples = sum(chunk["end"] - chunk["start"] for chunk in chunks)
250
- changed_samples = total_samples - speech_samples
251
- duration_difference = changed_samples / self.sampling_rate
252
 
253
- if not silence_non_speech:
254
- processed_audio = np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
255
- else:
256
- processed_audio = np.zeros_like(audio)
257
- for chunk in chunks:
258
- start, end = chunk['start'], chunk['end']
259
- processed_audio[start:end] = audio[start:end]
260
-
261
- return processed_audio, duration_difference
262
 
263
  @staticmethod
264
  def format_timestamp(
@@ -282,3 +238,4 @@ class SileroVAD:
282
  return (
283
  f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
284
  )
 
 
1
  from faster_whisper.vad import VadOptions, get_vad_model
2
  import numpy as np
3
+ from typing import BinaryIO, Union, List, Optional
4
  import warnings
5
  import faster_whisper
6
  import gradio as gr
 
15
  def run(self,
16
  audio: Union[str, BinaryIO, np.ndarray],
17
  vad_parameters: VadOptions,
 
18
  progress: gr.Progress = gr.Progress()):
19
  """
20
  Run VAD
 
25
  Audio path or file binary or Audio numpy array
26
  vad_parameters:
27
  Options for VAD processing.
 
 
28
  progress: gr.Progress
29
  Indicator to show progress directly in gradio.
30
 
 
40
  audio = faster_whisper.decode_audio(audio, sampling_rate=sampling_rate)
41
 
42
  duration = audio.shape[0] / sampling_rate
43
+ duration_after_vad = duration
44
 
45
  if vad_parameters is None:
46
  vad_parameters = VadOptions()
47
  elif isinstance(vad_parameters, dict):
48
  vad_parameters = VadOptions(**vad_parameters)
 
49
  speech_chunks = self.get_speech_timestamps(
50
  audio=audio,
51
  vad_options=vad_parameters,
52
  progress=progress
53
  )
54
+ audio = self.collect_chunks(audio, speech_chunks)
55
+ duration_after_vad = audio.shape[0] / sampling_rate
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  return audio
58
 
 
208
  def update_model(self):
209
  self.model = get_vad_model()
210
 
211
+ @staticmethod
212
+ def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
213
+ """Collects and concatenates audio chunks."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  if not chunks:
215
+ return np.array([], dtype=np.float32)
 
 
 
 
 
216
 
217
+ return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
 
 
 
 
 
 
 
 
218
 
219
  @staticmethod
220
  def format_timestamp(
 
238
  return (
239
  f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
240
  )
241
+
modules/whisper/whisper_base.py CHANGED
@@ -96,7 +96,6 @@ class WhisperBase(ABC):
96
  audio = self.vad.run(
97
  audio=audio,
98
  vad_parameters=vad_options,
99
- silence_non_speech=True,
100
  progress=progress
101
  )
102
 
 
96
  audio = self.vad.run(
97
  audio=audio,
98
  vad_parameters=vad_options,
 
99
  progress=progress
100
  )
101