jhj0517 commited on
Commit
7386da0
·
unverified ·
2 Parent(s): 6f9cdbb 174fcfd

Merge pull request #214 from jhj0517/fix/limit-vad

Browse files
app.py CHANGED
@@ -73,7 +73,7 @@ class App:
73
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
74
  interactive=True)
75
  with gr.Accordion("Advanced Parameters", open=False):
76
- nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True,
77
  info="Beam size to use for decoding.")
78
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True,
79
  info="If the average log probability over sampled tokens is below this value, treat as failed.")
@@ -137,7 +137,7 @@ class App:
137
  nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
138
  nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
139
 
140
- with gr.Accordion("VAD", open=False):
141
  cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
142
  sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5,
143
  info="Lower it to be more sensitive to small sounds.")
 
73
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
74
  interactive=True)
75
  with gr.Accordion("Advanced Parameters", open=False):
76
+ nb_beam_size = gr.Number(label="Beam Size", value=5, precision=0, interactive=True,
77
  info="Beam size to use for decoding.")
78
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True,
79
  info="If the average log probability over sampled tokens is below this value, treat as failed.")
 
137
  nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
138
  nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
139
 
140
+ with gr.Accordion("VAD", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
141
  cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
142
  sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5,
143
  info="Lower it to be more sensitive to small sounds.")
modules/diarize/audio_loader.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import os
2
  import subprocess
3
  from functools import lru_cache
 
1
+ # Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/audio.py
2
+
3
  import os
4
  import subprocess
5
  from functools import lru_cache
modules/diarize/diarize_pipeline.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import numpy as np
2
  import pandas as pd
3
  import os
 
1
+ # Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/diarize.py
2
+
3
  import numpy as np
4
  import pandas as pd
5
  import os
modules/vad/silero_vad.py CHANGED
@@ -1,6 +1,8 @@
 
 
1
  from faster_whisper.vad import VadOptions, get_vad_model
2
  import numpy as np
3
- from typing import BinaryIO, Union, List, Optional, Tuple
4
  import warnings
5
  import faster_whisper
6
  import gradio as gr
@@ -15,7 +17,6 @@ class SileroVAD:
15
  def run(self,
16
  audio: Union[str, BinaryIO, np.ndarray],
17
  vad_parameters: VadOptions,
18
- silence_non_speech: bool = True,
19
  progress: gr.Progress = gr.Progress()):
20
  """
21
  Run VAD
@@ -26,8 +27,6 @@ class SileroVAD:
26
  Audio path or file binary or Audio numpy array
27
  vad_parameters:
28
  Options for VAD processing.
29
- silence_non_speech: bool
30
- If True, non-speech parts will be silenced instead of being removed.
31
  progress: gr.Progress
32
  Indicator to show progress directly in gradio.
33
 
@@ -43,32 +42,19 @@ class SileroVAD:
43
  audio = faster_whisper.decode_audio(audio, sampling_rate=sampling_rate)
44
 
45
  duration = audio.shape[0] / sampling_rate
 
46
 
47
  if vad_parameters is None:
48
  vad_parameters = VadOptions()
49
  elif isinstance(vad_parameters, dict):
50
  vad_parameters = VadOptions(**vad_parameters)
51
-
52
  speech_chunks = self.get_speech_timestamps(
53
  audio=audio,
54
  vad_options=vad_parameters,
55
  progress=progress
56
  )
57
-
58
- audio, duration_diff = self.collect_chunks(
59
- audio=audio,
60
- chunks=speech_chunks,
61
- silence_non_speech=silence_non_speech
62
- )
63
-
64
- if silence_non_speech:
65
- print(
66
- f"VAD filter silenced {self.format_timestamp(duration_diff)} of audio.",
67
- )
68
- else:
69
- print(
70
- f"VAD filter removed {self.format_timestamp(duration_diff)} of audio",
71
- )
72
 
73
  return audio
74
 
@@ -224,41 +210,13 @@ class SileroVAD:
224
  def update_model(self):
225
  self.model = get_vad_model()
226
 
227
- def collect_chunks(
228
- self,
229
- audio: np.ndarray,
230
- chunks: List[dict],
231
- silence_non_speech: bool = True,
232
- ) -> Tuple[np.ndarray, float]:
233
- """Collects and concatenate audio chunks.
234
-
235
- Args:
236
- audio: One dimensional float array.
237
- chunks: List of dictionaries containing start and end samples of speech chunks
238
- silence_non_speech: If True, non-speech parts will be silenced instead of being removed.
239
-
240
- Returns:
241
- Tuple containing:
242
- - Processed audio as a numpy array
243
- - Duration of non-speech (silenced or removed) audio in seconds
244
- """
245
  if not chunks:
246
- return np.array([], dtype=np.float32), 0.0
247
-
248
- total_samples = audio.shape[0]
249
- speech_samples_count = sum(chunk["end"] - chunk["start"] for chunk in chunks)
250
- non_speech_samples_count = total_samples - speech_samples_count
251
- non_speech_duration = non_speech_samples_count / self.sampling_rate
252
 
253
- if not silence_non_speech:
254
- processed_audio = np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
255
- else:
256
- processed_audio = np.zeros_like(audio)
257
- for chunk in chunks:
258
- start, end = chunk['start'], chunk['end']
259
- processed_audio[start:end] = audio[start:end]
260
-
261
- return processed_audio, non_speech_duration
262
 
263
  @staticmethod
264
  def format_timestamp(
@@ -282,3 +240,4 @@ class SileroVAD:
282
  return (
283
  f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
284
  )
 
 
1
+ # Adapted from https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py
2
+
3
  from faster_whisper.vad import VadOptions, get_vad_model
4
  import numpy as np
5
+ from typing import BinaryIO, Union, List, Optional
6
  import warnings
7
  import faster_whisper
8
  import gradio as gr
 
17
  def run(self,
18
  audio: Union[str, BinaryIO, np.ndarray],
19
  vad_parameters: VadOptions,
 
20
  progress: gr.Progress = gr.Progress()):
21
  """
22
  Run VAD
 
27
  Audio path or file binary or Audio numpy array
28
  vad_parameters:
29
  Options for VAD processing.
 
 
30
  progress: gr.Progress
31
  Indicator to show progress directly in gradio.
32
 
 
42
  audio = faster_whisper.decode_audio(audio, sampling_rate=sampling_rate)
43
 
44
  duration = audio.shape[0] / sampling_rate
45
+ duration_after_vad = duration
46
 
47
  if vad_parameters is None:
48
  vad_parameters = VadOptions()
49
  elif isinstance(vad_parameters, dict):
50
  vad_parameters = VadOptions(**vad_parameters)
 
51
  speech_chunks = self.get_speech_timestamps(
52
  audio=audio,
53
  vad_options=vad_parameters,
54
  progress=progress
55
  )
56
+ audio = self.collect_chunks(audio, speech_chunks)
57
+ duration_after_vad = audio.shape[0] / sampling_rate
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  return audio
60
 
 
210
  def update_model(self):
211
  self.model = get_vad_model()
212
 
213
+ @staticmethod
214
+ def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
215
+ """Collects and concatenates audio chunks."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  if not chunks:
217
+ return np.array([], dtype=np.float32)
 
 
 
 
 
218
 
219
+ return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
 
 
 
 
 
 
 
 
220
 
221
  @staticmethod
222
  def format_timestamp(
 
240
  return (
241
  f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
242
  )
243
+
modules/whisper/faster_whisper_inference.py CHANGED
@@ -71,6 +71,20 @@ class FasterWhisperInference(WhisperBase):
71
  if not params.hotwords:
72
  params.hotwords = None
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)
75
 
76
  segments, info = self.model.transcribe(
@@ -100,7 +114,9 @@ class FasterWhisperInference(WhisperBase):
100
  hotwords=params.hotwords,
101
  language_detection_threshold=params.language_detection_threshold,
102
  language_detection_segments=params.language_detection_segments,
103
- prompt_reset_on_temperature=params.prompt_reset_on_temperature
 
 
104
  )
105
  progress(0, desc="Loading audio..")
106
 
 
71
  if not params.hotwords:
72
  params.hotwords = None
73
 
74
+ vad_options = None
75
+ if params.vad_filter:
76
+ # Explicit value set for float('inf') from gr.Number()
77
+ if params.max_speech_duration_s >= 9999:
78
+ params.max_speech_duration_s = float('inf')
79
+
80
+ vad_options = VadOptions(
81
+ threshold=params.threshold,
82
+ min_speech_duration_ms=params.min_speech_duration_ms,
83
+ max_speech_duration_s=params.max_speech_duration_s,
84
+ min_silence_duration_ms=params.min_silence_duration_ms,
85
+ speech_pad_ms=params.speech_pad_ms
86
+ )
87
+
88
  params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)
89
 
90
  segments, info = self.model.transcribe(
 
114
  hotwords=params.hotwords,
115
  language_detection_threshold=params.language_detection_threshold,
116
  language_detection_segments=params.language_detection_segments,
117
+ prompt_reset_on_temperature=params.prompt_reset_on_temperature,
118
+ vad_filter=params.vad_filter,
119
+ vad_parameters=vad_options
120
  )
121
  progress(0, desc="Loading audio..")
122
 
modules/whisper/whisper_base.py CHANGED
@@ -85,21 +85,6 @@ class WhisperBase(ABC):
85
  """
86
  params = WhisperParameters.as_value(*whisper_params)
87
 
88
- if params.vad_filter:
89
- vad_options = VadOptions(
90
- threshold=params.threshold,
91
- min_speech_duration_ms=params.min_speech_duration_ms,
92
- max_speech_duration_s=params.max_speech_duration_s,
93
- min_silence_duration_ms=params.min_silence_duration_ms,
94
- speech_pad_ms=params.speech_pad_ms
95
- )
96
- audio = self.vad.run(
97
- audio=audio,
98
- vad_parameters=vad_options,
99
- silence_non_speech=True,
100
- progress=progress
101
- )
102
-
103
  if params.lang == "Automatic Detection":
104
  params.lang = None
105
  else:
 
85
  """
86
  params = WhisperParameters.as_value(*whisper_params)
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  if params.lang == "Automatic Detection":
89
  params.lang = None
90
  else: