jhj0517 commited on
Commit
824b9ef
·
1 Parent(s): 20c2916

migrate faster-whisper to 1.0.3

Browse files
Files changed (1) hide show
  1. modules/vad/silero_vad.py +13 -15
modules/vad/silero_vad.py CHANGED
@@ -1,4 +1,4 @@
1
- from faster_whisper.vad import VadOptions
2
  import numpy as np
3
  from typing import BinaryIO, Union, List, Optional
4
  import warnings
@@ -9,6 +9,8 @@ import gradio as gr
9
  class SileroVAD:
10
  def __init__(self):
11
  self.sampling_rate = 16000
 
 
12
 
13
  def run(self,
14
  audio: Union[str, BinaryIO, np.ndarray],
@@ -54,8 +56,8 @@ class SileroVAD:
54
 
55
  return audio
56
 
57
- @staticmethod
58
  def get_speech_timestamps(
 
59
  audio: np.ndarray,
60
  vad_options: Optional[VadOptions] = None,
61
  progress: gr.Progress = gr.Progress(),
@@ -72,22 +74,16 @@ class SileroVAD:
72
  Returns:
73
  List of dicts containing begin and end samples of each speech chunk.
74
  """
75
- if vad_options is None:
76
- vad_options = VadOptions(**kwargs)
 
77
 
78
  threshold = vad_options.threshold
79
  min_speech_duration_ms = vad_options.min_speech_duration_ms
80
  max_speech_duration_s = vad_options.max_speech_duration_s
81
  min_silence_duration_ms = vad_options.min_silence_duration_ms
82
- window_size_samples = vad_options.window_size_samples
83
  speech_pad_ms = vad_options.speech_pad_ms
84
-
85
- if window_size_samples not in [512, 1024, 1536]:
86
- warnings.warn(
87
- "Unusual window_size_samples! Supported window_size_samples:\n"
88
- " - [512, 1024, 1536] for 16000 sampling_rate"
89
- )
90
-
91
  sampling_rate = 16000
92
  min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
93
  speech_pad_samples = sampling_rate * speech_pad_ms / 1000
@@ -101,8 +97,7 @@ class SileroVAD:
101
 
102
  audio_length_samples = len(audio)
103
 
104
- model = faster_whisper.vad.get_vad_model()
105
- state = model.get_initial_state(batch_size=1)
106
 
107
  speech_probs = []
108
  for current_start_sample in range(0, audio_length_samples, window_size_samples):
@@ -111,7 +106,7 @@ class SileroVAD:
111
  chunk = audio[current_start_sample: current_start_sample + window_size_samples]
112
  if len(chunk) < window_size_samples:
113
  chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
114
- speech_prob, state = model(chunk, state, sampling_rate)
115
  speech_probs.append(speech_prob)
116
 
117
  triggered = False
@@ -207,6 +202,9 @@ class SileroVAD:
207
 
208
  return speeches
209
 
 
 
 
210
  @staticmethod
211
  def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
212
  """Collects and concatenates audio chunks."""
 
1
+ from faster_whisper.vad import VadOptions, get_vad_model
2
  import numpy as np
3
  from typing import BinaryIO, Union, List, Optional
4
  import warnings
 
9
  class SileroVAD:
10
  def __init__(self):
11
  self.sampling_rate = 16000
12
+ self.window_size_samples = 512
13
+ self.model = None
14
 
15
  def run(self,
16
  audio: Union[str, BinaryIO, np.ndarray],
 
56
 
57
  return audio
58
 
 
59
  def get_speech_timestamps(
60
+ self,
61
  audio: np.ndarray,
62
  vad_options: Optional[VadOptions] = None,
63
  progress: gr.Progress = gr.Progress(),
 
74
  Returns:
75
  List of dicts containing begin and end samples of each speech chunk.
76
  """
77
+
78
+ if self.model is None:
79
+ self.update_model()
80
 
81
  threshold = vad_options.threshold
82
  min_speech_duration_ms = vad_options.min_speech_duration_ms
83
  max_speech_duration_s = vad_options.max_speech_duration_s
84
  min_silence_duration_ms = vad_options.min_silence_duration_ms
85
+ window_size_samples = self.window_size_samples
86
  speech_pad_ms = vad_options.speech_pad_ms
 
 
 
 
 
 
 
87
  sampling_rate = 16000
88
  min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
89
  speech_pad_samples = sampling_rate * speech_pad_ms / 1000
 
97
 
98
  audio_length_samples = len(audio)
99
 
100
+ state, context = self.model.get_initial_states(batch_size=1)
 
101
 
102
  speech_probs = []
103
  for current_start_sample in range(0, audio_length_samples, window_size_samples):
 
106
  chunk = audio[current_start_sample: current_start_sample + window_size_samples]
107
  if len(chunk) < window_size_samples:
108
  chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
109
+ speech_prob, state, context = self.model(chunk, state, context, sampling_rate)
110
  speech_probs.append(speech_prob)
111
 
112
  triggered = False
 
202
 
203
  return speeches
204
 
205
+ def update_model(self):
206
+ self.model = get_vad_model()
207
+
208
  @staticmethod
209
  def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
210
  """Collects and concatenates audio chunks."""