Spaces:
Running
Running
jhj0517
commited on
Commit
·
824b9ef
1
Parent(s):
20c2916
migrate faster-whisper to 1.0.3
Browse files- modules/vad/silero_vad.py +13 -15
modules/vad/silero_vad.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from faster_whisper.vad import VadOptions
|
2 |
import numpy as np
|
3 |
from typing import BinaryIO, Union, List, Optional
|
4 |
import warnings
|
@@ -9,6 +9,8 @@ import gradio as gr
|
|
9 |
class SileroVAD:
|
10 |
def __init__(self):
|
11 |
self.sampling_rate = 16000
|
|
|
|
|
12 |
|
13 |
def run(self,
|
14 |
audio: Union[str, BinaryIO, np.ndarray],
|
@@ -54,8 +56,8 @@ class SileroVAD:
|
|
54 |
|
55 |
return audio
|
56 |
|
57 |
-
@staticmethod
|
58 |
def get_speech_timestamps(
|
|
|
59 |
audio: np.ndarray,
|
60 |
vad_options: Optional[VadOptions] = None,
|
61 |
progress: gr.Progress = gr.Progress(),
|
@@ -72,22 +74,16 @@ class SileroVAD:
|
|
72 |
Returns:
|
73 |
List of dicts containing begin and end samples of each speech chunk.
|
74 |
"""
|
75 |
-
|
76 |
-
|
|
|
77 |
|
78 |
threshold = vad_options.threshold
|
79 |
min_speech_duration_ms = vad_options.min_speech_duration_ms
|
80 |
max_speech_duration_s = vad_options.max_speech_duration_s
|
81 |
min_silence_duration_ms = vad_options.min_silence_duration_ms
|
82 |
-
window_size_samples =
|
83 |
speech_pad_ms = vad_options.speech_pad_ms
|
84 |
-
|
85 |
-
if window_size_samples not in [512, 1024, 1536]:
|
86 |
-
warnings.warn(
|
87 |
-
"Unusual window_size_samples! Supported window_size_samples:\n"
|
88 |
-
" - [512, 1024, 1536] for 16000 sampling_rate"
|
89 |
-
)
|
90 |
-
|
91 |
sampling_rate = 16000
|
92 |
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
|
93 |
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
@@ -101,8 +97,7 @@ class SileroVAD:
|
|
101 |
|
102 |
audio_length_samples = len(audio)
|
103 |
|
104 |
-
|
105 |
-
state = model.get_initial_state(batch_size=1)
|
106 |
|
107 |
speech_probs = []
|
108 |
for current_start_sample in range(0, audio_length_samples, window_size_samples):
|
@@ -111,7 +106,7 @@ class SileroVAD:
|
|
111 |
chunk = audio[current_start_sample: current_start_sample + window_size_samples]
|
112 |
if len(chunk) < window_size_samples:
|
113 |
chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
|
114 |
-
speech_prob, state = model(chunk, state, sampling_rate)
|
115 |
speech_probs.append(speech_prob)
|
116 |
|
117 |
triggered = False
|
@@ -207,6 +202,9 @@ class SileroVAD:
|
|
207 |
|
208 |
return speeches
|
209 |
|
|
|
|
|
|
|
210 |
@staticmethod
|
211 |
def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
|
212 |
"""Collects and concatenates audio chunks."""
|
|
|
1 |
+
from faster_whisper.vad import VadOptions, get_vad_model
|
2 |
import numpy as np
|
3 |
from typing import BinaryIO, Union, List, Optional
|
4 |
import warnings
|
|
|
9 |
class SileroVAD:
|
10 |
def __init__(self):
|
11 |
self.sampling_rate = 16000
|
12 |
+
self.window_size_samples = 512
|
13 |
+
self.model = None
|
14 |
|
15 |
def run(self,
|
16 |
audio: Union[str, BinaryIO, np.ndarray],
|
|
|
56 |
|
57 |
return audio
|
58 |
|
|
|
59 |
def get_speech_timestamps(
|
60 |
+
self,
|
61 |
audio: np.ndarray,
|
62 |
vad_options: Optional[VadOptions] = None,
|
63 |
progress: gr.Progress = gr.Progress(),
|
|
|
74 |
Returns:
|
75 |
List of dicts containing begin and end samples of each speech chunk.
|
76 |
"""
|
77 |
+
|
78 |
+
if self.model is None:
|
79 |
+
self.update_model()
|
80 |
|
81 |
threshold = vad_options.threshold
|
82 |
min_speech_duration_ms = vad_options.min_speech_duration_ms
|
83 |
max_speech_duration_s = vad_options.max_speech_duration_s
|
84 |
min_silence_duration_ms = vad_options.min_silence_duration_ms
|
85 |
+
window_size_samples = self.window_size_samples
|
86 |
speech_pad_ms = vad_options.speech_pad_ms
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
sampling_rate = 16000
|
88 |
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
|
89 |
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
|
|
97 |
|
98 |
audio_length_samples = len(audio)
|
99 |
|
100 |
+
state, context = self.model.get_initial_states(batch_size=1)
|
|
|
101 |
|
102 |
speech_probs = []
|
103 |
for current_start_sample in range(0, audio_length_samples, window_size_samples):
|
|
|
106 |
chunk = audio[current_start_sample: current_start_sample + window_size_samples]
|
107 |
if len(chunk) < window_size_samples:
|
108 |
chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
|
109 |
+
speech_prob, state, context = self.model(chunk, state, context, sampling_rate)
|
110 |
speech_probs.append(speech_prob)
|
111 |
|
112 |
triggered = False
|
|
|
202 |
|
203 |
return speeches
|
204 |
|
205 |
+
def update_model(self):
|
206 |
+
self.model = get_vad_model()
|
207 |
+
|
208 |
@staticmethod
|
209 |
def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
|
210 |
"""Collects and concatenates audio chunks."""
|