jhj0517 commited on
Commit
16a0393
·
1 Parent(s): 7386da0

add `restore_speech_timestamps()`

Browse files
Files changed (1) hide show
  1. modules/vad/silero_vad.py +25 -4
modules/vad/silero_vad.py CHANGED
@@ -2,9 +2,10 @@
2
 
3
  from faster_whisper.vad import VadOptions, get_vad_model
4
  import numpy as np
5
- from typing import BinaryIO, Union, List, Optional
6
  import warnings
7
  import faster_whisper
 
8
  import gradio as gr
9
 
10
 
@@ -17,7 +18,8 @@ class SileroVAD:
17
  def run(self,
18
  audio: Union[str, BinaryIO, np.ndarray],
19
  vad_parameters: VadOptions,
20
- progress: gr.Progress = gr.Progress()):
 
21
  """
22
  Run VAD
23
 
@@ -32,8 +34,10 @@ class SileroVAD:
32
 
33
  Returns
34
  ----------
35
- audio: np.ndarray
36
  Pre-processed audio with VAD
 
 
37
  """
38
 
39
  sampling_rate = self.sampling_rate
@@ -56,7 +60,7 @@ class SileroVAD:
56
  audio = self.collect_chunks(audio, speech_chunks)
57
  duration_after_vad = audio.shape[0] / sampling_rate
58
 
59
- return audio
60
 
61
  def get_speech_timestamps(
62
  self,
@@ -241,3 +245,20 @@ class SileroVAD:
241
  f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
242
  )
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  from faster_whisper.vad import VadOptions, get_vad_model
4
  import numpy as np
5
+ from typing import BinaryIO, Union, List, Optional, Tuple
6
  import warnings
7
  import faster_whisper
8
+ from faster_whisper.transcribe import SpeechTimestampsMap, Segment
9
  import gradio as gr
10
 
11
 
 
18
  def run(self,
19
  audio: Union[str, BinaryIO, np.ndarray],
20
  vad_parameters: VadOptions,
21
+ progress: gr.Progress = gr.Progress()
22
+ ) -> Tuple[np.ndarray, List[dict]]:
23
  """
24
  Run VAD
25
 
 
34
 
35
  Returns
36
  ----------
37
+ np.ndarray
38
  Pre-processed audio with VAD
39
+ List[dict]
40
+ Chunks of speeches to be used to restore the timestamps later
41
  """
42
 
43
  sampling_rate = self.sampling_rate
 
60
  audio = self.collect_chunks(audio, speech_chunks)
61
  duration_after_vad = audio.shape[0] / sampling_rate
62
 
63
+ return audio, speech_chunks
64
 
65
  def get_speech_timestamps(
66
  self,
 
245
  f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
246
  )
247
 
248
+ def restore_speech_timestamps(
249
+ self,
250
+ segments: List[dict],
251
+ speech_chunks: List[dict],
252
+ sampling_rate: Optional[int] = None,
253
+ ) -> List[dict]:
254
+ if sampling_rate is None:
255
+ sampling_rate = self.sampling_rate
256
+
257
+ ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
258
+
259
+ for segment in segments:
260
+ segment["start"] = ts_map.get_original_time(segment["start"])
261
+ segment["end"] = ts_map.get_original_time(segment["end"])
262
+
263
+ return segments
264
+