aadnk commited on
Commit
7f502b4
1 Parent(s): 6cbe554

Make speech padding configurable.

Browse files

Unfortunately, it seems that zero padding is not entirely
desirable either, as it increases the probability of a mistake
in between each speech section passed to Whisper.

Thus, we set the default to 1, but we leave it up to the user
to configure it to zero or some other value depending
on their preference:

0: Better timestamps, but more transcription mistakes.
1: Worse timestamps, but more accurate transcription
in between each speech section.

Files changed (3) hide show
  1. app.py +7 -4
  2. docs/options.md +7 -1
  3. src/vad.py +5 -5
app.py CHANGED
@@ -52,7 +52,7 @@ class UI:
52
  self.vad_model = None
53
  self.inputAudioMaxDuration = inputAudioMaxDuration
54
 
55
- def transcribeFile(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize):
56
  try:
57
  source, sourceName = self.getSource(urlData, uploadFile, microphoneData)
58
 
@@ -76,7 +76,8 @@ class UI:
76
  self.vad_model = VadSileroTranscription()
77
 
78
  process_gaps = VadSileroTranscription(transcribe_non_speech = True,
79
- max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize, copy=self.vad_model)
 
80
  result = process_gaps.transcribe(source, whisperCallable)
81
  elif (vad == 'silero-vad-skip-gaps'):
82
  # Use Silero VAD
@@ -84,7 +85,8 @@ class UI:
84
  self.vad_model = VadSileroTranscription()
85
 
86
  skip_gaps = VadSileroTranscription(transcribe_non_speech = False,
87
- max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize, copy=self.vad_model)
 
88
  result = skip_gaps.transcribe(source, whisperCallable)
89
  elif (vad == 'periodic-vad'):
90
  # Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
@@ -197,7 +199,8 @@ def createUi(inputAudioMaxDuration, share=False, server_name: str = None):
197
  gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
198
  gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "periodic-vad"], label="VAD"),
199
  gr.Number(label="VAD - Merge Window (s)", precision=0, value=5),
200
- gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=150)
 
201
  ], outputs=[
202
  gr.File(label="Download"),
203
  gr.Text(label="Transcription"),
52
  self.vad_model = None
53
  self.inputAudioMaxDuration = inputAudioMaxDuration
54
 
55
+ def transcribeFile(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding):
56
  try:
57
  source, sourceName = self.getSource(urlData, uploadFile, microphoneData)
58
 
76
  self.vad_model = VadSileroTranscription()
77
 
78
  process_gaps = VadSileroTranscription(transcribe_non_speech = True,
79
+ max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize,
80
+ segment_padding_left=vadPadding, segment_padding_right=vadPadding, copy=self.vad_model)
81
  result = process_gaps.transcribe(source, whisperCallable)
82
  elif (vad == 'silero-vad-skip-gaps'):
83
  # Use Silero VAD
85
  self.vad_model = VadSileroTranscription()
86
 
87
  skip_gaps = VadSileroTranscription(transcribe_non_speech = False,
88
+ max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize,
89
+ segment_padding_left=vadPadding, segment_padding_right=vadPadding, copy=self.vad_model)
90
  result = skip_gaps.transcribe(source, whisperCallable)
91
  elif (vad == 'periodic-vad'):
92
  # Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
199
  gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
200
  gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "periodic-vad"], label="VAD"),
201
  gr.Number(label="VAD - Merge Window (s)", precision=0, value=5),
202
+ gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=150),
203
+ gr.Number(label="VAD - Padding (s)", precision=None, value=1)
204
  ], outputs=[
205
  gr.File(label="Download"),
206
  gr.Text(label="Transcription"),
docs/options.md CHANGED
@@ -49,4 +49,10 @@ Select the task - either "transcribe" to transcribe the audio to text, or "trans
49
  If set, any adjacent speech sections that are at most this number of seconds apart will be automatically merged.
50
 
51
  ## VAD - Max Merge Size (s)
52
- Disables merging of adjacent speech sections if they are this number of seconds long.
 
 
 
 
 
 
49
  If set, any adjacent speech sections that are at most this number of seconds apart will be automatically merged.
50
 
51
  ## VAD - Max Merge Size (s)
52
+ Disables merging of adjacent speech sections if they are this number of seconds long.
53
+
54
+ ## VAD - Padding (s)
55
+ The number of seconds (floating point) to add to the beginning and end of each speech section. Setting this to a number
56
+ larger than zero ensures that Whisper is more likely to correctly transcribe a sentence in the beginning of
57
+ a speech section. However, this also increases the probability of Whisper assigning the wrong timestamp
58
+ to each transcribed line. The default value is 1 second.
src/vad.py CHANGED
@@ -25,9 +25,9 @@ SPEECH_TRESHOLD = 0.3
25
  MAX_SILENT_PERIOD = 10 # seconds
26
  MAX_MERGE_SIZE = 150 # Do not create segments larger than 2.5 minutes
27
 
28
- # Segment padding is disabled for now
29
- SEGMENT_PADDING_LEFT = 0 # Start detected text segment early
30
- SEGMENT_PADDING_RIGHT = 0 # End detected segments late
31
 
32
  # Whether to attempt to transcribe non-speech
33
  TRANSCRIBE_NON_SPEECH = False
@@ -38,7 +38,7 @@ MIN_SEGMENT_DURATION = 1
38
  VAD_MAX_PROCESSING_CHUNK = 60 * 60 # 60 minutes of audio
39
 
40
  class AbstractTranscription(ABC):
41
- def __init__(self, segment_padding_left: int = None, segment_padding_right = None, max_silent_period: int = None, max_merge_size: int = None, transcribe_non_speech: bool = False):
42
  self.sampling_rate = 16000
43
  self.segment_padding_left = segment_padding_left
44
  self.segment_padding_right = segment_padding_right
@@ -334,7 +334,7 @@ class VadSileroTranscription(AbstractTranscription):
334
 
335
  # A very simple VAD that just marks every N seconds as speech
336
  class VadPeriodicTranscription(AbstractTranscription):
337
- def __init__(self, periodic_duration: int):
338
  super().__init__()
339
  self.periodic_duration = periodic_duration
340
 
25
  MAX_SILENT_PERIOD = 10 # seconds
26
  MAX_MERGE_SIZE = 150 # Do not create segments larger than 2.5 minutes
27
 
28
+ # Default segment padding
29
+ SEGMENT_PADDING_LEFT = 1 # Start detected text segment early
30
+ SEGMENT_PADDING_RIGHT = 1 # End detected segments late
31
 
32
  # Whether to attempt to transcribe non-speech
33
  TRANSCRIBE_NON_SPEECH = False
38
  VAD_MAX_PROCESSING_CHUNK = 60 * 60 # 60 minutes of audio
39
 
40
  class AbstractTranscription(ABC):
41
+ def __init__(self, segment_padding_left: float = None, segment_padding_right = None, max_silent_period: float = None, max_merge_size: float = None, transcribe_non_speech: bool = False):
42
  self.sampling_rate = 16000
43
  self.segment_padding_left = segment_padding_left
44
  self.segment_padding_right = segment_padding_right
334
 
335
  # A very simple VAD that just marks every N seconds as speech
336
  class VadPeriodicTranscription(AbstractTranscription):
337
+ def __init__(self, periodic_duration: float):
338
  super().__init__()
339
  self.periodic_duration = periodic_duration
340