Spaces:

aadnk
/

whisper-webui

Running

App Files Files Community

aadnk commited on Oct 15, 2022

Commit

7f502b4

•

1 Parent(s): 6cbe554

Make speech padding configurable.

Browse files

Unfortunately, it seems that zero padding is not entirely
desirable either, as it increases the probability of a mistake
in between each speech section passed to Whisper.

Thus, we set the default to 1, but we leave it up to the user
to configure it to zero or some other value depending
on their preference:

0: Better timestamps, but more transcription mistakes.
1: Worse timestamps, but more accurate transcription
in between each speech section.

Files changed (3) hide show

app.py +7 -4
docs/options.md +7 -1
src/vad.py +5 -5

app.py CHANGED Viewed

@@ -52,7 +52,7 @@ class UI:
         self.vad_model = None
         self.inputAudioMaxDuration = inputAudioMaxDuration
-    def transcribeFile(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize):
         try:
             source, sourceName = self.getSource(urlData, uploadFile, microphoneData)
@@ -76,7 +76,8 @@ class UI:
                         self.vad_model = VadSileroTranscription()
                     process_gaps = VadSileroTranscription(transcribe_non_speech = True,
-                                    max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize, copy=self.vad_model)
                     result = process_gaps.transcribe(source, whisperCallable)
                 elif (vad == 'silero-vad-skip-gaps'):
                     # Use Silero VAD
@@ -84,7 +85,8 @@ class UI:
                         self.vad_model = VadSileroTranscription()
                     skip_gaps = VadSileroTranscription(transcribe_non_speech = False,
-                                    max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize, copy=self.vad_model)
                     result = skip_gaps.transcribe(source, whisperCallable)
                 elif (vad == 'periodic-vad'):
                     # Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
@@ -197,7 +199,8 @@ def createUi(inputAudioMaxDuration, share=False, server_name: str = None):
         gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
         gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "periodic-vad"], label="VAD"),
         gr.Number(label="VAD - Merge Window (s)", precision=0, value=5),
-        gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=150)
     ], outputs=[
         gr.File(label="Download"),
         gr.Text(label="Transcription"),

         self.vad_model = None
         self.inputAudioMaxDuration = inputAudioMaxDuration
+    def transcribeFile(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding):
         try:
             source, sourceName = self.getSource(urlData, uploadFile, microphoneData)
                         self.vad_model = VadSileroTranscription()
                     process_gaps = VadSileroTranscription(transcribe_non_speech = True,
+                                    max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize,
+                                    segment_padding_left=vadPadding, segment_padding_right=vadPadding, copy=self.vad_model)
                     result = process_gaps.transcribe(source, whisperCallable)
                 elif (vad == 'silero-vad-skip-gaps'):
                     # Use Silero VAD
                         self.vad_model = VadSileroTranscription()
                     skip_gaps = VadSileroTranscription(transcribe_non_speech = False,
+                                    max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize,
+                                    segment_padding_left=vadPadding, segment_padding_right=vadPadding, copy=self.vad_model)
                     result = skip_gaps.transcribe(source, whisperCallable)
                 elif (vad == 'periodic-vad'):
                     # Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
         gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
         gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "periodic-vad"], label="VAD"),
         gr.Number(label="VAD - Merge Window (s)", precision=0, value=5),
+        gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=150),
+        gr.Number(label="VAD - Padding (s)", precision=None, value=1)
     ], outputs=[
         gr.File(label="Download"),
         gr.Text(label="Transcription"),

docs/options.md CHANGED Viewed

@@ -49,4 +49,10 @@ Select the task - either "transcribe" to transcribe the audio to text, or "trans
 If set, any adjacent speech sections that are at most this number of seconds apart will be automatically merged.
 ## VAD - Max Merge Size (s)
-Disables merging of adjacent speech sections if they are this number of seconds long.

 If set, any adjacent speech sections that are at most this number of seconds apart will be automatically merged.
 ## VAD - Max Merge Size (s)
+Disables merging of adjacent speech sections if they are this number of seconds long.
+## VAD - Padding (s)
+The number of seconds (floating point) to add to the beginning and end of each speech section. Setting this to a number
+larger than zero ensures that Whisper is more likely to correctly transcribe a sentence in the beginning of
+a speech section. However, this also increases the probability of Whisper assigning the wrong timestamp
+to each transcribed line. The default value is 1 second.

src/vad.py CHANGED Viewed

@@ -25,9 +25,9 @@ SPEECH_TRESHOLD = 0.3
 MAX_SILENT_PERIOD = 10 # seconds
 MAX_MERGE_SIZE = 150 # Do not create segments larger than 2.5 minutes
-# Segment padding is disabled for now
-SEGMENT_PADDING_LEFT = 0 # Start detected text segment early
-SEGMENT_PADDING_RIGHT = 0 # End detected segments late
 # Whether to attempt to transcribe non-speech
 TRANSCRIBE_NON_SPEECH = False
@@ -38,7 +38,7 @@ MIN_SEGMENT_DURATION = 1
 VAD_MAX_PROCESSING_CHUNK = 60 * 60 # 60 minutes of audio
 class AbstractTranscription(ABC):
-    def __init__(self, segment_padding_left: int = None, segment_padding_right = None, max_silent_period: int = None, max_merge_size: int = None, transcribe_non_speech: bool = False):
         self.sampling_rate = 16000
         self.segment_padding_left = segment_padding_left
         self.segment_padding_right = segment_padding_right
@@ -334,7 +334,7 @@ class VadSileroTranscription(AbstractTranscription):
 # A very simple VAD that just marks every N seconds as speech
 class VadPeriodicTranscription(AbstractTranscription):
-    def __init__(self, periodic_duration: int):
         super().__init__()
         self.periodic_duration = periodic_duration

 MAX_SILENT_PERIOD = 10 # seconds
 MAX_MERGE_SIZE = 150 # Do not create segments larger than 2.5 minutes
+# Default segment padding
+SEGMENT_PADDING_LEFT = 1 # Start detected text segment early
+SEGMENT_PADDING_RIGHT = 1 # End detected segments late
 # Whether to attempt to transcribe non-speech
 TRANSCRIBE_NON_SPEECH = False
 VAD_MAX_PROCESSING_CHUNK = 60 * 60 # 60 minutes of audio
 class AbstractTranscription(ABC):
+    def __init__(self, segment_padding_left: float = None, segment_padding_right = None, max_silent_period: float = None, max_merge_size: float = None, transcribe_non_speech: bool = False):
         self.sampling_rate = 16000
         self.segment_padding_left = segment_padding_left
         self.segment_padding_right = segment_padding_right
 # A very simple VAD that just marks every N seconds as speech
 class VadPeriodicTranscription(AbstractTranscription):
+    def __init__(self, periodic_duration: float):
         super().__init__()
         self.periodic_duration = periodic_duration