Spaces:

aadnk
/

whisper-webui

Runtime error

App Files Files Community

aadnk commited on Oct 5, 2022

Commit

084aa80

•

1 Parent(s): ea7f8cc

Add max merge size in VAD

Browse files

Also make this and the VAD merge window configurable.

Files changed (3) hide show

app.py +16 -9
docs/options.md +45 -0
vad.py +16 -7

app.py CHANGED Viewed

@@ -53,7 +53,7 @@ class UI:
         self.vad_model = None
         self.inputAudioMaxDuration = inputAudioMaxDuration
-    def transcribeFile(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad):
         try:
             source, sourceName = self.getSource(urlData, uploadFile, microphoneData)
@@ -74,19 +74,23 @@ class UI:
                 if (vad == 'silero-vad'):
                     # Use Silero VAD and include gaps
                     if (self.vad_model is None):
-                        self.vad_model = VadSileroTranscription(transcribe_non_speech= True)
-                    result = self.vad_model.transcribe(source, whisperCallable)
                 elif (vad == 'silero-vad-skip-gaps'):
                     # Use Silero VAD
                     if (self.vad_model is None):
-                        self.vad_model = VadSileroTranscription(transcribe_non_speech= True)
-                    skip_gaps = VadSileroTranscription(transcribe_non_speech = False, copy=self.vad_model)
                     result = skip_gaps.transcribe(source, whisperCallable)
                 elif (vad == 'periodic-vad'):
                     # Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
                     # it may create a break in the middle of a sentence, causing some artifacts.
-                    periodic_vad = VadPeriodicTranscription(periodic_duration=60 * 5)
                     result = periodic_vad.transcribe(source, whisperCallable)
                 else:
                     # Default VAD
@@ -178,13 +182,14 @@ def createUi(inputAudioMaxDuration, share=False, server_name: str = None):
     ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
     ui_description += " as well as speech translation and language identification. "
-    ui_description += "\n\n" + "Note: You can upload more audio (and even video) types by changing to All Files (*.*) in the file selector. For longer audio files (>10 minutes), "
-    ui_description += "it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option."
     if inputAudioMaxDuration > 0:
         ui_description += "\n\n" + "Max audio file length: " + str(inputAudioMaxDuration) + " s"
-    demo = gr.Interface(fn=ui.transcribeFile, description=ui_description, inputs=[
         gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value="medium", label="Model"),
         gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
         gr.Text(label="URL (YouTube, etc.)"),
@@ -192,6 +197,8 @@ def createUi(inputAudioMaxDuration, share=False, server_name: str = None):
         gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
         gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
         gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "periodic-vad"], label="VAD"),
     ], outputs=[
         gr.File(label="Download"),
         gr.Text(label="Transcription"),

         self.vad_model = None
         self.inputAudioMaxDuration = inputAudioMaxDuration
+    def transcribeFile(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize):
         try:
             source, sourceName = self.getSource(urlData, uploadFile, microphoneData)
                 if (vad == 'silero-vad'):
                     # Use Silero VAD and include gaps
                     if (self.vad_model is None):
+                        self.vad_model = VadSileroTranscription()
+                    process_gaps = VadSileroTranscription(transcribe_non_speech = True,
+                                    max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize, copy=self.vad_model)
+                    result = process_gaps.transcribe(source, whisperCallable)
                 elif (vad == 'silero-vad-skip-gaps'):
                     # Use Silero VAD
                     if (self.vad_model is None):
+                        self.vad_model = VadSileroTranscription()
+                    skip_gaps = VadSileroTranscription(transcribe_non_speech = False,
+                                    max_silent_period=vadMergeWindow, max_merge_size=vadMaxMergeSize, copy=self.vad_model)
                     result = skip_gaps.transcribe(source, whisperCallable)
                 elif (vad == 'periodic-vad'):
                     # Very simple VAD - mark every 5 minutes as speech. This makes it less likely that Whisper enters an infinite loop, but
                     # it may create a break in the middle of a sentence, causing some artifacts.
+                    periodic_vad = VadPeriodicTranscription(periodic_duration=vadMaxMergeSize)
                     result = periodic_vad.transcribe(source, whisperCallable)
                 else:
                     # Default VAD
     ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
     ui_description += " as well as speech translation and language identification. "
+    ui_description += "\n\n\n\nFor longer audio files (>10 minutes), it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option."
     if inputAudioMaxDuration > 0:
         ui_description += "\n\n" + "Max audio file length: " + str(inputAudioMaxDuration) + " s"
+    ui_article = "Read the [documentation her](https://huggingface.co/spaces/aadnk/whisper-webui/blob/main/docs/options.md)"
+    demo = gr.Interface(fn=ui.transcribeFile, description=ui_description, article=ui_article, inputs=[
         gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value="medium", label="Model"),
         gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
         gr.Text(label="URL (YouTube, etc.)"),
         gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
         gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
         gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "periodic-vad"], label="VAD"),
+        gr.Number(label="VAD - Merge Window (s)", precision=0, value=10),
+        gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=150)
     ], outputs=[
         gr.File(label="Download"),
         gr.Text(label="Transcription"),

docs/options.md ADDED Viewed

	@@ -0,0 +1,45 @@

+# Options
+To transcribe or translate an audio file, you can either copy an URL from a website (all [websites](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md) supported by YT-DLP will work, including YouTube). Otherwise, upload an audio file (choose "All Files (*.*)" in the file selector to select any file type, including video files) or use the microphone.
+For longer audio files (>10 minutes), it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option.
+## Model
+Select the model that Whisper will use to transcribe the audio:
+| Size   | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed |
+|--------|------------|--------------------|--------------------|---------------|----------------|
+| tiny   | 39 M       | tiny.en            | tiny               | ~1 GB         | ~32x           |
+| base   | 74 M       | base.en            | base               | ~1 GB         | ~16x           |
+| small  | 244 M      | small.en           | small              | ~2 GB         | ~6x            |
+| medium | 769 M      | medium.en          | medium             | ~5 GB         | ~2x            |
+| large  | 1550 M     | N/A                | large              | ~10 GB        | 1x             |
+## Language
+Select the language, or leave it empty for Whisper to automatically detect it.
+Note that if the selected language and the language in the audio differs, Whisper may start to translate the audio to the selected language. For instance, if the audio is in English but you select Japaneese, the model may translate the audio to Japanese.
+## Inputs
+The options "URL (YouTube, etc.)", "Upload Audio" or "Micriphone Input" allows you to send an audio input to the model.
+Note that the UI will only process the first valid input - i.e. if you enter both an URL and upload an audio, it will only process the URL.
+## Task
+Select the task - either "transcribe" to transcribe the audio to text, or "translate" to translate it to English.
+## Vad
+* none
+  * Run whisper on the entire audio input
+* silero-vad
+   * Use Silero VAD to detect sections that contain speech, and run whisper on independently on each section. Whisper is also run on the gaps between each speech section.
+* silero-vad-skip-gaps
+   * As above, but sections that doesn't contain speech according to Silero will be skipped. This will be slightly faster, but may cause dialogue to be skipped.
+* periodic-vad
+   * Create sections of speech every 'VAD - Max Merge Size' seconds. This is very fast and simple, but will potentially break a sentence or word in two.
+## VAD - Merge Window
+If set, any adjacent speech sections that are at most this number of seconds apart will be automatically merged."
+## VAD - Max Merge Size (s)
+Disables merging of adjacent speech sections if they are this number of seconds long."

vad.py CHANGED Viewed

@@ -20,8 +20,12 @@ import numpy as np
 from utils import format_timestamp
 # Defaults for Silero
 SPEECH_TRESHOLD = 0.3
 MAX_SILENT_PERIOD = 10 # seconds
 SEGMENT_PADDING_LEFT = 1 # Start detected text segment early
 SEGMENT_PADDING_RIGHT = 3 # End detected segments late
@@ -29,11 +33,12 @@ SEGMENT_PADDING_RIGHT = 3 # End detected segments late
 TRANSCRIBE_NON_SPEECH = False
 class AbstractTranscription(ABC):
-    def __init__(self, segment_padding_left: int = None, segment_padding_right = None, max_silent_period: int = None, transcribe_non_speech: bool = False):
         self.sampling_rate = 16000
         self.segment_padding_left = segment_padding_left
         self.segment_padding_right = segment_padding_right
         self.max_silent_period = max_silent_period
         self.transcribe_non_speech = transcribe_non_speech
     def get_audio_segment(self, str, start_time: str = None, duration: str = None):
@@ -76,7 +81,7 @@ class AbstractTranscription(ABC):
         seconds_timestamps = self.get_transcribe_timestamps(audio)
         padded = self.pad_timestamps(seconds_timestamps, self.segment_padding_left, self.segment_padding_right)
-        merged = self.merge_timestamps(padded, self.max_silent_period)
         print("Timestamps:")
         pprint(merged)
@@ -188,8 +193,8 @@ class AbstractTranscription(ABC):
         return result
-    def merge_timestamps(self, timestamps: List[Dict[str, Any]], max_distance: float):
-        if max_distance is None:
             return timestamps
         result = []
@@ -202,8 +207,9 @@ class AbstractTranscription(ABC):
             # Get distance to the previous entry
             distance = entry['start'] - current_entry['end']
-            if distance <= max_distance:
                 # Merge
                 current_entry['end'] = entry['end']
             else:
@@ -231,8 +237,11 @@ class AbstractTranscription(ABC):
         return result
 class VadSileroTranscription(AbstractTranscription):
-    def __init__(self, transcribe_non_speech: bool = False, copy = None):
-        super().__init__(SEGMENT_PADDING_LEFT, SEGMENT_PADDING_RIGHT, MAX_SILENT_PERIOD, transcribe_non_speech)
         if copy:
             self.model = copy.model

 from utils import format_timestamp
 # Defaults for Silero
+# TODO: Make these configurable?
 SPEECH_TRESHOLD = 0.3
 MAX_SILENT_PERIOD = 10 # seconds
+MAX_MERGE_SIZE = 150 # Do not create segments larger than 2.5 minutes
 SEGMENT_PADDING_LEFT = 1 # Start detected text segment early
 SEGMENT_PADDING_RIGHT = 3 # End detected segments late
 TRANSCRIBE_NON_SPEECH = False
 class AbstractTranscription(ABC):
+    def __init__(self, segment_padding_left: int = None, segment_padding_right = None, max_silent_period: int = None, max_merge_size: int = None, transcribe_non_speech: bool = False):
         self.sampling_rate = 16000
         self.segment_padding_left = segment_padding_left
         self.segment_padding_right = segment_padding_right
         self.max_silent_period = max_silent_period
+        self.max_merge_size = max_merge_size
         self.transcribe_non_speech = transcribe_non_speech
     def get_audio_segment(self, str, start_time: str = None, duration: str = None):
         seconds_timestamps = self.get_transcribe_timestamps(audio)
         padded = self.pad_timestamps(seconds_timestamps, self.segment_padding_left, self.segment_padding_right)
+        merged = self.merge_timestamps(padded, self.max_silent_period, self.max_merge_size)
         print("Timestamps:")
         pprint(merged)
         return result
+    def merge_timestamps(self, timestamps: List[Dict[str, Any]], max_merge_gap: float, max_merge_size: float):
+        if max_merge_gap is None:
             return timestamps
         result = []
             # Get distance to the previous entry
             distance = entry['start'] - current_entry['end']
+            current_entry_size = current_entry['end'] - current_entry['start']
+            if distance <= max_merge_gap and (max_merge_size is None or current_entry_size <= max_merge_size):
                 # Merge
                 current_entry['end'] = entry['end']
             else:
         return result
 class VadSileroTranscription(AbstractTranscription):
+    def __init__(self, segment_padding_left=SEGMENT_PADDING_LEFT, segment_padding_right=SEGMENT_PADDING_RIGHT,
+                 max_silent_period=MAX_SILENT_PERIOD, max_merge_size=MAX_MERGE_SIZE, transcribe_non_speech: bool = False,
+                 copy = None):
+        super().__init__(segment_padding_left=segment_padding_left, segment_padding_right=segment_padding_right,
+                         max_silent_period=max_silent_period, max_merge_size=max_merge_size, transcribe_non_speech=transcribe_non_speech)
         if copy:
             self.model = copy.model