Spaces:

aadnk
/

whisper-webui

Runtime error

aadnk commited on Oct 4, 2022

Commit

f288ceb

1 Parent(s): aa22372

Add support for selecting a VAD

The VAD (Voice Activity Detector) is used to detect time segments
where there is speech (more than 250ms), and only run whisper on
these segments. This prevents Whisper getting stuck in a loop, producing
the same sentence over and over (which usually happens after a long
continuous sequence of no speech).

A secondary benefit is that the time synchronization issues that sometimes are
present, will not carry over each detected time segment.

One slight issue, however, may be if these time sequences are very short,
and prevent Whisper from using previous text as prompt for context. To
mitigate this somewhat, the detected time segments with speech are
padded by 1 second before, and 4 seconds after, and then merged
if they overlap. Finally, the VAO model's threshold is set to 30%
instead of 50%.

Files changed (2) hide show

app.py +14 -3
vad.py +185 -0

app.py CHANGED Viewed

@@ -15,6 +15,7 @@ import gradio as gr
 from download import ExceededMaximumDuration, downloadUrl
 from utils import slugify, write_srt, write_vtt
 # Limitations (set to -1 to disable)
 DEFAULT_INPUT_AUDIO_MAX_DURATION = 600 # seconds
@@ -49,9 +50,10 @@ model_cache = dict()
 class UI:
     def __init__(self, inputAudioMaxDuration):
         self.inputAudioMaxDuration = inputAudioMaxDuration
-    def transcribeFile(self, modelName, languageName, urlData, uploadFile, microphoneData, task):
         try:
             source, sourceName = self.getSource(urlData, uploadFile, microphoneData)
@@ -66,7 +68,14 @@ class UI:
                     model_cache[selectedModel] = model
                 # The results
-                result = model.transcribe(source, language=selectedLanguage, task=task)
                 text = result["text"]
@@ -154,7 +163,8 @@ def createUi(inputAudioMaxDuration, share=False, server_name: str = None):
     ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
     ui_description += " as well as speech translation and language identification. "
-    ui_description += "\n\n" + "Note: You can upload more audio (and even video) types by changing to All Files (*.*) in the file selector."
     if inputAudioMaxDuration > 0:
         ui_description += "\n\n" + "Max audio file length: " + str(inputAudioMaxDuration) + " s"
@@ -166,6 +176,7 @@ def createUi(inputAudioMaxDuration, share=False, server_name: str = None):
         gr.Audio(source="upload", type="filepath", label="Upload Audio"),
         gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
         gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
     ], outputs=[
         gr.File(label="Download"),
         gr.Text(label="Transcription"),

 from download import ExceededMaximumDuration, downloadUrl
 from utils import slugify, write_srt, write_vtt
+from vad import VadTranscription
 # Limitations (set to -1 to disable)
 DEFAULT_INPUT_AUDIO_MAX_DURATION = 600 # seconds
 class UI:
     def __init__(self, inputAudioMaxDuration):
+        self.vad_model = None
         self.inputAudioMaxDuration = inputAudioMaxDuration
+    def transcribeFile(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad):
         try:
             source, sourceName = self.getSource(urlData, uploadFile, microphoneData)
                     model_cache[selectedModel] = model
                 # The results
+                if (vad == 'silero-vad'):
+                    # Use Silero VAD
+                    if (self.vad_model is None):
+                        self.vad_model = VadTranscription()
+                    result = self.vad_model.transcribe(source, lambda audio : model.transcribe(audio, language=selectedLanguage, task=task))
+                else:
+                    # Default VAD
+                    result = model.transcribe(source, language=selectedLanguage, task=task)
                 text = result["text"]
     ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
     ui_description += " as well as speech translation and language identification. "
+    ui_description += "\n\n" + "Note: You can upload more audio (and even video) types by changing to All Files (*.*) in the file selector. For longer audio files (>10 minutes), "
+    ui_description += "it is recommended that you select Silero VAD (Voice Activity Detector) in the VAD option."
     if inputAudioMaxDuration > 0:
         ui_description += "\n\n" + "Max audio file length: " + str(inputAudioMaxDuration) + " s"
         gr.Audio(source="upload", type="filepath", label="Upload Audio"),
         gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
         gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
+        gr.Dropdown(choices=["none", "silero-vad"], label="VAD"),
     ], outputs=[
         gr.File(label="Download"),
         gr.Text(label="Transcription"),

vad.py ADDED Viewed

	@@ -0,0 +1,185 @@

+from collections import Counter
+from dis import dis
+from typing import Any, Iterator, List, Dict
+from pprint import pprint
+import torch
+import ffmpeg
+import numpy as np
+SPEECH_TRESHOLD = 0.3
+MAX_SILENT_PERIOD = 10 # seconds
+SEGMENT_PADDING_LEFT = 1 # Start detected text segment early
+SEGMENT_PADDING_RIGHT = 4 # End detected segments late
+def load_audio(file: str, sample_rate: int = 16000,
+               start_time: str = None, duration: str = None):
+    """
+    Open an audio file and read as mono waveform, resampling as necessary
+    Parameters
+    ----------
+    file: str
+        The audio file to open
+    sr: int
+        The sample rate to resample the audio if necessary
+    start_time: str
+        The start time, using the standard FFMPEG time duration syntax, or None to disable.
+    duration: str
+        The duration, using the standard FFMPEG time duration syntax, or None to disable.
+    Returns
+    -------
+    A NumPy array containing the audio waveform, in float32 dtype.
+    """
+    try:
+        inputArgs = {'threads': 0}
+        if (start_time is not None):
+            inputArgs['ss'] = start_time
+        if (duration is not None):
+            inputArgs['t'] = duration
+        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
+        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+        out, _ = (
+            ffmpeg.input(file, **inputArgs)
+            .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sample_rate)
+            .run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True)
+        )
+    except ffmpeg.Error as e:
+        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}")
+    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+class VadTranscription:
+    def __init__(self):
+        self.model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
+        (self.get_speech_timestamps, _, _, _, _) = utils
+    def transcribe(self, audio: str, whisperCallable):
+        SAMPLING_RATE = 16000
+        wav = load_audio(audio, sample_rate=SAMPLING_RATE)
+        # get speech timestamps from full audio file
+        sample_timestamps = self.get_speech_timestamps(wav, self.model, sampling_rate=SAMPLING_RATE, threshold=SPEECH_TRESHOLD)
+        seconds_timestamps = self.convert_seconds(sample_timestamps, sampling_rate=SAMPLING_RATE)
+        padded = self.pad_timestamps(seconds_timestamps, SEGMENT_PADDING_LEFT, SEGMENT_PADDING_RIGHT)
+        merged = self.merge_timestamps(padded, MAX_SILENT_PERIOD)
+        print("Timestamps:")
+        pprint(merged)
+        result = {
+            'text': "",
+            'segments': [],
+            'language': ""
+        }
+        languageCounter = Counter()
+        # For each time segment, run whisper
+        for segment in merged:
+            segment_start = segment['start']
+            segment_duration = segment['end'] - segment_start
+            segment_audio = load_audio(audio, sample_rate=SAMPLING_RATE, start_time = str(segment_start) + "s", duration = str(segment_duration) + "s")
+            print("Running whisper on " + str(segment_start) + ", duration: " + str(segment_duration))
+            segment_result = whisperCallable(segment_audio)
+            adjusted_segments = self.adjust_whisper_timestamp(segment_result["segments"], adjust_seconds=segment_start, max_source_time=segment_duration)
+            # Append to output
+            result['text'] += segment_result['text']
+            result['segments'].extend(adjusted_segments)
+            # Increment detected language
+            languageCounter[segment_result['language']] += 1
+        if len(languageCounter) > 0:
+            result['language'] = languageCounter.most_common(1)[0][0]
+        return result
+    def adjust_whisper_timestamp(self, segments: Iterator[dict], adjust_seconds: float, max_source_time: float = None):
+        result = []
+        for segment in segments:
+            segment_start = float(segment['start'])
+            segment_end = float(segment['end'])
+            # Filter segments?
+            if (max_source_time is not None):
+                if (segment_start > max_source_time):
+                    continue
+                segment_end = min(max_source_time, segment_end)
+                new_segment = segment.copy()
+            # Add to start and end
+            new_segment['start'] = segment_start + adjust_seconds
+            new_segment['end'] = segment_end + adjust_seconds
+            result.append(new_segment)
+        return result
+    def pad_timestamps(self, timestamps: List[Dict[str, Any]], padding_left: float, padding_right: float):
+        result = []
+        for entry in timestamps:
+            segment_start = entry['start']
+            segment_end = entry['end']
+            if padding_left is not None:
+                segment_start = max(0, segment_start - padding_left)
+            if padding_right is not None:
+                segment_end = segment_end + padding_right
+            result.append({ 'start': segment_start, 'end': segment_end })
+        return result
+    def merge_timestamps(self, timestamps: List[Dict[str, Any]], max_distance: float):
+        result = []
+        current_entry = None
+        for entry in timestamps:
+            if current_entry is None:
+                current_entry = entry
+                continue
+            # Get distance to the previous entry
+            distance = entry['start'] - current_entry['end']
+            if distance <= max_distance:
+                # Merge
+                current_entry['end'] = entry['end']
+            else:
+                # Output current entry
+                result.append(current_entry)
+                current_entry = entry
+        # Add final entry
+        if current_entry is not None:
+            result.append(current_entry)
+        return result
+    def convert_seconds(self, timestamps: List[Dict[str, Any]], sampling_rate: int):
+        result = []
+        for entry in timestamps:
+            start = entry['start']
+            end = entry['end']
+            result.append({
+                'start': start / sampling_rate,
+                'end': end / sampling_rate
+            })
+        return result